├── src
    ├── notebook
    │   ├── CV.pdf
    │   ├── skills_extraction.py
    │   ├── url_data_scientist_loc_bangalore.json
    │   ├── skills.csv
    │   ├── jd_data_extraction.ipynb
    │   └── job_recommendation.ipynb
    ├── components
    │   ├── job_recommender.py
    │   ├── jd_data_cleaner.py
    │   └── jd_data_extractor.py
    └── data
    │   ├── url_data_scientist_loc_bangalore.json
    │   └── skills.csv
├── myenv
    ├── Scripts
    │   ├── f2py.exe
    │   ├── ftfy.exe
    │   ├── nltk.exe
    │   ├── pip.exe
    │   ├── pip3.exe
    │   ├── tqdm.exe
    │   ├── ttx.exe
    │   ├── ipython.exe
    │   ├── jupyter.exe
    │   ├── pathy.exe
    │   ├── pip3.10.exe
    │   ├── python.exe
    │   ├── pythonw.exe
    │   ├── spacy.exe
    │   ├── wheel.exe
    │   ├── fonttools.exe
    │   ├── ipython3.exe
    │   ├── pyftmerge.exe
    │   ├── python_d.exe
    │   ├── pythonw_d.exe
    │   ├── streamlit.exe
    │   ├── watchmedo.exe
    │   ├── chardetect.exe
    │   ├── jsonschema.exe
    │   ├── jupyter-run.exe
    │   ├── markdown-it.exe
    │   ├── normalizer.exe
    │   ├── pyftsubset.exe
    │   ├── pygmentize.exe
    │   ├── pyresparser.exe
    │   ├── jupyter-kernel.exe
    │   ├── jupyter-migrate.exe
    │   ├── jupyter-kernelspec.exe
    │   ├── jupyter-troubleshoot.exe
    │   ├── docx2txt
    │   ├── deactivate.bat
    │   ├── streamlit.cmd
    │   ├── activate.bat
    │   ├── activate
    │   ├── plac_runner.py
    │   ├── pywin32_testall.py
    │   ├── pdf2txt.py
    │   └── dumppdf.py
    ├── pyvenv.cfg
    ├── etc
    │   └── jupyter
    │   │   └── nbconfig
    │   │       └── notebook.d
    │   │           └── pydeck.json
    ├── docx-template
    │   ├── docProps
    │   │   └── thumbnail.jpeg
    │   ├── _rels
    │   │   └── .rels
    │   └── word
    │   │   ├── settings.xml
    │   │   ├── fontTable.xml
    │   │   ├── styles.xml
    │   │   ├── theme
    │   │       └── theme1.xml
    │   │   └── numbering.xml
    └── share
    │   ├── jupyter
    │       ├── kernels
    │       │   └── python3
    │       │   │   ├── logo-32x32.png
    │       │   │   ├── logo-64x64.png
    │       │   │   ├── kernel.json
    │       │   │   └── logo-svg.svg
    │       └── nbextensions
    │       │   └── pydeck
    │       │       └── extensionRequires.js
    │   └── man
    │       └── man1
    │           ├── ipython.1
    │           └── ttx.1
├── utilities
    └── resumes
    │   └── CV.pdf
├── chromedriver_win32
    └── chromedriver.exe
├── requirements.txt
├── setup.py
├── __init__.py
├── .gitignore
└── README.md


/src/notebook/CV.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/src/notebook/CV.pdf


--------------------------------------------------------------------------------
/myenv/Scripts/f2py.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/f2py.exe


--------------------------------------------------------------------------------
/myenv/Scripts/ftfy.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/ftfy.exe


--------------------------------------------------------------------------------
/myenv/Scripts/nltk.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/nltk.exe


--------------------------------------------------------------------------------
/myenv/Scripts/pip.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pip.exe


--------------------------------------------------------------------------------
/myenv/Scripts/pip3.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pip3.exe


--------------------------------------------------------------------------------
/myenv/Scripts/tqdm.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/tqdm.exe


--------------------------------------------------------------------------------
/myenv/Scripts/ttx.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/ttx.exe


--------------------------------------------------------------------------------
/myenv/pyvenv.cfg:
--------------------------------------------------------------------------------
1 | home = C:\Program Files\Python310
2 | include-system-site-packages = false
3 | version = 3.10.10
4 | 


--------------------------------------------------------------------------------
/myenv/Scripts/ipython.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/ipython.exe


--------------------------------------------------------------------------------
/myenv/Scripts/jupyter.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter.exe


--------------------------------------------------------------------------------
/myenv/Scripts/pathy.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pathy.exe


--------------------------------------------------------------------------------
/myenv/Scripts/pip3.10.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pip3.10.exe


--------------------------------------------------------------------------------
/myenv/Scripts/python.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/python.exe


--------------------------------------------------------------------------------
/myenv/Scripts/pythonw.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pythonw.exe


--------------------------------------------------------------------------------
/myenv/Scripts/spacy.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/spacy.exe


--------------------------------------------------------------------------------
/myenv/Scripts/wheel.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/wheel.exe


--------------------------------------------------------------------------------
/utilities/resumes/CV.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/utilities/resumes/CV.pdf


--------------------------------------------------------------------------------
/myenv/Scripts/fonttools.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/fonttools.exe


--------------------------------------------------------------------------------
/myenv/Scripts/ipython3.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/ipython3.exe


--------------------------------------------------------------------------------
/myenv/Scripts/pyftmerge.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pyftmerge.exe


--------------------------------------------------------------------------------
/myenv/Scripts/python_d.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/python_d.exe


--------------------------------------------------------------------------------
/myenv/Scripts/pythonw_d.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pythonw_d.exe


--------------------------------------------------------------------------------
/myenv/Scripts/streamlit.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/streamlit.exe


--------------------------------------------------------------------------------
/myenv/Scripts/watchmedo.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/watchmedo.exe


--------------------------------------------------------------------------------
/myenv/Scripts/chardetect.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/chardetect.exe


--------------------------------------------------------------------------------
/myenv/Scripts/jsonschema.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jsonschema.exe


--------------------------------------------------------------------------------
/myenv/Scripts/jupyter-run.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter-run.exe


--------------------------------------------------------------------------------
/myenv/Scripts/markdown-it.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/markdown-it.exe


--------------------------------------------------------------------------------
/myenv/Scripts/normalizer.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/normalizer.exe


--------------------------------------------------------------------------------
/myenv/Scripts/pyftsubset.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pyftsubset.exe


--------------------------------------------------------------------------------
/myenv/Scripts/pygmentize.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pygmentize.exe


--------------------------------------------------------------------------------
/myenv/Scripts/pyresparser.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/pyresparser.exe


--------------------------------------------------------------------------------
/myenv/Scripts/jupyter-kernel.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter-kernel.exe


--------------------------------------------------------------------------------
/myenv/etc/jupyter/nbconfig/notebook.d/pydeck.json:
--------------------------------------------------------------------------------
1 | {
2 |     "load_extensions": {
3 |         "pydeck/extension": true
4 |     }
5 | }
6 | 


--------------------------------------------------------------------------------
/chromedriver_win32/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/chromedriver_win32/chromedriver.exe


--------------------------------------------------------------------------------
/myenv/Scripts/jupyter-migrate.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter-migrate.exe


--------------------------------------------------------------------------------
/myenv/Scripts/jupyter-kernelspec.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter-kernelspec.exe


--------------------------------------------------------------------------------
/myenv/Scripts/jupyter-troubleshoot.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/Scripts/jupyter-troubleshoot.exe


--------------------------------------------------------------------------------
/myenv/docx-template/docProps/thumbnail.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/docx-template/docProps/thumbnail.jpeg


--------------------------------------------------------------------------------
/myenv/share/jupyter/kernels/python3/logo-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/share/jupyter/kernels/python3/logo-32x32.png


--------------------------------------------------------------------------------
/myenv/share/jupyter/kernels/python3/logo-64x64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abbas99-hub/Job-Recommendation-System/HEAD/myenv/share/jupyter/kernels/python3/logo-64x64.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas 
 2 | numpy 
 3 | seaborn
 4 | matplotlib
 5 | selenium
 6 | tqdm
 7 | bs4
 8 | regex
 9 | docx
10 | nltk
11 | spacy
12 | python-docx
13 | scikit-learn
14 | ftfy
15 | PyPDF2
16 | pyresparser
17 | streamlit
18 | -e .


--------------------------------------------------------------------------------
/myenv/share/jupyter/kernels/python3/kernel.json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "argv": [
 3 |   "python",
 4 |   "-m",
 5 |   "ipykernel_launcher",
 6 |   "-f",
 7 |   "{connection_file}"
 8 |  ],
 9 |  "display_name": "Python 3 (ipykernel)",
10 |  "language": "python",
11 |  "metadata": {
12 |   "debugger": true
13 |  }
14 | }


--------------------------------------------------------------------------------
/myenv/Scripts/docx2txt:
--------------------------------------------------------------------------------
 1 | #!C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv\Scripts\python.exe
 2 | 
 3 | import docx2txt
 4 | 
 5 | if __name__ == '__main__':
 6 |     import sys
 7 |     args = docx2txt.process_args()
 8 |     text = docx2txt.process(args.docx, args.img_dir)
 9 |     output = getattr(sys.stdout, 'buffer', sys.stdout)
10 |     output.write(text.encode('utf-8'))
11 | 


--------------------------------------------------------------------------------
/myenv/share/jupyter/nbextensions/pydeck/extensionRequires.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | define(function() {
 3 |   'use strict';
 4 |   requirejs.config({
 5 |     map: {
 6 |       '*': {
 7 |         '@deck.gl/jupyter-widget': 'nbextensions/pydeck/index'
 8 |       }
 9 |     }
10 |   });
11 |   // Export the required load_ipython_extension function
12 |   return {
13 |     load_ipython_extension: function() {}
14 |   };
15 | });
16 | 


--------------------------------------------------------------------------------
/myenv/Scripts/deactivate.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | if defined _OLD_VIRTUAL_PROMPT (
 4 |     set "PROMPT=%_OLD_VIRTUAL_PROMPT%"
 5 | )
 6 | set _OLD_VIRTUAL_PROMPT=
 7 | 
 8 | if defined _OLD_VIRTUAL_PYTHONHOME (
 9 |     set "PYTHONHOME=%_OLD_VIRTUAL_PYTHONHOME%"
10 |     set _OLD_VIRTUAL_PYTHONHOME=
11 | )
12 | 
13 | if defined _OLD_VIRTUAL_PATH (
14 |     set "PATH=%_OLD_VIRTUAL_PATH%"
15 | )
16 | 
17 | set _OLD_VIRTUAL_PATH=
18 | 
19 | set VIRTUAL_ENV=
20 | set VIRTUAL_ENV_PROMPT=
21 | 
22 | :END
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages,setup
 2 | 
 3 | HYPHEN_DOT_E='-e .'
 4 | def get_requirements(file_path:str)->list[str]:
 5 |     requirements=[]
 6 |     with open(file_path) as file_obj:
 7 |         requirements=file_obj.readlines()
 8 |         requirements=[req.replace("\n"," ") for req in requirements]
 9 |         if HYPHEN_DOT_E in requirements:
10 |             requirements.remove(HYPHEN_DOT_E)
11 |     return requirements    
12 | 
13 | 
14 | 
15 | setup(
16 |     name='Job_Recommendation_System',
17 |     version='0.0.1',
18 |     author='Abbas Behrainwala',
19 |     author_email='abbasbehrain95@gmail.com',
20 |     packages=find_packages(),
21 |     install_requires=get_requirements('requirements.txt')
22 | 
23 | )


--------------------------------------------------------------------------------
/myenv/Scripts/streamlit.cmd:
--------------------------------------------------------------------------------
 1 | rem Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022)
 2 | rem
 3 | rem Licensed under the Apache License, Version 2.0 (the "License");
 4 | rem you may not use this file except in compliance with the License.
 5 | rem You may obtain a copy of the License at
 6 | rem
 7 | rem     http://www.apache.org/licenses/LICENSE-2.0
 8 | rem
 9 | rem Unless required by applicable law or agreed to in writing, software
10 | rem distributed under the License is distributed on an "AS IS" BASIS,
11 | rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | rem See the License for the specific language governing permissions and
13 | rem limitations under the License.
14 | 
15 | @echo OFF
16 | python -m streamlit %*
17 | 


--------------------------------------------------------------------------------
/myenv/docx-template/_rels/.rels:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 2 | <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
 3 | 
 4 | 	<Relationship Id="rId4" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties" Target="docProps/app.xml"/>
 5 | 
 6 | 	<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
 7 | 
 8 | 	<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/thumbnail" Target="docProps/thumbnail.jpeg"/>
 9 | 
10 | 	<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officedocument/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
11 | 
12 | </Relationships>
13 | 


--------------------------------------------------------------------------------
/myenv/Scripts/activate.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem This file is UTF-8 encoded, so we need to update the current code page while executing it
 4 | for /f "tokens=2 delims=:." %%a in ('"%SystemRoot%\System32\chcp.com"') do (
 5 |     set _OLD_CODEPAGE=%%a
 6 | )
 7 | if defined _OLD_CODEPAGE (
 8 |     "%SystemRoot%\System32\chcp.com" 65001 > nul
 9 | )
10 | 
11 | set VIRTUAL_ENV=C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv
12 | 
13 | if not defined PROMPT set PROMPT=$P$G
14 | 
15 | if defined _OLD_VIRTUAL_PROMPT set PROMPT=%_OLD_VIRTUAL_PROMPT%
16 | if defined _OLD_VIRTUAL_PYTHONHOME set PYTHONHOME=%_OLD_VIRTUAL_PYTHONHOME%
17 | 
18 | set _OLD_VIRTUAL_PROMPT=%PROMPT%
19 | set PROMPT=(myenv) %PROMPT%
20 | 
21 | if defined PYTHONHOME set _OLD_VIRTUAL_PYTHONHOME=%PYTHONHOME%
22 | set PYTHONHOME=
23 | 
24 | if defined _OLD_VIRTUAL_PATH set PATH=%_OLD_VIRTUAL_PATH%
25 | if not defined _OLD_VIRTUAL_PATH set _OLD_VIRTUAL_PATH=%PATH%
26 | 
27 | set PATH=%VIRTUAL_ENV%\Scripts;%PATH%
28 | set VIRTUAL_ENV_PROMPT=(myenv) 
29 | 
30 | :END
31 | if defined _OLD_CODEPAGE (
32 |     "%SystemRoot%\System32\chcp.com" %_OLD_CODEPAGE% > nul
33 |     set _OLD_CODEPAGE=
34 | )
35 | 


--------------------------------------------------------------------------------
/src/notebook/skills_extraction.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | from spacy.matcher import Matcher
 3 | import PyPDF2
 4 | import os
 5 | 
 6 | # Load the Spacy English model
 7 | nlp = spacy.load('en_core_web_sm')
 8 | import csv
 9 | from spacy.matcher import Matcher
10 | import csv
11 | 
12 | # Read skills from CSV file
13 | file_path=r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\data\skills.csv'
14 | with open(file_path, 'r') as file:
15 |     csv_reader = csv.reader(file)
16 |     skills = [row for row in csv_reader]
17 | 
18 | # Create pattern dictionaries from skills
19 | skill_patterns = [[{'LOWER': skill}] for skill in skills[0]]
20 | 
21 | # Create a Matcher object
22 | matcher = Matcher(nlp.vocab)
23 | 
24 | # Add skill patterns to the matcher
25 | for pattern in skill_patterns:
26 |     matcher.add('Skills', [pattern])
27 | 
28 | # Function to extract skills from text
29 | def extract_skills(text):
30 |     doc = nlp(text)
31 |     matches = matcher(doc)
32 |     skills = set()
33 |     for match_id, start, end in matches:
34 |         skill = doc[start:end].text
35 |         skills.add(skill)
36 |     return skills
37 | 
38 | # Function to extract text from PDF
39 | def extract_text_from_pdf(file_path:str):
40 |     with open(file_path, 'rb') as f:
41 |         pdf_reader = PyPDF2.PdfReader(f)
42 |         text = ''
43 |         for page in pdf_reader.pages:
44 |             text += page.extract_text()
45 |     return text
46 | 
47 | def skills_extractor(file_path):
48 |         # Extract text from PDF
49 |         path=r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\notebook'
50 |         full_file_path = os.path.join(path, file_path)
51 |         resume_text = extract_text_from_pdf(full_file_path)
52 | 
53 |         # Extract skills from resume text
54 |         skills = list(extract_skills(resume_text))
55 |         return skills
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/myenv/docx-template/word/settings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 2 | <w:settings xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:sl="http://schemas.openxmlformats.org/schemaLibrary/2006/main">
 3 |   <w:zoom w:percent="90"/>
 4 |   <w:embedSystemFonts/>
 5 |   <w:proofState w:spelling="clean" w:grammar="clean"/>
 6 |   <w:stylePaneFormatFilter w:val="0004"/>
 7 |   <w:doNotTrackMoves/>
 8 |   <w:defaultTabStop w:val="720"/>
 9 |   <w:drawingGridHorizontalSpacing w:val="360"/>
10 |   <w:drawingGridVerticalSpacing w:val="360"/>
11 |   <w:displayHorizontalDrawingGridEvery w:val="0"/>
12 |   <w:displayVerticalDrawingGridEvery w:val="0"/>
13 |   <w:characterSpacingControl w:val="doNotCompress"/>
14 |   <w:savePreviewPicture/>
15 |   <w:rsids>
16 |     <w:rsidRoot w:val="00590D07"/>
17 |     <w:rsid w:val="00011C8B"/>
18 |     <w:rsid w:val="004E29B3"/>
19 |     <w:rsid w:val="00590D07"/>
20 |     <w:rsid w:val="00784D58"/>
21 |     <w:rsid w:val="008D6863"/>
22 |     <w:rsid w:val="00B86B75"/>
23 |     <w:rsid w:val="00BC48D5"/>
24 |     <w:rsid w:val="00C36279"/>
25 |     <w:rsid w:val="00E315A3"/>
26 |   </w:rsids>
27 |   <m:mathPr>
28 |     <m:mathFont m:val="Lucida Grande"/>
29 |     <m:brkBin m:val="before"/>
30 |     <m:brkBinSub m:val="--"/>
31 |     <m:smallFrac m:val="false"/>
32 |     <m:dispDef m:val="false"/>
33 |     <m:lMargin m:val="0"/>
34 |     <m:rMargin m:val="0"/>
35 |     <m:wrapRight/>
36 |     <m:intLim m:val="subSup"/>
37 |     <m:naryLim m:val="subSup"/>
38 |   </m:mathPr>
39 |   <w:themeFontLang w:val="en-US"/>
40 |   <w:clrSchemeMapping w:bg1="light1" w:t1="dark1" w:bg2="light2" w:t2="dark2" w:accent1="accent1" w:accent2="accent2" w:accent3="accent3" w:accent4="accent4" w:accent5="accent5" w:accent6="accent6" w:hyperlink="hyperlink" w:followedHyperlink="followedHyperlink"/>
41 |   <w:decimalSymbol w:val="."/>
42 |   <w:listSeparator w:val=","/>
43 | </w:settings>
44 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | import PyPDF2
 4 | from pyresparser import ResumeParser
 5 | from sklearn.neighbors import NearestNeighbors
 6 | from src.components.job_recommender import ngrams,getNearestN,jd_df
 7 | import src.notebook.skills_extraction as skills_extraction
 8 | from sklearn.feature_extraction.text import TfidfVectorizer
 9 | 
10 | 
11 | # Function to process the resume and recommend jobs
12 | def process_resume(file_path):
13 |     # Extract text from PDF resume
14 |     resume_skills=skills_extraction.skills_extractor(file_path)
15 | 
16 |     # Perform job recommendation based on parsed resume data
17 |     skills=[]
18 |     skills.append(' '.join(word for word in resume_skills))
19 |     
20 |     
21 |     # Feature Engineering:
22 |     vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
23 |     tfidf = vectorizer.fit_transform(skills)
24 | 
25 |     
26 |     nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
27 |     jd_test = (jd_df['Processed_JD'].values.astype('U'))
28 | 
29 |     distances, indices = getNearestN(jd_test)
30 |     test = list(jd_test) 
31 |     matches = []
32 | 
33 |     for i,j in enumerate(indices):
34 |         dist=round(distances[i][0],2)
35 |         temp = [dist]
36 |         matches.append(temp)
37 |     
38 |     matches = pd.DataFrame(matches, columns=['Match confidence'])
39 | 
40 |     # Following recommends Top 5 Jobs based on candidate resume:
41 |     jd_df['match']=matches['Match confidence']
42 |     
43 |     return jd_df.head(5).sort_values('match')
44 | 
45 | # Streamlit app
46 | def main():
47 |     st.title("Job Recommendation App")
48 |     st.write("Upload your resume in PDF format")
49 | 
50 |     # File uploader
51 |     uploaded_file = st.file_uploader("Choose a file", type=['pdf'])
52 | 
53 |     if uploaded_file is not None:
54 |         # Process resume and recommend jobs
55 |         file_path=uploaded_file.name
56 |         df_jobs = process_resume(file_path)
57 | 
58 |         # Display recommended jobs as DataFrame
59 |         st.write("Recommended Jobs:")
60 |         st.dataframe(df_jobs[['Job Title','Company Name','Location','Industry','Sector','Average Salary']])
61 | 
62 | # Run the Streamlit app
63 | if __name__ == '__main__':
64 |     main()
65 | 


--------------------------------------------------------------------------------
/myenv/share/man/man1/ipython.1:
--------------------------------------------------------------------------------
 1 | .\"                                      Hey, EMACS: -*- nroff -*-
 2 | .\" First parameter, NAME, should be all caps
 3 | .\" Second parameter, SECTION, should be 1-8, maybe w/ subsection
 4 | .\" other parameters are allowed: see man(7), man(1)
 5 | .TH IPYTHON 1 "July 15, 2011"
 6 | .\" Please adjust this date whenever revising the manpage.
 7 | .\"
 8 | .\" Some roff macros, for reference:
 9 | .\" .nh        disable hyphenation
10 | .\" .hy        enable hyphenation
11 | .\" .ad l      left justify
12 | .\" .ad b      justify to both left and right margins
13 | .\" .nf        disable filling
14 | .\" .fi        enable filling
15 | .\" .br        insert line break
16 | .\" .sp <n>    insert n+1 empty lines
17 | .\" for manpage-specific macros, see man(7) and groff_man(7)
18 | .\" .SH        section heading
19 | .\" .SS        secondary section heading
20 | .\"
21 | .\"
22 | .\" To preview this page as plain text: nroff -man ipython.1
23 | .\"
24 | .SH NAME
25 | ipython \- Tools for Interactive Computing in Python.
26 | .SH SYNOPSIS
27 | .B ipython
28 | .RI [ options ] " files" ...
29 | 
30 | .B ipython subcommand
31 | .RI [ options ] ...
32 | 
33 | .SH DESCRIPTION
34 | An interactive Python shell with automatic history (input and output), dynamic
35 | object introspection, easier configuration, command completion, access to the
36 | system shell, integration with numerical and scientific computing tools,
37 | web notebook, Qt console, and more.
38 | 
39 | For more information on how to use IPython, see 'ipython \-\-help',
40 | or 'ipython \-\-help\-all' for all available command\(hyline options.
41 | 
42 | .SH "ENVIRONMENT VARIABLES"
43 | .sp
44 | .PP
45 | \fIIPYTHONDIR\fR
46 | .RS 4
47 | This is the location where IPython stores all its configuration files.  The default
48 | is $HOME/.ipython if IPYTHONDIR is not defined.
49 | 
50 | You can see the computed value of IPYTHONDIR with `ipython locate`.
51 | 
52 | .SH FILES
53 | 
54 | IPython uses various configuration files stored in profiles within IPYTHONDIR.
55 | To generate the default configuration files and start configuring IPython,
56 | do 'ipython profile create', and edit '*_config.py' files located in
57 | IPYTHONDIR/profile_default.
58 | 
59 | .SH AUTHORS
60 | IPython is written by the IPython Development Team <https://github.com/ipython/ipython>.
61 | 


--------------------------------------------------------------------------------
/myenv/Scripts/activate:
--------------------------------------------------------------------------------
 1 | # This file must be used with "source bin/activate" *from bash*
 2 | # you cannot run it directly
 3 | 
 4 | deactivate () {
 5 |     # reset old environment variables
 6 |     if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
 7 |         PATH="${_OLD_VIRTUAL_PATH:-}"
 8 |         export PATH
 9 |         unset _OLD_VIRTUAL_PATH
10 |     fi
11 |     if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
12 |         PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
13 |         export PYTHONHOME
14 |         unset _OLD_VIRTUAL_PYTHONHOME
15 |     fi
16 | 
17 |     # This should detect bash and zsh, which have a hash command that must
18 |     # be called to get it to forget past commands.  Without forgetting
19 |     # past commands the $PATH changes we made may not be respected
20 |     if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
21 |         hash -r 2> /dev/null
22 |     fi
23 | 
24 |     if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
25 |         PS1="${_OLD_VIRTUAL_PS1:-}"
26 |         export PS1
27 |         unset _OLD_VIRTUAL_PS1
28 |     fi
29 | 
30 |     unset VIRTUAL_ENV
31 |     unset VIRTUAL_ENV_PROMPT
32 |     if [ ! "${1:-}" = "nondestructive" ] ; then
33 |     # Self destruct!
34 |         unset -f deactivate
35 |     fi
36 | }
37 | 
38 | # unset irrelevant variables
39 | deactivate nondestructive
40 | 
41 | VIRTUAL_ENV="C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv"
42 | export VIRTUAL_ENV
43 | 
44 | _OLD_VIRTUAL_PATH="$PATH"
45 | PATH="$VIRTUAL_ENV/Scripts:$PATH"
46 | export PATH
47 | 
48 | # unset PYTHONHOME if set
49 | # this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
50 | # could use `if (set -u; : $PYTHONHOME) ;` in bash
51 | if [ -n "${PYTHONHOME:-}" ] ; then
52 |     _OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
53 |     unset PYTHONHOME
54 | fi
55 | 
56 | if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
57 |     _OLD_VIRTUAL_PS1="${PS1:-}"
58 |     PS1="(myenv) ${PS1:-}"
59 |     export PS1
60 |     VIRTUAL_ENV_PROMPT="(myenv) "
61 |     export VIRTUAL_ENV_PROMPT
62 | fi
63 | 
64 | # This should detect bash and zsh, which have a hash command that must
65 | # be called to get it to forget past commands.  Without forgetting
66 | # past commands the $PATH changes we made may not be respected
67 | if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
68 |     hash -r 2> /dev/null
69 | fi
70 | 


--------------------------------------------------------------------------------
/myenv/docx-template/word/fontTable.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 2 | <w:fonts xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
 3 | 	<w:font w:name="Symbol">
 4 | 		<w:panose1 w:val="02000500000000000000"/>
 5 | 		<w:charset w:val="02"/>
 6 | 		<w:family w:val="auto"/>
 7 | 		<w:pitch w:val="variable"/>
 8 | 		<w:sig w:usb0="00000000" w:usb1="00000000" w:usb2="00010000" w:usb3="00000000" w:csb0="80000000" w:csb1="00000000"/>
 9 | 	</w:font>
10 | 	<w:font w:name="Times New Roman">
11 | 		<w:panose1 w:val="02020603050405020304"/>
12 | 		<w:charset w:val="00"/>
13 | 		<w:family w:val="auto"/>
14 | 		<w:pitch w:val="variable"/>
15 | 		<w:sig w:usb0="00000003" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="00000001" w:csb1="00000000"/>
16 | 	</w:font>
17 | 	<w:font w:name="Courier New">
18 | 		<w:panose1 w:val="02070309020205020404"/>
19 | 		<w:charset w:val="00"/>
20 | 		<w:family w:val="auto"/>
21 | 		<w:pitch w:val="variable"/>
22 | 		<w:sig w:usb0="00000003" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="00000001" w:csb1="00000000"/>
23 | 	</w:font>
24 | 	<w:font w:name="Wingdings">
25 | 		<w:panose1 w:val="05020102010804080708"/>
26 | 		<w:charset w:val="02"/>
27 | 		<w:family w:val="auto"/>
28 | 		<w:pitch w:val="variable"/>
29 | 		<w:sig w:usb0="00000000" w:usb1="00000000" w:usb2="00010000" w:usb3="00000000" w:csb0="80000000" w:csb1="00000000"/>
30 | 	</w:font>
31 | 	<w:font w:name="Cambria">
32 | 		<w:panose1 w:val="02040503050406030204"/>
33 | 		<w:charset w:val="00"/>
34 | 		<w:family w:val="auto"/>
35 | 		<w:pitch w:val="variable"/>
36 | 		<w:sig w:usb0="00000003" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="00000001" w:csb1="00000000"/>
37 | 	</w:font>
38 | 	<w:font w:name="Calibri">
39 | 		<w:panose1 w:val="020F0502020204030204"/>
40 | 		<w:charset w:val="00"/>
41 | 		<w:family w:val="auto"/>
42 | 		<w:pitch w:val="variable"/>
43 | 		<w:sig w:usb0="00000003" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="00000001" w:csb1="00000000"/>
44 | 	</w:font>
45 | 	<w:font w:name="Arial">
46 | 		<w:panose1 w:val="020B0604020202020204"/>
47 | 		<w:charset w:val="00"/>
48 | 		<w:family w:val="auto"/>
49 | 		<w:pitch w:val="variable"/>
50 | 		<w:sig w:usb0="00000003" w:usb1="00000000" w:usb2="00000000" w:usb3="00000000" w:csb0="00000001" w:csb1="00000000"/>
51 | 	</w:font>
52 | </w:fonts>
53 | 


--------------------------------------------------------------------------------
/src/components/job_recommender.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from ftfy import fix_text
 3 | from sklearn.feature_extraction.text import TfidfVectorizer
 4 | import re
 5 | from sklearn.neighbors import NearestNeighbors
 6 | import numpy as np
 7 | import pandas as pd
 8 | import nltk
 9 | from nltk.corpus import stopwords
10 | stopw  = set(stopwords.words('english'))
11 | from pyresparser import ResumeParser
12 | import os
13 | from docx import Document
14 | import src.notebook.skills_extraction as skills_extraction
15 | 
16 | # Load dataset:
17 | jd_df=pd.read_csv(r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\data\jd_structured_data.csv')
18 | 
19 | # Load the extracted resume skills:
20 | file_path=r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\utilities\resumes\CV.pdf'
21 | skills=[]
22 | skills.append(' '.join(word for word in skills_extraction.skills_extractor(file_path)))
23 | 
24 | def ngrams(string, n=3):
25 |     string = fix_text(string) # fix text
26 |     string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
27 |     string = string.lower()
28 |     chars_to_remove = [")","(",".","|","[","]","{","}","'"]
29 |     rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
30 |     string = re.sub(rx, '', string)
31 |     string = string.replace('&', 'and')
32 |     string = string.replace(',', ' ')
33 |     string = string.replace('-', ' ')
34 |     string = string.title() # normalise case - capital at start of each word
35 |     string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
36 |     string = ' '+ string +' ' # pad names for ngrams...
37 |     string = re.sub(r'[,-./]|\sBD',r'', string)
38 |     ngrams = zip(*[string[i:] for i in range(n)])
39 |     return [''.join(ngram) for ngram in ngrams]
40 | 
41 | vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
42 | tfidf = vectorizer.fit_transform(skills)
43 | 
44 | nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
45 | jd_test = (jd_df['Processed_JD'].values.astype('U'))
46 | 
47 | def getNearestN(query):
48 |   queryTFIDF_ = vectorizer.transform(query)
49 |   distances, indices = nbrs.kneighbors(queryTFIDF_)
50 |   return distances, indices
51 | 
52 | distances, indices = getNearestN(jd_test)
53 | test = list(jd_test) 
54 | matches = []
55 | 
56 | for i,j in enumerate(indices):
57 |     dist=round(distances[i][0],2)
58 |   
59 |     temp = [dist]
60 |     matches.append(temp)
61 |     
62 | matches = pd.DataFrame(matches, columns=['Match confidence'])
63 | 
64 | # Following recommends Top 5 Jobs based on candidate resume:
65 | jd_df['match']=matches['Match confidence']
66 | jd_df.head(5).sort_values('match')


--------------------------------------------------------------------------------
/myenv/Scripts/plac_runner.py:
--------------------------------------------------------------------------------
 1 | #!C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv\Scripts\python.exe
 2 | from __future__ import with_statement
 3 | import os
 4 | import sys
 5 | import shlex
 6 | import plac
 7 | 
 8 | 
 9 | def run(fnames, cmd, verbose):
10 |     "Run batch scripts and tests"
11 |     for fname in fnames:
12 |         with open(fname) as f:
13 |             lines = list(f)
14 |         if not lines[0].startswith('#!'):
15 |             sys.exit('Missing or incorrect shebang line!')
16 |         firstline = lines[0][2:]  # strip the shebang
17 |         init_args = shlex.split(firstline)
18 |         tool = plac.import_main(*init_args)
19 |         command = getattr(plac.Interpreter(tool), cmd)  # doctest or execute
20 |         if verbose:
21 |             sys.stdout.write('Running %s with %s' % (fname, firstline))
22 |         command(lines[1:], verbose=verbose)
23 | 
24 | 
25 | @plac.annotations(
26 |     verbose=('verbose mode', 'flag', 'v'),
27 |     interactive=('run plac tool in interactive mode', 'flag', 'i'),
28 |     multiline=('run plac tool in multiline mode', 'flag', 'm'),
29 |     serve=('run plac server', 'option', 's', int),
30 |     batch=('run plac batch files', 'flag', 'b'),
31 |     test=('run plac test files', 'flag', 't'),
32 |     fname='script to run (.py or .plac or .placet)',
33 |     extra='additional arguments',
34 |     )
35 | def main(verbose, interactive, multiline, serve, batch, test, fname='',
36 |          *extra):
37 |     "Runner for plac tools, plac batch files and plac tests"
38 |     baseparser = plac.parser_from(main)
39 |     if not fname:
40 |         baseparser.print_help()
41 |     elif sys.argv[1] == fname:  # script mode
42 |         plactool = plac.import_main(fname)
43 |         plactool.prog = os.path.basename(sys.argv[0]) + ' ' + fname
44 |         out = plac.call(plactool, sys.argv[2:], eager=False)
45 |         if plac.iterable(out):
46 |             for output in out:
47 |                 print(output)
48 |         else:
49 |             print(out)
50 |     elif interactive or multiline or serve:
51 |         plactool = plac.import_main(fname, *extra)
52 |         plactool.prog = ''
53 |         i = plac.Interpreter(plactool)
54 |         if interactive:
55 |             i.interact(verbose=verbose)
56 |         elif multiline:
57 |             i.multiline(verbose=verbose)
58 |         elif serve:
59 |             i.start_server(serve)
60 |     elif batch:
61 |         run((fname,) + extra, 'execute', verbose)
62 |     elif test:
63 |         run((fname,) + extra, 'doctest', verbose)
64 |         print('run %s plac test(s)' % (len(extra) + 1))
65 |     else:
66 |         baseparser.print_usage()
67 | 
68 | 
69 | main.add_help = False
70 | 
71 | if __name__ == '__main__':
72 |     plac.call(main)
73 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/src/components/jd_data_cleaner.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | import matplotlib.pyplot as plt 
 3 | import seaborn as sns
 4 | import re
 5 | import nltk
 6 | from nltk.corpus import stopwords
 7 | stopw  = set(stopwords.words('english'))
 8 | 
 9 | # Load the dataset:
10 | unstructured_df=pd.read_csv('jd_unstructured_data.csv')
11 | 
12 | def convert_salary(value):
13 |     if 'Unknown' in value:
14 |         return None
15 |     elif '-' in value:
16 |         values = re.findall(r'\$\d+K', value)
17 |         min_value = int(values[0].replace('$', '').replace('K', '')) if values else None
18 |         max_value = int(values[1].replace('$', '').replace('K', '')) if len(values) > 1 else None
19 |         if min_value and max_value:
20 |             return (min_value + max_value) / 2
21 |         elif min_value:
22 |             return min_value
23 |         elif max_value:
24 |             return max_value
25 |         else:
26 |             return None
27 |     else:
28 |         return int(re.findall(r'\$\d+K', value)[0].replace('$', '').replace('K', ''))
29 |     
30 | def convert_revenue(value):
31 |     if 'Unknown' in value:
32 |         return None
33 |     elif ' to ' in value:
34 |         values = re.findall(r'\d+\.?\d*', value)
35 |         min_revenue = float(values[0])
36 |         max_revenue = float(values[1])
37 |         unit = value.split()[-2]
38 |         if unit == 'billion':
39 |             min_revenue *= 1000
40 |             max_revenue *= 1000
41 |         return (min_revenue + max_revenue) / 2
42 |     else:
43 |         numerical_values = re.findall(r'\d+\.?\d*', value)
44 |         if numerical_values:
45 |             return float(numerical_values[0])
46 |         else:
47 |             return None
48 | 
49 | # Define a function to convert the size value
50 | def convert_size(value):
51 |     if 'Unknown' in value:
52 |         return None
53 |     elif ' to ' in value:
54 |         sizes = value.split(' to ')
55 |         min_size = int(sizes[0].replace('+', '').replace(',', '').split()[0])
56 |         max_size = int(sizes[1].replace('+', '').replace(',', '').split()[0])
57 |         return (min_size + max_size) / 2
58 |     else:
59 |         return int(value.replace('+', '').replace(',', '').split()[0])
60 | 
61 | # Apply the conversion function to the "Salary Column" column
62 | unstructured_df['Average Salary'] = unstructured_df['Salary Estimate'].apply(convert_salary)
63 | 
64 | # Apply the conversion function to the "Revenue" column
65 | unstructured_df['Average Revenue'] = unstructured_df['Revenue'].apply(convert_revenue)
66 | 
67 | # Extract the company name by splitting on '\r\n' and selecting the first element
68 | unstructured_df['Company Name'] = unstructured_df['Company Name'].str.split('\r\n').str[0]
69 | 
70 | 
71 | # Apply the conversion function to the "Size" column
72 | unstructured_df['Size'] = unstructured_df['Size'].apply(convert_size)
73 | 
74 | # remove stopwords and pre-process Job Description Column:
75 | unstructured_df['Processed_JD']=unstructured_df['Job Description'].apply(lambda x: ' '.join([word for word in str(x).split() if len(word)>2 and word not in (stopw)]))
76 | 
77 | 
78 | # Drop Unwanted Columns:
79 | unstructured_df=unstructured_df.drop(['Unnamed: 0','Salary Estimate','Revenue','Job Description'],axis=1)
80 | 
81 | # Check for Null Value after data pre-processing:
82 | unstructured_df.isnull().sum()
83 | 
84 | # Replace the null values with average value of each columns:
85 | # Calculate the average value of column B
86 | size_average = unstructured_df['Size'].mean()
87 | salary_average=unstructured_df['Average Salary'].mean()
88 | revenue_average=unstructured_df['Average Revenue'].mean()
89 | 
90 | # Replace null values with the average
91 | unstructured_df['Size'].fillna(size_average, inplace=True)
92 | unstructured_df['Average Salary'].fillna(salary_average, inplace=True)
93 | unstructured_df['Average Revenue'].fillna(revenue_average, inplace=True)
94 | 
95 | # Convert DataFrame to CSV file
96 | unstructured_df.to_csv(r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\data\jd_structured_data.csv', index=False)


--------------------------------------------------------------------------------
/myenv/Scripts/pywin32_testall.py:
--------------------------------------------------------------------------------
  1 | """A test runner for pywin32"""
  2 | import os
  3 | import site
  4 | import subprocess
  5 | import sys
  6 | 
  7 | # locate the dirs based on where this script is - it may be either in the
  8 | # source tree, or in an installed Python 'Scripts' tree.
  9 | this_dir = os.path.dirname(__file__)
 10 | site_packages = [
 11 |     site.getusersitepackages(),
 12 | ] + site.getsitepackages()
 13 | 
 14 | failures = []
 15 | 
 16 | 
 17 | # Run a test using subprocess and wait for the result.
 18 | # If we get an returncode != 0, we know that there was an error, but we don't
 19 | # abort immediately - we run as many tests as we can.
 20 | def run_test(script, cmdline_extras):
 21 |     dirname, scriptname = os.path.split(script)
 22 |     # some tests prefer to be run from their directory.
 23 |     cmd = [sys.executable, "-u", scriptname] + cmdline_extras
 24 |     print("--- Running '%s' ---" % script)
 25 |     sys.stdout.flush()
 26 |     result = subprocess.run(cmd, check=False, cwd=dirname)
 27 |     print("*** Test script '%s' exited with %s" % (script, result.returncode))
 28 |     sys.stdout.flush()
 29 |     if result.returncode:
 30 |         failures.append(script)
 31 | 
 32 | 
 33 | def find_and_run(possible_locations, extras):
 34 |     for maybe in possible_locations:
 35 |         if os.path.isfile(maybe):
 36 |             run_test(maybe, extras)
 37 |             break
 38 |     else:
 39 |         raise RuntimeError(
 40 |             "Failed to locate a test script in one of %s" % possible_locations
 41 |         )
 42 | 
 43 | 
 44 | def main():
 45 |     import argparse
 46 | 
 47 |     code_directories = [this_dir] + site_packages
 48 | 
 49 |     parser = argparse.ArgumentParser(
 50 |         description="A script to trigger tests in all subprojects of PyWin32."
 51 |     )
 52 |     parser.add_argument(
 53 |         "-no-user-interaction",
 54 |         default=False,
 55 |         action="store_true",
 56 |         help="(This is now the default - use `-user-interaction` to include them)",
 57 |     )
 58 | 
 59 |     parser.add_argument(
 60 |         "-user-interaction",
 61 |         action="store_true",
 62 |         help="Include tests which require user interaction",
 63 |     )
 64 | 
 65 |     parser.add_argument(
 66 |         "-skip-adodbapi",
 67 |         default=False,
 68 |         action="store_true",
 69 |         help="Skip the adodbapi tests; useful for CI where there's no provider",
 70 |     )
 71 | 
 72 |     args, remains = parser.parse_known_args()
 73 | 
 74 |     # win32, win32ui / Pythonwin
 75 | 
 76 |     extras = []
 77 |     if args.user_interaction:
 78 |         extras += ["-user-interaction"]
 79 |     extras.extend(remains)
 80 |     scripts = [
 81 |         "win32/test/testall.py",
 82 |         "Pythonwin/pywin/test/all.py",
 83 |     ]
 84 |     for script in scripts:
 85 |         maybes = [os.path.join(directory, script) for directory in code_directories]
 86 |         find_and_run(maybes, extras)
 87 | 
 88 |     # win32com
 89 |     maybes = [
 90 |         os.path.join(directory, "win32com", "test", "testall.py")
 91 |         for directory in [
 92 |             os.path.join(this_dir, "com"),
 93 |         ]
 94 |         + site_packages
 95 |     ]
 96 |     extras = remains + ["1"]  # only run "level 1" tests in CI
 97 |     find_and_run(maybes, extras)
 98 | 
 99 |     # adodbapi
100 |     if not args.skip_adodbapi:
101 |         maybes = [
102 |             os.path.join(directory, "adodbapi", "test", "adodbapitest.py")
103 |             for directory in code_directories
104 |         ]
105 |         find_and_run(maybes, remains)
106 |         # This script has a hard-coded sql server name in it, (and markh typically
107 |         # doesn't have a different server to test on) but there is now supposed to be a server out there on the Internet
108 |         # just to run these tests, so try it...
109 |         maybes = [
110 |             os.path.join(directory, "adodbapi", "test", "test_adodbapi_dbapi20.py")
111 |             for directory in code_directories
112 |         ]
113 |         find_and_run(maybes, remains)
114 | 
115 |     if failures:
116 |         print("The following scripts failed")
117 |         for failure in failures:
118 |             print(">", failure)
119 |         sys.exit(1)
120 |     print("All tests passed \\o/")
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     main()
125 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![jsr2](https://github.com/abbas99-hub/Job-Recommendation-System/assets/60792939/ffa4a634-42d7-491e-89f8-07a137322876)
 2 | 
 3 | # Job Recommendation System using Machine Learning
 4 | This repository contains the code and instructions to build a job recommendation system using machine learning. The system is designed to provide personalized job recommendations based on user preferences and historical job data. The data for this project is scraped from Glassdoor, and the system is deployed using the Azure cloud platform.
 5 | 
 6 | ## Business Understanding
 7 | The goal of this project is to develop a job recommendation system that helps users find relevant job opportunities based on their preferences and historical data. By leveraging machine learning techniques, we aim to provide personalized recommendations that align with the user's skills, interests, and career goals. The system will take into account various factors such as job title, salary estimate, company rating, location, industry, and more to generate accurate recommendations.
 8 | 
 9 | ## Data Scraping
10 | To collect the necessary data for training our recommendation system, we will scrape job-related information from Glassdoor. The following columns will be extracted:
11 | 
12 | Job Title
13 | Salary Estimate
14 | Job Description
15 | Rating
16 | Company Name
17 | Location
18 | Headquarters
19 | Size
20 | Founded
21 | Type of Ownership
22 | Industry
23 | Sector
24 | Revenue
25 | Competitors
26 | 
27 | ## Feature Engineering
28 | Once the data is collected, we will perform feature engineering to preprocess and transform the raw data into a suitable format for training our recommendation model. This step includes:
29 | 
30 | Handling Missing Data: Deal with missing values in the dataset by either imputing them or removing the corresponding rows/columns.
31 | Encoding Categorical Variables: Convert categorical variables such as job title, location, industry, and sector into numerical representations using techniques like one-hot encoding or label encoding.
32 | Feature Scaling: Normalize numerical features, such as salary estimate and company rating, to ensure they have a similar scale and prevent dominance of certain features in the model.
33 | 
34 | ## Machine Learning Techniques:
35 | To provide personalized job recommendations, we employ the TF-IDF (Term Frequency-Inverse Document Frequency) vectorization technique. The "job_recommender.py" component plays a crucial role in this process. It utilizes the TF-IDF vectorizer from the scikit-learn library to transform job descriptions and user preferences into numerical feature vectors. These vectors capture the importance of each word in the documents, enabling the system to find similar job opportunities based on user preferences. The Nearest Neighbors algorithm is then used to identify the most relevant job recommendations.
36 | 
37 | skill extractor segment provides functions and utilities to extract skills from a PDF file using the Spacy library and perform text processing and matching operations. These extracted skills can be used for further analysis and processing in the job recommendation system.
38 | 
39 | ## Streamlit Application
40 | To make the job recommendation system easily accessible and user-friendly, we have developed a Streamlit application. Streamlit provides an intuitive web interface where users can upload their resumes. The application processes the user input, applies the machine learning models, and displays the top-recommended jobs based on the user's preferences and historical data.
41 | 
42 | ## Model Deployment using Azure Cloud
43 | To make the job recommendation system accessible to users, we will deploy the model on the Azure cloud platform. The deployment process involves the following steps:
44 | 
45 | * Model Serialization: Serialize the trained model to a format compatible with the Azure cloud deployment.
46 | * Model Containerization: Package the serialized model along with the necessary dependencies and environment specifications into a container using tools like Docker.
47 | * Azure Container Registry: Create a container registry on Azure to store the model container and related artifacts securely.
48 | * Azure Kubernetes Service (AKS): Deploy the model container as a scalable microservice using AKS, which provides orchestration and management capabilities.
49 | * API Development: Develop an API that allows users to interact with the deployed model and request personalized job recommendations.
50 | * Integration and Testing: Integrate the API with other components of the job recommendation system, and perform thorough testing to ensure its functionality and performance.
51 | * Deployment Monitoring: Monitor the deployed model and API to track usage, and performance metrics, and address any potential issues or errors.
52 | 
53 | ## Usage
54 | To use the job recommendation system, follow the instructions below:
55 | 
56 | * Clone this repository: git clone <repository-url>
57 | * Install the required dependencies: pip install -r requirements.txt
58 | * Run the command: streamlit run __init__.py ( For Local Server )
59 | * Access the deployed job recommendation API and make requests to receive personalized recommendations.
60 | 
61 | #### Please feel free to contribute to this project by submitting pull requests or opening issues.
62 | 


--------------------------------------------------------------------------------
/myenv/docx-template/word/styles.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
2 | <w:styles xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:docDefaults><w:rPrDefault><w:rPr><w:rFonts w:asciiTheme="minorHAnsi" w:eastAsiaTheme="minorHAnsi" w:hAnsiTheme="minorHAnsi" w:cstheme="minorBidi"/><w:sz w:val="24"/><w:szCs w:val="24"/><w:lang w:val="en-US" w:eastAsia="en-US" w:bidi="ar-SA"/></w:rPr></w:rPrDefault><w:pPrDefault><w:pPr><w:spacing w:after="200"/></w:pPr></w:pPrDefault></w:docDefaults><w:latentStyles w:defLockedState="0" w:defUIPriority="0" w:defSemiHidden="0" w:defUnhideWhenUsed="0" w:defQFormat="0" w:count="276"/><w:style w:type="paragraph" w:default="1" w:styleId="Normal"><w:name w:val="Normal"/><w:qFormat/><w:rsid w:val="000D33D5"/></w:style><w:style w:type="paragraph" w:styleId="Heading1"><w:name w:val="heading 1"/><w:basedOn w:val="Normal"/><w:next w:val="Normal"/><w:link w:val="Heading1Char"/><w:uiPriority w:val="9"/><w:qFormat/><w:rsid w:val="00E315A3"/><w:pPr><w:keepNext/><w:keepLines/><w:spacing w:before="480" w:after="0"/><w:outlineLvl w:val="0"/></w:pPr><w:rPr><w:rFonts w:asciiTheme="majorHAnsi" w:eastAsiaTheme="majorEastAsia" w:hAnsiTheme="majorHAnsi" w:cstheme="majorBidi"/><w:b/><w:bCs/><w:color w:val="345A8A" w:themeColor="accent1" w:themeShade="B5"/><w:sz w:val="32"/><w:szCs w:val="32"/></w:rPr></w:style><w:style w:type="paragraph" w:styleId="Heading2"><w:name w:val="heading 2"/><w:basedOn w:val="Normal"/><w:next w:val="Normal"/><w:link w:val="Heading2Char"/><w:uiPriority w:val="9"/><w:unhideWhenUsed/><w:qFormat/><w:rsid w:val="00E315A3"/><w:pPr><w:keepNext/><w:keepLines/><w:spacing w:before="200" w:after="0"/><w:outlineLvl w:val="1"/></w:pPr><w:rPr><w:rFonts w:asciiTheme="majorHAnsi" w:eastAsiaTheme="majorEastAsia" w:hAnsiTheme="majorHAnsi" w:cstheme="majorBidi"/><w:b/><w:bCs/><w:color w:val="4F81BD" w:themeColor="accent1"/><w:sz w:val="26"/><w:szCs w:val="26"/></w:rPr></w:style><w:style w:type="character" w:default="1" w:styleId="DefaultParagraphFont"><w:name w:val="Default Paragraph Font"/><w:semiHidden/><w:unhideWhenUsed/></w:style><w:style w:type="table" w:default="1" w:styleId="TableNormal"><w:name w:val="Normal Table"/><w:semiHidden/><w:unhideWhenUsed/><w:qFormat/><w:tblPr><w:tblInd w:w="0" w:type="dxa"/><w:tblCellMar><w:top w:w="0" w:type="dxa"/><w:left w:w="108" w:type="dxa"/><w:bottom w:w="0" w:type="dxa"/><w:right w:w="108" w:type="dxa"/></w:tblCellMar></w:tblPr></w:style><w:style w:type="numbering" w:default="1" w:styleId="NoList"><w:name w:val="No List"/><w:semiHidden/><w:unhideWhenUsed/></w:style><w:style w:type="table" w:styleId="ColorfulGrid-Accent1"><w:name w:val="Colorful Grid Accent 1"/><w:basedOn w:val="TableNormal"/><w:uiPriority w:val="73"/><w:rsid w:val="008D6863"/><w:pPr><w:spacing w:after="0"/></w:pPr><w:rPr><w:color w:val="000000" w:themeColor="text1"/><w:sz w:val="22"/><w:szCs w:val="22"/></w:rPr><w:tblPr><w:tblStyleRowBandSize w:val="1"/><w:tblStyleColBandSize w:val="1"/><w:tblInd w:w="0" w:type="dxa"/><w:tblBorders><w:insideH w:val="single" w:sz="4" w:space="0" w:color="FFFFFF" w:themeColor="background1"/></w:tblBorders><w:tblCellMar><w:top w:w="0" w:type="dxa"/><w:left w:w="108" w:type="dxa"/><w:bottom w:w="0" w:type="dxa"/><w:right w:w="108" w:type="dxa"/></w:tblCellMar></w:tblPr><w:tcPr><w:shd w:val="clear" w:color="auto" w:fill="DBE5F1" w:themeFill="accent1" w:themeFillTint="33"/></w:tcPr><w:tblStylePr w:type="firstRow"><w:rPr><w:b/><w:bCs/></w:rPr><w:tblPr/><w:tcPr><w:shd w:val="clear" w:color="auto" w:fill="B8CCE4" w:themeFill="accent1" w:themeFillTint="66"/></w:tcPr></w:tblStylePr><w:tblStylePr w:type="lastRow"><w:rPr><w:b/><w:bCs/><w:color w:val="000000" w:themeColor="text1"/></w:rPr><w:tblPr/><w:tcPr><w:shd w:val="clear" w:color="auto" w:fill="B8CCE4" w:themeFill="accent1" w:themeFillTint="66"/></w:tcPr></w:tblStylePr><w:tblStylePr w:type="firstCol"><w:rPr><w:color w:val="FFFFFF" w:themeColor="background1"/></w:rPr><w:tblPr/><w:tcPr><w:shd w:val="clear" w:color="auto" w:fill="365F91" w:themeFill="accent1" w:themeFillShade="BF"/></w:tcPr></w:tblStylePr><w:tblStylePr w:type="lastCol"><w:rPr><w:color w:val="FFFFFF" w:themeColor="background1"/></w:rPr><w:tblPr/><w:tcPr><w:shd w:val="clear" w:color="auto" w:fill="365F91" w:themeFill="accent1" w:themeFillShade="BF"/></w:tcPr></w:tblStylePr><w:tblStylePr w:type="band1Vert"><w:tblPr/><w:tcPr><w:shd w:val="clear" w:color="auto" w:fill="A7BFDE" w:themeFill="accent1" w:themeFillTint="7F"/></w:tcPr></w:tblStylePr><w:tblStylePr w:type="band1Horz"><w:tblPr/><w:tcPr><w:shd w:val="clear" w:color="auto" w:fill="A7BFDE" w:themeFill="accent1" w:themeFillTint="7F"/></w:tcPr></w:tblStylePr></w:style><w:style w:type="character" w:customStyle="1" w:styleId="Heading1Char"><w:name w:val="Heading 1 Char"/><w:basedOn w:val="DefaultParagraphFont"/><w:link w:val="Heading1"/><w:uiPriority w:val="9"/><w:rsid w:val="00E315A3"/><w:rPr><w:rFonts w:asciiTheme="majorHAnsi" w:eastAsiaTheme="majorEastAsia" w:hAnsiTheme="majorHAnsi" w:cstheme="majorBidi"/><w:b/><w:bCs/><w:color w:val="345A8A" w:themeColor="accent1" w:themeShade="B5"/><w:sz w:val="32"/><w:szCs w:val="32"/></w:rPr></w:style><w:style w:type="character" w:customStyle="1" w:styleId="Heading2Char"><w:name w:val="Heading 2 Char"/><w:basedOn w:val="DefaultParagraphFont"/><w:link w:val="Heading2"/><w:uiPriority w:val="9"/><w:rsid w:val="00E315A3"/><w:rPr><w:rFonts w:asciiTheme="majorHAnsi" w:eastAsiaTheme="majorEastAsia" w:hAnsiTheme="majorHAnsi" w:cstheme="majorBidi"/><w:b/><w:bCs/><w:color w:val="4F81BD" w:themeColor="accent1"/><w:sz w:val="26"/><w:szCs w:val="26"/></w:rPr></w:style><w:style w:type="paragraph" w:styleId="ListBullet"><w:name w:val="List Bullet"/><w:basedOn w:val="Normal"/><w:rsid w:val="00784D58"/><w:pPr><w:numPr><w:numId w:val="8"/></w:numPr><w:contextualSpacing/></w:pPr></w:style><w:style w:type="paragraph" w:styleId="ListNumber"><w:name w:val="List Number"/><w:basedOn w:val="Normal"/><w:rsid w:val="00784D58"/><w:pPr><w:numPr><w:numId w:val="13"/></w:numPr><w:contextualSpacing/></w:pPr></w:style><w:style w:type="paragraph" w:styleId="BodyText"><w:name w:val="Body Text"/><w:basedOn w:val="Normal"/><w:link w:val="BodyTextChar"/><w:rsid w:val="00BC48D5"/><w:pPr><w:spacing w:after="120"/></w:pPr></w:style><w:style w:type="character" w:customStyle="1" w:styleId="BodyTextChar"><w:name w:val="Body Text Char"/><w:basedOn w:val="DefaultParagraphFont"/><w:link w:val="BodyText"/><w:rsid w:val="00BC48D5"/></w:style></w:styles>


--------------------------------------------------------------------------------
/myenv/share/man/man1/ttx.1:
--------------------------------------------------------------------------------
  1 | .Dd May 18, 2004
  2 | .\" ttx is not specific to any OS, but contrary to what groff_mdoc(7)
  3 | .\" seems to imply, entirely omitting the .Os macro causes 'BSD' to
  4 | .\" be used, so I give a zero-width space as its argument.
  5 | .Os \&
  6 | .\" The "FontTools Manual" argument apparently has no effect in
  7 | .\" groff 1.18.1. I think it is a bug in the -mdoc groff package.
  8 | .Dt TTX 1 "FontTools Manual"
  9 | .Sh NAME
 10 | .Nm ttx
 11 | .Nd tool for manipulating TrueType and OpenType fonts
 12 | .Sh SYNOPSIS
 13 | .Nm
 14 | .Bk
 15 | .Op Ar option ...
 16 | .Ek
 17 | .Bk
 18 | .Ar file ...
 19 | .Ek
 20 | .Sh DESCRIPTION
 21 | .Nm
 22 | is a tool for manipulating TrueType and OpenType fonts.  It can convert
 23 | TrueType and OpenType fonts to and from an
 24 | .Tn XML Ns -based format called
 25 | .Tn TTX .
 26 | .Tn TTX
 27 | files have a
 28 | .Ql .ttx
 29 | extension.
 30 | .Pp
 31 | For each
 32 | .Ar file
 33 | argument it is given,
 34 | .Nm
 35 | detects whether it is a
 36 | .Ql .ttf ,
 37 | .Ql .otf
 38 | or
 39 | .Ql .ttx
 40 | file and acts accordingly: if it is a
 41 | .Ql .ttf
 42 | or
 43 | .Ql .otf
 44 | file, it generates a
 45 | .Ql .ttx
 46 | file; if it is a
 47 | .Ql .ttx
 48 | file, it generates a
 49 | .Ql .ttf
 50 | or
 51 | .Ql .otf
 52 | file.
 53 | .Pp
 54 | By default, every output file is created in the same directory as the
 55 | corresponding input file and with the same name except for the
 56 | extension, which is substituted appropriately.
 57 | .Nm
 58 | never overwrites existing files; if necessary, it appends a suffix to
 59 | the output file name before the extension, as in
 60 | .Pa Arial#1.ttf .
 61 | .Ss "General options"
 62 | .Bl -tag -width ".Fl t Ar table"
 63 | .It Fl h
 64 | Display usage information.
 65 | .It Fl d Ar dir
 66 | Write the output files to directory
 67 | .Ar dir
 68 | instead of writing every output file to the same directory as the
 69 | corresponding input file.
 70 | .It Fl o Ar file
 71 | Write the output to
 72 | .Ar file
 73 | instead of writing it to the same directory as the
 74 | corresponding input file.
 75 | .It Fl v
 76 | Be verbose.  Write more messages to the standard output describing what
 77 | is being done.
 78 | .It Fl a
 79 | Allow virtual glyphs ID's on compile or decompile.
 80 | .El
 81 | .Ss "Dump options"
 82 | The following options control the process of dumping font files
 83 | (TrueType or OpenType) to
 84 | .Tn TTX
 85 | files.
 86 | .Bl -tag -width ".Fl t Ar table"
 87 | .It Fl l
 88 | List table information.  Instead of dumping the font to a
 89 | .Tn TTX
 90 | file, display minimal information about each table.
 91 | .It Fl t Ar table
 92 | Dump table
 93 | .Ar table .
 94 | This option may be given multiple times to dump several tables at
 95 | once.  When not specified, all tables are dumped.
 96 | .It Fl x Ar table
 97 | Exclude table
 98 | .Ar table
 99 | from the list of tables to dump.  This option may be given multiple
100 | times to exclude several tables from the dump.  The
101 | .Fl t
102 | and
103 | .Fl x
104 | options are mutually exclusive.
105 | .It Fl s
106 | Split tables.  Dump each table to a separate
107 | .Tn TTX
108 | file and write (under the name that would have been used for the output
109 | file if the
110 | .Fl s
111 | option had not been given) one small
112 | .Tn TTX
113 | file containing references to the individual table dump files.  This
114 | file can be used as input to
115 | .Nm
116 | as long as the referenced files can be found in the same directory.
117 | .It Fl i
118 | .\" XXX: I suppose OpenType programs (exist and) are also affected.
119 | Don't disassemble TrueType instructions.  When this option is specified,
120 | all TrueType programs (glyph programs, the font program and the
121 | pre-program) are written to the
122 | .Tn TTX
123 | file as hexadecimal data instead of
124 | assembly.  This saves some time and results in smaller
125 | .Tn TTX
126 | files.
127 | .It Fl y Ar n
128 | When decompiling a TrueType Collection (TTC) file,
129 | decompile font number
130 | .Ar n ,
131 | starting from 0.
132 | .El
133 | .Ss "Compilation options"
134 | The following options control the process of compiling
135 | .Tn TTX
136 | files into font files (TrueType or OpenType):
137 | .Bl -tag -width ".Fl t Ar table"
138 | .It Fl m Ar fontfile
139 | Merge the input
140 | .Tn TTX
141 | file
142 | .Ar file
143 | with
144 | .Ar fontfile .
145 | No more than one
146 | .Ar file
147 | argument can be specified when this option is used.
148 | .It Fl b
149 | Don't recalculate glyph bounding boxes.  Use the values in the
150 | .Tn TTX
151 | file as is.
152 | .El
153 | .Sh "THE TTX FILE FORMAT"
154 | You can find some information about the
155 | .Tn TTX
156 | file format in
157 | .Pa documentation.html .
158 | In particular, you will find in that file the list of tables understood by
159 | .Nm
160 | and the relations between TrueType GlyphIDs and the glyph names used in
161 | .Tn TTX
162 | files.
163 | .Sh EXAMPLES
164 | In the following examples, all files are read from and written to the
165 | current directory.  Additionally, the name given for the output file
166 | assumes in every case that it did not exist before
167 | .Nm
168 | was invoked.
169 | .Pp
170 | Dump the TrueType font contained in
171 | .Pa FreeSans.ttf
172 | to
173 | .Pa FreeSans.ttx :
174 | .Pp
175 | .Dl ttx FreeSans.ttf
176 | .Pp
177 | Compile
178 | .Pa MyFont.ttx
179 | into a TrueType or OpenType font file:
180 | .Pp
181 | .Dl ttx MyFont.ttx
182 | .Pp
183 | List the tables in
184 | .Pa FreeSans.ttf
185 | along with some information:
186 | .Pp
187 | .Dl ttx -l FreeSans.ttf
188 | .Pp
189 | Dump the
190 | .Sq cmap
191 | table from
192 | .Pa FreeSans.ttf
193 | to
194 | .Pa FreeSans.ttx :
195 | .Pp
196 | .Dl ttx -t cmap FreeSans.ttf
197 | .Sh NOTES
198 | On MS\-Windows and MacOS,
199 | .Nm
200 | is available as a graphical application to which files can be dropped.
201 | .Sh SEE ALSO
202 | .Pa documentation.html
203 | .Pp
204 | .Xr fontforge 1 ,
205 | .Xr ftinfo 1 ,
206 | .Xr gfontview 1 ,
207 | .Xr xmbdfed 1 ,
208 | .Xr Font::TTF 3pm
209 | .Sh AUTHORS
210 | .Nm
211 | was written by
212 | .An -nosplit
213 | .An "Just van Rossum" Aq just@letterror.com .
214 | .Pp
215 | This manual page was written by
216 | .An "Florent Rougon" Aq f.rougon@free.fr
217 | for the Debian GNU/Linux system based on the existing FontTools
218 | documentation.  It may be freely used, modified and distributed without
219 | restrictions.
220 | .\" For Emacs:
221 | .\" Local Variables:
222 | .\" fill-column: 72
223 | .\" sentence-end: "[.?!][]\"')}]*\\($\\| $\\|   \\|  \\)[   \n]*"
224 | .\" sentence-end-double-space: t
225 | .\" End:


--------------------------------------------------------------------------------
/src/data/url_data_scientist_loc_bangalore.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=117&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_24fdd564&cb=1586328300996&jobListingId=3548697507",
 3 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=120&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_385c924b&cb=1586328300998&jobListingId=3334835027",
 4 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=129&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_07047cd3&cb=1586328301010&jobListingId=3255119944",
 5 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=124&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_12a2f280&cb=1586328301002&jobListingId=3309508178",
 6 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=125&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_f977b9a1&cb=1586328301002&jobListingId=3463137315",
 7 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=121&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_f41df190&cb=1586328300999&jobListingId=3522398014",
 8 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=114&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_eed74d53&cb=1586328300994&jobListingId=3548424285",
 9 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=101&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b527f613&cb=1586328300984&jobListingId=3463396953",
10 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=102&ao=883172&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_4cce804c&cb=1586328300985&jobListingId=3463302895",
11 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=108&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_d4e51354&cb=1586328300990&jobListingId=3548552003",
12 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=118&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_86e6a5cf&cb=1586328300997&jobListingId=3361772952",
13 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=122&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_9a16a794&cb=1586328301000&jobListingId=3224747590",
14 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=109&ao=4120&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b44be053&cb=1586328300991&jobListingId=3284143205",
15 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=103&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b9149c3f&cb=1586328300986&jobListingId=3488569582",
16 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=127&ao=140609&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_1bd12ebc&cb=1586328301004&jobListingId=3463632306",
17 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=110&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b987a82e&cb=1586328300992&jobListingId=3394241447",
18 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=112&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_0f7674db&cb=1586328300993&jobListingId=3442340171",
19 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=126&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_5bd8e316&cb=1586328301003&jobListingId=3549041922",
20 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=111&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_588dd741&cb=1586328300992&jobListingId=3255119711",
21 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=113&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_779c66c0&cb=1586328300994&jobListingId=3285076786",
22 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=123&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_175a925f&cb=1586328301001&jobListingId=3442891307",
23 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=116&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_86d86e28&cb=1586328300996&jobListingId=3548427144",
24 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=104&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_eca49f2c&cb=1586328300987&jobListingId=3463894344",
25 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=130&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_78b076c2&cb=1586328301010&jobListingId=3548552092",
26 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=106&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_a61639ea&cb=1586328300988&jobListingId=3548704688",
27 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=105&ao=4120&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_0e5497d0&cb=1586328300987&jobListingId=3488034422",
28 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=128&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_64c1d4e8&cb=1586328301009&jobListingId=3393519661",
29 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=119&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_9264dd03&cb=1586328300998&jobListingId=3361772911",
30 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=115&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_37b252b0&cb=1586328300995&jobListingId=3548425012",
31 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=107&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_68954416&cb=1586328300989&jobListingId=3548552132"
32 | ]


--------------------------------------------------------------------------------
/src/notebook/url_data_scientist_loc_bangalore.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=117&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_24fdd564&cb=1586328300996&jobListingId=3548697507",
 3 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=120&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_385c924b&cb=1586328300998&jobListingId=3334835027",
 4 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=129&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_07047cd3&cb=1586328301010&jobListingId=3255119944",
 5 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=124&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_12a2f280&cb=1586328301002&jobListingId=3309508178",
 6 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=125&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_f977b9a1&cb=1586328301002&jobListingId=3463137315",
 7 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=121&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_f41df190&cb=1586328300999&jobListingId=3522398014",
 8 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=114&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_eed74d53&cb=1586328300994&jobListingId=3548424285",
 9 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=101&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b527f613&cb=1586328300984&jobListingId=3463396953",
10 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=102&ao=883172&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_4cce804c&cb=1586328300985&jobListingId=3463302895",
11 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=108&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_d4e51354&cb=1586328300990&jobListingId=3548552003",
12 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=118&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_86e6a5cf&cb=1586328300997&jobListingId=3361772952",
13 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=122&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_9a16a794&cb=1586328301000&jobListingId=3224747590",
14 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=109&ao=4120&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b44be053&cb=1586328300991&jobListingId=3284143205",
15 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=103&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b9149c3f&cb=1586328300986&jobListingId=3488569582",
16 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=127&ao=140609&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_1bd12ebc&cb=1586328301004&jobListingId=3463632306",
17 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=110&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_b987a82e&cb=1586328300992&jobListingId=3394241447",
18 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=112&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_0f7674db&cb=1586328300993&jobListingId=3442340171",
19 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=126&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_5bd8e316&cb=1586328301003&jobListingId=3549041922",
20 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=111&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_588dd741&cb=1586328300992&jobListingId=3255119711",
21 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=113&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_779c66c0&cb=1586328300994&jobListingId=3285076786",
22 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=123&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_175a925f&cb=1586328301001&jobListingId=3442891307",
23 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=116&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_86d86e28&cb=1586328300996&jobListingId=3548427144",
24 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=104&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_eca49f2c&cb=1586328300987&jobListingId=3463894344",
25 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=130&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_78b076c2&cb=1586328301010&jobListingId=3548552092",
26 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=106&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&ea=1&cs=1_a61639ea&cb=1586328300988&jobListingId=3548704688",
27 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=105&ao=4120&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_0e5497d0&cb=1586328300987&jobListingId=3488034422",
28 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=128&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_64c1d4e8&cb=1586328301009&jobListingId=3393519661",
29 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=119&ao=437149&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_9264dd03&cb=1586328300998&jobListingId=3361772911",
30 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=115&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_37b252b0&cb=1586328300995&jobListingId=3548425012",
31 |     "https://www.glassdoor.co.in/partner/jobListing.htm?pos=107&ao=389273&s=58&guid=000001715888ecafb95d504b04025cde&src=GD_JOB_AD&t=SR&extid=1&exst=&ist=L&ast=L&vt=w&slr=true&cs=1_68954416&cb=1586328300989&jobListingId=3548552132"
32 | ]


--------------------------------------------------------------------------------
/myenv/docx-template/word/theme/theme1.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
2 | <a:theme xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" name="Office Theme"><a:themeElements><a:clrScheme name="Office"><a:dk1><a:sysClr val="windowText" lastClr="000000"/></a:dk1><a:lt1><a:sysClr val="window" lastClr="FFFFFF"/></a:lt1><a:dk2><a:srgbClr val="1F497D"/></a:dk2><a:lt2><a:srgbClr val="EEECE1"/></a:lt2><a:accent1><a:srgbClr val="4F81BD"/></a:accent1><a:accent2><a:srgbClr val="C0504D"/></a:accent2><a:accent3><a:srgbClr val="9BBB59"/></a:accent3><a:accent4><a:srgbClr val="8064A2"/></a:accent4><a:accent5><a:srgbClr val="4BACC6"/></a:accent5><a:accent6><a:srgbClr val="F79646"/></a:accent6><a:hlink><a:srgbClr val="0000FF"/></a:hlink><a:folHlink><a:srgbClr val="800080"/></a:folHlink></a:clrScheme><a:fontScheme name="Office"><a:majorFont><a:latin typeface="Calibri"/><a:ea typeface=""/><a:cs typeface=""/><a:font script="Jpan" typeface="ＭＳ ゴシック"/><a:font script="Hang" typeface="맑은 고딕"/><a:font script="Hans" typeface="宋体"/><a:font script="Hant" typeface="新細明體"/><a:font script="Arab" typeface="Times New Roman"/><a:font script="Hebr" typeface="Times New Roman"/><a:font script="Thai" typeface="Angsana New"/><a:font script="Ethi" typeface="Nyala"/><a:font script="Beng" typeface="Vrinda"/><a:font script="Gujr" typeface="Shruti"/><a:font script="Khmr" typeface="MoolBoran"/><a:font script="Knda" typeface="Tunga"/><a:font script="Guru" typeface="Raavi"/><a:font script="Cans" typeface="Euphemia"/><a:font script="Cher" typeface="Plantagenet Cherokee"/><a:font script="Yiii" typeface="Microsoft Yi Baiti"/><a:font script="Tibt" typeface="Microsoft Himalaya"/><a:font script="Thaa" typeface="MV Boli"/><a:font script="Deva" typeface="Mangal"/><a:font script="Telu" typeface="Gautami"/><a:font script="Taml" typeface="Latha"/><a:font script="Syrc" typeface="Estrangelo Edessa"/><a:font script="Orya" typeface="Kalinga"/><a:font script="Mlym" typeface="Kartika"/><a:font script="Laoo" typeface="DokChampa"/><a:font script="Sinh" typeface="Iskoola Pota"/><a:font script="Mong" typeface="Mongolian Baiti"/><a:font script="Viet" typeface="Times New Roman"/><a:font script="Uigh" typeface="Microsoft Uighur"/></a:majorFont><a:minorFont><a:latin typeface="Cambria"/><a:ea typeface=""/><a:cs typeface=""/><a:font script="Jpan" typeface="ＭＳ 明朝"/><a:font script="Hang" typeface="맑은 고딕"/><a:font script="Hans" typeface="宋体"/><a:font script="Hant" typeface="新細明體"/><a:font script="Arab" typeface="Arial"/><a:font script="Hebr" typeface="Arial"/><a:font script="Thai" typeface="Cordia New"/><a:font script="Ethi" typeface="Nyala"/><a:font script="Beng" typeface="Vrinda"/><a:font script="Gujr" typeface="Shruti"/><a:font script="Khmr" typeface="DaunPenh"/><a:font script="Knda" typeface="Tunga"/><a:font script="Guru" typeface="Raavi"/><a:font script="Cans" typeface="Euphemia"/><a:font script="Cher" typeface="Plantagenet Cherokee"/><a:font script="Yiii" typeface="Microsoft Yi Baiti"/><a:font script="Tibt" typeface="Microsoft Himalaya"/><a:font script="Thaa" typeface="MV Boli"/><a:font script="Deva" typeface="Mangal"/><a:font script="Telu" typeface="Gautami"/><a:font script="Taml" typeface="Latha"/><a:font script="Syrc" typeface="Estrangelo Edessa"/><a:font script="Orya" typeface="Kalinga"/><a:font script="Mlym" typeface="Kartika"/><a:font script="Laoo" typeface="DokChampa"/><a:font script="Sinh" typeface="Iskoola Pota"/><a:font script="Mong" typeface="Mongolian Baiti"/><a:font script="Viet" typeface="Arial"/><a:font script="Uigh" typeface="Microsoft Uighur"/></a:minorFont></a:fontScheme><a:fmtScheme name="Office"><a:fillStyleLst><a:solidFill><a:schemeClr val="phClr"/></a:solidFill><a:gradFill rotWithShape="1"><a:gsLst><a:gs pos="0"><a:schemeClr val="phClr"><a:tint val="50000"/><a:satMod val="300000"/></a:schemeClr></a:gs><a:gs pos="35000"><a:schemeClr val="phClr"><a:tint val="37000"/><a:satMod val="300000"/></a:schemeClr></a:gs><a:gs pos="100000"><a:schemeClr val="phClr"><a:tint val="15000"/><a:satMod val="350000"/></a:schemeClr></a:gs></a:gsLst><a:lin ang="16200000" scaled="1"/></a:gradFill><a:gradFill rotWithShape="1"><a:gsLst><a:gs pos="0"><a:schemeClr val="phClr"><a:tint val="100000"/><a:shade val="100000"/><a:satMod val="130000"/></a:schemeClr></a:gs><a:gs pos="100000"><a:schemeClr val="phClr"><a:tint val="50000"/><a:shade val="100000"/><a:satMod val="350000"/></a:schemeClr></a:gs></a:gsLst><a:lin ang="16200000" scaled="0"/></a:gradFill></a:fillStyleLst><a:lnStyleLst><a:ln w="9525" cap="flat" cmpd="sng" algn="ctr"><a:solidFill><a:schemeClr val="phClr"><a:shade val="95000"/><a:satMod val="105000"/></a:schemeClr></a:solidFill><a:prstDash val="solid"/></a:ln><a:ln w="25400" cap="flat" cmpd="sng" algn="ctr"><a:solidFill><a:schemeClr val="phClr"/></a:solidFill><a:prstDash val="solid"/></a:ln><a:ln w="38100" cap="flat" cmpd="sng" algn="ctr"><a:solidFill><a:schemeClr val="phClr"/></a:solidFill><a:prstDash val="solid"/></a:ln></a:lnStyleLst><a:effectStyleLst><a:effectStyle><a:effectLst><a:outerShdw blurRad="40000" dist="20000" dir="5400000" rotWithShape="0"><a:srgbClr val="000000"><a:alpha val="38000"/></a:srgbClr></a:outerShdw></a:effectLst></a:effectStyle><a:effectStyle><a:effectLst><a:outerShdw blurRad="40000" dist="23000" dir="5400000" rotWithShape="0"><a:srgbClr val="000000"><a:alpha val="35000"/></a:srgbClr></a:outerShdw></a:effectLst></a:effectStyle><a:effectStyle><a:effectLst><a:outerShdw blurRad="40000" dist="23000" dir="5400000" rotWithShape="0"><a:srgbClr val="000000"><a:alpha val="35000"/></a:srgbClr></a:outerShdw></a:effectLst><a:scene3d><a:camera prst="orthographicFront"><a:rot lat="0" lon="0" rev="0"/></a:camera><a:lightRig rig="threePt" dir="t"><a:rot lat="0" lon="0" rev="1200000"/></a:lightRig></a:scene3d><a:sp3d><a:bevelT w="63500" h="25400"/></a:sp3d></a:effectStyle></a:effectStyleLst><a:bgFillStyleLst><a:solidFill><a:schemeClr val="phClr"/></a:solidFill><a:gradFill rotWithShape="1"><a:gsLst><a:gs pos="0"><a:schemeClr val="phClr"><a:tint val="40000"/><a:satMod val="350000"/></a:schemeClr></a:gs><a:gs pos="40000"><a:schemeClr val="phClr"><a:tint val="45000"/><a:shade val="99000"/><a:satMod val="350000"/></a:schemeClr></a:gs><a:gs pos="100000"><a:schemeClr val="phClr"><a:shade val="20000"/><a:satMod val="255000"/></a:schemeClr></a:gs></a:gsLst><a:path path="circle"><a:fillToRect l="50000" t="-80000" r="50000" b="180000"/></a:path></a:gradFill><a:gradFill rotWithShape="1"><a:gsLst><a:gs pos="0"><a:schemeClr val="phClr"><a:tint val="80000"/><a:satMod val="300000"/></a:schemeClr></a:gs><a:gs pos="100000"><a:schemeClr val="phClr"><a:shade val="30000"/><a:satMod val="200000"/></a:schemeClr></a:gs></a:gsLst><a:path path="circle"><a:fillToRect l="50000" t="50000" r="50000" b="50000"/></a:path></a:gradFill></a:bgFillStyleLst></a:fmtScheme></a:themeElements><a:objectDefaults><a:spDef><a:spPr/><a:bodyPr/><a:lstStyle/><a:style><a:lnRef idx="1"><a:schemeClr val="accent1"/></a:lnRef><a:fillRef idx="3"><a:schemeClr val="accent1"/></a:fillRef><a:effectRef idx="2"><a:schemeClr val="accent1"/></a:effectRef><a:fontRef idx="minor"><a:schemeClr val="lt1"/></a:fontRef></a:style></a:spDef><a:lnDef><a:spPr/><a:bodyPr/><a:lstStyle/><a:style><a:lnRef idx="2"><a:schemeClr val="accent1"/></a:lnRef><a:fillRef idx="0"><a:schemeClr val="accent1"/></a:fillRef><a:effectRef idx="1"><a:schemeClr val="accent1"/></a:effectRef><a:fontRef idx="minor"><a:schemeClr val="tx1"/></a:fontRef></a:style></a:lnDef></a:objectDefaults><a:extraClrSchemeLst/></a:theme>


--------------------------------------------------------------------------------
/myenv/share/jupyter/kernels/python3/logo-svg.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    version="1.0"
  6 |    id="svg2"
  7 |    sodipodi:version="0.32"
  8 |    inkscape:version="1.2.1 (9c6d41e410, 2022-07-14)"
  9 |    sodipodi:docname="python-logo-only.svg"
 10 |    width="83.371017pt"
 11 |    height="101.00108pt"
 12 |    inkscape:export-filename="python-logo-only.png"
 13 |    inkscape:export-xdpi="232.44"
 14 |    inkscape:export-ydpi="232.44"
 15 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 16 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 17 |    xmlns:xlink="http://www.w3.org/1999/xlink"
 18 |    xmlns="http://www.w3.org/2000/svg"
 19 |    xmlns:svg="http://www.w3.org/2000/svg"
 20 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 21 |    xmlns:cc="http://creativecommons.org/ns#"
 22 |    xmlns:dc="http://purl.org/dc/elements/1.1/">
 23 |   <metadata
 24 |      id="metadata371">
 25 |     <rdf:RDF>
 26 |       <cc:Work
 27 |          rdf:about="">
 28 |         <dc:format>image/svg+xml</dc:format>
 29 |         <dc:type
 30 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 31 |       </cc:Work>
 32 |     </rdf:RDF>
 33 |   </metadata>
 34 |   <sodipodi:namedview
 35 |      inkscape:window-height="2080"
 36 |      inkscape:window-width="1976"
 37 |      inkscape:pageshadow="2"
 38 |      inkscape:pageopacity="0.0"
 39 |      guidetolerance="10.0"
 40 |      gridtolerance="10.0"
 41 |      objecttolerance="10.0"
 42 |      borderopacity="1.0"
 43 |      bordercolor="#666666"
 44 |      pagecolor="#ffffff"
 45 |      id="base"
 46 |      inkscape:zoom="2.1461642"
 47 |      inkscape:cx="91.558698"
 48 |      inkscape:cy="47.9926"
 49 |      inkscape:window-x="1092"
 50 |      inkscape:window-y="72"
 51 |      inkscape:current-layer="svg2"
 52 |      width="210mm"
 53 |      height="40mm"
 54 |      units="mm"
 55 |      inkscape:showpageshadow="2"
 56 |      inkscape:pagecheckerboard="0"
 57 |      inkscape:deskcolor="#d1d1d1"
 58 |      inkscape:document-units="pt"
 59 |      showgrid="false"
 60 |      inkscape:window-maximized="0" />
 61 |   <defs
 62 |      id="defs4">
 63 |     <linearGradient
 64 |        id="linearGradient2795">
 65 |       <stop
 66 |          style="stop-color:#b8b8b8;stop-opacity:0.49803922;"
 67 |          offset="0"
 68 |          id="stop2797" />
 69 |       <stop
 70 |          style="stop-color:#7f7f7f;stop-opacity:0;"
 71 |          offset="1"
 72 |          id="stop2799" />
 73 |     </linearGradient>
 74 |     <linearGradient
 75 |        id="linearGradient2787">
 76 |       <stop
 77 |          style="stop-color:#7f7f7f;stop-opacity:0.5;"
 78 |          offset="0"
 79 |          id="stop2789" />
 80 |       <stop
 81 |          style="stop-color:#7f7f7f;stop-opacity:0;"
 82 |          offset="1"
 83 |          id="stop2791" />
 84 |     </linearGradient>
 85 |     <linearGradient
 86 |        id="linearGradient3676">
 87 |       <stop
 88 |          style="stop-color:#b2b2b2;stop-opacity:0.5;"
 89 |          offset="0"
 90 |          id="stop3678" />
 91 |       <stop
 92 |          style="stop-color:#b3b3b3;stop-opacity:0;"
 93 |          offset="1"
 94 |          id="stop3680" />
 95 |     </linearGradient>
 96 |     <linearGradient
 97 |        id="linearGradient3236">
 98 |       <stop
 99 |          style="stop-color:#f4f4f4;stop-opacity:1"
100 |          offset="0"
101 |          id="stop3244" />
102 |       <stop
103 |          style="stop-color:white;stop-opacity:1"
104 |          offset="1"
105 |          id="stop3240" />
106 |     </linearGradient>
107 |     <linearGradient
108 |        id="linearGradient4671">
109 |       <stop
110 |          style="stop-color:#ffd43b;stop-opacity:1;"
111 |          offset="0"
112 |          id="stop4673" />
113 |       <stop
114 |          style="stop-color:#ffe873;stop-opacity:1"
115 |          offset="1"
116 |          id="stop4675" />
117 |     </linearGradient>
118 |     <linearGradient
119 |        id="linearGradient4689">
120 |       <stop
121 |          style="stop-color:#5a9fd4;stop-opacity:1;"
122 |          offset="0"
123 |          id="stop4691" />
124 |       <stop
125 |          style="stop-color:#306998;stop-opacity:1;"
126 |          offset="1"
127 |          id="stop4693" />
128 |     </linearGradient>
129 |     <linearGradient
130 |        x1="224.23996"
131 |        y1="144.75717"
132 |        x2="-65.308502"
133 |        y2="144.75717"
134 |        id="linearGradient2987"
135 |        xlink:href="#linearGradient4671"
136 |        gradientUnits="userSpaceOnUse"
137 |        gradientTransform="translate(100.2702,99.61116)" />
138 |     <linearGradient
139 |        x1="172.94208"
140 |        y1="77.475983"
141 |        x2="26.670298"
142 |        y2="76.313133"
143 |        id="linearGradient2990"
144 |        xlink:href="#linearGradient4689"
145 |        gradientUnits="userSpaceOnUse"
146 |        gradientTransform="translate(100.2702,99.61116)" />
147 |     <linearGradient
148 |        inkscape:collect="always"
149 |        xlink:href="#linearGradient4689"
150 |        id="linearGradient2587"
151 |        gradientUnits="userSpaceOnUse"
152 |        gradientTransform="translate(100.2702,99.61116)"
153 |        x1="172.94208"
154 |        y1="77.475983"
155 |        x2="26.670298"
156 |        y2="76.313133" />
157 |     <linearGradient
158 |        inkscape:collect="always"
159 |        xlink:href="#linearGradient4671"
160 |        id="linearGradient2589"
161 |        gradientUnits="userSpaceOnUse"
162 |        gradientTransform="translate(100.2702,99.61116)"
163 |        x1="224.23996"
164 |        y1="144.75717"
165 |        x2="-65.308502"
166 |        y2="144.75717" />
167 |     <linearGradient
168 |        inkscape:collect="always"
169 |        xlink:href="#linearGradient4689"
170 |        id="linearGradient2248"
171 |        gradientUnits="userSpaceOnUse"
172 |        gradientTransform="translate(100.2702,99.61116)"
173 |        x1="172.94208"
174 |        y1="77.475983"
175 |        x2="26.670298"
176 |        y2="76.313133" />
177 |     <linearGradient
178 |        inkscape:collect="always"
179 |        xlink:href="#linearGradient4671"
180 |        id="linearGradient2250"
181 |        gradientUnits="userSpaceOnUse"
182 |        gradientTransform="translate(100.2702,99.61116)"
183 |        x1="224.23996"
184 |        y1="144.75717"
185 |        x2="-65.308502"
186 |        y2="144.75717" />
187 |     <linearGradient
188 |        inkscape:collect="always"
189 |        xlink:href="#linearGradient4671"
190 |        id="linearGradient2255"
191 |        gradientUnits="userSpaceOnUse"
192 |        gradientTransform="matrix(0.562541,0,0,0.567972,-11.5974,-7.60954)"
193 |        x1="224.23996"
194 |        y1="144.75717"
195 |        x2="-65.308502"
196 |        y2="144.75717" />
197 |     <linearGradient
198 |        inkscape:collect="always"
199 |        xlink:href="#linearGradient4689"
200 |        id="linearGradient2258"
201 |        gradientUnits="userSpaceOnUse"
202 |        gradientTransform="matrix(0.562541,0,0,0.567972,-11.5974,-7.60954)"
203 |        x1="172.94208"
204 |        y1="76.176224"
205 |        x2="26.670298"
206 |        y2="76.313133" />
207 |     <radialGradient
208 |        inkscape:collect="always"
209 |        xlink:href="#linearGradient2795"
210 |        id="radialGradient2801"
211 |        cx="61.518883"
212 |        cy="132.28575"
213 |        fx="61.518883"
214 |        fy="132.28575"
215 |        r="29.036913"
216 |        gradientTransform="matrix(1,0,0,0.177966,0,108.7434)"
217 |        gradientUnits="userSpaceOnUse" />
218 |     <linearGradient
219 |        inkscape:collect="always"
220 |        xlink:href="#linearGradient4671"
221 |        id="linearGradient1475"
222 |        gradientUnits="userSpaceOnUse"
223 |        gradientTransform="matrix(0.562541,0,0,0.567972,-14.99112,-11.702371)"
224 |        x1="150.96111"
225 |        y1="192.35176"
226 |        x2="112.03144"
227 |        y2="137.27299" />
228 |     <linearGradient
229 |        inkscape:collect="always"
230 |        xlink:href="#linearGradient4689"
231 |        id="linearGradient1478"
232 |        gradientUnits="userSpaceOnUse"
233 |        gradientTransform="matrix(0.562541,0,0,0.567972,-14.99112,-11.702371)"
234 |        x1="26.648937"
235 |        y1="20.603781"
236 |        x2="135.66525"
237 |        y2="114.39767" />
238 |     <radialGradient
239 |        inkscape:collect="always"
240 |        xlink:href="#linearGradient2795"
241 |        id="radialGradient1480"
242 |        gradientUnits="userSpaceOnUse"
243 |        gradientTransform="matrix(1.7490565e-8,-0.23994696,1.054668,3.7915457e-7,-83.7008,142.46201)"
244 |        cx="61.518883"
245 |        cy="132.28575"
246 |        fx="61.518883"
247 |        fy="132.28575"
248 |        r="29.036913" />
249 |   </defs>
250 |   <path
251 |      style="fill:url(#linearGradient1478);fill-opacity:1"
252 |      d="M 54.918785,9.1927421e-4 C 50.335132,0.02221727 45.957846,0.41313697 42.106285,1.0946693 30.760069,3.0991731 28.700036,7.2947714 28.700035,15.032169 v 10.21875 h 26.8125 v 3.40625 h -26.8125 -10.0625 c -7.792459,0 -14.6157588,4.683717 -16.7499998,13.59375 -2.46181998,10.212966 -2.57101508,16.586023 0,27.25 1.9059283,7.937852 6.4575432,13.593748 14.2499998,13.59375 h 9.21875 v -12.25 c 0,-8.849902 7.657144,-16.656248 16.75,-16.65625 h 26.78125 c 7.454951,0 13.406253,-6.138164 13.40625,-13.625 v -25.53125 c 0,-7.2663386 -6.12998,-12.7247771 -13.40625,-13.9374997 C 64.281548,0.32794397 59.502438,-0.02037903 54.918785,9.1927421e-4 Z m -14.5,8.21875012579 c 2.769547,0 5.03125,2.2986456 5.03125,5.1249996 -2e-6,2.816336 -2.261703,5.09375 -5.03125,5.09375 -2.779476,-1e-6 -5.03125,-2.277415 -5.03125,-5.09375 -10e-7,-2.826353 2.251774,-5.1249996 5.03125,-5.1249996 z"
253 |      id="path1948" />
254 |   <path
255 |      style="fill:url(#linearGradient1475);fill-opacity:1"
256 |      d="m 85.637535,28.657169 v 11.90625 c 0,9.230755 -7.825895,16.999999 -16.75,17 h -26.78125 c -7.335833,0 -13.406249,6.278483 -13.40625,13.625 v 25.531247 c 0,7.266344 6.318588,11.540324 13.40625,13.625004 8.487331,2.49561 16.626237,2.94663 26.78125,0 6.750155,-1.95439 13.406253,-5.88761 13.40625,-13.625004 V 86.500919 h -26.78125 v -3.40625 h 26.78125 13.406254 c 7.792461,0 10.696251,-5.435408 13.406241,-13.59375 2.79933,-8.398886 2.68022,-16.475776 0,-27.25 -1.92578,-7.757441 -5.60387,-13.59375 -13.406241,-13.59375 z m -15.0625,64.65625 c 2.779478,3e-6 5.03125,2.277417 5.03125,5.093747 -2e-6,2.826354 -2.251775,5.125004 -5.03125,5.125004 -2.76955,0 -5.03125,-2.29865 -5.03125,-5.125004 2e-6,-2.81633 2.261697,-5.093747 5.03125,-5.093747 z"
257 |      id="path1950" />
258 |   <ellipse
259 |      style="opacity:0.44382;fill:url(#radialGradient1480);fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:15.4174;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
260 |      id="path1894"
261 |      cx="55.816761"
262 |      cy="127.70079"
263 |      rx="35.930977"
264 |      ry="6.9673119" />
265 | </svg>
266 | 


--------------------------------------------------------------------------------
/myenv/Scripts/pdf2txt.py:
--------------------------------------------------------------------------------
  1 | #!C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv\Scripts\python.exe
  2 | """A command line tool for extracting text and images from PDF and
  3 | output it to plain text, html, xml or tags."""
  4 | import argparse
  5 | import logging
  6 | import sys
  7 | from typing import Any, Container, Iterable, List, Optional
  8 | 
  9 | import pdfminer.high_level
 10 | from pdfminer.layout import LAParams
 11 | from pdfminer.utils import AnyIO
 12 | 
 13 | logging.basicConfig()
 14 | 
 15 | OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
 16 | 
 17 | 
 18 | def float_or_disabled(x: str) -> Optional[float]:
 19 |     if x.lower().strip() == "disabled":
 20 |         return None
 21 |     try:
 22 |         return float(x)
 23 |     except ValueError:
 24 |         raise argparse.ArgumentTypeError("invalid float value: {}".format(x))
 25 | 
 26 | 
 27 | def extract_text(
 28 |     files: Iterable[str] = [],
 29 |     outfile: str = "-",
 30 |     laparams: Optional[LAParams] = None,
 31 |     output_type: str = "text",
 32 |     codec: str = "utf-8",
 33 |     strip_control: bool = False,
 34 |     maxpages: int = 0,
 35 |     page_numbers: Optional[Container[int]] = None,
 36 |     password: str = "",
 37 |     scale: float = 1.0,
 38 |     rotation: int = 0,
 39 |     layoutmode: str = "normal",
 40 |     output_dir: Optional[str] = None,
 41 |     debug: bool = False,
 42 |     disable_caching: bool = False,
 43 |     **kwargs: Any
 44 | ) -> AnyIO:
 45 |     if not files:
 46 |         raise ValueError("Must provide files to work upon!")
 47 | 
 48 |     if output_type == "text" and outfile != "-":
 49 |         for override, alttype in OUTPUT_TYPES:
 50 |             if outfile.endswith(override):
 51 |                 output_type = alttype
 52 | 
 53 |     if outfile == "-":
 54 |         outfp: AnyIO = sys.stdout
 55 |         if sys.stdout.encoding is not None:
 56 |             codec = "utf-8"
 57 |     else:
 58 |         outfp = open(outfile, "wb")
 59 | 
 60 |     for fname in files:
 61 |         with open(fname, "rb") as fp:
 62 |             pdfminer.high_level.extract_text_to_fp(fp, **locals())
 63 |     return outfp
 64 | 
 65 | 
 66 | def create_parser() -> argparse.ArgumentParser:
 67 |     parser = argparse.ArgumentParser(description=__doc__, add_help=True)
 68 |     parser.add_argument(
 69 |         "files",
 70 |         type=str,
 71 |         default=None,
 72 |         nargs="+",
 73 |         help="One or more paths to PDF files.",
 74 |     )
 75 | 
 76 |     parser.add_argument(
 77 |         "--version",
 78 |         "-v",
 79 |         action="version",
 80 |         version="pdfminer.six v{}".format(pdfminer.__version__),
 81 |     )
 82 |     parser.add_argument(
 83 |         "--debug",
 84 |         "-d",
 85 |         default=False,
 86 |         action="store_true",
 87 |         help="Use debug logging level.",
 88 |     )
 89 |     parser.add_argument(
 90 |         "--disable-caching",
 91 |         "-C",
 92 |         default=False,
 93 |         action="store_true",
 94 |         help="If caching or resources, such as fonts, should be disabled.",
 95 |     )
 96 | 
 97 |     parse_params = parser.add_argument_group(
 98 |         "Parser", description="Used during PDF parsing"
 99 |     )
100 |     parse_params.add_argument(
101 |         "--page-numbers",
102 |         type=int,
103 |         default=None,
104 |         nargs="+",
105 |         help="A space-seperated list of page numbers to parse.",
106 |     )
107 |     parse_params.add_argument(
108 |         "--pagenos",
109 |         "-p",
110 |         type=str,
111 |         help="A comma-separated list of page numbers to parse. "
112 |         "Included for legacy applications, use --page-numbers "
113 |         "for more idiomatic argument entry.",
114 |     )
115 |     parse_params.add_argument(
116 |         "--maxpages",
117 |         "-m",
118 |         type=int,
119 |         default=0,
120 |         help="The maximum number of pages to parse.",
121 |     )
122 |     parse_params.add_argument(
123 |         "--password",
124 |         "-P",
125 |         type=str,
126 |         default="",
127 |         help="The password to use for decrypting PDF file.",
128 |     )
129 |     parse_params.add_argument(
130 |         "--rotation",
131 |         "-R",
132 |         default=0,
133 |         type=int,
134 |         help="The number of degrees to rotate the PDF "
135 |         "before other types of processing.",
136 |     )
137 | 
138 |     la_params = LAParams()  # will be used for defaults
139 |     la_param_group = parser.add_argument_group(
140 |         "Layout analysis", description="Used during layout analysis."
141 |     )
142 |     la_param_group.add_argument(
143 |         "--no-laparams",
144 |         "-n",
145 |         default=False,
146 |         action="store_true",
147 |         help="If layout analysis parameters should be ignored.",
148 |     )
149 |     la_param_group.add_argument(
150 |         "--detect-vertical",
151 |         "-V",
152 |         default=la_params.detect_vertical,
153 |         action="store_true",
154 |         help="If vertical text should be considered during layout analysis",
155 |     )
156 |     la_param_group.add_argument(
157 |         "--line-overlap",
158 |         type=float,
159 |         default=la_params.line_overlap,
160 |         help="If two characters have more overlap than this they "
161 |         "are considered to be on the same line. The overlap is specified "
162 |         "relative to the minimum height of both characters.",
163 |     )
164 |     la_param_group.add_argument(
165 |         "--char-margin",
166 |         "-M",
167 |         type=float,
168 |         default=la_params.char_margin,
169 |         help="If two characters are closer together than this margin they "
170 |         "are considered to be part of the same line. The margin is "
171 |         "specified relative to the width of the character.",
172 |     )
173 |     la_param_group.add_argument(
174 |         "--word-margin",
175 |         "-W",
176 |         type=float,
177 |         default=la_params.word_margin,
178 |         help="If two characters on the same line are further apart than this "
179 |         "margin then they are considered to be two separate words, and "
180 |         "an intermediate space will be added for readability. The margin "
181 |         "is specified relative to the width of the character.",
182 |     )
183 |     la_param_group.add_argument(
184 |         "--line-margin",
185 |         "-L",
186 |         type=float,
187 |         default=la_params.line_margin,
188 |         help="If two lines are close together they are considered to "
189 |         "be part of the same paragraph. The margin is specified "
190 |         "relative to the height of a line.",
191 |     )
192 |     la_param_group.add_argument(
193 |         "--boxes-flow",
194 |         "-F",
195 |         type=float_or_disabled,
196 |         default=la_params.boxes_flow,
197 |         help="Specifies how much a horizontal and vertical position of a "
198 |         "text matters when determining the order of lines. The value "
199 |         "should be within the range of -1.0 (only horizontal position "
200 |         "matters) to +1.0 (only vertical position matters). You can also "
201 |         "pass `disabled` to disable advanced layout analysis, and "
202 |         "instead return text based on the position of the bottom left "
203 |         "corner of the text box.",
204 |     )
205 |     la_param_group.add_argument(
206 |         "--all-texts",
207 |         "-A",
208 |         default=la_params.all_texts,
209 |         action="store_true",
210 |         help="If layout analysis should be performed on text in figures.",
211 |     )
212 | 
213 |     output_params = parser.add_argument_group(
214 |         "Output", description="Used during output generation."
215 |     )
216 |     output_params.add_argument(
217 |         "--outfile",
218 |         "-o",
219 |         type=str,
220 |         default="-",
221 |         help="Path to file where output is written. "
222 |         'Or "-" (default) to write to stdout.',
223 |     )
224 |     output_params.add_argument(
225 |         "--output_type",
226 |         "-t",
227 |         type=str,
228 |         default="text",
229 |         help="Type of output to generate {text,html,xml,tag}.",
230 |     )
231 |     output_params.add_argument(
232 |         "--codec",
233 |         "-c",
234 |         type=str,
235 |         default="utf-8",
236 |         help="Text encoding to use in output file.",
237 |     )
238 |     output_params.add_argument(
239 |         "--output-dir",
240 |         "-O",
241 |         default=None,
242 |         help="The output directory to put extracted images in. If not given, "
243 |         "images are not extracted.",
244 |     )
245 |     output_params.add_argument(
246 |         "--layoutmode",
247 |         "-Y",
248 |         default="normal",
249 |         type=str,
250 |         help="Type of layout to use when generating html "
251 |         "{normal,exact,loose}. If normal,each line is"
252 |         " positioned separately in the html. If exact"
253 |         ", each character is positioned separately in"
254 |         " the html. If loose, same result as normal "
255 |         "but with an additional newline after each "
256 |         "text line. Only used when output_type is html.",
257 |     )
258 |     output_params.add_argument(
259 |         "--scale",
260 |         "-s",
261 |         type=float,
262 |         default=1.0,
263 |         help="The amount of zoom to use when generating html file. "
264 |         "Only used when output_type is html.",
265 |     )
266 |     output_params.add_argument(
267 |         "--strip-control",
268 |         "-S",
269 |         default=False,
270 |         action="store_true",
271 |         help="Remove control statement from text. "
272 |         "Only used when output_type is xml.",
273 |     )
274 | 
275 |     return parser
276 | 
277 | 
278 | def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
279 |     parsed_args = create_parser().parse_args(args=args)
280 | 
281 |     # Propagate parsed layout parameters to LAParams object
282 |     if parsed_args.no_laparams:
283 |         parsed_args.laparams = None
284 |     else:
285 |         parsed_args.laparams = LAParams(
286 |             line_overlap=parsed_args.line_overlap,
287 |             char_margin=parsed_args.char_margin,
288 |             line_margin=parsed_args.line_margin,
289 |             word_margin=parsed_args.word_margin,
290 |             boxes_flow=parsed_args.boxes_flow,
291 |             detect_vertical=parsed_args.detect_vertical,
292 |             all_texts=parsed_args.all_texts,
293 |         )
294 | 
295 |     if parsed_args.page_numbers:
296 |         parsed_args.page_numbers = {x - 1 for x in parsed_args.page_numbers}
297 | 
298 |     if parsed_args.pagenos:
299 |         parsed_args.page_numbers = {int(x) - 1 for x in parsed_args.pagenos.split(",")}
300 | 
301 |     if parsed_args.output_type == "text" and parsed_args.outfile != "-":
302 |         for override, alttype in OUTPUT_TYPES:
303 |             if parsed_args.outfile.endswith(override):
304 |                 parsed_args.output_type = alttype
305 | 
306 |     return parsed_args
307 | 
308 | 
309 | def main(args: Optional[List[str]] = None) -> int:
310 |     parsed_args = parse_args(args)
311 |     outfp = extract_text(**vars(parsed_args))
312 |     outfp.close()
313 |     return 0
314 | 
315 | 
316 | if __name__ == "__main__":
317 |     sys.exit(main())
318 | 


--------------------------------------------------------------------------------
/src/data/skills.csv:
--------------------------------------------------------------------------------
1 | technical skills,ajenti,django-suit,django-xadmin,flask-admin,flower,grappelli,wooey,algorithms,pypattyrn,python-patterns,sortedcontainers,django-simple-captcha,django-simple-spam-blocker,django-compressor,django-pipeline,django-storages,fanstatic,fileconveyor,flask-assets,jinja-assets-compressor,webassets,audiolazy,audioread,beets,dejavu,django-elastic-transcoder,eyed3,id3reader,m3u8,mingus,pyaudioanalysis,pydub,pyechonest,talkbox,timeside,tinytag,authomatic,django-allauth,django-oauth-toolkit,flask-oauthlib,oauthlib,python-oauth2,python-social-auth,rauth,sanction,jose,pyjwt,python-jws,python-jwt,bitbake,buildout,platformio,pybuilder,scons,django-cms,djedi-cms,feincms,kotti,mezzanine,opps,plone,quokka,wagtail,widgy,beaker,diskcache,django-cache-machine,django-cacheops,django-viewlet,dogpile.cache,hermescache,johnny-cache,pylibmc,errbot,coala,code2flow,pycallgraph,flake8,pylama,pylint,mypy,asciimatics,cement,click,cliff,clint,colorama,docopt,gooey,python-fire,python-prompt-toolkit,aws-cli,bashplotlib,caniusepython3,cookiecutter,doitlive,howdoi,httpie,mycli,pathpicker,percol,pgcli,saws,thefuck,try,python-future,python-modernize,six,opencv,pyocr,pytesseract,simplecv,eventlet,gevent,multiprocessing,threading,tomorrow,uvloop,config,configobj,configparser,profig,python-decouple,cryptography,hashids,paramiko,passlib,pynacl,blaze,orange,pandas,cerberus,colander,jsonschema,schematics,valideer,voluptuous,altair,bokeh,ggplot,matplotlib,pygal,pygraphviz,pyqtgraph,seaborn,vispy,pickledb,pipelinedb,tinydb,zodb,mysql,mysql-python,mysqlclient,oursql,pymysql,postgresql,psycopg2,queries,txpostgres,apsw,pymssql,nosql,cassandra-python-driver,happybase,plyvel,py2neo,pycassa,pymongo,redis-py,telephus,txredis,arrow,chronyk,dateutil,delorean,moment,pendulum,pytime,pytz,when.py,ipdb,pdb++,pudb,remote-pdb,wdb,line_profiler,memory_profiler,profiling,vprof,caffe,keras,mxnet,neupy,pytorch,tensorflow,theano,ansible,cloud-init,cuisine,docker,fabric,fabtools,honcho,openstack,pexpect,psutil,saltstack,supervisor,dh-virtualenv,nuitka,py2app,py2exe,pyinstaller,pynsist,sphinx,awesome-sphinxdoc,mkdocs,pdoc,pycco,s3cmd,s4cmd,you-get,youtube-dl,alipay,cartridge,django-oscar,django-shop,merchant,money,python-currencies,forex-python,shoop,emacs,elpy,sublime,anaconda,sublimejedi,vim,jedi-vim,python-mode,youcompleteme,ptvs,visual,python,magic,liclipse,pycharm,spyder,envelopes,flanker,imbox,inbox.py,lamson,marrow,modoboa,nylas,yagmail,pipenv,p,pyenv,venv,virtualenv,virtualenvwrapper,imghdr,mimetypes,path.py,pathlib,python-magic,unipath,watchdog,cffi,ctypes,pycuda,swig,deform,django-bootstrap3,django-crispy-forms,django-remote-forms,wtforms,cytoolz,fn.py,funcy,toolz,curses,enaml,flexx,kivy,pyglet,pygobject,pyqt,pyside,pywebview,tkinter,toga,urwid,wxpython,cocos2d,panda3d,pygame,pyogre,pyopengl,pysdl2,renpy,django-countries,geodjango,geoip,geojson,geopy,pygeoip,beautifulsoup,bleach,cssutils,html5lib,lxml,markupsafe,pyquery,untangle,weasyprint,xmldataset,xmltodict,grequests,httplib2,requests,treq,urllib3,ino,keyboard,mouse,pingo,pyro,pyuserinput,scapy,wifi,hmap,imgseek,nude.py,pagan,pillow,pybarcode,pygram,python-qrcode,quads,scikit-image,thumbor,wand,clpython,cpython,cython,grumpy,ironpython,jython,micropython,numba,peachpy,pyjion,pypy,pysec,pyston,stackless,interactive,bpython,jupyter,ptpython,babel,pyicu,apscheduler,django-schedule,doit,gunnery,joblib,plan,schedule,spiff,taskflow,eliot,logbook,logging,sentry,metrics,nupic,scikit-learn,spark,vowpal_porpoise,xgboost,pyspark,luigi,mrjob,streamparse,dask,python(x,y),pythonlibs,pythonnet,pywin32,winpython,gensim,jieba,langid.py,nltk,pattern,polyglot,snownlp,spacy,textblob,mininet,pox,pyretic,sdx,asyncio,diesel,pulsar,pyzmq,twisted,txzmq,napalm,django-activity-stream,stream-framework,django,sqlalchemy,awesome-sqlalchemy,orator,peewee,ponyorm,pydal,python-sql,pip,conda,curdling,pip-tools,wheel,warehouse,bandersnatch,devpi,localshop,carteblanche,django-guardian,django-rules,delegator.py subprocesses for,sarge,sh,celery,huey,mrq,rq,simpleq,annoy,fastfm,implicit,libffm,lightfm,surprise,tensorrec,django-rest-framework,django-tastypie,flask,eve,flask-api-utils,flask-api,flask-restful,flask-restless,pyramid,cornice,falcon,hug,restless,ripozo,sandman,apistar,simplejsonrpcserver,simplexmlrpcserver,zerorpc,astropy,bcbio-nextgen,bccb,biopython,cclib,networkx,nipy,numpy,obspy,pydy,pymc,rdkit,scipy,statsmodels,sympy,zipline,simpy,django-haystack,elasticsearch-dsl-py,elasticsearch-py,esengine,pysolr,solrpy,whoosh,marshmallow,apex,python-lambda,zappa,tablib,marmir,openpyxl,pyexcel,python-docx,relatorio,unoconv,xlsxwriter,xlwings,xlwt / xlrd,pdf,pdfminer,pypdf2,reportlab,markdown,mistune,python-markdown,yaml,pyyaml,csvkit,unp,cactus,hyde,lektor,nikola,pelican,tinkerer,django-taggit,genshi,jinja2,mako,hypothesis,mamba,nose,nose2,pytest,robot,unittest,green,tox,locust,pyautogui,selenium,sixpack,splinter,doublex,freezegun,httmock,httpretty,mock,responses,vcr.py,factory_boy,mixer,model_mommy,mimesis,fake2db,faker,radar,chardet,difflib,ftfy,fuzzywuzzy,levenshtein,pangu.py,pyfiglet,pypinyin,shortuuid,unidecode,uniout,xpinyin,slugify,awesome-slugify,python-slugify,unicode-slugify,parser,phonenumbers,ply,pygments,pyparsing,python-nameparser,python-user-agents,sqlparse,apache-libcloud,boto3,django-wordpress,facebook-sdk,facepy,gmail,google-api-python-client,gspread,twython,furl,purl,pyshorteners,short_url,webargs,moviepy,scikit-video,wsgi-compatible,bjoern,fapws3,gunicorn,meinheld,netius,paste,rocket,uwsgi,waitress,werkzeug,haul,html2text,lassie,micawber,newspaper,opengraph,python-goose,python-readability,sanitize,sumy,textract,cola,demiurge,feedparser,grab,mechanicalsoup,portia,pyspider,robobrowser,scrapy,bottle,cherrypy,awesome-django,awesome-flask,awesome-pyramid,sanic,tornado,turbogears,web2py,github,autobahnpython,crossbar,django-socketio,websocket-for-python,javascript,php,c#,c++,ruby,css,c,objective-c,shell,scala,swift,matlab,clojure,octave,machine learning,data analytics,predictive analytics,html,js,accounts payable,receivables,inventory controls,payroll,deposits,bank reconciliation,planning and enacting cash-flows,report preparation,financial models,financial controls,documentation,time management,schedules,benchmarking,future state assessment,business process re-engineering,as-is analysis,defining solutions and scope,gap analysis,role change,wireframing,prototyping,user stories,financial analysis/modeling,swot analysis,quickbooks,quicken,erp,enterprise resource planning,spanish,german,rest,soap,json,website,ui,ux,design,crm,cms,communication,coding,windows,servers,unix,linux,redhat,solaris,java,perl,vb script,xml,database,oracle,microsoft sql,sql,microsoft word,microsoft powerpoint,powerpoint,word,excel,visio,microsoft visio,microsoft excel,adobe,photoshop,hadoop,hbase,hive,zookeeper,openserver,auto cad,pl/sql,ruby on rails,asp,jsp,operations,technical,training,sales,marketing,reporting,compliance,strategy,research,analytical,engineering,policies,budget,finance,project management,health,customer service,content,presentation,brand,presentations,safety,certification,seo,digital marketing,accounting,regulations,legal,engagement,analytics,distribution,coaching,testing,vendors,consulting,writing,contracts,inventory,retail,healthcare,regulatory,scheduling,construction,logistics,mobile,c�(programming language),correspondence,controls,human resources,specifications,recruitment,procurement,partnership,partnerships,management experience,negotiation,hardware,programming,agile,forecasting,advertising,business development,audit,architecture,supply chain,governance,staffing,continuous improvement,product development,networking,recruiting,product management,sap,troubleshooting,computer science,budgeting,electrical,customer experience,economics,information technology,transportation,social media,automation,lifecycle,filing,modeling,investigation,editing,purchasing,kpis,hospital,forecasts,acquisition,expenses,billing,workflow,product owner,analyze,cross functional,business process,process,improvement,pivot tables,pivot,vlookups,sharepoint,microsoft sharepoint,access database,access,test case,jira,tfs,hp alm,tableau,business object,business intelligence,jad,solicitation,kaban,vue.js,sketch,indesign,illustrator,english,french,active directory,data center,solution architecture,dns,network design,open source,desktop support,application support,administration,change management,video,invoices,administrative support,payments,lean,process improvement,installation,risk management,transactions,investigations,r (programming language),data analysis,statistics,protocols,program management,quality assurance,banking,outreach,sourcing,microsoft office,merchandising,r,teaching,pharmaceutical,fulfillment,positioning,tax,service delivery,investigate,editorial,account management,valid drivers license,electronics,pr,public relations,assembly,facebook,spreadsheets,recruit,proposal,data entry,hotel,ordering,branding,life cycle,real estate,relationship management,researching,process improvements,chemistry,saas,cad,sales experience,mathematics,customer-facing,audio,project management skills,six sigma,hospitality,mechanical engineering,auditing,employee relations,android,security clearance,licensing,fundraising,repairs,iso,market research,business strategy,pmp,data management,quality control,reconciliation,conversion,business analysis,financial analysis,ecommerce,client service,publishing,supervising,complex projects,key performance indicators,scrum,sports,e-commerce,journalism,d (programming language),data collection,higher education,marketing programs,financial management,business plans,user experience,client relationships,cloud,analytical skills,cisco,internal stakeholders,product marketing,regulatory requirements,itil,information security,aviation,supply chain management,industry experience,autocad,purchase orders,acquisitions,tv,instrumentation,strategic direction,law enforcement,call center,experiments,technical skills,human resource,business cases,build relationships,invoicing,support services,marketing strategy,operating systems,biology,start-up,electrical engineering,workflows,routing,non-profit,marketing plans,due diligence,business management,iphone,architectures,reconcile,dynamic environment,external partners,asset management,emea,intranet,sops,sas,digital media,prospecting,financial reporting,project delivery,operational excellence,standard operating procedures,technical knowledge,on-call,talent management,stakeholder management,tablets,analyze data,financial statements,microsoft office suite,fitness,case management,value proposition,industry trends,rfp,broadcast,portfolio management,fabrication,financial performance,customer requirements,psychology,marketing materials,resource management,physics,mortgage,development activities,end user,business planning,root cause,analysis,leadership development,relationship building,sdlc,on-boarding,quality standards,regulatory compliance,aws,kpi,status reports,product line,drafting,phone calls,product knowledge,business stakeholders,technical issues,admissions,supervisory experience,usability,pharmacy,commissioning,project plan,ms excel,fda,test plans,variances,financing,travel arrangements,internal customers,medical device,counsel,inventory management,performance metrics,lighting,outsourcing,performance improvement,management consulting,graphic design,transport,information management,.net,startup,matrix,front-end,project planning,business systems,accounts receivable,public health,hris,instructional design,in-store,employee engagement,cost effective,sales management,api,adobe creative suite,twitter,program development,event planning,cash flow,strategic plans,vendor management,trade shows,hotels,segmentation,contract management,gis,talent acquisition,photography,internal communications,client services,ibm,financial reports,product quality,beverage,strong analytical skills,underwriting,cpr,mining,sales goals,chemicals,scripting,migration,software engineering,mis,therapeutic,general ledger,ms project,standardization,retention,spelling,media relations,os,daily operations,immigration,product design,etl,field sales,driving record,peoplesoft,benchmark,quality management,apis,test cases,internal controls,telecom,business issues,research projects,data quality,strategic initiatives,office software,cfa,co-op,big data,journal entries,vmware,help desk,statistical analysis,datasets,alliances,solidworks,prototype,lan,sci,budget management,rfps,flex,gaap,experimental,cpg,information system,customer facing,process development,web services,international,travel,revenue growth,software development life cycle,operations management,computer applications,risk assessments,sales operations,raw materials,internal audit,physical security,sql server,affiliate,computer software,manage projects,business continuity,litigation,it infrastructure,cost reduction,small business,annual budget,ios,html5,real-time,consulting experience,circuits,risk assessment,cross-functional team,public policy,analyzing data,consulting services,google drive,ad words,pay per click,email,db2,expense tracking,reports,wordpress,yoast,ghostwriting,corel draw,automated billing,system,customer management,debugging,system administration,network configuration,software installation,security,tech support,updates,tci/ip,dhcp,wan/lan,ubuntu,virtualized networks,network automation,cloud management,ai,salesforce,mango db,math,calculus,product launch,mvp
2 | 


--------------------------------------------------------------------------------
/src/notebook/skills.csv:
--------------------------------------------------------------------------------
1 | technical skills,ajenti,django-suit,django-xadmin,flask-admin,flower,grappelli,wooey,algorithms,pypattyrn,python-patterns,sortedcontainers,django-simple-captcha,django-simple-spam-blocker,django-compressor,django-pipeline,django-storages,fanstatic,fileconveyor,flask-assets,jinja-assets-compressor,webassets,audiolazy,audioread,beets,dejavu,django-elastic-transcoder,eyed3,id3reader,m3u8,mingus,pyaudioanalysis,pydub,pyechonest,talkbox,timeside,tinytag,authomatic,django-allauth,django-oauth-toolkit,flask-oauthlib,oauthlib,python-oauth2,python-social-auth,rauth,sanction,jose,pyjwt,python-jws,python-jwt,bitbake,buildout,platformio,pybuilder,scons,django-cms,djedi-cms,feincms,kotti,mezzanine,opps,plone,quokka,wagtail,widgy,beaker,diskcache,django-cache-machine,django-cacheops,django-viewlet,dogpile.cache,hermescache,johnny-cache,pylibmc,errbot,coala,code2flow,pycallgraph,flake8,pylama,pylint,mypy,asciimatics,cement,click,cliff,clint,colorama,docopt,gooey,python-fire,python-prompt-toolkit,aws-cli,bashplotlib,caniusepython3,cookiecutter,doitlive,howdoi,httpie,mycli,pathpicker,percol,pgcli,saws,thefuck,try,python-future,python-modernize,six,opencv,pyocr,pytesseract,simplecv,eventlet,gevent,multiprocessing,threading,tomorrow,uvloop,config,configobj,configparser,profig,python-decouple,cryptography,hashids,paramiko,passlib,pynacl,blaze,orange,pandas,cerberus,colander,jsonschema,schematics,valideer,voluptuous,altair,bokeh,ggplot,matplotlib,pygal,pygraphviz,pyqtgraph,seaborn,vispy,pickledb,pipelinedb,tinydb,zodb,mysql,mysql-python,mysqlclient,oursql,pymysql,postgresql,psycopg2,queries,txpostgres,apsw,pymssql,nosql,cassandra-python-driver,happybase,plyvel,py2neo,pycassa,pymongo,redis-py,telephus,txredis,arrow,chronyk,dateutil,delorean,moment,pendulum,pytime,pytz,when.py,ipdb,pdb++,pudb,remote-pdb,wdb,line_profiler,memory_profiler,profiling,vprof,caffe,keras,mxnet,neupy,pytorch,tensorflow,theano,ansible,cloud-init,cuisine,docker,fabric,fabtools,honcho,openstack,pexpect,psutil,saltstack,supervisor,dh-virtualenv,nuitka,py2app,py2exe,pyinstaller,pynsist,sphinx,awesome-sphinxdoc,mkdocs,pdoc,pycco,s3cmd,s4cmd,you-get,youtube-dl,alipay,cartridge,django-oscar,django-shop,merchant,money,python-currencies,forex-python,shoop,emacs,elpy,sublime,anaconda,sublimejedi,vim,jedi-vim,python-mode,youcompleteme,ptvs,visual,python,magic,liclipse,pycharm,spyder,envelopes,flanker,imbox,inbox.py,lamson,marrow,modoboa,nylas,yagmail,pipenv,p,pyenv,venv,virtualenv,virtualenvwrapper,imghdr,mimetypes,path.py,pathlib,python-magic,unipath,watchdog,cffi,ctypes,pycuda,swig,deform,django-bootstrap3,django-crispy-forms,django-remote-forms,wtforms,cytoolz,fn.py,funcy,toolz,curses,enaml,flexx,kivy,pyglet,pygobject,pyqt,pyside,pywebview,tkinter,toga,urwid,wxpython,cocos2d,panda3d,pygame,pyogre,pyopengl,pysdl2,renpy,django-countries,geodjango,geoip,geojson,geopy,pygeoip,beautifulsoup,bleach,cssutils,html5lib,lxml,markupsafe,pyquery,untangle,weasyprint,xmldataset,xmltodict,grequests,httplib2,requests,treq,urllib3,ino,keyboard,mouse,pingo,pyro,pyuserinput,scapy,wifi,hmap,imgseek,nude.py,pagan,pillow,pybarcode,pygram,python-qrcode,quads,scikit-image,thumbor,wand,clpython,cpython,cython,grumpy,ironpython,jython,micropython,numba,peachpy,pyjion,pypy,pysec,pyston,stackless,interactive,bpython,jupyter,ptpython,babel,pyicu,apscheduler,django-schedule,doit,gunnery,joblib,plan,schedule,spiff,taskflow,eliot,logbook,logging,sentry,metrics,nupic,scikit-learn,spark,vowpal_porpoise,xgboost,pyspark,luigi,mrjob,streamparse,dask,python(x,y),pythonlibs,pythonnet,pywin32,winpython,gensim,jieba,langid.py,nltk,pattern,polyglot,snownlp,spacy,textblob,mininet,pox,pyretic,sdx,asyncio,diesel,pulsar,pyzmq,twisted,txzmq,napalm,django-activity-stream,stream-framework,django,sqlalchemy,awesome-sqlalchemy,orator,peewee,ponyorm,pydal,python-sql,pip,conda,curdling,pip-tools,wheel,warehouse,bandersnatch,devpi,localshop,carteblanche,django-guardian,django-rules,delegator.py subprocesses for,sarge,sh,celery,huey,mrq,rq,simpleq,annoy,fastfm,implicit,libffm,lightfm,surprise,tensorrec,django-rest-framework,django-tastypie,flask,eve,flask-api-utils,flask-api,flask-restful,flask-restless,pyramid,cornice,falcon,hug,restless,ripozo,sandman,apistar,simplejsonrpcserver,simplexmlrpcserver,zerorpc,astropy,bcbio-nextgen,bccb,biopython,cclib,networkx,nipy,numpy,obspy,pydy,pymc,rdkit,scipy,statsmodels,sympy,zipline,simpy,django-haystack,elasticsearch-dsl-py,elasticsearch-py,esengine,pysolr,solrpy,whoosh,marshmallow,apex,python-lambda,zappa,tablib,marmir,openpyxl,pyexcel,python-docx,relatorio,unoconv,xlsxwriter,xlwings,xlwt / xlrd,pdf,pdfminer,pypdf2,reportlab,markdown,mistune,python-markdown,yaml,pyyaml,csvkit,unp,cactus,hyde,lektor,nikola,pelican,tinkerer,django-taggit,genshi,jinja2,mako,hypothesis,mamba,nose,nose2,pytest,robot,unittest,green,tox,locust,pyautogui,selenium,sixpack,splinter,doublex,freezegun,httmock,httpretty,mock,responses,vcr.py,factory_boy,mixer,model_mommy,mimesis,fake2db,faker,radar,chardet,difflib,ftfy,fuzzywuzzy,levenshtein,pangu.py,pyfiglet,pypinyin,shortuuid,unidecode,uniout,xpinyin,slugify,awesome-slugify,python-slugify,unicode-slugify,parser,phonenumbers,ply,pygments,pyparsing,python-nameparser,python-user-agents,sqlparse,apache-libcloud,boto3,django-wordpress,facebook-sdk,facepy,gmail,google-api-python-client,gspread,twython,furl,purl,pyshorteners,short_url,webargs,moviepy,scikit-video,wsgi-compatible,bjoern,fapws3,gunicorn,meinheld,netius,paste,rocket,uwsgi,waitress,werkzeug,haul,html2text,lassie,micawber,newspaper,opengraph,python-goose,python-readability,sanitize,sumy,textract,cola,demiurge,feedparser,grab,mechanicalsoup,portia,pyspider,robobrowser,scrapy,bottle,cherrypy,awesome-django,awesome-flask,awesome-pyramid,sanic,tornado,turbogears,web2py,github,autobahnpython,crossbar,django-socketio,websocket-for-python,javascript,php,c#,c++,ruby,css,c,objective-c,shell,scala,swift,matlab,clojure,octave,machine learning,data analytics,predictive analytics,html,js,accounts payable,receivables,inventory controls,payroll,deposits,bank reconciliation,planning and enacting cash-flows,report preparation,financial models,financial controls,documentation,time management,schedules,benchmarking,future state assessment,business process re-engineering,as-is analysis,defining solutions and scope,gap analysis,role change,wireframing,prototyping,user stories,financial analysis/modeling,swot analysis,quickbooks,quicken,erp,enterprise resource planning,spanish,german,rest,soap,json,website,ui,ux,design,crm,cms,communication,coding,windows,servers,unix,linux,redhat,solaris,java,perl,vb script,xml,database,oracle,microsoft sql,sql,microsoft word,microsoft powerpoint,powerpoint,word,excel,visio,microsoft visio,microsoft excel,adobe,photoshop,hadoop,hbase,hive,zookeeper,openserver,auto cad,pl/sql,ruby on rails,asp,jsp,operations,technical,training,sales,marketing,reporting,compliance,strategy,research,analytical,engineering,policies,budget,finance,project management,health,customer service,content,presentation,brand,presentations,safety,certification,seo,digital marketing,accounting,regulations,legal,engagement,analytics,distribution,coaching,testing,vendors,consulting,writing,contracts,inventory,retail,healthcare,regulatory,scheduling,construction,logistics,mobile,c�(programming language),correspondence,controls,human resources,specifications,recruitment,procurement,partnership,partnerships,management experience,negotiation,hardware,programming,agile,forecasting,advertising,business development,audit,architecture,supply chain,governance,staffing,continuous improvement,product development,networking,recruiting,product management,sap,troubleshooting,computer science,budgeting,electrical,customer experience,economics,information technology,transportation,social media,automation,lifecycle,filing,modeling,investigation,editing,purchasing,kpis,hospital,forecasts,acquisition,expenses,billing,workflow,product owner,analyze,cross functional,business process,process,improvement,pivot tables,pivot,vlookups,sharepoint,microsoft sharepoint,access database,access,test case,jira,tfs,hp alm,tableau,business object,business intelligence,jad,solicitation,kaban,vue.js,sketch,indesign,illustrator,english,french,active directory,data center,solution architecture,dns,network design,open source,desktop support,application support,administration,change management,video,invoices,administrative support,payments,lean,process improvement,installation,risk management,transactions,investigations,r (programming language),data analysis,statistics,protocols,program management,quality assurance,banking,outreach,sourcing,microsoft office,merchandising,r,teaching,pharmaceutical,fulfillment,positioning,tax,service delivery,investigate,editorial,account management,valid drivers license,electronics,pr,public relations,assembly,facebook,spreadsheets,recruit,proposal,data entry,hotel,ordering,branding,life cycle,real estate,relationship management,researching,process improvements,chemistry,saas,cad,sales experience,mathematics,customer-facing,audio,project management skills,six sigma,hospitality,mechanical engineering,auditing,employee relations,android,security clearance,licensing,fundraising,repairs,iso,market research,business strategy,pmp,data management,quality control,reconciliation,conversion,business analysis,financial analysis,ecommerce,client service,publishing,supervising,complex projects,key performance indicators,scrum,sports,e-commerce,journalism,d (programming language),data collection,higher education,marketing programs,financial management,business plans,user experience,client relationships,cloud,analytical skills,cisco,internal stakeholders,product marketing,regulatory requirements,itil,information security,aviation,supply chain management,industry experience,autocad,purchase orders,acquisitions,tv,instrumentation,strategic direction,law enforcement,call center,experiments,technical skills,human resource,business cases,build relationships,invoicing,support services,marketing strategy,operating systems,biology,start-up,electrical engineering,workflows,routing,non-profit,marketing plans,due diligence,business management,iphone,architectures,reconcile,dynamic environment,external partners,asset management,emea,intranet,sops,sas,digital media,prospecting,financial reporting,project delivery,operational excellence,standard operating procedures,technical knowledge,on-call,talent management,stakeholder management,tablets,analyze data,financial statements,microsoft office suite,fitness,case management,value proposition,industry trends,rfp,broadcast,portfolio management,fabrication,financial performance,customer requirements,psychology,marketing materials,resource management,physics,mortgage,development activities,end user,business planning,root cause,analysis,leadership development,relationship building,sdlc,on-boarding,quality standards,regulatory compliance,aws,kpi,status reports,product line,drafting,phone calls,product knowledge,business stakeholders,technical issues,admissions,supervisory experience,usability,pharmacy,commissioning,project plan,ms excel,fda,test plans,variances,financing,travel arrangements,internal customers,medical device,counsel,inventory management,performance metrics,lighting,outsourcing,performance improvement,management consulting,graphic design,transport,information management,.net,startup,matrix,front-end,project planning,business systems,accounts receivable,public health,hris,instructional design,in-store,employee engagement,cost effective,sales management,api,adobe creative suite,twitter,program development,event planning,cash flow,strategic plans,vendor management,trade shows,hotels,segmentation,contract management,gis,talent acquisition,photography,internal communications,client services,ibm,financial reports,product quality,beverage,strong analytical skills,underwriting,cpr,mining,sales goals,chemicals,scripting,migration,software engineering,mis,therapeutic,general ledger,ms project,standardization,retention,spelling,media relations,os,daily operations,immigration,product design,etl,field sales,driving record,peoplesoft,benchmark,quality management,apis,test cases,internal controls,telecom,business issues,research projects,data quality,strategic initiatives,office software,cfa,co-op,big data,journal entries,vmware,help desk,statistical analysis,datasets,alliances,solidworks,prototype,lan,sci,budget management,rfps,flex,gaap,experimental,cpg,information system,customer facing,process development,web services,international,travel,revenue growth,software development life cycle,operations management,computer applications,risk assessments,sales operations,raw materials,internal audit,physical security,sql server,affiliate,computer software,manage projects,business continuity,litigation,it infrastructure,cost reduction,small business,annual budget,ios,html5,real-time,consulting experience,circuits,risk assessment,cross-functional team,public policy,analyzing data,consulting services,google drive,ad words,pay per click,email,db2,expense tracking,reports,wordpress,yoast,ghostwriting,corel draw,automated billing,system,customer management,debugging,system administration,network configuration,software installation,security,tech support,updates,tci/ip,dhcp,wan/lan,ubuntu,virtualized networks,network automation,cloud management,ai,salesforce,mango db,math,calculus,product launch,mvp
2 | 


--------------------------------------------------------------------------------
/src/components/jd_data_extractor.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from tqdm import tqdm
  3 | from time import sleep
  4 | from selenium import webdriver
  5 | from selenium.webdriver.common.by import By
  6 | from selenium.webdriver.support.ui import WebDriverWait
  7 | from selenium.webdriver.support import expected_conditions as EC
  8 | from selenium.common.exceptions import TimeoutException
  9 | from bs4 import BeautifulSoup
 10 | from selenium.common.exceptions import ElementClickInterceptedException
 11 | from selenium.common.exceptions import NoSuchElementException
 12 | import json
 13 | import urllib
 14 | import time
 15 | 
 16 | driver = webdriver.Chrome(executable_path=r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\chromedriver_win32\chromedriver.exe')
 17 | 
 18 | def openbrowser(locid, key):
 19 |     driver.wait = WebDriverWait(driver, 5)
 20 |     driver.maximize_window()
 21 |     words = key.split()
 22 |     txt =''    
 23 |     for w in words:
 24 |         txt +=(w+'+')
 25 |     #print (txt)
 26 |     driver.get("https://www.glassdoor.co.in/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword={}"
 27 |            "&sc.keyword={}&locT=C&locId={}&jobType=fulltime&fromAge=1&radius=6&cityId=-1&minRating=0.0&industryId=-1"
 28 |            "&sgocId=-1&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0".format(txt[:-1], txt[:-1], locid))
 29 | 
 30 |     return driver
 31 | 
 32 | def geturl(driver):
 33 |     url = set()
 34 |     while True:
 35 |         print(len(url))
 36 |         if len(url)>=20:
 37 |             break
 38 |         soup1 = BeautifulSoup(driver.page_source, "lxml")
 39 |         
 40 |         main = soup1.find_all("li",{"class":"jl"})
 41 |         
 42 |         for m in main:
 43 |             url.add('https://www.glassdoor.co.in{}'.format(m.find('a')['href']))       
 44 |         try:
 45 |             next_element = soup1.find("li", {"class": "next"})
 46 |             try:
 47 |                 next_exist = next_element.find('a')
 48 |             except AttributeError:
 49 |                 driver.quit()
 50 |                 break
 51 |             except NoSuchElementException:
 52 |                 driver.quit()
 53 |                 break
 54 |             if next_exist:
 55 |     
 56 |                 driver.find_element_by_class_name("next").click()
 57 |                 time.sleep(2)
 58 |             else:
 59 |                 driver.quit()
 60 |                 break
 61 |         except ElementClickInterceptedException:
 62 |             pass
 63 |         
 64 |     return list(url)
 65 | 
 66 | x =openbrowser(locid =4477468, key='"Data Scientist"')
 67 | with open('url_data_scientist_loc_bangalore.json','w') as f:
 68 |     json.dump(geturl(driver),f, indent = 4)
 69 |     print("file created")
 70 | 
 71 | with open('url_data_scientist_loc_bangalore.json','r') as f:
 72 |     url = json.load(f)
 73 | data ={}    
 74 | i = 1
 75 | jd_df = pd.DataFrame()
 76 | driver = webdriver.Chrome(executable_path=r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\chromedriver_win32\chromedriver.exe')
 77 | 
 78 | for u in tqdm(url):
 79 |     driver.wait = WebDriverWait(driver, 2)
 80 |     driver.maximize_window()
 81 |     driver.get(u)
 82 |     soup = BeautifulSoup(driver.page_source, "lxml")
 83 |     try:
 84 |        
 85 |         header = soup.find("div",{"class":"header cell info"})
 86 |         position = driver.find_element_by_tag_name('h2').text
 87 |         company = driver.find_element_by_xpath("//span[@class='strong ib']").text
 88 |         location = driver.find_element_by_xpath("//span[@class='subtle ib']").text
 89 |         jd_temp = driver.find_element_by_id("JobDescriptionContainer")
 90 |         jd = jd_temp.text
 91 |         info = soup.find_all("infoEntity")
 92 |     except IndexError:
 93 |         print('IndexError: list index out of range')
 94 |     except NoSuchElementException:
 95 |         pass
 96 |     data[i] = {
 97 |         'url' :u,
 98 |         'Position':position,
 99 |         'Company': company,
100 |         'Location' :location,
101 |         'Job_Description' :jd
102 |     }
103 |     i+=1     
104 | driver.quit()
105 | jd_df = pd.DataFrame(data)
106 | jd = jd_df.transpose()
107 | 
108 | jd = jd[['url','Position','Company','Location','Job_Description']]
109 | jd.to_csv(r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\data\jd_unstructured_data.csv')
110 | print('file created')
111 | 
112 | 
113 | def get_jobs(keyword, num_jobs, verbose, path, slp_time):
114 |     
115 |     '''Gathers jobs as a dataframe, scraped from Glassdoor'''
116 |     
117 |     #Initializing the webdriver
118 |     options = webdriver.ChromeOptions()
119 |     
120 |     #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
121 |     #options.add_argument('headless')
122 |     
123 |     #Change the path to where chromedriver is in your home folder.
124 |     driver = webdriver.Chrome(executable_path=path, options=options)
125 |     driver.set_window_size(1120, 1000)
126 |     
127 |     url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+keyword+"&sc.keyword="+keyword+"&locT=&locId=&jobType="
128 |     #url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
129 |     driver.get(url)
130 |     jobs = []
131 | 
132 |     while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.
133 | 
134 |         #Let the page load. Change this number based on your internet speed.
135 |         #Or, wait until the webpage is loaded, instead of hardcoding it.
136 |         time.sleep(slp_time)
137 | 
138 |         #Test for the "Sign Up" prompt and get rid of it.
139 |         try:
140 |             driver.find_element_by_class_name("selected").click()
141 |         except ElementClickInterceptedException:
142 |             pass
143 | 
144 |         time.sleep(.1)
145 | 
146 |         try:
147 |             driver.find_element_by_css_selector('[alt="Close"]').click() #clicking to the X.
148 |             print(' x out worked')
149 |         except NoSuchElementException:
150 |             print(' x out failed')
151 |             pass
152 | 
153 |         
154 |         #Going through each job in this page
155 |         job_buttons = driver.find_elements_by_class_name("jl")  #jl for Job Listing. These are the buttons we're going to click.
156 |         for job_button in job_buttons:  
157 | 
158 |             print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
159 |             if len(jobs) >= num_jobs:
160 |                 break
161 | 
162 |             job_button.click()  #You might 
163 |             time.sleep(1)
164 |             collected_successfully = False
165 |             
166 |             while not collected_successfully:
167 |                 try:
168 |                     company_name = driver.find_element_by_xpath('.//div[@class="employerName"]').text
169 |                     location = driver.find_element_by_xpath('.//div[@class="location"]').text
170 |                     job_title = driver.find_element_by_xpath('.//div[contains(@class, "title")]').text
171 |                     job_description = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]').text
172 |                     collected_successfully = True
173 |                 except:
174 |                     time.sleep(5)
175 | 
176 |             try:
177 |                 salary_estimate = driver.find_element_by_xpath('.//span[@class="gray salary"]').text
178 |             except NoSuchElementException:
179 |                 salary_estimate = -1 #You need to set a "not found value. It's important."
180 |             
181 |             try:
182 |                 rating = driver.find_element_by_xpath('.//span[@class="rating"]').text
183 |             except NoSuchElementException:
184 |                 rating = -1 #You need to set a "not found value. It's important."
185 | 
186 |             #Printing for debugging
187 |             if verbose:
188 |                 print("Job Title: {}".format(job_title))
189 |                 print("Salary Estimate: {}".format(salary_estimate))
190 |                 print("Job Description: {}".format(job_description[:500]))
191 |                 print("Rating: {}".format(rating))
192 |                 print("Company Name: {}".format(company_name))
193 |                 print("Location: {}".format(location))
194 | 
195 |             #Going to the Company tab...
196 |             #clicking on this:
197 |             #<div class="tab" data-tab-type="overview"><span>Company</span></div>
198 |             try:
199 |                 driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]').click()
200 | 
201 |                 try:
202 |                     #<div class="infoEntity">
203 |                     #    <label>Headquarters</label>
204 |                     #    <span class="value">San Francisco, CA</span>
205 |                     #</div>
206 |                     headquarters = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text
207 |                 except NoSuchElementException:
208 |                     headquarters = -1
209 | 
210 |                 try:
211 |                     size = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
212 |                 except NoSuchElementException:
213 |                     size = -1
214 | 
215 |                 try:
216 |                     founded = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
217 |                 except NoSuchElementException:
218 |                     founded = -1
219 | 
220 |                 try:
221 |                     type_of_ownership = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
222 |                 except NoSuchElementException:
223 |                     type_of_ownership = -1
224 | 
225 |                 try:
226 |                     industry = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
227 |                 except NoSuchElementException:
228 |                     industry = -1
229 | 
230 |                 try:
231 |                     sector = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
232 |                 except NoSuchElementException:
233 |                     sector = -1
234 | 
235 |                 try:
236 |                     revenue = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
237 |                 except NoSuchElementException:
238 |                     revenue = -1
239 | 
240 |                 try:
241 |                     competitors = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text
242 |                 except NoSuchElementException:
243 |                     competitors = -1
244 | 
245 |             except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
246 |                 headquarters = -1
247 |                 size = -1
248 |                 founded = -1
249 |                 type_of_ownership = -1
250 |                 industry = -1
251 |                 sector = -1
252 |                 revenue = -1
253 |                 competitors = -1
254 | 
255 |                 
256 |             if verbose:
257 |                 print("Headquarters: {}".format(headquarters))
258 |                 print("Size: {}".format(size))
259 |                 print("Founded: {}".format(founded))
260 |                 print("Type of Ownership: {}".format(type_of_ownership))
261 |                 print("Industry: {}".format(industry))
262 |                 print("Sector: {}".format(sector))
263 |                 print("Revenue: {}".format(revenue))
264 |                 print("Competitors: {}".format(competitors))
265 | 
266 |             jobs.append({"Job Title" : job_title,
267 |             "Salary Estimate" : salary_estimate,
268 |             "Job Description" : job_description,
269 |             "Rating" : rating,
270 |             "Company Name" : company_name,
271 |             "Location" : location,
272 |             "Headquarters" : headquarters,
273 |             "Size" : size,
274 |             "Founded" : founded,
275 |             "Type of ownership" : type_of_ownership,
276 |             "Industry" : industry,
277 |             "Sector" : sector,
278 |             "Revenue" : revenue,
279 |             "Competitors" : competitors})
280 |             #add job to jobs
281 |             
282 |             
283 |         #Clicking on the "next page" button
284 |         try:
285 |             driver.find_element_by_xpath('.//li[@class="next"]//a').click()
286 |         except NoSuchElementException:
287 |             print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
288 |             break
289 | 
290 |     return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame.
291 | 
292 |     path = r"C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\chromedriver_win32\chromedriver.exe"
293 | 
294 | unstructured_data_df = get_jobs('data scientist',1000, False, driver, 15)
295 | 
296 | unstructured_data_df.to_csv(r'C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\src\data\jd_unstructured_data.csv', index = False)


--------------------------------------------------------------------------------
/myenv/docx-template/word/numbering.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
  2 | <w:numbering xmlns:mv="urn:schemas-microsoft-com:mac:vml" xmlns:mo="http://schemas.microsoft.com/office/mac/office/2008/main" xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing">
  3 | 	<w:abstractNum w:abstractNumId="0">
  4 | 		<w:nsid w:val="FFFFFF1D"/>
  5 | 		<w:multiLevelType w:val="multilevel"/>
  6 | 		<w:tmpl w:val="D0409C7C"/>
  7 | 		<w:lvl w:ilvl="0">
  8 | 			<w:start w:val="1"/>
  9 | 			<w:numFmt w:val="bullet"/>
 10 | 			<w:lvlText w:val=""/>
 11 | 			<w:lvlJc w:val="left"/>
 12 | 			<w:pPr>
 13 | 				<w:tabs>
 14 | 					<w:tab w:val="num" w:pos="0"/>
 15 | 				</w:tabs>
 16 | 				<w:ind w:left="0" w:firstLine="0"/>
 17 | 			</w:pPr>
 18 | 			<w:rPr>
 19 | 				<w:rFonts w:ascii="Symbol" w:hAnsi="Symbol" w:hint="default"/>
 20 | 			</w:rPr>
 21 | 		</w:lvl>
 22 | 		<w:lvl w:ilvl="1">
 23 | 			<w:start w:val="1"/>
 24 | 			<w:numFmt w:val="bullet"/>
 25 | 			<w:lvlText w:val=""/>
 26 | 			<w:lvlJc w:val="left"/>
 27 | 			<w:pPr>
 28 | 				<w:tabs>
 29 | 					<w:tab w:val="num" w:pos="720"/>
 30 | 				</w:tabs>
 31 | 				<w:ind w:left="1080" w:hanging="360"/>
 32 | 			</w:pPr>
 33 | 			<w:rPr>
 34 | 				<w:rFonts w:ascii="Symbol" w:hAnsi="Symbol" w:hint="default"/>
 35 | 			</w:rPr>
 36 | 		</w:lvl>
 37 | 		<w:lvl w:ilvl="2">
 38 | 			<w:start w:val="1"/>
 39 | 			<w:numFmt w:val="bullet"/>
 40 | 			<w:lvlText w:val="o"/>
 41 | 			<w:lvlJc w:val="left"/>
 42 | 			<w:pPr>
 43 | 				<w:tabs>
 44 | 					<w:tab w:val="num" w:pos="1440"/>
 45 | 				</w:tabs>
 46 | 				<w:ind w:left="1800" w:hanging="360"/>
 47 | 			</w:pPr>
 48 | 			<w:rPr>
 49 | 				<w:rFonts w:ascii="Courier New" w:hAnsi="Courier New" w:hint="default"/>
 50 | 			</w:rPr>
 51 | 		</w:lvl>
 52 | 		<w:lvl w:ilvl="3">
 53 | 			<w:start w:val="1"/>
 54 | 			<w:numFmt w:val="bullet"/>
 55 | 			<w:lvlText w:val=""/>
 56 | 			<w:lvlJc w:val="left"/>
 57 | 			<w:pPr>
 58 | 				<w:tabs>
 59 | 					<w:tab w:val="num" w:pos="2160"/>
 60 | 				</w:tabs>
 61 | 				<w:ind w:left="2520" w:hanging="360"/>
 62 | 			</w:pPr>
 63 | 			<w:rPr>
 64 | 				<w:rFonts w:ascii="Wingdings" w:hAnsi="Wingdings" w:hint="default"/>
 65 | 			</w:rPr>
 66 | 		</w:lvl>
 67 | 		<w:lvl w:ilvl="4">
 68 | 			<w:start w:val="1"/>
 69 | 			<w:numFmt w:val="bullet"/>
 70 | 			<w:lvlText w:val=""/>
 71 | 			<w:lvlJc w:val="left"/>
 72 | 			<w:pPr>
 73 | 				<w:tabs>
 74 | 					<w:tab w:val="num" w:pos="2880"/>
 75 | 				</w:tabs>
 76 | 				<w:ind w:left="3240" w:hanging="360"/>
 77 | 			</w:pPr>
 78 | 			<w:rPr>
 79 | 				<w:rFonts w:ascii="Wingdings" w:hAnsi="Wingdings" w:hint="default"/>
 80 | 			</w:rPr>
 81 | 		</w:lvl>
 82 | 		<w:lvl w:ilvl="5">
 83 | 			<w:start w:val="1"/>
 84 | 			<w:numFmt w:val="bullet"/>
 85 | 			<w:lvlText w:val=""/>
 86 | 			<w:lvlJc w:val="left"/>
 87 | 			<w:pPr>
 88 | 				<w:tabs>
 89 | 					<w:tab w:val="num" w:pos="3600"/>
 90 | 				</w:tabs>
 91 | 				<w:ind w:left="3960" w:hanging="360"/>
 92 | 			</w:pPr>
 93 | 			<w:rPr>
 94 | 				<w:rFonts w:ascii="Symbol" w:hAnsi="Symbol" w:hint="default"/>
 95 | 			</w:rPr>
 96 | 		</w:lvl>
 97 | 		<w:lvl w:ilvl="6">
 98 | 			<w:start w:val="1"/>
 99 | 			<w:numFmt w:val="bullet"/>
100 | 			<w:lvlText w:val="o"/>
101 | 			<w:lvlJc w:val="left"/>
102 | 			<w:pPr>
103 | 				<w:tabs>
104 | 					<w:tab w:val="num" w:pos="4320"/>
105 | 				</w:tabs>
106 | 				<w:ind w:left="4680" w:hanging="360"/>
107 | 			</w:pPr>
108 | 			<w:rPr>
109 | 				<w:rFonts w:ascii="Courier New" w:hAnsi="Courier New" w:hint="default"/>
110 | 			</w:rPr>
111 | 		</w:lvl>
112 | 		<w:lvl w:ilvl="7">
113 | 			<w:start w:val="1"/>
114 | 			<w:numFmt w:val="bullet"/>
115 | 			<w:lvlText w:val=""/>
116 | 			<w:lvlJc w:val="left"/>
117 | 			<w:pPr>
118 | 				<w:tabs>
119 | 					<w:tab w:val="num" w:pos="5040"/>
120 | 				</w:tabs>
121 | 				<w:ind w:left="5400" w:hanging="360"/>
122 | 			</w:pPr>
123 | 			<w:rPr>
124 | 				<w:rFonts w:ascii="Wingdings" w:hAnsi="Wingdings" w:hint="default"/>
125 | 			</w:rPr>
126 | 		</w:lvl>
127 | 		<w:lvl w:ilvl="8">
128 | 			<w:start w:val="1"/>
129 | 			<w:numFmt w:val="bullet"/>
130 | 			<w:lvlText w:val=""/>
131 | 			<w:lvlJc w:val="left"/>
132 | 			<w:pPr>
133 | 				<w:tabs>
134 | 					<w:tab w:val="num" w:pos="5760"/>
135 | 				</w:tabs>
136 | 				<w:ind w:left="6120" w:hanging="360"/>
137 | 			</w:pPr>
138 | 			<w:rPr>
139 | 				<w:rFonts w:ascii="Wingdings" w:hAnsi="Wingdings" w:hint="default"/>
140 | 			</w:rPr>
141 | 		</w:lvl>
142 | 	</w:abstractNum>
143 | 	<w:abstractNum w:abstractNumId="1">
144 | 		<w:nsid w:val="FFFFFF7C"/>
145 | 		<w:multiLevelType w:val="singleLevel"/>
146 | 		<w:tmpl w:val="9B522538"/>
147 | 		<w:lvl w:ilvl="0">
148 | 			<w:start w:val="1"/>
149 | 			<w:numFmt w:val="decimal"/>
150 | 			<w:lvlText w:val="%1."/>
151 | 			<w:lvlJc w:val="left"/>
152 | 			<w:pPr>
153 | 				<w:tabs>
154 | 					<w:tab w:val="num" w:pos="1492"/>
155 | 				</w:tabs>
156 | 				<w:ind w:left="1492" w:hanging="360"/>
157 | 			</w:pPr>
158 | 		</w:lvl>
159 | 	</w:abstractNum>
160 | 	<w:abstractNum w:abstractNumId="2">
161 | 		<w:nsid w:val="FFFFFF7D"/>
162 | 		<w:multiLevelType w:val="singleLevel"/>
163 | 		<w:tmpl w:val="1BB4178C"/>
164 | 		<w:lvl w:ilvl="0">
165 | 			<w:start w:val="1"/>
166 | 			<w:numFmt w:val="decimal"/>
167 | 			<w:lvlText w:val="%1."/>
168 | 			<w:lvlJc w:val="left"/>
169 | 			<w:pPr>
170 | 				<w:tabs>
171 | 					<w:tab w:val="num" w:pos="1209"/>
172 | 				</w:tabs>
173 | 				<w:ind w:left="1209" w:hanging="360"/>
174 | 			</w:pPr>
175 | 		</w:lvl>
176 | 	</w:abstractNum>
177 | 	<w:abstractNum w:abstractNumId="3">
178 | 		<w:nsid w:val="FFFFFF7E"/>
179 | 		<w:multiLevelType w:val="singleLevel"/>
180 | 		<w:tmpl w:val="4720F336"/>
181 | 		<w:lvl w:ilvl="0">
182 | 			<w:start w:val="1"/>
183 | 			<w:numFmt w:val="decimal"/>
184 | 			<w:lvlText w:val="%1."/>
185 | 			<w:lvlJc w:val="left"/>
186 | 			<w:pPr>
187 | 				<w:tabs>
188 | 					<w:tab w:val="num" w:pos="926"/>
189 | 				</w:tabs>
190 | 				<w:ind w:left="926" w:hanging="360"/>
191 | 			</w:pPr>
192 | 		</w:lvl>
193 | 	</w:abstractNum>
194 | 	<w:abstractNum w:abstractNumId="4">
195 | 		<w:nsid w:val="FFFFFF7F"/>
196 | 		<w:multiLevelType w:val="singleLevel"/>
197 | 		<w:tmpl w:val="7DCEC040"/>
198 | 		<w:lvl w:ilvl="0">
199 | 			<w:start w:val="1"/>
200 | 			<w:numFmt w:val="decimal"/>
201 | 			<w:lvlText w:val="%1."/>
202 | 			<w:lvlJc w:val="left"/>
203 | 			<w:pPr>
204 | 				<w:tabs>
205 | 					<w:tab w:val="num" w:pos="643"/>
206 | 				</w:tabs>
207 | 				<w:ind w:left="643" w:hanging="360"/>
208 | 			</w:pPr>
209 | 		</w:lvl>
210 | 	</w:abstractNum>
211 | 	<w:abstractNum w:abstractNumId="5">
212 | 		<w:nsid w:val="FFFFFF80"/>
213 | 		<w:multiLevelType w:val="singleLevel"/>
214 | 		<w:tmpl w:val="A01021FA"/>
215 | 		<w:lvl w:ilvl="0">
216 | 			<w:start w:val="1"/>
217 | 			<w:numFmt w:val="bullet"/>
218 | 			<w:lvlText w:val=""/>
219 | 			<w:lvlJc w:val="left"/>
220 | 			<w:pPr>
221 | 				<w:tabs>
222 | 					<w:tab w:val="num" w:pos="1492"/>
223 | 				</w:tabs>
224 | 				<w:ind w:left="1492" w:hanging="360"/>
225 | 			</w:pPr>
226 | 			<w:rPr>
227 | 				<w:rFonts w:ascii="Symbol" w:hAnsi="Symbol" w:hint="default"/>
228 | 			</w:rPr>
229 | 		</w:lvl>
230 | 	</w:abstractNum>
231 | 	<w:abstractNum w:abstractNumId="6">
232 | 		<w:nsid w:val="FFFFFF81"/>
233 | 		<w:multiLevelType w:val="singleLevel"/>
234 | 		<w:tmpl w:val="977AAC7C"/>
235 | 		<w:lvl w:ilvl="0">
236 | 			<w:start w:val="1"/>
237 | 			<w:numFmt w:val="bullet"/>
238 | 			<w:lvlText w:val=""/>
239 | 			<w:lvlJc w:val="left"/>
240 | 			<w:pPr>
241 | 				<w:tabs>
242 | 					<w:tab w:val="num" w:pos="1209"/>
243 | 				</w:tabs>
244 | 				<w:ind w:left="1209" w:hanging="360"/>
245 | 			</w:pPr>
246 | 			<w:rPr>
247 | 				<w:rFonts w:ascii="Symbol" w:hAnsi="Symbol" w:hint="default"/>
248 | 			</w:rPr>
249 | 		</w:lvl>
250 | 	</w:abstractNum>
251 | 	<w:abstractNum w:abstractNumId="7">
252 | 		<w:nsid w:val="FFFFFF82"/>
253 | 		<w:multiLevelType w:val="singleLevel"/>
254 | 		<w:tmpl w:val="0D26C678"/>
255 | 		<w:lvl w:ilvl="0">
256 | 			<w:start w:val="1"/>
257 | 			<w:numFmt w:val="bullet"/>
258 | 			<w:lvlText w:val=""/>
259 | 			<w:lvlJc w:val="left"/>
260 | 			<w:pPr>
261 | 				<w:tabs>
262 | 					<w:tab w:val="num" w:pos="926"/>
263 | 				</w:tabs>
264 | 				<w:ind w:left="926" w:hanging="360"/>
265 | 			</w:pPr>
266 | 			<w:rPr>
267 | 				<w:rFonts w:ascii="Symbol" w:hAnsi="Symbol" w:hint="default"/>
268 | 			</w:rPr>
269 | 		</w:lvl>
270 | 	</w:abstractNum>
271 | 	<w:abstractNum w:abstractNumId="8">
272 | 		<w:nsid w:val="FFFFFF83"/>
273 | 		<w:multiLevelType w:val="singleLevel"/>
274 | 		<w:tmpl w:val="EE200B76"/>
275 | 		<w:lvl w:ilvl="0">
276 | 			<w:start w:val="1"/>
277 | 			<w:numFmt w:val="bullet"/>
278 | 			<w:lvlText w:val=""/>
279 | 			<w:lvlJc w:val="left"/>
280 | 			<w:pPr>
281 | 				<w:tabs>
282 | 					<w:tab w:val="num" w:pos="643"/>
283 | 				</w:tabs>
284 | 				<w:ind w:left="643" w:hanging="360"/>
285 | 			</w:pPr>
286 | 			<w:rPr>
287 | 				<w:rFonts w:ascii="Symbol" w:hAnsi="Symbol" w:hint="default"/>
288 | 			</w:rPr>
289 | 		</w:lvl>
290 | 	</w:abstractNum>
291 | 	<w:abstractNum w:abstractNumId="9">
292 | 		<w:nsid w:val="FFFFFF88"/>
293 | 		<w:multiLevelType w:val="singleLevel"/>
294 | 		<w:tmpl w:val="E7381406"/>
295 | 		<w:lvl w:ilvl="0">
296 | 			<w:start w:val="1"/>
297 | 			<w:numFmt w:val="decimal"/>
298 | 			<w:pStyle w:val="ListNumber"/>
299 | 			<w:lvlText w:val="%1."/>
300 | 			<w:lvlJc w:val="left"/>
301 | 			<w:pPr>
302 | 				<w:tabs>
303 | 					<w:tab w:val="num" w:pos="360"/>
304 | 				</w:tabs>
305 | 				<w:ind w:left="360" w:hanging="360"/>
306 | 			</w:pPr>
307 | 		</w:lvl>
308 | 	</w:abstractNum>
309 | 	<w:abstractNum w:abstractNumId="10">
310 | 		<w:nsid w:val="FFFFFF89"/>
311 | 		<w:multiLevelType w:val="singleLevel"/>
312 | 		<w:tmpl w:val="ECFC1548"/>
313 | 		<w:lvl w:ilvl="0">
314 | 			<w:start w:val="1"/>
315 | 			<w:numFmt w:val="bullet"/>
316 | 			<w:pStyle w:val="ListBullet"/>
317 | 			<w:lvlText w:val=""/>
318 | 			<w:lvlJc w:val="left"/>
319 | 			<w:pPr>
320 | 				<w:tabs>
321 | 					<w:tab w:val="num" w:pos="360"/>
322 | 				</w:tabs>
323 | 				<w:ind w:left="360" w:hanging="360"/>
324 | 			</w:pPr>
325 | 			<w:rPr>
326 | 				<w:rFonts w:ascii="Symbol" w:hAnsi="Symbol" w:hint="default"/>
327 | 			</w:rPr>
328 | 		</w:lvl>
329 | 	</w:abstractNum>
330 | 	<w:abstractNum w:abstractNumId="11">
331 | 		<w:nsid w:val="30585973"/>
332 | 		<w:multiLevelType w:val="multilevel"/>
333 | 		<w:tmpl w:val="A47E15CE"/>
334 | 		<w:lvl w:ilvl="0">
335 | 			<w:start w:val="1"/>
336 | 			<w:numFmt w:val="decimal"/>
337 | 			<w:lvlText w:val="%1."/>
338 | 			<w:lvlJc w:val="left"/>
339 | 			<w:pPr>
340 | 				<w:tabs>
341 | 					<w:tab w:val="num" w:pos="720"/>
342 | 				</w:tabs>
343 | 				<w:ind w:left="720" w:hanging="720"/>
344 | 			</w:pPr>
345 | 		</w:lvl>
346 | 		<w:lvl w:ilvl="1">
347 | 			<w:start w:val="1"/>
348 | 			<w:numFmt w:val="decimal"/>
349 | 			<w:lvlText w:val="%2."/>
350 | 			<w:lvlJc w:val="left"/>
351 | 			<w:pPr>
352 | 				<w:tabs>
353 | 					<w:tab w:val="num" w:pos="1440"/>
354 | 				</w:tabs>
355 | 				<w:ind w:left="1440" w:hanging="720"/>
356 | 			</w:pPr>
357 | 		</w:lvl>
358 | 		<w:lvl w:ilvl="2">
359 | 			<w:start w:val="1"/>
360 | 			<w:numFmt w:val="decimal"/>
361 | 			<w:lvlText w:val="%3."/>
362 | 			<w:lvlJc w:val="left"/>
363 | 			<w:pPr>
364 | 				<w:tabs>
365 | 					<w:tab w:val="num" w:pos="2160"/>
366 | 				</w:tabs>
367 | 				<w:ind w:left="2160" w:hanging="720"/>
368 | 			</w:pPr>
369 | 		</w:lvl>
370 | 		<w:lvl w:ilvl="3">
371 | 			<w:start w:val="1"/>
372 | 			<w:numFmt w:val="decimal"/>
373 | 			<w:lvlText w:val="%4."/>
374 | 			<w:lvlJc w:val="left"/>
375 | 			<w:pPr>
376 | 				<w:tabs>
377 | 					<w:tab w:val="num" w:pos="2880"/>
378 | 				</w:tabs>
379 | 				<w:ind w:left="2880" w:hanging="720"/>
380 | 			</w:pPr>
381 | 		</w:lvl>
382 | 		<w:lvl w:ilvl="4">
383 | 			<w:start w:val="1"/>
384 | 			<w:numFmt w:val="decimal"/>
385 | 			<w:lvlText w:val="%5."/>
386 | 			<w:lvlJc w:val="left"/>
387 | 			<w:pPr>
388 | 				<w:tabs>
389 | 					<w:tab w:val="num" w:pos="3600"/>
390 | 				</w:tabs>
391 | 				<w:ind w:left="3600" w:hanging="720"/>
392 | 			</w:pPr>
393 | 		</w:lvl>
394 | 		<w:lvl w:ilvl="5">
395 | 			<w:start w:val="1"/>
396 | 			<w:numFmt w:val="decimal"/>
397 | 			<w:lvlText w:val="%6."/>
398 | 			<w:lvlJc w:val="left"/>
399 | 			<w:pPr>
400 | 				<w:tabs>
401 | 					<w:tab w:val="num" w:pos="4320"/>
402 | 				</w:tabs>
403 | 				<w:ind w:left="4320" w:hanging="720"/>
404 | 			</w:pPr>
405 | 		</w:lvl>
406 | 		<w:lvl w:ilvl="6">
407 | 			<w:start w:val="1"/>
408 | 			<w:numFmt w:val="decimal"/>
409 | 			<w:lvlText w:val="%7."/>
410 | 			<w:lvlJc w:val="left"/>
411 | 			<w:pPr>
412 | 				<w:tabs>
413 | 					<w:tab w:val="num" w:pos="5040"/>
414 | 				</w:tabs>
415 | 				<w:ind w:left="5040" w:hanging="720"/>
416 | 			</w:pPr>
417 | 		</w:lvl>
418 | 		<w:lvl w:ilvl="7">
419 | 			<w:start w:val="1"/>
420 | 			<w:numFmt w:val="decimal"/>
421 | 			<w:lvlText w:val="%8."/>
422 | 			<w:lvlJc w:val="left"/>
423 | 			<w:pPr>
424 | 				<w:tabs>
425 | 					<w:tab w:val="num" w:pos="5760"/>
426 | 				</w:tabs>
427 | 				<w:ind w:left="5760" w:hanging="720"/>
428 | 			</w:pPr>
429 | 		</w:lvl>
430 | 		<w:lvl w:ilvl="8">
431 | 			<w:start w:val="1"/>
432 | 			<w:numFmt w:val="decimal"/>
433 | 			<w:lvlText w:val="%9."/>
434 | 			<w:lvlJc w:val="left"/>
435 | 			<w:pPr>
436 | 				<w:tabs>
437 | 					<w:tab w:val="num" w:pos="6480"/>
438 | 				</w:tabs>
439 | 				<w:ind w:left="6480" w:hanging="720"/>
440 | 			</w:pPr>
441 | 		</w:lvl>
442 | 	</w:abstractNum>
443 | 	<w:num w:numId="1">
444 | 		<w:abstractNumId w:val="11"/>
445 | 	</w:num>
446 | 	<w:num w:numId="2">
447 | 		<w:abstractNumId w:val="11"/>
448 | 		<w:lvlOverride w:ilvl="0">
449 | 			<w:startOverride w:val="1"/>
450 | 		</w:lvlOverride>
451 | 		<w:lvlOverride w:ilvl="1">
452 | 			<w:startOverride w:val="1"/>
453 | 		</w:lvlOverride>
454 | 		<w:lvlOverride w:ilvl="2">
455 | 			<w:startOverride w:val="1"/>
456 | 		</w:lvlOverride>
457 | 		<w:lvlOverride w:ilvl="3">
458 | 			<w:startOverride w:val="1"/>
459 | 		</w:lvlOverride>
460 | 		<w:lvlOverride w:ilvl="4">
461 | 			<w:startOverride w:val="1"/>
462 | 		</w:lvlOverride>
463 | 		<w:lvlOverride w:ilvl="5">
464 | 			<w:startOverride w:val="1"/>
465 | 		</w:lvlOverride>
466 | 		<w:lvlOverride w:ilvl="6">
467 | 			<w:startOverride w:val="1"/>
468 | 		</w:lvlOverride>
469 | 		<w:lvlOverride w:ilvl="7">
470 | 			<w:startOverride w:val="1"/>
471 | 		</w:lvlOverride>
472 | 		<w:lvlOverride w:ilvl="8">
473 | 			<w:startOverride w:val="1"/>
474 | 		</w:lvlOverride>
475 | 	</w:num>
476 | 	<w:num w:numId="3">
477 | 		<w:abstractNumId w:val="4"/>
478 | 	</w:num>
479 | 	<w:num w:numId="4">
480 | 		<w:abstractNumId w:val="3"/>
481 | 	</w:num>
482 | 	<w:num w:numId="5">
483 | 		<w:abstractNumId w:val="2"/>
484 | 	</w:num>
485 | 	<w:num w:numId="6">
486 | 		<w:abstractNumId w:val="1"/>
487 | 	</w:num>
488 | 	<w:num w:numId="7">
489 | 		<w:abstractNumId w:val="0"/>
490 | 	</w:num>
491 | 	<w:num w:numId="8">
492 | 		<w:abstractNumId w:val="10"/>
493 | 	</w:num>
494 | 	<w:num w:numId="9">
495 | 		<w:abstractNumId w:val="8"/>
496 | 	</w:num>
497 | 	<w:num w:numId="10">
498 | 		<w:abstractNumId w:val="7"/>
499 | 	</w:num>
500 | 	<w:num w:numId="11">
501 | 		<w:abstractNumId w:val="6"/>
502 | 	</w:num>
503 | 	<w:num w:numId="12">
504 | 		<w:abstractNumId w:val="5"/>
505 | 	</w:num>
506 | 	<w:num w:numId="13">
507 | 		<w:abstractNumId w:val="9"/>
508 | 	</w:num>
509 | </w:numbering>
510 | 


--------------------------------------------------------------------------------
/myenv/Scripts/dumppdf.py:
--------------------------------------------------------------------------------
  1 | #!C:\Users\Admin\ML_Projects\Job_Recommendation_System\Job-Recommendation-System\myenv\Scripts\python.exe
  2 | """Extract pdf structure in XML format"""
  3 | import logging
  4 | import os.path
  5 | import re
  6 | import sys
  7 | from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, Union, cast
  8 | from argparse import ArgumentParser
  9 | 
 10 | import pdfminer
 11 | from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines, PDFXRefFallback
 12 | from pdfminer.pdfpage import PDFPage
 13 | from pdfminer.pdfparser import PDFParser
 14 | from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
 15 | from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
 16 | from pdfminer.psparser import PSKeyword, PSLiteral, LIT
 17 | from pdfminer.utils import isnumber
 18 | 
 19 | logging.basicConfig()
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
 23 | 
 24 | 
 25 | def escape(s: Union[str, bytes]) -> str:
 26 |     if isinstance(s, bytes):
 27 |         us = str(s, "latin-1")
 28 |     else:
 29 |         us = s
 30 |     return ESC_PAT.sub(lambda m: "&#%d;" % ord(m.group(0)), us)
 31 | 
 32 | 
 33 | def dumpxml(out: TextIO, obj: object, codec: Optional[str] = None) -> None:
 34 |     if obj is None:
 35 |         out.write("<null />")
 36 |         return
 37 | 
 38 |     if isinstance(obj, dict):
 39 |         out.write('<dict size="%d">\n' % len(obj))
 40 |         for (k, v) in obj.items():
 41 |             out.write("<key>%s</key>\n" % k)
 42 |             out.write("<value>")
 43 |             dumpxml(out, v)
 44 |             out.write("</value>\n")
 45 |         out.write("</dict>")
 46 |         return
 47 | 
 48 |     if isinstance(obj, list):
 49 |         out.write('<list size="%d">\n' % len(obj))
 50 |         for v in obj:
 51 |             dumpxml(out, v)
 52 |             out.write("\n")
 53 |         out.write("</list>")
 54 |         return
 55 | 
 56 |     if isinstance(obj, (str, bytes)):
 57 |         out.write('<string size="%d">%s</string>' % (len(obj), escape(obj)))
 58 |         return
 59 | 
 60 |     if isinstance(obj, PDFStream):
 61 |         if codec == "raw":
 62 |             # Bug: writing bytes to text I/O. This will raise TypeError.
 63 |             out.write(obj.get_rawdata())  # type: ignore [arg-type]
 64 |         elif codec == "binary":
 65 |             # Bug: writing bytes to text I/O. This will raise TypeError.
 66 |             out.write(obj.get_data())  # type: ignore [arg-type]
 67 |         else:
 68 |             out.write("<stream>\n<props>\n")
 69 |             dumpxml(out, obj.attrs)
 70 |             out.write("\n</props>\n")
 71 |             if codec == "text":
 72 |                 data = obj.get_data()
 73 |                 out.write('<data size="%d">%s</data>\n' % (len(data), escape(data)))
 74 |             out.write("</stream>")
 75 |         return
 76 | 
 77 |     if isinstance(obj, PDFObjRef):
 78 |         out.write('<ref id="%d" />' % obj.objid)
 79 |         return
 80 | 
 81 |     if isinstance(obj, PSKeyword):
 82 |         # Likely bug: obj.name is bytes, not str
 83 |         out.write("<keyword>%s</keyword>" % obj.name)  # type: ignore [str-bytes-safe]
 84 |         return
 85 | 
 86 |     if isinstance(obj, PSLiteral):
 87 |         # Likely bug: obj.name may be bytes, not str
 88 |         out.write("<literal>%s</literal>" % obj.name)  # type: ignore [str-bytes-safe]
 89 |         return
 90 | 
 91 |     if isnumber(obj):
 92 |         out.write("<number>%s</number>" % obj)
 93 |         return
 94 | 
 95 |     raise TypeError(obj)
 96 | 
 97 | 
 98 | def dumptrailers(
 99 |     out: TextIO, doc: PDFDocument, show_fallback_xref: bool = False
100 | ) -> None:
101 |     for xref in doc.xrefs:
102 |         if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
103 |             out.write("<trailer>\n")
104 |             dumpxml(out, xref.get_trailer())
105 |             out.write("\n</trailer>\n\n")
106 |     no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
107 |     if no_xrefs and not show_fallback_xref:
108 |         msg = (
109 |             "This PDF does not have an xref. Use --show-fallback-xref if "
110 |             "you want to display the content of a fallback xref that "
111 |             "contains all objects."
112 |         )
113 |         logger.warning(msg)
114 |     return
115 | 
116 | 
117 | def dumpallobjs(
118 |     out: TextIO,
119 |     doc: PDFDocument,
120 |     codec: Optional[str] = None,
121 |     show_fallback_xref: bool = False,
122 | ) -> None:
123 |     visited = set()
124 |     out.write("<pdf>")
125 |     for xref in doc.xrefs:
126 |         for objid in xref.get_objids():
127 |             if objid in visited:
128 |                 continue
129 |             visited.add(objid)
130 |             try:
131 |                 obj = doc.getobj(objid)
132 |                 if obj is None:
133 |                     continue
134 |                 out.write('<object id="%d">\n' % objid)
135 |                 dumpxml(out, obj, codec=codec)
136 |                 out.write("\n</object>\n\n")
137 |             except PDFObjectNotFound as e:
138 |                 print("not found: %r" % e)
139 |     dumptrailers(out, doc, show_fallback_xref)
140 |     out.write("</pdf>")
141 |     return
142 | 
143 | 
144 | def dumpoutline(
145 |     outfp: TextIO,
146 |     fname: str,
147 |     objids: Any,
148 |     pagenos: Container[int],
149 |     password: str = "",
150 |     dumpall: bool = False,
151 |     codec: Optional[str] = None,
152 |     extractdir: Optional[str] = None,
153 | ) -> None:
154 |     fp = open(fname, "rb")
155 |     parser = PDFParser(fp)
156 |     doc = PDFDocument(parser, password)
157 |     pages = {
158 |         page.pageid: pageno
159 |         for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)
160 |     }
161 | 
162 |     def resolve_dest(dest: object) -> Any:
163 |         if isinstance(dest, (str, bytes)):
164 |             dest = resolve1(doc.get_dest(dest))
165 |         elif isinstance(dest, PSLiteral):
166 |             dest = resolve1(doc.get_dest(dest.name))
167 |         if isinstance(dest, dict):
168 |             dest = dest["D"]
169 |         if isinstance(dest, PDFObjRef):
170 |             dest = dest.resolve()
171 |         return dest
172 | 
173 |     try:
174 |         outlines = doc.get_outlines()
175 |         outfp.write("<outlines>\n")
176 |         for (level, title, dest, a, se) in outlines:
177 |             pageno = None
178 |             if dest:
179 |                 dest = resolve_dest(dest)
180 |                 pageno = pages[dest[0].objid]
181 |             elif a:
182 |                 action = a
183 |                 if isinstance(action, dict):
184 |                     subtype = action.get("S")
185 |                     if subtype and repr(subtype) == "/'GoTo'" and action.get("D"):
186 |                         dest = resolve_dest(action["D"])
187 |                         pageno = pages[dest[0].objid]
188 |             s = escape(title)
189 |             outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s))
190 |             if dest is not None:
191 |                 outfp.write("<dest>")
192 |                 dumpxml(outfp, dest)
193 |                 outfp.write("</dest>\n")
194 |             if pageno is not None:
195 |                 outfp.write("<pageno>%r</pageno>\n" % pageno)
196 |             outfp.write("</outline>\n")
197 |         outfp.write("</outlines>\n")
198 |     except PDFNoOutlines:
199 |         pass
200 |     parser.close()
201 |     fp.close()
202 |     return
203 | 
204 | 
205 | LITERAL_FILESPEC = LIT("Filespec")
206 | LITERAL_EMBEDDEDFILE = LIT("EmbeddedFile")
207 | 
208 | 
209 | def extractembedded(fname: str, password: str, extractdir: str) -> None:
210 |     def extract1(objid: int, obj: Dict[str, Any]) -> None:
211 |         filename = os.path.basename(obj.get("UF") or cast(bytes, obj.get("F")).decode())
212 |         fileref = obj["EF"].get("UF") or obj["EF"].get("F")
213 |         fileobj = doc.getobj(fileref.objid)
214 |         if not isinstance(fileobj, PDFStream):
215 |             error_msg = (
216 |                 "unable to process PDF: reference for %r is not a "
217 |                 "PDFStream" % filename
218 |             )
219 |             raise PDFValueError(error_msg)
220 |         if fileobj.get("Type") is not LITERAL_EMBEDDEDFILE:
221 |             raise PDFValueError(
222 |                 "unable to process PDF: reference for %r "
223 |                 "is not an EmbeddedFile" % (filename)
224 |             )
225 |         path = os.path.join(extractdir, "%.6d-%s" % (objid, filename))
226 |         if os.path.exists(path):
227 |             raise IOError("file exists: %r" % path)
228 |         print("extracting: %r" % path)
229 |         os.makedirs(os.path.dirname(path), exist_ok=True)
230 |         out = open(path, "wb")
231 |         out.write(fileobj.get_data())
232 |         out.close()
233 |         return
234 | 
235 |     with open(fname, "rb") as fp:
236 |         parser = PDFParser(fp)
237 |         doc = PDFDocument(parser, password)
238 |         extracted_objids = set()
239 |         for xref in doc.xrefs:
240 |             for objid in xref.get_objids():
241 |                 obj = doc.getobj(objid)
242 |                 if (
243 |                     objid not in extracted_objids
244 |                     and isinstance(obj, dict)
245 |                     and obj.get("Type") is LITERAL_FILESPEC
246 |                 ):
247 |                     extracted_objids.add(objid)
248 |                     extract1(objid, obj)
249 |     return
250 | 
251 | 
252 | def dumppdf(
253 |     outfp: TextIO,
254 |     fname: str,
255 |     objids: Iterable[int],
256 |     pagenos: Container[int],
257 |     password: str = "",
258 |     dumpall: bool = False,
259 |     codec: Optional[str] = None,
260 |     extractdir: Optional[str] = None,
261 |     show_fallback_xref: bool = False,
262 | ) -> None:
263 |     fp = open(fname, "rb")
264 |     parser = PDFParser(fp)
265 |     doc = PDFDocument(parser, password)
266 |     if objids:
267 |         for objid in objids:
268 |             obj = doc.getobj(objid)
269 |             dumpxml(outfp, obj, codec=codec)
270 |     if pagenos:
271 |         for (pageno, page) in enumerate(PDFPage.create_pages(doc)):
272 |             if pageno in pagenos:
273 |                 if codec:
274 |                     for obj in page.contents:
275 |                         obj = stream_value(obj)
276 |                         dumpxml(outfp, obj, codec=codec)
277 |                 else:
278 |                     dumpxml(outfp, page.attrs)
279 |     if dumpall:
280 |         dumpallobjs(outfp, doc, codec, show_fallback_xref)
281 |     if (not objids) and (not pagenos) and (not dumpall):
282 |         dumptrailers(outfp, doc, show_fallback_xref)
283 |     fp.close()
284 |     if codec not in ("raw", "binary"):
285 |         outfp.write("\n")
286 |     return
287 | 
288 | 
289 | def create_parser() -> ArgumentParser:
290 |     parser = ArgumentParser(description=__doc__, add_help=True)
291 |     parser.add_argument(
292 |         "files",
293 |         type=str,
294 |         default=None,
295 |         nargs="+",
296 |         help="One or more paths to PDF files.",
297 |     )
298 | 
299 |     parser.add_argument(
300 |         "--version",
301 |         "-v",
302 |         action="version",
303 |         version="pdfminer.six v{}".format(pdfminer.__version__),
304 |     )
305 |     parser.add_argument(
306 |         "--debug",
307 |         "-d",
308 |         default=False,
309 |         action="store_true",
310 |         help="Use debug logging level.",
311 |     )
312 |     procedure_parser = parser.add_mutually_exclusive_group()
313 |     procedure_parser.add_argument(
314 |         "--extract-toc",
315 |         "-T",
316 |         default=False,
317 |         action="store_true",
318 |         help="Extract structure of outline",
319 |     )
320 |     procedure_parser.add_argument(
321 |         "--extract-embedded", "-E", type=str, help="Extract embedded files"
322 |     )
323 | 
324 |     parse_params = parser.add_argument_group(
325 |         "Parser", description="Used during PDF parsing"
326 |     )
327 |     parse_params.add_argument(
328 |         "--page-numbers",
329 |         type=int,
330 |         default=None,
331 |         nargs="+",
332 |         help="A space-seperated list of page numbers to parse.",
333 |     )
334 |     parse_params.add_argument(
335 |         "--pagenos",
336 |         "-p",
337 |         type=str,
338 |         help="A comma-separated list of page numbers to parse. Included for "
339 |         "legacy applications, use --page-numbers for more idiomatic "
340 |         "argument entry.",
341 |     )
342 |     parse_params.add_argument(
343 |         "--objects",
344 |         "-i",
345 |         type=str,
346 |         help="Comma separated list of object numbers to extract",
347 |     )
348 |     parse_params.add_argument(
349 |         "--all",
350 |         "-a",
351 |         default=False,
352 |         action="store_true",
353 |         help="If the structure of all objects should be extracted",
354 |     )
355 |     parse_params.add_argument(
356 |         "--show-fallback-xref",
357 |         action="store_true",
358 |         help="Additionally show the fallback xref. Use this if the PDF "
359 |         "has zero or only invalid xref's. This setting is ignored if "
360 |         "--extract-toc or --extract-embedded is used.",
361 |     )
362 |     parse_params.add_argument(
363 |         "--password",
364 |         "-P",
365 |         type=str,
366 |         default="",
367 |         help="The password to use for decrypting PDF file.",
368 |     )
369 | 
370 |     output_params = parser.add_argument_group(
371 |         "Output", description="Used during output generation."
372 |     )
373 |     output_params.add_argument(
374 |         "--outfile",
375 |         "-o",
376 |         type=str,
377 |         default="-",
378 |         help='Path to file where output is written. Or "-" (default) to '
379 |         "write to stdout.",
380 |     )
381 |     codec_parser = output_params.add_mutually_exclusive_group()
382 |     codec_parser.add_argument(
383 |         "--raw-stream",
384 |         "-r",
385 |         default=False,
386 |         action="store_true",
387 |         help="Write stream objects without encoding",
388 |     )
389 |     codec_parser.add_argument(
390 |         "--binary-stream",
391 |         "-b",
392 |         default=False,
393 |         action="store_true",
394 |         help="Write stream objects with binary encoding",
395 |     )
396 |     codec_parser.add_argument(
397 |         "--text-stream",
398 |         "-t",
399 |         default=False,
400 |         action="store_true",
401 |         help="Write stream objects as plain text",
402 |     )
403 | 
404 |     return parser
405 | 
406 | 
407 | def main(argv: Optional[List[str]] = None) -> None:
408 |     parser = create_parser()
409 |     args = parser.parse_args(args=argv)
410 | 
411 |     if args.debug:
412 |         logging.getLogger().setLevel(logging.DEBUG)
413 | 
414 |     if args.outfile == "-":
415 |         outfp = sys.stdout
416 |     else:
417 |         outfp = open(args.outfile, "w")
418 | 
419 |     if args.objects:
420 |         objids = [int(x) for x in args.objects.split(",")]
421 |     else:
422 |         objids = []
423 | 
424 |     if args.page_numbers:
425 |         pagenos = {x - 1 for x in args.page_numbers}
426 |     elif args.pagenos:
427 |         pagenos = {int(x) - 1 for x in args.pagenos.split(",")}
428 |     else:
429 |         pagenos = set()
430 | 
431 |     password = args.password
432 | 
433 |     if args.raw_stream:
434 |         codec: Optional[str] = "raw"
435 |     elif args.binary_stream:
436 |         codec = "binary"
437 |     elif args.text_stream:
438 |         codec = "text"
439 |     else:
440 |         codec = None
441 | 
442 |     for fname in args.files:
443 |         if args.extract_toc:
444 |             dumpoutline(
445 |                 outfp,
446 |                 fname,
447 |                 objids,
448 |                 pagenos,
449 |                 password=password,
450 |                 dumpall=args.all,
451 |                 codec=codec,
452 |                 extractdir=None,
453 |             )
454 |         elif args.extract_embedded:
455 |             extractembedded(fname, password=password, extractdir=args.extract_embedded)
456 |         else:
457 |             dumppdf(
458 |                 outfp,
459 |                 fname,
460 |                 objids,
461 |                 pagenos,
462 |                 password=password,
463 |                 dumpall=args.all,
464 |                 codec=codec,
465 |                 extractdir=None,
466 |                 show_fallback_xref=args.show_fallback_xref,
467 |             )
468 | 
469 |     outfp.close()
470 | 
471 | 
472 | if __name__ == "__main__":
473 |     main()
474 | 


--------------------------------------------------------------------------------
/src/notebook/jd_data_extraction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from tqdm import tqdm\n",
 11 |     "from time import sleep\n",
 12 |     "from selenium import webdriver\n",
 13 |     "from selenium.webdriver.common.by import By\n",
 14 |     "from selenium.webdriver.support.ui import WebDriverWait\n",
 15 |     "from selenium.webdriver.support import expected_conditions as EC\n",
 16 |     "from selenium.common.exceptions import TimeoutException\n",
 17 |     "from bs4 import BeautifulSoup\n",
 18 |     "from selenium.common.exceptions import ElementClickInterceptedException\n",
 19 |     "from selenium.common.exceptions import NoSuchElementException\n",
 20 |     "import json\n",
 21 |     "import urllib\n",
 22 |     "import time"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "driver = webdriver.Chrome(executable_path=r'C:\\Users\\Admin\\ML_Projects\\Job_Recommendation_System\\Job-Recommendation-System\\chromedriver_win32\\chromedriver.exe')\n",
 32 |     "\n",
 33 |     "def openbrowser(locid, key):\n",
 34 |     "    driver.wait = WebDriverWait(driver, 5)\n",
 35 |     "    driver.maximize_window()\n",
 36 |     "    words = key.split()\n",
 37 |     "    txt =''    \n",
 38 |     "    for w in words:\n",
 39 |     "        txt +=(w+'+')\n",
 40 |     "    #print (txt)\n",
 41 |     "    driver.get(\"https://www.glassdoor.co.in/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword={}\n",
 42 |     "               &sc.keyword\"={}&locT=C&locId={}&jobType=fulltime&fromAge=1&radius=6&cityId=-1&minRating=0.0&industryId=-1\n",
 43 |     "               &sgocId=-1&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0\".format(txt[:-1],txt[:-1], locid))\n",
 44 |     "    return driver\n",
 45 |     "\n",
 46 |     "def geturl(driver):\n",
 47 |     "    url = set()\n",
 48 |     "    while True:\n",
 49 |     "        print(len(url))\n",
 50 |     "        if len(url)>=20:\n",
 51 |     "            break\n",
 52 |     "        soup1 = BeautifulSoup(driver.page_source, \"lxml\")\n",
 53 |     "        \n",
 54 |     "        main = soup1.find_all(\"li\",{\"class\":\"jl\"})\n",
 55 |     "        \n",
 56 |     "        for m in main:\n",
 57 |     "            url.add('https://www.glassdoor.co.in{}'.format(m.find('a')['href']))       \n",
 58 |     "        try:\n",
 59 |     "            next_element = soup1.find(\"li\", {\"class\": \"next\"})\n",
 60 |     "            try:\n",
 61 |     "                next_exist = next_element.find('a')\n",
 62 |     "            except AttributeError:\n",
 63 |     "                driver.quit()\n",
 64 |     "                break\n",
 65 |     "            except NoSuchElementException:\n",
 66 |     "                driver.quit()\n",
 67 |     "                break\n",
 68 |     "            if next_exist:\n",
 69 |     "    \n",
 70 |     "                driver.find_element_by_class_name(\"next\").click()\n",
 71 |     "                time.sleep(2)\n",
 72 |     "            else:\n",
 73 |     "                driver.quit()\n",
 74 |     "                break\n",
 75 |     "        except ElementClickInterceptedException:\n",
 76 |     "            pass\n",
 77 |     "        \n",
 78 |     "    return list(url)\n",
 79 |     "\n",
 80 |     "x =openbrowser(locid =4477468, key='\"Data Scientist\"')\n",
 81 |     "with open('url_data_scientist_loc_bangalore.json','w') as f:\n",
 82 |     "    json.dump(geturl(driver),f, indent = 4)\n",
 83 |     "    print(\"file created\")"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "with open('url_data_scientist_loc_bangalore.json','r') as f:\n",
 93 |     "    url = json.load(f)\n",
 94 |     "data ={}    \n",
 95 |     "i = 1\n",
 96 |     "jd_df = pd.DataFrame()\n",
 97 |     "driver = webdriver.Chrome(executable_path=r'C:\\Users\\Admin\\ML_Projects\\Job_Recommendation_System\\Job-Recommendation-System\\chromedriver_win32\\chromedriver.exe')\n",
 98 |     "\n",
 99 |     "for u in tqdm(url):\n",
100 |     "    driver.wait = WebDriverWait(driver, 2)\n",
101 |     "    driver.maximize_window()\n",
102 |     "    driver.get(u)\n",
103 |     "    soup = BeautifulSoup(driver.page_source, \"lxml\")\n",
104 |     "    try:\n",
105 |     "       \n",
106 |     "        header = soup.find(\"div\",{\"class\":\"header cell info\"})\n",
107 |     "        position = driver.find_element_by_tag_name('h2').text\n",
108 |     "        company = driver.find_element_by_xpath(\"//span[@class='strong ib']\").text\n",
109 |     "        location = driver.find_element_by_xpath(\"//span[@class='subtle ib']\").text\n",
110 |     "        jd_temp = driver.find_element_by_id(\"JobDescriptionContainer\")\n",
111 |     "        jd = jd_temp.text\n",
112 |     "        info = soup.find_all(\"infoEntity\")\n",
113 |     "    except IndexError:\n",
114 |     "        print('IndexError: list index out of range')\n",
115 |     "    except NoSuchElementException:\n",
116 |     "        pass\n",
117 |     "    data[i] = {\n",
118 |     "        'url' :u,\n",
119 |     "        'Position':position,\n",
120 |     "        'Company': company,\n",
121 |     "        'Location' :location,\n",
122 |     "        'Job_Description' :jd\n",
123 |     "    }\n",
124 |     "    i+=1     \n",
125 |     "driver.quit()\n",
126 |     "jd_df = pd.DataFrame(data)\n",
127 |     "jd = jd_df.transpose()\n",
128 |     "\n",
129 |     "jd = jd[['url','Position','Company','Location','Job_Description']]\n",
130 |     "jd.to_csv('unstructured_data.csv')\n",
131 |     "print('file created')"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "def get_jobs(keyword, num_jobs, verbose, path, slp_time):\n",
141 |     "    \n",
142 |     "    '''Gathers jobs as a dataframe, scraped from Glassdoor'''\n",
143 |     "    \n",
144 |     "    #Initializing the webdriver\n",
145 |     "    options = webdriver.ChromeOptions()\n",
146 |     "    \n",
147 |     "    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.\n",
148 |     "    #options.add_argument('headless')\n",
149 |     "    \n",
150 |     "    #Change the path to where chromedriver is in your home folder.\n",
151 |     "    driver = webdriver.Chrome(executable_path=path, options=options)\n",
152 |     "    driver.set_window_size(1120, 1000)\n",
153 |     "    \n",
154 |     "    url = \"https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=\"+keyword+\"&sc.keyword=\"+keyword+\"&locT=&locId=&jobType=\"\n",
155 |     "    #url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=\"' + keyword + '\"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'\n",
156 |     "    driver.get(url)\n",
157 |     "    jobs = []\n",
158 |     "\n",
159 |     "    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.\n",
160 |     "\n",
161 |     "        #Let the page load. Change this number based on your internet speed.\n",
162 |     "        #Or, wait until the webpage is loaded, instead of hardcoding it.\n",
163 |     "        time.sleep(slp_time)\n",
164 |     "\n",
165 |     "        #Test for the \"Sign Up\" prompt and get rid of it.\n",
166 |     "        try:\n",
167 |     "            driver.find_element_by_class_name(\"selected\").click()\n",
168 |     "        except ElementClickInterceptedException:\n",
169 |     "            pass\n",
170 |     "\n",
171 |     "        time.sleep(.1)\n",
172 |     "\n",
173 |     "        try:\n",
174 |     "            driver.find_element_by_css_selector('[alt=\"Close\"]').click() #clicking to the X.\n",
175 |     "            print(' x out worked')\n",
176 |     "        except NoSuchElementException:\n",
177 |     "            print(' x out failed')\n",
178 |     "            pass\n",
179 |     "\n",
180 |     "        \n",
181 |     "        #Going through each job in this page\n",
182 |     "        job_buttons = driver.find_elements_by_class_name(\"jl\")  #jl for Job Listing. These are the buttons we're going to click.\n",
183 |     "        for job_button in job_buttons:  \n",
184 |     "\n",
185 |     "            print(\"Progress: {}\".format(\"\" + str(len(jobs)) + \"/\" + str(num_jobs)))\n",
186 |     "            if len(jobs) >= num_jobs:\n",
187 |     "                break\n",
188 |     "\n",
189 |     "            job_button.click()  #You might \n",
190 |     "            time.sleep(1)\n",
191 |     "            collected_successfully = False\n",
192 |     "            \n",
193 |     "            while not collected_successfully:\n",
194 |     "                try:\n",
195 |     "                    company_name = driver.find_element_by_xpath('.//div[@class=\"employerName\"]').text\n",
196 |     "                    location = driver.find_element_by_xpath('.//div[@class=\"location\"]').text\n",
197 |     "                    job_title = driver.find_element_by_xpath('.//div[contains(@class, \"title\")]').text\n",
198 |     "                    job_description = driver.find_element_by_xpath('.//div[@class=\"jobDescriptionContent desc\"]').text\n",
199 |     "                    collected_successfully = True\n",
200 |     "                except:\n",
201 |     "                    time.sleep(5)\n",
202 |     "\n",
203 |     "            try:\n",
204 |     "                salary_estimate = driver.find_element_by_xpath('.//span[@class=\"gray salary\"]').text\n",
205 |     "            except NoSuchElementException:\n",
206 |     "                salary_estimate = -1 #You need to set a \"not found value. It's important.\"\n",
207 |     "            \n",
208 |     "            try:\n",
209 |     "                rating = driver.find_element_by_xpath('.//span[@class=\"rating\"]').text\n",
210 |     "            except NoSuchElementException:\n",
211 |     "                rating = -1 #You need to set a \"not found value. It's important.\"\n",
212 |     "\n",
213 |     "            #Printing for debugging\n",
214 |     "            if verbose:\n",
215 |     "                print(\"Job Title: {}\".format(job_title))\n",
216 |     "                print(\"Salary Estimate: {}\".format(salary_estimate))\n",
217 |     "                print(\"Job Description: {}\".format(job_description[:500]))\n",
218 |     "                print(\"Rating: {}\".format(rating))\n",
219 |     "                print(\"Company Name: {}\".format(company_name))\n",
220 |     "                print(\"Location: {}\".format(location))\n",
221 |     "\n",
222 |     "            #Going to the Company tab...\n",
223 |     "            #clicking on this:\n",
224 |     "            #<div class=\"tab\" data-tab-type=\"overview\"><span>Company</span></div>\n",
225 |     "            try:\n",
226 |     "                driver.find_element_by_xpath('.//div[@class=\"tab\" and @data-tab-type=\"overview\"]').click()\n",
227 |     "\n",
228 |     "                try:\n",
229 |     "                    #<div class=\"infoEntity\">\n",
230 |     "                    #    <label>Headquarters</label>\n",
231 |     "                    #    <span class=\"value\">San Francisco, CA</span>\n",
232 |     "                    #</div>\n",
233 |     "                    headquarters = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Headquarters\"]//following-sibling::*').text\n",
234 |     "                except NoSuchElementException:\n",
235 |     "                    headquarters = -1\n",
236 |     "\n",
237 |     "                try:\n",
238 |     "                    size = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Size\"]//following-sibling::*').text\n",
239 |     "                except NoSuchElementException:\n",
240 |     "                    size = -1\n",
241 |     "\n",
242 |     "                try:\n",
243 |     "                    founded = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Founded\"]//following-sibling::*').text\n",
244 |     "                except NoSuchElementException:\n",
245 |     "                    founded = -1\n",
246 |     "\n",
247 |     "                try:\n",
248 |     "                    type_of_ownership = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Type\"]//following-sibling::*').text\n",
249 |     "                except NoSuchElementException:\n",
250 |     "                    type_of_ownership = -1\n",
251 |     "\n",
252 |     "                try:\n",
253 |     "                    industry = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Industry\"]//following-sibling::*').text\n",
254 |     "                except NoSuchElementException:\n",
255 |     "                    industry = -1\n",
256 |     "\n",
257 |     "                try:\n",
258 |     "                    sector = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Sector\"]//following-sibling::*').text\n",
259 |     "                except NoSuchElementException:\n",
260 |     "                    sector = -1\n",
261 |     "\n",
262 |     "                try:\n",
263 |     "                    revenue = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Revenue\"]//following-sibling::*').text\n",
264 |     "                except NoSuchElementException:\n",
265 |     "                    revenue = -1\n",
266 |     "\n",
267 |     "                try:\n",
268 |     "                    competitors = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Competitors\"]//following-sibling::*').text\n",
269 |     "                except NoSuchElementException:\n",
270 |     "                    competitors = -1\n",
271 |     "\n",
272 |     "            except NoSuchElementException:  #Rarely, some job postings do not have the \"Company\" tab.\n",
273 |     "                headquarters = -1\n",
274 |     "                size = -1\n",
275 |     "                founded = -1\n",
276 |     "                type_of_ownership = -1\n",
277 |     "                industry = -1\n",
278 |     "                sector = -1\n",
279 |     "                revenue = -1\n",
280 |     "                competitors = -1\n",
281 |     "\n",
282 |     "                \n",
283 |     "            if verbose:\n",
284 |     "                print(\"Headquarters: {}\".format(headquarters))\n",
285 |     "                print(\"Size: {}\".format(size))\n",
286 |     "                print(\"Founded: {}\".format(founded))\n",
287 |     "                print(\"Type of Ownership: {}\".format(type_of_ownership))\n",
288 |     "                print(\"Industry: {}\".format(industry))\n",
289 |     "                print(\"Sector: {}\".format(sector))\n",
290 |     "                print(\"Revenue: {}\".format(revenue))\n",
291 |     "                print(\"Competitors: {}\".format(competitors))\n",
292 |     "\n",
293 |     "            jobs.append({\"Job Title\" : job_title,\n",
294 |     "            \"Salary Estimate\" : salary_estimate,\n",
295 |     "            \"Job Description\" : job_description,\n",
296 |     "            \"Rating\" : rating,\n",
297 |     "            \"Company Name\" : company_name,\n",
298 |     "            \"Location\" : location,\n",
299 |     "            \"Headquarters\" : headquarters,\n",
300 |     "            \"Size\" : size,\n",
301 |     "            \"Founded\" : founded,\n",
302 |     "            \"Type of ownership\" : type_of_ownership,\n",
303 |     "            \"Industry\" : industry,\n",
304 |     "            \"Sector\" : sector,\n",
305 |     "            \"Revenue\" : revenue,\n",
306 |     "            \"Competitors\" : competitors})\n",
307 |     "            #add job to jobs\n",
308 |     "            \n",
309 |     "            \n",
310 |     "        #Clicking on the \"next page\" button\n",
311 |     "        try:\n",
312 |     "            driver.find_element_by_xpath('.//li[@class=\"next\"]//a').click()\n",
313 |     "        except NoSuchElementException:\n",
314 |     "            print(\"Scraping terminated before reaching target number of jobs. Needed {}, got {}.\".format(num_jobs, len(jobs)))\n",
315 |     "            break\n",
316 |     "\n",
317 |     "    return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame."
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "path = r\"C:\\Users\\Admin\\ML_Projects\\Job_Recommendation_System\\Job-Recommendation-System\\chromedriver_win32\\chromedriver.exe\"\n",
327 |     "\n",
328 |     "unstructured_data_df = get_jobs('data scientist',1000, False, path, 15)\n",
329 |     "\n",
330 |     "unstructured_data_df.to_csv('unstructured_data.csv', index = False)"
331 |    ]
332 |   }
333 |  ],
334 |  "metadata": {
335 |   "kernelspec": {
336 |    "display_name": "myenv",
337 |    "language": "python",
338 |    "name": "python3"
339 |   },
340 |   "language_info": {
341 |    "codemirror_mode": {
342 |     "name": "ipython",
343 |     "version": 3
344 |    },
345 |    "file_extension": ".py",
346 |    "mimetype": "text/x-python",
347 |    "name": "python",
348 |    "nbconvert_exporter": "python",
349 |    "pygments_lexer": "ipython3",
350 |    "version": "3.10.10"
351 |   },
352 |   "orig_nbformat": 4,
353 |   "vscode": {
354 |    "interpreter": {
355 |     "hash": "ae6b9c19ba8290d367f751939abe8de5af7ecdf4fdf442937bc3215b661f3d40"
356 |    }
357 |   }
358 |  },
359 |  "nbformat": 4,
360 |  "nbformat_minor": 2
361 | }
362 | 


--------------------------------------------------------------------------------
/src/notebook/job_recommendation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re\n",
 10 |     "from ftfy import fix_text\n",
 11 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 12 |     "import re\n",
 13 |     "from sklearn.neighbors import NearestNeighbors\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "import nltk\n",
 17 |     "from nltk.corpus import stopwords\n",
 18 |     "stopw  = set(stopwords.words('english'))\n",
 19 |     "from pyresparser import ResumeParser\n",
 20 |     "import os\n",
 21 |     "from docx import Document\n",
 22 |     "from skills_extraction import skills_extractor"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 4,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/html": [
 33 |        "<div>\n",
 34 |        "<style scoped>\n",
 35 |        "    .dataframe tbody tr th:only-of-type {\n",
 36 |        "        vertical-align: middle;\n",
 37 |        "    }\n",
 38 |        "\n",
 39 |        "    .dataframe tbody tr th {\n",
 40 |        "        vertical-align: top;\n",
 41 |        "    }\n",
 42 |        "\n",
 43 |        "    .dataframe thead th {\n",
 44 |        "        text-align: right;\n",
 45 |        "    }\n",
 46 |        "</style>\n",
 47 |        "<table border=\"1\" class=\"dataframe\">\n",
 48 |        "  <thead>\n",
 49 |        "    <tr style=\"text-align: right;\">\n",
 50 |        "      <th></th>\n",
 51 |        "      <th>Job Title</th>\n",
 52 |        "      <th>Rating</th>\n",
 53 |        "      <th>Company Name</th>\n",
 54 |        "      <th>Location</th>\n",
 55 |        "      <th>Headquarters</th>\n",
 56 |        "      <th>Size</th>\n",
 57 |        "      <th>Founded</th>\n",
 58 |        "      <th>Type of ownership</th>\n",
 59 |        "      <th>Industry</th>\n",
 60 |        "      <th>Sector</th>\n",
 61 |        "      <th>Competitors</th>\n",
 62 |        "      <th>Average Salary</th>\n",
 63 |        "      <th>Average Revenue</th>\n",
 64 |        "      <th>Processed_JD</th>\n",
 65 |        "    </tr>\n",
 66 |        "  </thead>\n",
 67 |        "  <tbody>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>0</th>\n",
 70 |        "      <td>Data Scientist</td>\n",
 71 |        "      <td>3.8</td>\n",
 72 |        "      <td>Tecolote Research</td>\n",
 73 |        "      <td>Albuquerque, NM</td>\n",
 74 |        "      <td>Goleta, CA</td>\n",
 75 |        "      <td>750.5</td>\n",
 76 |        "      <td>1973</td>\n",
 77 |        "      <td>Company - Private</td>\n",
 78 |        "      <td>Aerospace &amp; Defense</td>\n",
 79 |        "      <td>Aerospace &amp; Defense</td>\n",
 80 |        "      <td>-1</td>\n",
 81 |        "      <td>72.0</td>\n",
 82 |        "      <td>75.000000</td>\n",
 83 |        "      <td>Data Scientist Location: Albuquerque, Educatio...</td>\n",
 84 |        "    </tr>\n",
 85 |        "    <tr>\n",
 86 |        "      <th>1</th>\n",
 87 |        "      <td>Healthcare Data Scientist</td>\n",
 88 |        "      <td>3.4</td>\n",
 89 |        "      <td>University of Maryland Medical System</td>\n",
 90 |        "      <td>Linthicum, MD</td>\n",
 91 |        "      <td>Baltimore, MD</td>\n",
 92 |        "      <td>10000.0</td>\n",
 93 |        "      <td>1984</td>\n",
 94 |        "      <td>Other Organization</td>\n",
 95 |        "      <td>Health Care Services &amp; Hospitals</td>\n",
 96 |        "      <td>Health Care</td>\n",
 97 |        "      <td>-1</td>\n",
 98 |        "      <td>87.5</td>\n",
 99 |        "      <td>3500.000000</td>\n",
100 |        "      <td>What You Will Do: General Summary The Healthca...</td>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "      <th>2</th>\n",
104 |        "      <td>Data Scientist</td>\n",
105 |        "      <td>4.8</td>\n",
106 |        "      <td>KnowBe4</td>\n",
107 |        "      <td>Clearwater, FL</td>\n",
108 |        "      <td>Clearwater, FL</td>\n",
109 |        "      <td>750.5</td>\n",
110 |        "      <td>2010</td>\n",
111 |        "      <td>Company - Private</td>\n",
112 |        "      <td>Security Services</td>\n",
113 |        "      <td>Business Services</td>\n",
114 |        "      <td>-1</td>\n",
115 |        "      <td>85.0</td>\n",
116 |        "      <td>300.000000</td>\n",
117 |        "      <td>KnowBe4, Inc. high growth information security...</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>3</th>\n",
121 |        "      <td>Data Scientist</td>\n",
122 |        "      <td>3.8</td>\n",
123 |        "      <td>PNNL</td>\n",
124 |        "      <td>Richland, WA</td>\n",
125 |        "      <td>Richland, WA</td>\n",
126 |        "      <td>3000.5</td>\n",
127 |        "      <td>1965</td>\n",
128 |        "      <td>Government</td>\n",
129 |        "      <td>Energy</td>\n",
130 |        "      <td>Oil, Gas, Energy &amp; Utilities</td>\n",
131 |        "      <td>Oak Ridge National Laboratory, National Renewa...</td>\n",
132 |        "      <td>76.5</td>\n",
133 |        "      <td>250500.000000</td>\n",
134 |        "      <td>*Organization Job ID** Job ID: 310709 Director...</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>4</th>\n",
138 |        "      <td>Data Scientist</td>\n",
139 |        "      <td>2.9</td>\n",
140 |        "      <td>Affinity Solutions</td>\n",
141 |        "      <td>New York, NY</td>\n",
142 |        "      <td>New York, NY</td>\n",
143 |        "      <td>125.5</td>\n",
144 |        "      <td>1998</td>\n",
145 |        "      <td>Company - Private</td>\n",
146 |        "      <td>Advertising &amp; Marketing</td>\n",
147 |        "      <td>Business Services</td>\n",
148 |        "      <td>Commerce Signals, Cardlytics, Yodlee</td>\n",
149 |        "      <td>114.5</td>\n",
150 |        "      <td>24319.000761</td>\n",
151 |        "      <td>Data Scientist Affinity Solutions Marketing Cl...</td>\n",
152 |        "    </tr>\n",
153 |        "  </tbody>\n",
154 |        "</table>\n",
155 |        "</div>"
156 |       ],
157 |       "text/plain": [
158 |        "                   Job Title  Rating                           Company Name   \n",
159 |        "0             Data Scientist     3.8                      Tecolote Research  \\\n",
160 |        "1  Healthcare Data Scientist     3.4  University of Maryland Medical System   \n",
161 |        "2             Data Scientist     4.8                                KnowBe4   \n",
162 |        "3             Data Scientist     3.8                                   PNNL   \n",
163 |        "4             Data Scientist     2.9                     Affinity Solutions   \n",
164 |        "\n",
165 |        "          Location    Headquarters     Size  Founded   Type of ownership   \n",
166 |        "0  Albuquerque, NM      Goleta, CA    750.5     1973   Company - Private  \\\n",
167 |        "1    Linthicum, MD   Baltimore, MD  10000.0     1984  Other Organization   \n",
168 |        "2   Clearwater, FL  Clearwater, FL    750.5     2010   Company - Private   \n",
169 |        "3     Richland, WA    Richland, WA   3000.5     1965          Government   \n",
170 |        "4     New York, NY    New York, NY    125.5     1998   Company - Private   \n",
171 |        "\n",
172 |        "                           Industry                        Sector   \n",
173 |        "0               Aerospace & Defense           Aerospace & Defense  \\\n",
174 |        "1  Health Care Services & Hospitals                   Health Care   \n",
175 |        "2                 Security Services             Business Services   \n",
176 |        "3                            Energy  Oil, Gas, Energy & Utilities   \n",
177 |        "4           Advertising & Marketing             Business Services   \n",
178 |        "\n",
179 |        "                                         Competitors  Average Salary   \n",
180 |        "0                                                 -1            72.0  \\\n",
181 |        "1                                                 -1            87.5   \n",
182 |        "2                                                 -1            85.0   \n",
183 |        "3  Oak Ridge National Laboratory, National Renewa...            76.5   \n",
184 |        "4               Commerce Signals, Cardlytics, Yodlee           114.5   \n",
185 |        "\n",
186 |        "   Average Revenue                                       Processed_JD  \n",
187 |        "0        75.000000  Data Scientist Location: Albuquerque, Educatio...  \n",
188 |        "1      3500.000000  What You Will Do: General Summary The Healthca...  \n",
189 |        "2       300.000000  KnowBe4, Inc. high growth information security...  \n",
190 |        "3    250500.000000  *Organization Job ID** Job ID: 310709 Director...  \n",
191 |        "4     24319.000761  Data Scientist Affinity Solutions Marketing Cl...  "
192 |       ]
193 |      },
194 |      "execution_count": 4,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "# Load dataset:\n",
201 |     "jd_df=pd.read_csv('jd_structured_data.csv')\n",
202 |     "jd_df.head()"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 7,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "# Load the extracted resume skills:\n",
212 |     "file_path=r'C:\\Users\\Admin\\ML_Projects\\Job_Recommendation_System\\Job-Recommendation-System\\src\\notebook\\CV.pdf'\n",
213 |     "skills=[]\n",
214 |     "skills.append(' '.join(word for word in skills_extractor(file_path)))"
215 |    ]
216 |   },
217 |   {
218 |    "attachments": {},
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "# Feature Engineering"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 8,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "def ngrams(string, n=3):\n",
232 |     "    string = fix_text(string) # fix text\n",
233 |     "    string = string.encode(\"ascii\", errors=\"ignore\").decode() #remove non ascii chars\n",
234 |     "    string = string.lower()\n",
235 |     "    chars_to_remove = [\")\",\"(\",\".\",\"|\",\"[\",\"]\",\"{\",\"}\",\"'\"]\n",
236 |     "    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'\n",
237 |     "    string = re.sub(rx, '', string)\n",
238 |     "    string = string.replace('&', 'and')\n",
239 |     "    string = string.replace(',', ' ')\n",
240 |     "    string = string.replace('-', ' ')\n",
241 |     "    string = string.title() # normalise case - capital at start of each word\n",
242 |     "    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single\n",
243 |     "    string = ' '+ string +' ' # pad names for ngrams...\n",
244 |     "    string = re.sub(r'[,-./]|\\sBD',r'', string)\n",
245 |     "    ngrams = zip(*[string[i:] for i in range(n)])\n",
246 |     "    return [''.join(ngram) for ngram in ngrams]\n"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 9,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)\n",
256 |     "tfidf = vectorizer.fit_transform(skills)\n"
257 |    ]
258 |   },
259 |   {
260 |    "attachments": {},
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "# Job Recommender"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 12,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)\n",
274 |     "jd_test = (jd_df['Processed_JD'].values.astype('U'))"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 11,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "def getNearestN(query):\n",
284 |     "  queryTFIDF_ = vectorizer.transform(query)\n",
285 |     "  distances, indices = nbrs.kneighbors(queryTFIDF_)\n",
286 |     "  return distances, indices\n"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 13,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "distances, indices = getNearestN(jd_test)\n",
296 |     "test = list(jd_test) \n",
297 |     "matches = []"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 14,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "for i,j in enumerate(indices):\n",
307 |     "    dist=round(distances[i][0],2)\n",
308 |     "  \n",
309 |     "    temp = [dist]\n",
310 |     "    matches.append(temp)\n",
311 |     "    \n",
312 |     "matches = pd.DataFrame(matches, columns=['Match confidence'])"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 23,
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "data": {
322 |       "text/html": [
323 |        "<div>\n",
324 |        "<style scoped>\n",
325 |        "    .dataframe tbody tr th:only-of-type {\n",
326 |        "        vertical-align: middle;\n",
327 |        "    }\n",
328 |        "\n",
329 |        "    .dataframe tbody tr th {\n",
330 |        "        vertical-align: top;\n",
331 |        "    }\n",
332 |        "\n",
333 |        "    .dataframe thead th {\n",
334 |        "        text-align: right;\n",
335 |        "    }\n",
336 |        "</style>\n",
337 |        "<table border=\"1\" class=\"dataframe\">\n",
338 |        "  <thead>\n",
339 |        "    <tr style=\"text-align: right;\">\n",
340 |        "      <th></th>\n",
341 |        "      <th>Job Title</th>\n",
342 |        "      <th>Rating</th>\n",
343 |        "      <th>Company Name</th>\n",
344 |        "      <th>Location</th>\n",
345 |        "      <th>Headquarters</th>\n",
346 |        "      <th>Size</th>\n",
347 |        "      <th>Founded</th>\n",
348 |        "      <th>Type of ownership</th>\n",
349 |        "      <th>Industry</th>\n",
350 |        "      <th>Sector</th>\n",
351 |        "      <th>Competitors</th>\n",
352 |        "      <th>Average Salary</th>\n",
353 |        "      <th>Average Revenue</th>\n",
354 |        "      <th>Processed_JD</th>\n",
355 |        "      <th>match</th>\n",
356 |        "    </tr>\n",
357 |        "  </thead>\n",
358 |        "  <tbody>\n",
359 |        "    <tr>\n",
360 |        "      <th>4</th>\n",
361 |        "      <td>Data Scientist</td>\n",
362 |        "      <td>2.9</td>\n",
363 |        "      <td>Affinity Solutions</td>\n",
364 |        "      <td>New York, NY</td>\n",
365 |        "      <td>New York, NY</td>\n",
366 |        "      <td>125.5</td>\n",
367 |        "      <td>1998</td>\n",
368 |        "      <td>Company - Private</td>\n",
369 |        "      <td>Advertising &amp; Marketing</td>\n",
370 |        "      <td>Business Services</td>\n",
371 |        "      <td>Commerce Signals, Cardlytics, Yodlee</td>\n",
372 |        "      <td>114.5</td>\n",
373 |        "      <td>24319.000761</td>\n",
374 |        "      <td>Data Scientist Affinity Solutions Marketing Cl...</td>\n",
375 |        "      <td>0.73</td>\n",
376 |        "    </tr>\n",
377 |        "    <tr>\n",
378 |        "      <th>0</th>\n",
379 |        "      <td>Data Scientist</td>\n",
380 |        "      <td>3.8</td>\n",
381 |        "      <td>Tecolote Research</td>\n",
382 |        "      <td>Albuquerque, NM</td>\n",
383 |        "      <td>Goleta, CA</td>\n",
384 |        "      <td>750.5</td>\n",
385 |        "      <td>1973</td>\n",
386 |        "      <td>Company - Private</td>\n",
387 |        "      <td>Aerospace &amp; Defense</td>\n",
388 |        "      <td>Aerospace &amp; Defense</td>\n",
389 |        "      <td>-1</td>\n",
390 |        "      <td>72.0</td>\n",
391 |        "      <td>75.000000</td>\n",
392 |        "      <td>Data Scientist Location: Albuquerque, Educatio...</td>\n",
393 |        "      <td>0.74</td>\n",
394 |        "    </tr>\n",
395 |        "    <tr>\n",
396 |        "      <th>2</th>\n",
397 |        "      <td>Data Scientist</td>\n",
398 |        "      <td>4.8</td>\n",
399 |        "      <td>KnowBe4</td>\n",
400 |        "      <td>Clearwater, FL</td>\n",
401 |        "      <td>Clearwater, FL</td>\n",
402 |        "      <td>750.5</td>\n",
403 |        "      <td>2010</td>\n",
404 |        "      <td>Company - Private</td>\n",
405 |        "      <td>Security Services</td>\n",
406 |        "      <td>Business Services</td>\n",
407 |        "      <td>-1</td>\n",
408 |        "      <td>85.0</td>\n",
409 |        "      <td>300.000000</td>\n",
410 |        "      <td>KnowBe4, Inc. high growth information security...</td>\n",
411 |        "      <td>0.79</td>\n",
412 |        "    </tr>\n",
413 |        "    <tr>\n",
414 |        "      <th>3</th>\n",
415 |        "      <td>Data Scientist</td>\n",
416 |        "      <td>3.8</td>\n",
417 |        "      <td>PNNL</td>\n",
418 |        "      <td>Richland, WA</td>\n",
419 |        "      <td>Richland, WA</td>\n",
420 |        "      <td>3000.5</td>\n",
421 |        "      <td>1965</td>\n",
422 |        "      <td>Government</td>\n",
423 |        "      <td>Energy</td>\n",
424 |        "      <td>Oil, Gas, Energy &amp; Utilities</td>\n",
425 |        "      <td>Oak Ridge National Laboratory, National Renewa...</td>\n",
426 |        "      <td>76.5</td>\n",
427 |        "      <td>250500.000000</td>\n",
428 |        "      <td>*Organization Job ID** Job ID: 310709 Director...</td>\n",
429 |        "      <td>0.80</td>\n",
430 |        "    </tr>\n",
431 |        "    <tr>\n",
432 |        "      <th>1</th>\n",
433 |        "      <td>Healthcare Data Scientist</td>\n",
434 |        "      <td>3.4</td>\n",
435 |        "      <td>University of Maryland Medical System</td>\n",
436 |        "      <td>Linthicum, MD</td>\n",
437 |        "      <td>Baltimore, MD</td>\n",
438 |        "      <td>10000.0</td>\n",
439 |        "      <td>1984</td>\n",
440 |        "      <td>Other Organization</td>\n",
441 |        "      <td>Health Care Services &amp; Hospitals</td>\n",
442 |        "      <td>Health Care</td>\n",
443 |        "      <td>-1</td>\n",
444 |        "      <td>87.5</td>\n",
445 |        "      <td>3500.000000</td>\n",
446 |        "      <td>What You Will Do: General Summary The Healthca...</td>\n",
447 |        "      <td>0.85</td>\n",
448 |        "    </tr>\n",
449 |        "  </tbody>\n",
450 |        "</table>\n",
451 |        "</div>"
452 |       ],
453 |       "text/plain": [
454 |        "                   Job Title  Rating                           Company Name   \n",
455 |        "4             Data Scientist     2.9                     Affinity Solutions  \\\n",
456 |        "0             Data Scientist     3.8                      Tecolote Research   \n",
457 |        "2             Data Scientist     4.8                                KnowBe4   \n",
458 |        "3             Data Scientist     3.8                                   PNNL   \n",
459 |        "1  Healthcare Data Scientist     3.4  University of Maryland Medical System   \n",
460 |        "\n",
461 |        "          Location    Headquarters     Size  Founded   Type of ownership   \n",
462 |        "4     New York, NY    New York, NY    125.5     1998   Company - Private  \\\n",
463 |        "0  Albuquerque, NM      Goleta, CA    750.5     1973   Company - Private   \n",
464 |        "2   Clearwater, FL  Clearwater, FL    750.5     2010   Company - Private   \n",
465 |        "3     Richland, WA    Richland, WA   3000.5     1965          Government   \n",
466 |        "1    Linthicum, MD   Baltimore, MD  10000.0     1984  Other Organization   \n",
467 |        "\n",
468 |        "                           Industry                        Sector   \n",
469 |        "4           Advertising & Marketing             Business Services  \\\n",
470 |        "0               Aerospace & Defense           Aerospace & Defense   \n",
471 |        "2                 Security Services             Business Services   \n",
472 |        "3                            Energy  Oil, Gas, Energy & Utilities   \n",
473 |        "1  Health Care Services & Hospitals                   Health Care   \n",
474 |        "\n",
475 |        "                                         Competitors  Average Salary   \n",
476 |        "4               Commerce Signals, Cardlytics, Yodlee           114.5  \\\n",
477 |        "0                                                 -1            72.0   \n",
478 |        "2                                                 -1            85.0   \n",
479 |        "3  Oak Ridge National Laboratory, National Renewa...            76.5   \n",
480 |        "1                                                 -1            87.5   \n",
481 |        "\n",
482 |        "   Average Revenue                                       Processed_JD  match  \n",
483 |        "4     24319.000761  Data Scientist Affinity Solutions Marketing Cl...   0.73  \n",
484 |        "0        75.000000  Data Scientist Location: Albuquerque, Educatio...   0.74  \n",
485 |        "2       300.000000  KnowBe4, Inc. high growth information security...   0.79  \n",
486 |        "3    250500.000000  *Organization Job ID** Job ID: 310709 Director...   0.80  \n",
487 |        "1      3500.000000  What You Will Do: General Summary The Healthca...   0.85  "
488 |       ]
489 |      },
490 |      "execution_count": 23,
491 |      "metadata": {},
492 |      "output_type": "execute_result"
493 |     }
494 |    ],
495 |    "source": [
496 |     "# Following recommends Top 5 Jobs based on candidate resume:\n",
497 |     "jd_df['match']=matches['Match confidence']\n",
498 |     "jd_df.head(5).sort_values('match')"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "metadata": {},
505 |    "outputs": [],
506 |    "source": []
507 |   }
508 |  ],
509 |  "metadata": {
510 |   "kernelspec": {
511 |    "display_name": "myenv",
512 |    "language": "python",
513 |    "name": "python3"
514 |   },
515 |   "language_info": {
516 |    "codemirror_mode": {
517 |     "name": "ipython",
518 |     "version": 3
519 |    },
520 |    "file_extension": ".py",
521 |    "mimetype": "text/x-python",
522 |    "name": "python",
523 |    "nbconvert_exporter": "python",
524 |    "pygments_lexer": "ipython3",
525 |    "version": "3.10.10"
526 |   },
527 |   "orig_nbformat": 4,
528 |   "vscode": {
529 |    "interpreter": {
530 |     "hash": "ae6b9c19ba8290d367f751939abe8de5af7ecdf4fdf442937bc3215b661f3d40"
531 |    }
532 |   }
533 |  },
534 |  "nbformat": 4,
535 |  "nbformat_minor": 2
536 | }
537 | 


--------------------------------------------------------------------------------