├── .gitignore ├── images ├── homepage.png ├── serpapi.png └── relevant_search.jpg ├── requirements.txt ├── Dockerfile ├── docker-compose.yml ├── pages ├── 4_Sponsors.py ├── 2_Analytics.py ├── 1_Post_a_Job.py └── 3_Scrape_Jobs_🔒.py ├── Makefile ├── css.py ├── README.md ├── google_analytics.py ├── Home.py └── search_index.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pem 2 | *.pyc 3 | *.env 4 | *.json -------------------------------------------------------------------------------- /images/homepage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpalbrecht/job-search-engine/HEAD/images/homepage.png -------------------------------------------------------------------------------- /images/serpapi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpalbrecht/job-search-engine/HEAD/images/serpapi.png -------------------------------------------------------------------------------- /images/relevant_search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpalbrecht/job-search-engine/HEAD/images/relevant_search.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.24.0 2 | altair<5 3 | boto3==1.26.65 4 | opensearch-py==2.1.1 5 | beautifulsoup4==4.11.2 6 | awscli==1.27.142 7 | plotly 8 | google-search-results 9 | python-dotenv 10 | google-auth 11 | google-analytics-data -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim-buster 2 | 3 | 4 | WORKDIR /app 5 | COPY . /app 6 | RUN pip3 install -r /app/requirements.txt 7 | 8 | EXPOSE 8501 9 | 10 | ENTRYPOINT ["streamlit", "run", "Home.py", "--server.port=8501", "--server.address=0.0.0.0"] -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | 4 | job-search-engine: 5 | image: 838424036277.dkr.ecr.us-west-1.amazonaws.com/job-search-engine 6 | logging: 7 | driver: "json-file" 8 | options: 9 | max-size: "10m" 10 | max-file: "1" 11 | ports: 12 | - "8501:8501" 13 | env_file: 14 | - ./.env -------------------------------------------------------------------------------- /pages/4_Sponsors.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import css; css.set_page_style('Next Search Job • Sponsors') 3 | 4 | 5 | 6 | # Title 7 | st.markdown('

Thank You to Our Sponsors


', unsafe_allow_html=True) 8 | st.markdown('
', unsafe_allow_html=True) 9 | 10 | 11 | # Sponsors 12 | cols = st.columns(5) 13 | with cols[0]: 14 | st.image('images/serpapi.png', use_column_width=True) 15 | st.markdown("""SerpApi kindly donated a year's worth of Developer access.""", unsafe_allow_html=True) -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | run-local: 2 | docker run -p 8501:8501 --env-file ./.env --rm job-search-engine 3 | 4 | tag-push: 5 | aws ecr get-login-password | docker login --username AWS --password-stdin 838424036277.dkr.ecr.us-west-1.amazonaws.com 6 | docker tag job-search-engine:latest 838424036277.dkr.ecr.us-west-1.amazonaws.com/job-search-engine:latest 7 | docker push 838424036277.dkr.ecr.us-west-1.amazonaws.com/job-search-engine:latest 8 | 9 | pull: 10 | aws ecr get-login-password | docker login --username AWS --password-stdin 838424036277.dkr.ecr.us-west-1.amazonaws.com 11 | docker pull 838424036277.dkr.ecr.us-west-1.amazonaws.com/job-search-engine:latest 12 | 13 | run-prod: 14 | docker-compose --env-file ./.env up 15 | -------------------------------------------------------------------------------- /css.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | 4 | 5 | def set_page_style(page_title): 6 | st.set_page_config(page_title=page_title, page_icon='images/relevant_search.jpg', layout='wide') 7 | margins_css = """ 8 | 28 | """ 29 | st.markdown(margins_css, unsafe_allow_html=True) 30 | -------------------------------------------------------------------------------- /pages/2_Analytics.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | sys.path.append('..') 4 | import css; css.set_page_style('Next Search Job • Analytics') 5 | import plotly.graph_objs as go 6 | import datetime 7 | from google_analytics import query_google_analytics 8 | from dotenv import load_dotenv; load_dotenv() 9 | 10 | 11 | 12 | st.markdown('

Site Analytics

', unsafe_allow_html=True) 13 | st.markdown('

', unsafe_allow_html=True) 14 | 15 | 16 | 17 | def plot_histogram(data, event_type): 18 | x, y = [], [] 19 | for key, value in data[event_type].items(): 20 | x.append(key) 21 | y.append(value) 22 | fig = go.Figure(data=go.Bar(x=x, y=y)) 23 | fig.update_layout(title=f"{event_type.replace('_',' ').title()} Events", 24 | xaxis_title='Page', 25 | yaxis_title='Count') 26 | st.plotly_chart(fig) 27 | 28 | 29 | 30 | # Plot a date range's activity 31 | container1 = st.container() 32 | with container1: 33 | cols1 = st.columns(5) 34 | with cols1[0]: 35 | dates = st.date_input(label='Choose a Date Range Display:', 36 | value=(datetime.datetime.utcnow().date()-datetime.timedelta(days=7), 37 | datetime.datetime.utcnow().date())) 38 | start_date = dates[0].strftime('%Y-%m-%d') 39 | end_date = dates[1].strftime('%Y-%m-%d') 40 | page_event_dict = query_google_analytics('page_events', start_date, end_date) 41 | plot_histogram(page_event_dict, event_type='click') 42 | plot_histogram(page_event_dict, event_type='page_view') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # job-search-engine 2 | A job search engine and site created for the Relevance & Matching Tech community on Slack (relevancy.slack.com). 3 |
4 | ![alt text](https://github.com/dpalbrecht/job-search-engine/blob/main/images/homepage.png) 5 |


6 | 7 | ## How did this site come about? 8 | Soon after I joined the group and started perusing open roles in the #jobs channel, I realized there was an opportunity to 1) build a quick search engine MVP and 2) contribute to the community. Below are the steps I took to deploy the site: 9 | 1. Create a new AWS account, so you can use Free Tier resources. 10 | 2. Create an [OpenSearch](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/gsgcreate-domain.html) instance. 11 | - When setting up fine-grained access control, create a user using IAM and use it as the master user. 12 | 3. Create the index and mapping using either the [OpenSearch Dashboard](https://opensearch.org/docs/latest/dashboards/quickstart-dashboards/), [command line](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/gsgupload-data.html), or [juptyer notebook](https://dylancastillo.co/opensearch-python/#create-an-index) (my personal recommendation). For the latter two, you'll need to either pass in your AWS credentials explicitly or install the CLI (`pip install awscli`) and run `aws configure` where you will store them (_highly_ recommended). 13 | 4. Build your [Streamlit](https://streamlit.io/) app (or copy what I have above). 14 | 5. [Deploy on EC2](https://towardsdatascience.com/how-to-deploy-a-streamlit-app-using-an-amazon-free-ec2-instance-416a41f69dc3). 15 | - Just as in step 3, for local access, you'll need to `aws configure` so the app can query the index. 16 | 17 |


18 | ## Notable Updates 19 | * [4/23] Get a real domain name by using Amazon Route 53 and PairDomains.com 20 | * [4/23] Enable user to search for "more jobs like this" with a single button click 21 | * [5/23] Track user behavior with Streamlit Analytics by writing page load and click events to S3, and a nightly cron for aggregating them with Google Analytics (7/29). Add a page to visualize activity 22 | * [5/23] Align dev and prod environments by using Docker 23 | * [6/23] Add new page, Scrape Jobs, so developers can more easily pull in jobs to the site 24 | * [7/19] Enable HTTPS -------------------------------------------------------------------------------- /google_analytics.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv; load_dotenv() 2 | import os 3 | import pathlib 4 | from bs4 import BeautifulSoup 5 | import shutil 6 | import streamlit as st 7 | from collections import defaultdict 8 | from google.oauth2 import service_account 9 | from google.analytics.data_v1beta import BetaAnalyticsDataClient 10 | from google.analytics.data_v1beta.types import ( 11 | DateRange, 12 | Dimension, 13 | Metric, 14 | RunReportRequest, 15 | FilterExpression, 16 | Filter 17 | ) 18 | 19 | 20 | 21 | def inject_google_analytics(): 22 | GA_ID = os.environ['GOOGLE_ANALYTICS_ID'] 23 | 24 | GA_JS = f""" 25 | 26 | """+""" 27 | 34 | """ 35 | 36 | # Insert the script in the head tag of the static template inside your virtual 37 | index_path = pathlib.Path(st.__file__).parent / "static" / "index.html" 38 | soup = BeautifulSoup(index_path.read_text(), features="html.parser") 39 | if not soup.find(id=GA_ID): # if cannot find tag 40 | bck_index = index_path.with_suffix('.bck') 41 | if bck_index.exists(): 42 | shutil.copy(bck_index, index_path) # recover from backup 43 | else: 44 | shutil.copy(index_path, bck_index) # keep a backup 45 | html = str(soup) 46 | new_html = html.replace('', '\n' + GA_JS) 47 | index_path.write_text(new_html) 48 | 49 | 50 | def get_data(query): 51 | credentials = service_account.Credentials.from_service_account_file( 52 | 'google_analytics_credentials.json', 53 | scopes=["https://www.googleapis.com/auth/analytics.readonly"], 54 | ) 55 | client = BetaAnalyticsDataClient(credentials=credentials) 56 | response = client.run_report(query) 57 | return response.rows 58 | 59 | 60 | def parse_page_events(response): 61 | data = defaultdict(lambda: defaultdict(lambda: 0)) 62 | for row in response: 63 | event_name, page_title = row.dimension_values 64 | event_count = row.metric_values[0] 65 | data[event_name.value][page_title.value] += int(event_count.value) 66 | return data 67 | 68 | 69 | def parse_link_clicks(response): 70 | data = defaultdict(lambda: 0) 71 | for row in response: 72 | link_name = row.dimension_values[0] 73 | click_count = row.metric_values[0] 74 | data[link_name.value] += int(click_count.value) 75 | return data 76 | 77 | 78 | def query_google_analytics(report_type, start_date, end_date): 79 | if report_type == 'page_events': 80 | query = RunReportRequest( 81 | property=f"properties/{os.environ['GA4_PROPERTY_ID']}", 82 | dimensions=[Dimension(name="eventName"), Dimension(name="pageTitle")], 83 | metrics=[Metric(name="eventCount")], 84 | date_ranges=[DateRange(start_date=start_date, end_date=end_date)], 85 | dimension_filter=FilterExpression( 86 | filter=Filter( 87 | field_name="eventName", 88 | in_list_filter=Filter.InListFilter(values=["page_view","click"]) 89 | ) 90 | ) 91 | ) 92 | return parse_page_events(get_data(query)) 93 | elif report_type == 'link_clicks': 94 | query = RunReportRequest( 95 | property=f"properties/{os.environ['GA4_PROPERTY_ID']}", 96 | dimensions=[Dimension(name="customEvent:link_url")], 97 | metrics=[Metric(name="eventCount")], 98 | date_ranges=[DateRange(start_date=start_date, end_date=end_date)] 99 | ) 100 | return parse_link_clicks(get_data(query)) -------------------------------------------------------------------------------- /Home.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import search_index 3 | import css; css.set_page_style('Next Search Job • Find') 4 | from datetime import datetime, timedelta 5 | from streamlit.components.v1 import html 6 | import google_analytics; google_analytics.inject_google_analytics() 7 | 8 | 9 | 10 | # Title and search bar, and format options 11 | if 'query' not in st.session_state: 12 | st.session_state.query = '' 13 | st.markdown('

Search Relevance & Matching Tech

', unsafe_allow_html=True) 14 | st.markdown("""

Join the Slack Channel



""", unsafe_allow_html=True) 15 | col1, col2, _ = st.columns([1,2,1]) 16 | with col1: 17 | st.markdown('

🔍︍

', unsafe_allow_html=True) 18 | with col2: 19 | query = st.text_input(label="Find jobs...", 20 | value=st.session_state.query, 21 | placeholder='Search through jobs...', 22 | label_visibility='collapsed') 23 | most_recent_flag = st.checkbox(label='Last 30 Days', value=True) 24 | eu_flag = st.checkbox(label='EU') 25 | json_flag = st.checkbox(label='JSON Format') 26 | st.write(f"{search_index.count(most_recent_flag, eu_flag):,} jobs to be exact!") 27 | st.markdown('
', unsafe_allow_html=True) 28 | 29 | 30 | # Update session query from Find Similar Jobs 31 | def update_session_query(new_query): 32 | st.session_state.query = new_query 33 | 34 | 35 | # Get link clicks from the last 14 days 36 | end_date = datetime.utcnow().date() 37 | start_date = end_date - timedelta(days=14) 38 | link_click_dict = google_analytics.query_google_analytics( 39 | 'link_clicks', start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')) 40 | 41 | 42 | # Show query results 43 | if len(query) == 0: 44 | query_results = search_index.blank_query(query, eu_flag, most_recent_flag) 45 | else: 46 | query_results = search_index.query(query, eu_flag, most_recent_flag) 47 | if json_flag: 48 | st.json(query_results['hits']['hits']) 49 | else: 50 | for n, result in enumerate(query_results['hits']['hits'], 1): 51 | days_ago_posted = (datetime.utcnow() - datetime.strptime(result['_source']['created_at'], '%Y-%m-%d')).days 52 | 53 | col1, col2 = st.columns([0.9,0.1]) 54 | with col1: 55 | st.write(f"""
", unsafe_allow_html=True) 61 | with col2: 62 | st.button(f"{link_click_dict.get(result['_source']['url'][:100], 0)} Clicks", 63 | key=result['_source']['url']+'_LINK_CLICKS', 64 | help='Number of times this link has been clicked in the last 14 days.', 65 | disabled=True) 66 | 67 | st.button('Find Similar Jobs', 68 | key=result['_source']['url']+'_FIND_SIMILAR_JOBS', 69 | on_click=update_session_query, 70 | kwargs={'new_query':result['_source']['title']}) 71 | 72 | description_text = "" 73 | if result['_source']['poster'] != 'Unknown': 74 | poster_msg = f"{result['_source']['poster']} posted {days_ago_posted} days ago" 75 | else: 76 | poster_msg = f"Posted {days_ago_posted} days ago" 77 | description_text += f"""
{poster_msg}
""" 78 | if result['_source'].get('email', '') != '': 79 | description_text += f"""
Email the poster at: {result['_source']['email']}
""" 80 | st.markdown(f""" 81 | {description_text} 82 |
{result['_source']['description'][:1000]+'...'}
83 |
84 | """, unsafe_allow_html=True) -------------------------------------------------------------------------------- /pages/1_Post_a_Job.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | import json 4 | sys.path.append('..') 5 | import search_index 6 | import css; css.set_page_style('Next Search Job • Post') 7 | from datetime import datetime 8 | from bs4 import BeautifulSoup 9 | import re 10 | from urllib.parse import urlparse 11 | import requests 12 | 13 | 14 | 15 | def crawl(url): 16 | # From https://colab.research.google.com/drive/1L_s0ey6T-aK65J2wHSZEhoBVW8vmJlkH?usp=sharing#scrollTo=a8yeMg5Tv9Nx 17 | 18 | # Determine whether the host is supported (LinkedIn only at the moment) 19 | parsed = urlparse(url) 20 | if parsed.hostname not in ['linkedin.com', 'www.linkedin.com']: 21 | return {} 22 | 23 | # Scrape the data 24 | html = requests.get(url).content 25 | soup = BeautifulSoup(html) 26 | description = soup.find( 27 | name='div', 28 | attrs={'class':"show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden"} 29 | ) 30 | description = description.get_text('\n').strip() 31 | title = soup.find('h1').text 32 | company = soup.find('a', {'href': re.compile('linkedin.com/company/*')}).text 33 | 34 | return { 35 | "url": url, 36 | "company": company, 37 | "title": title, 38 | "poster": '', 39 | "description": description, 40 | } 41 | 42 | 43 | def crawl_and_populate(): 44 | try: 45 | crawled_data = crawl(st.session_state.url) 46 | if crawled_data: 47 | st.session_state.url = crawled_data['url'] 48 | st.session_state.title = crawled_data['title'] 49 | st.session_state.company = crawled_data['company'] 50 | st.session_state.description = crawled_data['description'] 51 | else: 52 | st.warning(f"Only LinkedIn URLs currently supported... But you can still manually add your job below!", icon='🚨') 53 | except: 54 | st.warning(f"Something went wrong... But you can still manually add your job below!", icon='🚨') 55 | 56 | 57 | 58 | # Title and post form 59 | st.markdown('

Post a Job


', unsafe_allow_html=True) 60 | _, col2, _ = st.columns([1,8,1]) 61 | with col2: 62 | with st.form("job_form"): 63 | url = st.text_input(label='URL', placeholder='* URL', label_visibility='collapsed', key='url') 64 | _ = st.form_submit_button('Auto-Populate (LinkedIn URL Only)', on_click=crawl_and_populate) 65 | payload = { 66 | 'company': st.text_input(label='Company', key='company', 67 | placeholder='* Company', label_visibility='collapsed'), 68 | 'title': st.text_input(label='Job Title', key='title', 69 | placeholder='* Job Title', label_visibility='collapsed'), 70 | 'description': st.text_area(label='Description', placeholder='* Description', 71 | label_visibility='collapsed', key='description'), 72 | 'url': url, 73 | 'poster': st.text_input(label='Your Name', placeholder='Your Name', label_visibility='collapsed'), 74 | 'email': st.text_input(label='Your Email', placeholder='Your Email', label_visibility='collapsed'), 75 | 'slack_blurb': st.text_area(label='Slack Blurb', placeholder="Anything else that you'd like posted to Slack. Say something like 'This job is really great, reach out to me!'", 76 | label_visibility='collapsed', key='slack_blurb'), 77 | 'eu': st.checkbox(label='EU') 78 | } 79 | password = st.text_input(label=' ', placeholder='* Password: What book is on the cover of the Search Relevance Slack channel?', label_visibility='collapsed') 80 | submitted = st.form_submit_button("Post") 81 | 82 | 83 | # Display success/failure 84 | if submitted: 85 | if password.lower() == 'relevant search': 86 | post_payload = True 87 | for name, value in payload.items(): 88 | if (name not in ['poster', 'EU', 'slack_blurb', 'email']) and (value == ''): 89 | post_payload = False 90 | st.warning(f"The '{name}' parameter is required. Can't post job.", icon='🚨') 91 | if post_payload: 92 | if search_index.already_posted_job(payload['url']): 93 | st.warning(f"This job has already been posted. {datetime.utcnow().strftime('%H-%M-%S')}", icon='⚠️') 94 | else: 95 | success = search_index.post(payload) 96 | if success: 97 | st.success(f"Success! {datetime.utcnow().strftime('%H-%M-%S')}", icon='✅') 98 | else: 99 | st.warning(f"Something went wrong! The job was not posted. {datetime.utcnow().strftime('%H-%M-%S')}", icon='🚨') 100 | else: 101 | st.warning(f"Incorrect password! {datetime.utcnow().strftime('%H-%M-%S')}", icon='🚨') 102 | -------------------------------------------------------------------------------- /search_index.py: -------------------------------------------------------------------------------- 1 | from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth 2 | import boto3 3 | from datetime import datetime, timedelta 4 | import json 5 | 6 | 7 | 8 | # Get session client 9 | credentials = boto3.Session().get_credentials() 10 | auth = AWSV4SignerAuth(credentials, 'us-west-1') 11 | client = OpenSearch( 12 | hosts = [{'host': 'search-opensearch-jobs-af5xd22qh6zatxk5wbfhnvdje4.us-west-1.es.amazonaws.com', 13 | 'port': 443}], 14 | http_auth = auth, 15 | use_ssl = True, 16 | verify_certs = True, 17 | connection_class = RequestsHttpConnection 18 | ) 19 | index_name = 'jobs-index' 20 | lambda_client = boto3.client( 21 | 'lambda', 22 | region_name="us-west-1", 23 | config=boto3.session.Config(signature_version='s3v4',) 24 | ) 25 | 26 | 27 | def blank_query(user_query, eu_flag, most_recent_flag): 28 | if most_recent_flag: 29 | start_date = (datetime.utcnow().date() - timedelta(days=30)).strftime('%Y-%m-%d') 30 | else: 31 | start_date = '2023-01-01' 32 | query = { 33 | "query": { 34 | "bool" : { 35 | 'filter': {'term': {'eu': eu_flag}}, 36 | "must" : [ 37 | {"match_all": {}}, 38 | { 39 | "range": { 40 | "created_at": { 41 | "gte": start_date, 42 | "lte": datetime.utcnow().date().strftime('%Y-%m-%d'), 43 | "format": "yyyy-MM-dd", 44 | "relation": "within" 45 | } 46 | } 47 | } 48 | ] 49 | } 50 | }, 51 | "size": 50, 52 | "sort": [ 53 | { 54 | "created_at": { 55 | "order": "desc" 56 | } 57 | } 58 | ] 59 | } 60 | response = client.search( 61 | body = query, 62 | index = index_name 63 | ) 64 | return response 65 | 66 | 67 | def query(user_query, eu_flag, most_recent_flag, num_results=50): 68 | if most_recent_flag: 69 | start_date = (datetime.utcnow().date() - timedelta(days=30)).strftime('%Y-%m-%d') 70 | else: 71 | start_date = '2023-01-01' 72 | query = { 73 | 'size': num_results, 74 | 'query': { 75 | 'bool': { 76 | 'filter': {'term': {'eu': eu_flag}}, 77 | 'must': [{ 78 | 'multi_match': { 79 | 'query': user_query, 80 | 'fields': ['title^1','description^1'], 81 | "type": "most_fields", # if we expect search terms to appear in most fields 82 | "operator": "or", 83 | "minimum_should_match": 1, 84 | "tie_breaker": 1.0, # sum of all field scores 85 | "analyzer": "english", 86 | "boost": 1, 87 | "fuzziness": "AUTO", 88 | "fuzzy_transpositions": True, # reduces the number of fuzziness movements for adjacent characters 89 | "lenient": False, # allows data type mismatches 90 | "prefix_length": 0, # number of leading characters that are not considered in fuzziness 91 | "auto_generate_synonyms_phrase_query": True, # enables synonym searches if you have them 92 | "zero_terms_query": "none" # returns no results if query gets reduced to no terms (if all of them are stopwords) 93 | } 94 | }, 95 | { 96 | "range": { 97 | "created_at": { 98 | "gte": start_date, 99 | "lte": datetime.utcnow().date().strftime('%Y-%m-%d'), 100 | "format": "yyyy-MM-dd", 101 | "relation": "within" 102 | } 103 | } 104 | }] 105 | } 106 | } 107 | } 108 | response = client.search( 109 | body = query, 110 | index = index_name 111 | ) 112 | return response 113 | 114 | 115 | def count(most_recent_flag, eu_flag): 116 | if most_recent_flag: 117 | start_date = (datetime.utcnow().date() - timedelta(days=30)).strftime('%Y-%m-%d') 118 | else: 119 | start_date = '2023-01-01' 120 | query = { 121 | "query": { 122 | "bool" : { 123 | 'filter': {'term': {'eu': eu_flag}}, 124 | "must" : [ 125 | {"match_all": {}}, 126 | { 127 | "range": { 128 | "created_at": { 129 | "gte": start_date, 130 | "lte": datetime.utcnow().date().strftime('%Y-%m-%d'), 131 | "format": "yyyy-MM-dd", 132 | "relation": "within" 133 | } 134 | } 135 | } 136 | ] 137 | } 138 | } 139 | } 140 | return client.count(body=query, index=index_name)['count'] 141 | 142 | 143 | def post(payload): 144 | payload['url'] = payload['url'].strip('/') 145 | payload['created_at'] = datetime.utcnow().strftime('%Y-%m-%d') 146 | if len(payload['poster'])==0: 147 | payload['poster'] = 'Unknown' 148 | try: 149 | client.index(index=index_name, body=payload, id=payload['url']) 150 | response = lambda_client.invoke(FunctionName='post_url_to_slack', 151 | Payload=json.dumps(payload)) 152 | if response['ResponseMetadata']['HTTPStatusCode'] == 200: 153 | return True 154 | else: 155 | return False 156 | except: 157 | return False 158 | 159 | 160 | def already_posted_job(url): 161 | query = {'query': {'term': {'_id': url.strip('/')}}} 162 | num_docs = client.count(body=query, index=index_name)['count'] 163 | if num_docs == 0: 164 | return False 165 | else: 166 | return True 167 | -------------------------------------------------------------------------------- /pages/3_Scrape_Jobs_🔒.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import json 3 | from serpapi import GoogleSearch 4 | import sys; sys.path.append('..') 5 | import search_index 6 | from datetime import datetime 7 | import css; css.set_page_style('Next Search Job • 🔒') 8 | import boto3 9 | from dotenv import load_dotenv; load_dotenv() 10 | import os 11 | s3_resource = boto3.resource('s3') 12 | s3_client = boto3.client('s3') 13 | 14 | 15 | 16 | def remove_job_from_session_state(job_data): 17 | st.session_state['current_query']['jobs_to_display'] = [job for job in st.session_state['current_query']['jobs_to_display'] 18 | if job['job_id'] != job_data['job_id']] 19 | st.session_state['current_query']['job_ids_to_display'].remove(job['job_id']) 20 | 21 | 22 | def filter_job_data(job): 23 | keys_to_keep = ['company_name','title','description', 24 | 'merged_description','job_url','job_id'] 25 | return {key:job[key] for key in keys_to_keep} 26 | 27 | 28 | def add_job_to_site(query, job_data): 29 | payload = { 30 | 'company': job_data['company_name'], 31 | 'title': job_data['title'], 32 | 'description': job_data['merged_description'], 33 | 'url': job_data['job_url'], 34 | 'poster': '', 35 | 'email': '', 36 | 'slack_blurb': '', 37 | 'eu': False 38 | } 39 | success = search_index.post(payload) 40 | remove_job_from_session_state(job_data) 41 | 42 | 43 | def remove_job_listing(query, job_data): 44 | job_url = job_data['job_url'].replace('/',';') 45 | s3_object = s3_resource.Object('scraped-job-urls', f"blocked/{job_url}") 46 | s3_object.put(Body=json.dumps(job_data)) 47 | remove_job_from_session_state(job_data) 48 | 49 | 50 | def get_blocked_job_urls(): 51 | response = s3_client.list_objects_v2(Bucket='scraped-job-urls', Prefix='blocked') 52 | keys = [r['Key'].replace('blocked/','') for r in response.get('Contents', [])] 53 | return set(keys) 54 | 55 | 56 | def get_job_link(job_id): 57 | params = { 58 | "engine": "google_jobs_listing", 59 | "q": job_id, 60 | "api_key": os.environ['SERPAPI_KEY'] 61 | } 62 | search = GoogleSearch(params) 63 | results = search.get_dict() 64 | apply_options = sorted(results['apply_options'], key=lambda x: x['title']) 65 | for apply_option in apply_options: 66 | if apply_option['title'] == "Apply on LinkedIn": 67 | return apply_option['link'] 68 | return results['apply_options'][0]['link'] 69 | 70 | 71 | def get_job_listings(query, search_param_start): 72 | search_params = { 73 | "q": query, 74 | "engine": "google_jobs", 75 | "location_requested": "United States", 76 | "location_used": "United States", 77 | "google_domain": "google.com", 78 | "hl": "en", 79 | "gl": "us", 80 | "ltype": "1", 81 | "start": search_param_start, 82 | "api_key": os.environ['SERPAPI_KEY'] 83 | } 84 | search = GoogleSearch(search_params) 85 | results = search.get_dict() 86 | return results.get('jobs_results', []) 87 | 88 | 89 | def merge_job_description(job): 90 | description = job['description'] + "\n" 91 | for highlight in job['job_highlights']: 92 | if highlight.get('title'): 93 | description += highlight['title'] + "\n" 94 | for item in highlight['items']: 95 | description += item + "\n" 96 | job['merged_description'] = description 97 | return job 98 | 99 | 100 | def display_similar_jobs(job): 101 | query = f"{job['title']} @ {job['company_name']}" 102 | query_results = search_index.query(query, eu_flag=False, most_recent_flag=True, num_results=3) 103 | spaces = ''.join([' ']*10) 104 | st.write('Similar jobs we added in the last 30 days:') 105 | for n, result in enumerate(query_results['hits']['hits'], 1): 106 | st.markdown(f"""{spaces}{n}) {result['_source']['title']} @ {result['_source']['company']}""", 107 | unsafe_allow_html=True) 108 | 109 | 110 | def display_this_job(n, job, query): 111 | st.markdown(f""" 112 |

{n}. 113 | {job['title']} @ {job['company_name']} 114 |

""", unsafe_allow_html=True) 115 | display_similar_jobs(job) 116 | st.button("Add to Site", 117 | key=job['job_id'], 118 | on_click=add_job_to_site, 119 | kwargs={'job_data':job, 'query':query}) 120 | st.markdown(f""" 121 |
Posted {job.get('detected_extensions',{}).get('posted_at','Unknown')}
122 |
{job['merged_description'][:1000]+'...'}
123 | """, unsafe_allow_html=True) 124 | st.button("Don't Show This Job Again", 125 | key=job['job_id']+"-2", 126 | on_click=remove_job_listing, 127 | kwargs={'job_data':job, 'query':query}) 128 | 129 | 130 | def add_new_session_state(query): 131 | st.session_state['queries'][query] = { 132 | 'query': query, 133 | 'job_ids_to_display': set(), 134 | 'jobs_to_display': [], 135 | 'search_param_start': 0 136 | } 137 | 138 | 139 | 140 | # Password check 141 | placeholder = st.empty() 142 | input_password = placeholder.text_input(label="This page is locked. What's the password?", 143 | value='', type='password').lower() 144 | 145 | if st.session_state.get('password') or (input_password == os.environ['STREAMLIT_PW']): 146 | st.session_state['password'] = True 147 | placeholder.empty() 148 | 149 | # Title and search bar 150 | st.markdown('

Add More Jobs!

', unsafe_allow_html=True) 151 | st.markdown("
(be careful pulling in jobs as some sites have old postings)

", unsafe_allow_html=True) 152 | col1, col2, _ = st.columns([1,2,1]) 153 | with col1: 154 | st.markdown('

🔍︍

', unsafe_allow_html=True) 155 | with col2: 156 | query = st.text_input(label="Find jobs...", 157 | value="search engineer", 158 | placeholder='Search through jobs...', 159 | label_visibility='collapsed') 160 | st.markdown('
', unsafe_allow_html=True) 161 | 162 | 163 | # Display query results 164 | if query: 165 | if 'queries' not in st.session_state: 166 | st.session_state['queries'] = {} 167 | if query not in st.session_state['queries']: 168 | add_new_session_state(query) 169 | st.session_state['current_query'] = st.session_state['queries'][query] 170 | 171 | if len(st.session_state['current_query']['jobs_to_display']) < 10: 172 | blocked_job_urls = get_blocked_job_urls() 173 | 174 | percent_complete = int(len(st.session_state['current_query']['jobs_to_display'])/10) 175 | progress_bar = st.progress(percent_complete, text='Fetching Jobs...') 176 | 177 | while len(st.session_state['current_query']['jobs_to_display']) < 10: 178 | 179 | job_listings = get_job_listings(query, st.session_state['current_query']['search_param_start']) 180 | if len(job_listings) == 0: 181 | break 182 | 183 | for n, job in enumerate(job_listings, 1): 184 | 185 | if job['job_id'] not in st.session_state['current_query']['job_ids_to_display']: 186 | job['job_url'] = get_job_link(job['job_id']) 187 | 188 | # If the job link is already in OpenSearch, don't show it 189 | if (not search_index.already_posted_job(job['job_url'])) \ 190 | and (job['job_url'].replace('/',';') not in blocked_job_urls): 191 | 192 | # Merge description text 193 | job = merge_job_description(job) 194 | 195 | # Keep only necessary data to save cache space 196 | job = filter_job_data(job) 197 | 198 | # Add job to those we want to display 199 | st.session_state['current_query']['jobs_to_display'].append(job) 200 | st.session_state['current_query']['job_ids_to_display'].add(job['job_id']) 201 | 202 | # Update progress bar 203 | percent_complete += 10 204 | progress_bar.progress(percent_complete, text='Fetching Jobs...') 205 | if len(st.session_state['current_query']['jobs_to_display']) == 10: 206 | break 207 | 208 | st.session_state['current_query']['search_param_start'] += 10 209 | 210 | # Display jobs 211 | for n, job in enumerate(st.session_state['current_query']['jobs_to_display'], 1): 212 | display_this_job(n, job, query) 213 | st.markdown("
", unsafe_allow_html=True) 214 | progress_bar.empty() 215 | else: 216 | if input_password != '': 217 | st.warning(f"Incorrect password! {datetime.utcnow().strftime('%H-%M-%S')}", icon='🚨') --------------------------------------------------------------------------------