├── .gitignore
├── images
├── homepage.png
├── serpapi.png
└── relevant_search.jpg
├── requirements.txt
├── Dockerfile
├── docker-compose.yml
├── pages
├── 4_Sponsors.py
├── 2_Analytics.py
├── 1_Post_a_Job.py
└── 3_Scrape_Jobs_🔒.py
├── Makefile
├── css.py
├── README.md
├── google_analytics.py
├── Home.py
└── search_index.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pem
2 | *.pyc
3 | *.env
4 | *.json
--------------------------------------------------------------------------------
/images/homepage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpalbrecht/job-search-engine/HEAD/images/homepage.png
--------------------------------------------------------------------------------
/images/serpapi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpalbrecht/job-search-engine/HEAD/images/serpapi.png
--------------------------------------------------------------------------------
/images/relevant_search.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dpalbrecht/job-search-engine/HEAD/images/relevant_search.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.24.0
2 | altair<5
3 | boto3==1.26.65
4 | opensearch-py==2.1.1
5 | beautifulsoup4==4.11.2
6 | awscli==1.27.142
7 | plotly
8 | google-search-results
9 | python-dotenv
10 | google-auth
11 | google-analytics-data
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8-slim-buster
2 |
3 |
4 | WORKDIR /app
5 | COPY . /app
6 | RUN pip3 install -r /app/requirements.txt
7 |
8 | EXPOSE 8501
9 |
10 | ENTRYPOINT ["streamlit", "run", "Home.py", "--server.port=8501", "--server.address=0.0.0.0"]
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 |
4 | job-search-engine:
5 | image: 838424036277.dkr.ecr.us-west-1.amazonaws.com/job-search-engine
6 | logging:
7 | driver: "json-file"
8 | options:
9 | max-size: "10m"
10 | max-file: "1"
11 | ports:
12 | - "8501:8501"
13 | env_file:
14 | - ./.env
--------------------------------------------------------------------------------
/pages/4_Sponsors.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import css; css.set_page_style('Next Search Job • Sponsors')
3 |
4 |
5 |
6 | # Title
7 | st.markdown('
Thank You to Our Sponsors
', unsafe_allow_html=True)
8 | st.markdown('
', unsafe_allow_html=True)
9 |
10 |
11 | # Sponsors
12 | cols = st.columns(5)
13 | with cols[0]:
14 | st.image('images/serpapi.png', use_column_width=True)
15 | st.markdown("""SerpApi kindly donated a year's worth of Developer access.""", unsafe_allow_html=True)
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | run-local:
2 | docker run -p 8501:8501 --env-file ./.env --rm job-search-engine
3 |
4 | tag-push:
5 | aws ecr get-login-password | docker login --username AWS --password-stdin 838424036277.dkr.ecr.us-west-1.amazonaws.com
6 | docker tag job-search-engine:latest 838424036277.dkr.ecr.us-west-1.amazonaws.com/job-search-engine:latest
7 | docker push 838424036277.dkr.ecr.us-west-1.amazonaws.com/job-search-engine:latest
8 |
9 | pull:
10 | aws ecr get-login-password | docker login --username AWS --password-stdin 838424036277.dkr.ecr.us-west-1.amazonaws.com
11 | docker pull 838424036277.dkr.ecr.us-west-1.amazonaws.com/job-search-engine:latest
12 |
13 | run-prod:
14 | docker-compose --env-file ./.env up
15 |
--------------------------------------------------------------------------------
/css.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 |
4 |
5 | def set_page_style(page_title):
6 | st.set_page_config(page_title=page_title, page_icon='images/relevant_search.jpg', layout='wide')
7 | margins_css = """
8 |
28 | """
29 | st.markdown(margins_css, unsafe_allow_html=True)
30 |
--------------------------------------------------------------------------------
/pages/2_Analytics.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import sys
3 | sys.path.append('..')
4 | import css; css.set_page_style('Next Search Job • Analytics')
5 | import plotly.graph_objs as go
6 | import datetime
7 | from google_analytics import query_google_analytics
8 | from dotenv import load_dotenv; load_dotenv()
9 |
10 |
11 |
12 | st.markdown('Site Analytics
', unsafe_allow_html=True)
13 | st.markdown('
', unsafe_allow_html=True)
14 |
15 |
16 |
17 | def plot_histogram(data, event_type):
18 | x, y = [], []
19 | for key, value in data[event_type].items():
20 | x.append(key)
21 | y.append(value)
22 | fig = go.Figure(data=go.Bar(x=x, y=y))
23 | fig.update_layout(title=f"{event_type.replace('_',' ').title()} Events",
24 | xaxis_title='Page',
25 | yaxis_title='Count')
26 | st.plotly_chart(fig)
27 |
28 |
29 |
30 | # Plot a date range's activity
31 | container1 = st.container()
32 | with container1:
33 | cols1 = st.columns(5)
34 | with cols1[0]:
35 | dates = st.date_input(label='Choose a Date Range Display:',
36 | value=(datetime.datetime.utcnow().date()-datetime.timedelta(days=7),
37 | datetime.datetime.utcnow().date()))
38 | start_date = dates[0].strftime('%Y-%m-%d')
39 | end_date = dates[1].strftime('%Y-%m-%d')
40 | page_event_dict = query_google_analytics('page_events', start_date, end_date)
41 | plot_histogram(page_event_dict, event_type='click')
42 | plot_histogram(page_event_dict, event_type='page_view')
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # job-search-engine
2 | A job search engine and site created for the Relevance & Matching Tech community on Slack (relevancy.slack.com).
3 |
4 | 
5 |
6 |
7 | ## How did this site come about?
8 | Soon after I joined the group and started perusing open roles in the #jobs channel, I realized there was an opportunity to 1) build a quick search engine MVP and 2) contribute to the community. Below are the steps I took to deploy the site:
9 | 1. Create a new AWS account, so you can use Free Tier resources.
10 | 2. Create an [OpenSearch](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/gsgcreate-domain.html) instance.
11 | - When setting up fine-grained access control, create a user using IAM and use it as the master user.
12 | 3. Create the index and mapping using either the [OpenSearch Dashboard](https://opensearch.org/docs/latest/dashboards/quickstart-dashboards/), [command line](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/gsgupload-data.html), or [juptyer notebook](https://dylancastillo.co/opensearch-python/#create-an-index) (my personal recommendation). For the latter two, you'll need to either pass in your AWS credentials explicitly or install the CLI (`pip install awscli`) and run `aws configure` where you will store them (_highly_ recommended).
13 | 4. Build your [Streamlit](https://streamlit.io/) app (or copy what I have above).
14 | 5. [Deploy on EC2](https://towardsdatascience.com/how-to-deploy-a-streamlit-app-using-an-amazon-free-ec2-instance-416a41f69dc3).
15 | - Just as in step 3, for local access, you'll need to `aws configure` so the app can query the index.
16 |
17 |
18 | ## Notable Updates
19 | * [4/23] Get a real domain name by using Amazon Route 53 and PairDomains.com
20 | * [4/23] Enable user to search for "more jobs like this" with a single button click
21 | * [5/23] Track user behavior with Streamlit Analytics by writing page load and click events to S3, and a nightly cron for aggregating them with Google Analytics (7/29). Add a page to visualize activity
22 | * [5/23] Align dev and prod environments by using Docker
23 | * [6/23] Add new page, Scrape Jobs, so developers can more easily pull in jobs to the site
24 | * [7/19] Enable HTTPS
--------------------------------------------------------------------------------
/google_analytics.py:
--------------------------------------------------------------------------------
1 | from dotenv import load_dotenv; load_dotenv()
2 | import os
3 | import pathlib
4 | from bs4 import BeautifulSoup
5 | import shutil
6 | import streamlit as st
7 | from collections import defaultdict
8 | from google.oauth2 import service_account
9 | from google.analytics.data_v1beta import BetaAnalyticsDataClient
10 | from google.analytics.data_v1beta.types import (
11 | DateRange,
12 | Dimension,
13 | Metric,
14 | RunReportRequest,
15 | FilterExpression,
16 | Filter
17 | )
18 |
19 |
20 |
21 | def inject_google_analytics():
22 | GA_ID = os.environ['GOOGLE_ANALYTICS_ID']
23 |
24 | GA_JS = f"""
25 |
26 | """+"""
27 |
34 | """
35 |
36 | # Insert the script in the head tag of the static template inside your virtual
37 | index_path = pathlib.Path(st.__file__).parent / "static" / "index.html"
38 | soup = BeautifulSoup(index_path.read_text(), features="html.parser")
39 | if not soup.find(id=GA_ID): # if cannot find tag
40 | bck_index = index_path.with_suffix('.bck')
41 | if bck_index.exists():
42 | shutil.copy(bck_index, index_path) # recover from backup
43 | else:
44 | shutil.copy(index_path, bck_index) # keep a backup
45 | html = str(soup)
46 | new_html = html.replace('', '\n' + GA_JS)
47 | index_path.write_text(new_html)
48 |
49 |
50 | def get_data(query):
51 | credentials = service_account.Credentials.from_service_account_file(
52 | 'google_analytics_credentials.json',
53 | scopes=["https://www.googleapis.com/auth/analytics.readonly"],
54 | )
55 | client = BetaAnalyticsDataClient(credentials=credentials)
56 | response = client.run_report(query)
57 | return response.rows
58 |
59 |
60 | def parse_page_events(response):
61 | data = defaultdict(lambda: defaultdict(lambda: 0))
62 | for row in response:
63 | event_name, page_title = row.dimension_values
64 | event_count = row.metric_values[0]
65 | data[event_name.value][page_title.value] += int(event_count.value)
66 | return data
67 |
68 |
69 | def parse_link_clicks(response):
70 | data = defaultdict(lambda: 0)
71 | for row in response:
72 | link_name = row.dimension_values[0]
73 | click_count = row.metric_values[0]
74 | data[link_name.value] += int(click_count.value)
75 | return data
76 |
77 |
78 | def query_google_analytics(report_type, start_date, end_date):
79 | if report_type == 'page_events':
80 | query = RunReportRequest(
81 | property=f"properties/{os.environ['GA4_PROPERTY_ID']}",
82 | dimensions=[Dimension(name="eventName"), Dimension(name="pageTitle")],
83 | metrics=[Metric(name="eventCount")],
84 | date_ranges=[DateRange(start_date=start_date, end_date=end_date)],
85 | dimension_filter=FilterExpression(
86 | filter=Filter(
87 | field_name="eventName",
88 | in_list_filter=Filter.InListFilter(values=["page_view","click"])
89 | )
90 | )
91 | )
92 | return parse_page_events(get_data(query))
93 | elif report_type == 'link_clicks':
94 | query = RunReportRequest(
95 | property=f"properties/{os.environ['GA4_PROPERTY_ID']}",
96 | dimensions=[Dimension(name="customEvent:link_url")],
97 | metrics=[Metric(name="eventCount")],
98 | date_ranges=[DateRange(start_date=start_date, end_date=end_date)]
99 | )
100 | return parse_link_clicks(get_data(query))
--------------------------------------------------------------------------------
/Home.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import search_index
3 | import css; css.set_page_style('Next Search Job • Find')
4 | from datetime import datetime, timedelta
5 | from streamlit.components.v1 import html
6 | import google_analytics; google_analytics.inject_google_analytics()
7 |
8 |
9 |
10 | # Title and search bar, and format options
11 | if 'query' not in st.session_state:
12 | st.session_state.query = ''
13 | st.markdown('Search Relevance & Matching Tech
', unsafe_allow_html=True)
14 | st.markdown("""
""", unsafe_allow_html=True)
15 | col1, col2, _ = st.columns([1,2,1])
16 | with col1:
17 | st.markdown('🔍︍
', unsafe_allow_html=True)
18 | with col2:
19 | query = st.text_input(label="Find jobs...",
20 | value=st.session_state.query,
21 | placeholder='Search through jobs...',
22 | label_visibility='collapsed')
23 | most_recent_flag = st.checkbox(label='Last 30 Days', value=True)
24 | eu_flag = st.checkbox(label='EU')
25 | json_flag = st.checkbox(label='JSON Format')
26 | st.write(f"{search_index.count(most_recent_flag, eu_flag):,} jobs to be exact!")
27 | st.markdown('
', unsafe_allow_html=True)
28 |
29 |
30 | # Update session query from Find Similar Jobs
31 | def update_session_query(new_query):
32 | st.session_state.query = new_query
33 |
34 |
35 | # Get link clicks from the last 14 days
36 | end_date = datetime.utcnow().date()
37 | start_date = end_date - timedelta(days=14)
38 | link_click_dict = google_analytics.query_google_analytics(
39 | 'link_clicks', start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))
40 |
41 |
42 | # Show query results
43 | if len(query) == 0:
44 | query_results = search_index.blank_query(query, eu_flag, most_recent_flag)
45 | else:
46 | query_results = search_index.query(query, eu_flag, most_recent_flag)
47 | if json_flag:
48 | st.json(query_results['hits']['hits'])
49 | else:
50 | for n, result in enumerate(query_results['hits']['hits'], 1):
51 | days_ago_posted = (datetime.utcnow() - datetime.strptime(result['_source']['created_at'], '%Y-%m-%d')).days
52 |
53 | col1, col2 = st.columns([0.9,0.1])
54 | with col1:
55 | st.write(f"""
", unsafe_allow_html=True)
61 | with col2:
62 | st.button(f"{link_click_dict.get(result['_source']['url'][:100], 0)} Clicks",
63 | key=result['_source']['url']+'_LINK_CLICKS',
64 | help='Number of times this link has been clicked in the last 14 days.',
65 | disabled=True)
66 |
67 | st.button('Find Similar Jobs',
68 | key=result['_source']['url']+'_FIND_SIMILAR_JOBS',
69 | on_click=update_session_query,
70 | kwargs={'new_query':result['_source']['title']})
71 |
72 | description_text = ""
73 | if result['_source']['poster'] != 'Unknown':
74 | poster_msg = f"{result['_source']['poster']} posted {days_ago_posted} days ago"
75 | else:
76 | poster_msg = f"Posted {days_ago_posted} days ago"
77 | description_text += f"""{poster_msg}
"""
78 | if result['_source'].get('email', '') != '':
79 | description_text += f"""Email the poster at: {result['_source']['email']}
"""
80 | st.markdown(f"""
81 | {description_text}
82 | {result['_source']['description'][:1000]+'...'}
83 |
84 | """, unsafe_allow_html=True)
--------------------------------------------------------------------------------
/pages/1_Post_a_Job.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import sys
3 | import json
4 | sys.path.append('..')
5 | import search_index
6 | import css; css.set_page_style('Next Search Job • Post')
7 | from datetime import datetime
8 | from bs4 import BeautifulSoup
9 | import re
10 | from urllib.parse import urlparse
11 | import requests
12 |
13 |
14 |
15 | def crawl(url):
16 | # From https://colab.research.google.com/drive/1L_s0ey6T-aK65J2wHSZEhoBVW8vmJlkH?usp=sharing#scrollTo=a8yeMg5Tv9Nx
17 |
18 | # Determine whether the host is supported (LinkedIn only at the moment)
19 | parsed = urlparse(url)
20 | if parsed.hostname not in ['linkedin.com', 'www.linkedin.com']:
21 | return {}
22 |
23 | # Scrape the data
24 | html = requests.get(url).content
25 | soup = BeautifulSoup(html)
26 | description = soup.find(
27 | name='div',
28 | attrs={'class':"show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden"}
29 | )
30 | description = description.get_text('\n').strip()
31 | title = soup.find('h1').text
32 | company = soup.find('a', {'href': re.compile('linkedin.com/company/*')}).text
33 |
34 | return {
35 | "url": url,
36 | "company": company,
37 | "title": title,
38 | "poster": '',
39 | "description": description,
40 | }
41 |
42 |
43 | def crawl_and_populate():
44 | try:
45 | crawled_data = crawl(st.session_state.url)
46 | if crawled_data:
47 | st.session_state.url = crawled_data['url']
48 | st.session_state.title = crawled_data['title']
49 | st.session_state.company = crawled_data['company']
50 | st.session_state.description = crawled_data['description']
51 | else:
52 | st.warning(f"Only LinkedIn URLs currently supported... But you can still manually add your job below!", icon='🚨')
53 | except:
54 | st.warning(f"Something went wrong... But you can still manually add your job below!", icon='🚨')
55 |
56 |
57 |
58 | # Title and post form
59 | st.markdown('Post a Job
', unsafe_allow_html=True)
60 | _, col2, _ = st.columns([1,8,1])
61 | with col2:
62 | with st.form("job_form"):
63 | url = st.text_input(label='URL', placeholder='* URL', label_visibility='collapsed', key='url')
64 | _ = st.form_submit_button('Auto-Populate (LinkedIn URL Only)', on_click=crawl_and_populate)
65 | payload = {
66 | 'company': st.text_input(label='Company', key='company',
67 | placeholder='* Company', label_visibility='collapsed'),
68 | 'title': st.text_input(label='Job Title', key='title',
69 | placeholder='* Job Title', label_visibility='collapsed'),
70 | 'description': st.text_area(label='Description', placeholder='* Description',
71 | label_visibility='collapsed', key='description'),
72 | 'url': url,
73 | 'poster': st.text_input(label='Your Name', placeholder='Your Name', label_visibility='collapsed'),
74 | 'email': st.text_input(label='Your Email', placeholder='Your Email', label_visibility='collapsed'),
75 | 'slack_blurb': st.text_area(label='Slack Blurb', placeholder="Anything else that you'd like posted to Slack. Say something like 'This job is really great, reach out to me!'",
76 | label_visibility='collapsed', key='slack_blurb'),
77 | 'eu': st.checkbox(label='EU')
78 | }
79 | password = st.text_input(label=' ', placeholder='* Password: What book is on the cover of the Search Relevance Slack channel?', label_visibility='collapsed')
80 | submitted = st.form_submit_button("Post")
81 |
82 |
83 | # Display success/failure
84 | if submitted:
85 | if password.lower() == 'relevant search':
86 | post_payload = True
87 | for name, value in payload.items():
88 | if (name not in ['poster', 'EU', 'slack_blurb', 'email']) and (value == ''):
89 | post_payload = False
90 | st.warning(f"The '{name}' parameter is required. Can't post job.", icon='🚨')
91 | if post_payload:
92 | if search_index.already_posted_job(payload['url']):
93 | st.warning(f"This job has already been posted. {datetime.utcnow().strftime('%H-%M-%S')}", icon='⚠️')
94 | else:
95 | success = search_index.post(payload)
96 | if success:
97 | st.success(f"Success! {datetime.utcnow().strftime('%H-%M-%S')}", icon='✅')
98 | else:
99 | st.warning(f"Something went wrong! The job was not posted. {datetime.utcnow().strftime('%H-%M-%S')}", icon='🚨')
100 | else:
101 | st.warning(f"Incorrect password! {datetime.utcnow().strftime('%H-%M-%S')}", icon='🚨')
102 |
--------------------------------------------------------------------------------
/search_index.py:
--------------------------------------------------------------------------------
1 | from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
2 | import boto3
3 | from datetime import datetime, timedelta
4 | import json
5 |
6 |
7 |
8 | # Get session client
9 | credentials = boto3.Session().get_credentials()
10 | auth = AWSV4SignerAuth(credentials, 'us-west-1')
11 | client = OpenSearch(
12 | hosts = [{'host': 'search-opensearch-jobs-af5xd22qh6zatxk5wbfhnvdje4.us-west-1.es.amazonaws.com',
13 | 'port': 443}],
14 | http_auth = auth,
15 | use_ssl = True,
16 | verify_certs = True,
17 | connection_class = RequestsHttpConnection
18 | )
19 | index_name = 'jobs-index'
20 | lambda_client = boto3.client(
21 | 'lambda',
22 | region_name="us-west-1",
23 | config=boto3.session.Config(signature_version='s3v4',)
24 | )
25 |
26 |
27 | def blank_query(user_query, eu_flag, most_recent_flag):
28 | if most_recent_flag:
29 | start_date = (datetime.utcnow().date() - timedelta(days=30)).strftime('%Y-%m-%d')
30 | else:
31 | start_date = '2023-01-01'
32 | query = {
33 | "query": {
34 | "bool" : {
35 | 'filter': {'term': {'eu': eu_flag}},
36 | "must" : [
37 | {"match_all": {}},
38 | {
39 | "range": {
40 | "created_at": {
41 | "gte": start_date,
42 | "lte": datetime.utcnow().date().strftime('%Y-%m-%d'),
43 | "format": "yyyy-MM-dd",
44 | "relation": "within"
45 | }
46 | }
47 | }
48 | ]
49 | }
50 | },
51 | "size": 50,
52 | "sort": [
53 | {
54 | "created_at": {
55 | "order": "desc"
56 | }
57 | }
58 | ]
59 | }
60 | response = client.search(
61 | body = query,
62 | index = index_name
63 | )
64 | return response
65 |
66 |
67 | def query(user_query, eu_flag, most_recent_flag, num_results=50):
68 | if most_recent_flag:
69 | start_date = (datetime.utcnow().date() - timedelta(days=30)).strftime('%Y-%m-%d')
70 | else:
71 | start_date = '2023-01-01'
72 | query = {
73 | 'size': num_results,
74 | 'query': {
75 | 'bool': {
76 | 'filter': {'term': {'eu': eu_flag}},
77 | 'must': [{
78 | 'multi_match': {
79 | 'query': user_query,
80 | 'fields': ['title^1','description^1'],
81 | "type": "most_fields", # if we expect search terms to appear in most fields
82 | "operator": "or",
83 | "minimum_should_match": 1,
84 | "tie_breaker": 1.0, # sum of all field scores
85 | "analyzer": "english",
86 | "boost": 1,
87 | "fuzziness": "AUTO",
88 | "fuzzy_transpositions": True, # reduces the number of fuzziness movements for adjacent characters
89 | "lenient": False, # allows data type mismatches
90 | "prefix_length": 0, # number of leading characters that are not considered in fuzziness
91 | "auto_generate_synonyms_phrase_query": True, # enables synonym searches if you have them
92 | "zero_terms_query": "none" # returns no results if query gets reduced to no terms (if all of them are stopwords)
93 | }
94 | },
95 | {
96 | "range": {
97 | "created_at": {
98 | "gte": start_date,
99 | "lte": datetime.utcnow().date().strftime('%Y-%m-%d'),
100 | "format": "yyyy-MM-dd",
101 | "relation": "within"
102 | }
103 | }
104 | }]
105 | }
106 | }
107 | }
108 | response = client.search(
109 | body = query,
110 | index = index_name
111 | )
112 | return response
113 |
114 |
115 | def count(most_recent_flag, eu_flag):
116 | if most_recent_flag:
117 | start_date = (datetime.utcnow().date() - timedelta(days=30)).strftime('%Y-%m-%d')
118 | else:
119 | start_date = '2023-01-01'
120 | query = {
121 | "query": {
122 | "bool" : {
123 | 'filter': {'term': {'eu': eu_flag}},
124 | "must" : [
125 | {"match_all": {}},
126 | {
127 | "range": {
128 | "created_at": {
129 | "gte": start_date,
130 | "lte": datetime.utcnow().date().strftime('%Y-%m-%d'),
131 | "format": "yyyy-MM-dd",
132 | "relation": "within"
133 | }
134 | }
135 | }
136 | ]
137 | }
138 | }
139 | }
140 | return client.count(body=query, index=index_name)['count']
141 |
142 |
143 | def post(payload):
144 | payload['url'] = payload['url'].strip('/')
145 | payload['created_at'] = datetime.utcnow().strftime('%Y-%m-%d')
146 | if len(payload['poster'])==0:
147 | payload['poster'] = 'Unknown'
148 | try:
149 | client.index(index=index_name, body=payload, id=payload['url'])
150 | response = lambda_client.invoke(FunctionName='post_url_to_slack',
151 | Payload=json.dumps(payload))
152 | if response['ResponseMetadata']['HTTPStatusCode'] == 200:
153 | return True
154 | else:
155 | return False
156 | except:
157 | return False
158 |
159 |
160 | def already_posted_job(url):
161 | query = {'query': {'term': {'_id': url.strip('/')}}}
162 | num_docs = client.count(body=query, index=index_name)['count']
163 | if num_docs == 0:
164 | return False
165 | else:
166 | return True
167 |
--------------------------------------------------------------------------------
/pages/3_Scrape_Jobs_🔒.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import json
3 | from serpapi import GoogleSearch
4 | import sys; sys.path.append('..')
5 | import search_index
6 | from datetime import datetime
7 | import css; css.set_page_style('Next Search Job • 🔒')
8 | import boto3
9 | from dotenv import load_dotenv; load_dotenv()
10 | import os
11 | s3_resource = boto3.resource('s3')
12 | s3_client = boto3.client('s3')
13 |
14 |
15 |
16 | def remove_job_from_session_state(job_data):
17 | st.session_state['current_query']['jobs_to_display'] = [job for job in st.session_state['current_query']['jobs_to_display']
18 | if job['job_id'] != job_data['job_id']]
19 | st.session_state['current_query']['job_ids_to_display'].remove(job['job_id'])
20 |
21 |
22 | def filter_job_data(job):
23 | keys_to_keep = ['company_name','title','description',
24 | 'merged_description','job_url','job_id']
25 | return {key:job[key] for key in keys_to_keep}
26 |
27 |
28 | def add_job_to_site(query, job_data):
29 | payload = {
30 | 'company': job_data['company_name'],
31 | 'title': job_data['title'],
32 | 'description': job_data['merged_description'],
33 | 'url': job_data['job_url'],
34 | 'poster': '',
35 | 'email': '',
36 | 'slack_blurb': '',
37 | 'eu': False
38 | }
39 | success = search_index.post(payload)
40 | remove_job_from_session_state(job_data)
41 |
42 |
43 | def remove_job_listing(query, job_data):
44 | job_url = job_data['job_url'].replace('/',';')
45 | s3_object = s3_resource.Object('scraped-job-urls', f"blocked/{job_url}")
46 | s3_object.put(Body=json.dumps(job_data))
47 | remove_job_from_session_state(job_data)
48 |
49 |
50 | def get_blocked_job_urls():
51 | response = s3_client.list_objects_v2(Bucket='scraped-job-urls', Prefix='blocked')
52 | keys = [r['Key'].replace('blocked/','') for r in response.get('Contents', [])]
53 | return set(keys)
54 |
55 |
56 | def get_job_link(job_id):
57 | params = {
58 | "engine": "google_jobs_listing",
59 | "q": job_id,
60 | "api_key": os.environ['SERPAPI_KEY']
61 | }
62 | search = GoogleSearch(params)
63 | results = search.get_dict()
64 | apply_options = sorted(results['apply_options'], key=lambda x: x['title'])
65 | for apply_option in apply_options:
66 | if apply_option['title'] == "Apply on LinkedIn":
67 | return apply_option['link']
68 | return results['apply_options'][0]['link']
69 |
70 |
71 | def get_job_listings(query, search_param_start):
72 | search_params = {
73 | "q": query,
74 | "engine": "google_jobs",
75 | "location_requested": "United States",
76 | "location_used": "United States",
77 | "google_domain": "google.com",
78 | "hl": "en",
79 | "gl": "us",
80 | "ltype": "1",
81 | "start": search_param_start,
82 | "api_key": os.environ['SERPAPI_KEY']
83 | }
84 | search = GoogleSearch(search_params)
85 | results = search.get_dict()
86 | return results.get('jobs_results', [])
87 |
88 |
89 | def merge_job_description(job):
90 | description = job['description'] + "\n"
91 | for highlight in job['job_highlights']:
92 | if highlight.get('title'):
93 | description += highlight['title'] + "\n"
94 | for item in highlight['items']:
95 | description += item + "\n"
96 | job['merged_description'] = description
97 | return job
98 |
99 |
100 | def display_similar_jobs(job):
101 | query = f"{job['title']} @ {job['company_name']}"
102 | query_results = search_index.query(query, eu_flag=False, most_recent_flag=True, num_results=3)
103 | spaces = ''.join([' ']*10)
104 | st.write('Similar jobs we added in the last 30 days:')
105 | for n, result in enumerate(query_results['hits']['hits'], 1):
106 | st.markdown(f"""{spaces}{n}) {result['_source']['title']} @ {result['_source']['company']}""",
107 | unsafe_allow_html=True)
108 |
109 |
110 | def display_this_job(n, job, query):
111 | st.markdown(f"""
112 | """, unsafe_allow_html=True)
115 | display_similar_jobs(job)
116 | st.button("Add to Site",
117 | key=job['job_id'],
118 | on_click=add_job_to_site,
119 | kwargs={'job_data':job, 'query':query})
120 | st.markdown(f"""
121 | Posted {job.get('detected_extensions',{}).get('posted_at','Unknown')}
122 | {job['merged_description'][:1000]+'...'}
123 | """, unsafe_allow_html=True)
124 | st.button("Don't Show This Job Again",
125 | key=job['job_id']+"-2",
126 | on_click=remove_job_listing,
127 | kwargs={'job_data':job, 'query':query})
128 |
129 |
130 | def add_new_session_state(query):
131 | st.session_state['queries'][query] = {
132 | 'query': query,
133 | 'job_ids_to_display': set(),
134 | 'jobs_to_display': [],
135 | 'search_param_start': 0
136 | }
137 |
138 |
139 |
140 | # Password check
141 | placeholder = st.empty()
142 | input_password = placeholder.text_input(label="This page is locked. What's the password?",
143 | value='', type='password').lower()
144 |
145 | if st.session_state.get('password') or (input_password == os.environ['STREAMLIT_PW']):
146 | st.session_state['password'] = True
147 | placeholder.empty()
148 |
149 | # Title and search bar
150 | st.markdown('Add More Jobs!
', unsafe_allow_html=True)
151 | st.markdown("(be careful pulling in jobs as some sites have old postings)
", unsafe_allow_html=True)
152 | col1, col2, _ = st.columns([1,2,1])
153 | with col1:
154 | st.markdown('🔍︍
', unsafe_allow_html=True)
155 | with col2:
156 | query = st.text_input(label="Find jobs...",
157 | value="search engineer",
158 | placeholder='Search through jobs...',
159 | label_visibility='collapsed')
160 | st.markdown('
', unsafe_allow_html=True)
161 |
162 |
163 | # Display query results
164 | if query:
165 | if 'queries' not in st.session_state:
166 | st.session_state['queries'] = {}
167 | if query not in st.session_state['queries']:
168 | add_new_session_state(query)
169 | st.session_state['current_query'] = st.session_state['queries'][query]
170 |
171 | if len(st.session_state['current_query']['jobs_to_display']) < 10:
172 | blocked_job_urls = get_blocked_job_urls()
173 |
174 | percent_complete = int(len(st.session_state['current_query']['jobs_to_display'])/10)
175 | progress_bar = st.progress(percent_complete, text='Fetching Jobs...')
176 |
177 | while len(st.session_state['current_query']['jobs_to_display']) < 10:
178 |
179 | job_listings = get_job_listings(query, st.session_state['current_query']['search_param_start'])
180 | if len(job_listings) == 0:
181 | break
182 |
183 | for n, job in enumerate(job_listings, 1):
184 |
185 | if job['job_id'] not in st.session_state['current_query']['job_ids_to_display']:
186 | job['job_url'] = get_job_link(job['job_id'])
187 |
188 | # If the job link is already in OpenSearch, don't show it
189 | if (not search_index.already_posted_job(job['job_url'])) \
190 | and (job['job_url'].replace('/',';') not in blocked_job_urls):
191 |
192 | # Merge description text
193 | job = merge_job_description(job)
194 |
195 | # Keep only necessary data to save cache space
196 | job = filter_job_data(job)
197 |
198 | # Add job to those we want to display
199 | st.session_state['current_query']['jobs_to_display'].append(job)
200 | st.session_state['current_query']['job_ids_to_display'].add(job['job_id'])
201 |
202 | # Update progress bar
203 | percent_complete += 10
204 | progress_bar.progress(percent_complete, text='Fetching Jobs...')
205 | if len(st.session_state['current_query']['jobs_to_display']) == 10:
206 | break
207 |
208 | st.session_state['current_query']['search_param_start'] += 10
209 |
210 | # Display jobs
211 | for n, job in enumerate(st.session_state['current_query']['jobs_to_display'], 1):
212 | display_this_job(n, job, query)
213 | st.markdown("
", unsafe_allow_html=True)
214 | progress_bar.empty()
215 | else:
216 | if input_password != '':
217 | st.warning(f"Incorrect password! {datetime.utcnow().strftime('%H-%M-%S')}", icon='🚨')
--------------------------------------------------------------------------------