├── .gitignore
├── site
    ├── icon.png
    └── index.html
├── Makefile
├── requirements.txt
├── .circleci
    └── config.yml
├── twitter_utils.py
├── README.md
├── lwn.py
├── sessionize.py
├── devopsdays.py
├── linux_foundation.py
├── models.py
├── main.py
├── mozilla_calendar.py
├── seecfp.py
├── papercall.py
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | .envrc
2 | .venv/
3 | __pycache__/
4 | 


--------------------------------------------------------------------------------
/site/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderanger/cfp-scraper/HEAD/site/icon.png


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | all: sync upload
2 | 
3 | upload:
4 | 	aws s3 sync site s3://cfpcalendar.com/ --acl public-read
5 | 
6 | sync:
7 | 	python main.py
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | airtable-python-wrapper==0.11.3.post1
 2 | beautifulsoup4==4.7.1
 3 | certifi==2018.11.29
 4 | chardet==3.0.4
 5 | dateparser==0.7.0
 6 | idna==2.8
 7 | oauthlib==3.0.0
 8 | PySocks==1.6.8
 9 | python-dateutil==2.7.5
10 | pytz==2018.9
11 | regex==2018.11.22
12 | requests==2.21.0
13 | requests-oauthlib==1.2.0
14 | six==1.12.0
15 | soupsieve==1.7.2
16 | tweepy==3.7.0
17 | tzlocal==1.5.1
18 | urllib3==1.24.1
19 | ics==0.4
20 | urlextract==0.8.3
21 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | # Python CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-python/ for more details
 4 | #
 5 | version: 2
 6 | jobs:
 7 |   sync:
 8 |     docker:
 9 |       - image: circleci/python:3.7.0
10 |     steps:
11 |       - checkout
12 | 
13 |       # Download and cache dependencies
14 |       - restore_cache:
15 |           keys:
16 |           - v1-dependencies-{{ checksum "requirements.txt" }}
17 |           # fallback to using the latest cache if no exact match is found
18 |           - v1-dependencies-
19 | 
20 |       - run:
21 |           name: Install dependencies
22 |           command: |
23 |             python3 -m venv venv
24 |             . venv/bin/activate
25 |             pip install -r requirements.txt
26 | 
27 |       - save_cache:
28 |           paths:
29 |             - ./venv
30 |           key: v1-dependencies-{{ checksum "requirements.txt" }}
31 | 
32 |       - run:
33 |           name: Run sync
34 |           command: |
35 |             . venv/bin/activate
36 |             python main.py
37 | 
38 | workflows:
39 |   version: 2
40 |   sync:
41 |     jobs:
42 |       - sync
43 |     triggers:
44 |       - schedule:
45 |           cron: "0 10 * * *"
46 |           filters:
47 |             branches:
48 |               only:
49 |                 - master
50 | 


--------------------------------------------------------------------------------
/twitter_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import requests
 4 | import tweepy
 5 | 
 6 | auth = tweepy.OAuthHandler(os.environ['TWITTER_CONSUMER_KEY'], os.environ['TWITTER_CONSUMER_SECRET'])
 7 | auth.set_access_token(os.environ['TWITTER_ACCESS_KEY'], os.environ['TWITTER_ACCESS_SECRET'])
 8 | 
 9 | api = tweepy.API(auth)
10 | 
11 | _expand_cache = {}
12 | 
13 | def expand_url(url):
14 |     expanded = _expand_cache.get(url)
15 |     if expanded is not None:
16 |         return expanded
17 |     expanded = requests.head(url, allow_redirects=True).url
18 |     _expand_cache[url] = expanded
19 |     return expanded
20 | 
21 | 
22 | def search_for_url(query, total=1000):
23 |     max_id = None
24 |     last_max_id = None
25 |     count = 0
26 |     while count < total:
27 |         for tweet in api.search(q=query, count=100, max_id=max_id, result_type='recent'):
28 |             count += 1
29 |             if max_id:
30 |                 max_id = min(max_id, tweet.id)
31 |             else:
32 |                 max_id = tweet.id
33 |             for url in tweet.entities['urls']:
34 |                 # Twitter only expands their own shortener so get event more.
35 |                 truly_expanded_url = url['expanded_url']
36 |                 if query not in truly_expanded_url:
37 |                     truly_expanded_url = expand_url(truly_expanded_url)
38 |                 if query in truly_expanded_url:
39 |                     yield truly_expanded_url
40 |         # Did we run of Tweets?
41 |         if last_max_id == max_id:
42 |             break
43 |         last_max_id = max_id
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     for url in search_for_url('sessionize.com'):
48 |         print(url)
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CFP-Scraper
 2 | 
 3 | This is web scraper that powers cfpcalendar.com.
 4 | 
 5 | Data is collected from various sources and written to Airtable. 
 6 | 
 7 | ## Running it for yourself
 8 | 
 9 | Things you'll need: 
10 | 
11 | 1. [Airtable](https://airtable.com/) account
12 | 2. Twitter account 
13 | 3. Python 3 + virtualenv
14 | 
15 | ### Pre-reqs 
16 | 
17 | #### Airtable
18 | 
19 | 1. Once you've created an Airtable account, use [this link](https://airtable.com/addBaseFromShare/shrYiAKkEEBMuVzcu?utm_source=airtable_shared_application) to copy the template Base to your own workspace. 
20 | 
21 | 2. From https://airtable.com/account generate your API key and make a note of this. This will be your `AIRTABLE_API_KEY`. 
22 | 
23 | 3. Go to https://airtable.com/api and select your base (the one to which you copied the source one in step 1 above). The URL you go to will look like `https://airtable.com/appXXXXXXYYYYY/api/docs` - make a note of the `appXXXXXXYYYYY`. This will be your `AIRTABLE_BASE_KEY`
24 | 
25 | #### Twitter
26 | 
27 | Create yourself API keys from https://developer.twitter.com/en/apps. 
28 | 
29 | #### Set up Python virtualenv
30 | 
31 | The easiest way to run this is in isolation, using virtualenv. 
32 | 
33 | 1. Clone the git repo 
34 | 
35 |         git clone git@github.com:coderanger/cfp-scraper.git
36 | 
37 | 2. Create virtualenv
38 | 
39 |         cd cfp-scraper
40 |         virtualenv --python=python3 .
41 |         source ./bin/activate.fish
42 | 
43 |     (Use the `activate` script appropriate for your shell)
44 | 
45 | 3. Install required modules
46 | 
47 |         pip install -r requirements.txt
48 | 
49 | ### Run cfp-scraper
50 | 
51 | * Activate the virtualenv
52 | 
53 |         source ./bin/activate.fish
54 | 
55 |     (Use the `activate` script appropriate for your shell)
56 | 
57 | * Based on the credentials obtained above, run: 
58 | 
59 |         export TWITTER_CONSUMER_KEY=xxxxxx
60 |         export TWITTER_CONSUMER_SECRET=xxxxxx
61 |         export TWITTER_ACCESS_KEY=xxxxxx
62 |         export TWITTER_ACCESS_SECRET=xxxxxx
63 | 
64 |         export AIRTABLE_API_KEY=xxxxxx
65 |         export AIRTABLE_BASE_KEY=xxxxxx
66 | 
67 | * Launch: 
68 | 
69 |         python main.py
70 | 


--------------------------------------------------------------------------------
/lwn.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from datetime import date, datetime, time
 3 | 
 4 | import dateparser
 5 | import pytz
 6 | import requests
 7 | from bs4 import BeautifulSoup
 8 | 
 9 | import sessionize
10 | 
11 | def get(url):
12 |     res = requests.get(url)
13 |     return BeautifulSoup(res.text, 'html.parser')
14 | 
15 | 
16 | def parse_page(root):
17 |     for evt_elm in root.select('.CalMEvent a'):
18 |         col_index = len(evt_elm.find_parent('td').find_previous_siblings('td'))
19 |         date_row = evt_elm.find_parent('tr').find_previous_sibling(lambda elm: elm.name == 'tr' and elm.select('.CalMDate'))
20 |         day = date_row.find_all('td')[col_index].text
21 |         yield {
22 |             'short_name' : evt_elm.text,
23 |             'url': evt_elm['href'],
24 |             'name': evt_elm['title'],
25 |             'day': day
26 |         }
27 | 
28 | 
29 | def find_pages():
30 |     start = date.today()
31 |     for i in range(12):
32 |         new_month = start.month + i
33 |         new_year = start.year
34 |         if new_month > 12:
35 |             new_month -= 12
36 |             new_year += 1
37 |         yield f'https://lwn.net/Calendar/Monthly/cfp/{new_year}-{new_month:02d}/', date(new_year, new_month, 1)
38 | 
39 | 
40 | def parse_pages():
41 |     for url, base_date in find_pages():
42 |         for evt in parse_page(get(url)):
43 |             evt['date'] = base_date.replace(day=int(evt['day']))
44 |             yield evt
45 | 
46 | 
47 | def format_page(raw_evt):
48 |     md = re.search(r'^([^(]+) \(([^)]+)\)$', raw_evt['name'])
49 |     name, location = md.group(1, 2)
50 |     return {
51 |         'Conference Name': name,
52 |         'Conference URL': raw_evt['url'],
53 |         'Location': location,
54 |         'CFP URL': raw_evt['url'],
55 |         'CFP End Date': datetime.combine(raw_evt['date'], time()),
56 |     }
57 | 
58 | def scrape():
59 |     for raw_evt in parse_pages():
60 |         evt = format_page(raw_evt)
61 |         if evt is None:
62 |             continue
63 |         if 'papercall.io' in evt['CFP URL']:
64 |             continue
65 |         if 'events.linuxfoundation.org' in evt['CFP URL']:
66 |             continue
67 |         if 'sessionize.com' in evt['CFP URL']:
68 |             s = sessionize.parse_event(evt['CFP URL'])
69 |             if s:
70 |                 evt.update(s)
71 |         yield evt
72 | 
73 | if __name__ == '__main__':
74 |     for e in scrape():
75 |         print(e)
76 | 


--------------------------------------------------------------------------------
/sessionize.py:
--------------------------------------------------------------------------------
  1 | import dateparser
  2 | import requests
  3 | from bs4 import BeautifulSoup
  4 | 
  5 | import twitter_utils
  6 | 
  7 | def get(url):
  8 |     res = requests.get(url)
  9 |     return BeautifulSoup(res.text, 'html.parser')
 10 | 
 11 | 
 12 | def find_navy_section(root, label):
 13 |     for elm in root.select('.text-navy'):
 14 |         if elm.contents[-1].strip().startswith(label):
 15 |             return elm.find_parent(lambda e: e.has_attr('class') and 'col-' in ' '.join(e['class'])).find('h2')
 16 | 
 17 | 
 18 | def parse_event(url):
 19 |     root = get(url)
 20 | 
 21 |     if root.find('span', string='Speaker Profile'):
 22 |         return None
 23 | 
 24 |     if 'Log in' in root.find('title').string:
 25 |         return None
 26 | 
 27 |     if '@ Sessionize.com' not in root.find('title').string:
 28 |         return None
 29 | 
 30 |     data = {
 31 |         'Conference Name': root.select('.ibox-title h4')[0].string,
 32 |         'CFP URL': url,
 33 |     }
 34 | 
 35 |     elm = find_navy_section(root, 'location')
 36 |     if elm:
 37 |          data['Location'] = elm.select('.block')[-1].string
 38 | 
 39 |     elm = find_navy_section(root, 'website')
 40 |     if elm:
 41 |         data['Conference URL'] = elm.find('a')['href']
 42 | 
 43 |     elm = find_navy_section(root, 'event date')
 44 |     if elm:
 45 |         data['Conference Start Date'] = data['Conference End Date'] = dateparser.parse(elm.string).date()
 46 | 
 47 |     elm = find_navy_section(root, 'event starts')
 48 |     if elm:
 49 |         data['Conference Start Date'] = dateparser.parse(elm.string).date()
 50 | 
 51 |     elm = find_navy_section(root, 'event ends')
 52 |     if elm:
 53 |         data['Conference End Date'] = dateparser.parse(elm.string).date()
 54 | 
 55 |     # Find the UTC version of the CFP end date.
 56 |     elm = root.select('.js-closedate')[0]
 57 |     if not elm:
 58 |         raise ValueError(f'js-closedate not found in {url}')
 59 |     utc_cfp_end_date = dateparser.parse(elm['data-date']).replace(tzinfo=None)
 60 |     data['CFP End Date'] = utc_cfp_end_date
 61 | 
 62 |     elm = find_navy_section(root, 'CfS closes at')
 63 |     if not elm:
 64 |         raise ValueError(f'CfS closes at not found in {url}')
 65 |     time = elm.parent.select('.text-navy')[0].string[13:]
 66 |     parsed = dateparser.parse(f'{elm.string} {time}')
 67 |     utc_offset = parsed - utc_cfp_end_date
 68 | 
 69 | 
 70 |     elm = find_navy_section(root, 'CfS opens at')
 71 |     if elm:
 72 |         time = elm.parent.select('.text-navy')[0].string[13:]
 73 |         date = elm.string
 74 |         parsed = dateparser.parse(f'{date} {time}')
 75 |         data['CFP Start Date'] = (parsed - utc_offset).date()
 76 | 
 77 |     return data
 78 | 
 79 | 
 80 | def find_events():
 81 |     seen_urls = set()
 82 |     for url in twitter_utils.search_for_url('sessionize.com'):
 83 |         # Skip the queryparams and downcase it.
 84 |         clean_url = url.split('?')[0].lower().rstrip('/')
 85 |         if clean_url in seen_urls:
 86 |             continue
 87 |         if '/api/' in clean_url:
 88 |             continue
 89 |         evt = parse_event(clean_url)
 90 |         if evt is not None:
 91 |             yield evt
 92 |         seen_urls.add(clean_url)
 93 | 
 94 | 
 95 | def scrape():
 96 |     yield from find_events()
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     for d in find_events():
101 |         print(d)
102 | 


--------------------------------------------------------------------------------
/devopsdays.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import dateparser
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | def get(url):
  8 |     res = requests.get(url)
  9 |     return BeautifulSoup(res.text, 'html.parser')
 10 | 
 11 | 
 12 | def parse_events():
 13 |     root = get('https://www.devopsdays.org/events/')
 14 |     for elm in root.select('.col-md-12 .row')[1].find_all('a'):
 15 |         yield elm['href']
 16 | 
 17 | 
 18 | def parse_open_cfps():
 19 |     root = get('https://www.devopsdays.org/speaking/')
 20 |     for row in root.select('table.sortable tbody tr'):
 21 |         yield {
 22 |             'Location': row.find('a').string,
 23 |             'Conference URL': 'https://www.devopsdays.org' + row.find('a')['href'],
 24 |             'CFP End Date': dateparser.parse(row.find_all('td')[1].string.strip()),
 25 |             'Conference Start Date': dateparser.parse(row.find_all('td')[2].string.strip()).date(),
 26 |         }
 27 | 
 28 | 
 29 | def parse_event(url):
 30 |     root = get(url+'welcome/')
 31 | 
 32 |     cfp_nav = None
 33 |     for nav in root.select('.nav-link'):
 34 |         nav_text = str(nav.string).lower()
 35 |         if 'propose' in nav_text or 'cfp' in nav_text:
 36 |             cfp_nav = nav
 37 |             break
 38 |     if cfp_nav is None:
 39 |         propose_elm = root.find('strong', string='Propose')
 40 |         if propose_elm:
 41 |             cfp_nav = propose_elm.parent.next_sibling.find('a')
 42 |     if cfp_nav is None:
 43 |         return None
 44 |     cfp_url = cfp_nav['href']
 45 |     if cfp_url.startswith('/'):
 46 |         cfp_url = f'https://www.devopsdays.org{cfp_url}'
 47 | 
 48 | 
 49 |     dates_elm = root.find('strong', string='Dates')
 50 |     if dates_elm:
 51 |         dates = dates_elm.parent.next_sibling.string.split('-')
 52 |         event_end = dateparser.parse(dates[-1]).date()
 53 |     else:
 54 |         dates = root.select('.welcome-page-date')[0].contents[0]
 55 |         # Looks like "April 9 - 10, 2019"
 56 |         md = re.match(r'^(\S+) ([ 0-9-]+), (\d+)$', dates)
 57 |         if md:
 58 |             month, days, year = md.group(1, 2, 3)
 59 |             if '-' in days:
 60 |                 start_day, end_day = days.split('-')
 61 |             else:
 62 |                 start_day = end_day = days
 63 |             event_end = dateparser.parse(f'{month} {end_day}, {year}').date()
 64 |             if int(start_day) > int(end_day):
 65 |                 event_end = event_end.replace(month=event_end.month+1)
 66 |         else:
 67 |             raise ValueError(f'Unable to find end date in {url}')
 68 | 
 69 |     name_elm = root.select('.welcome-page')
 70 |     if not name_elm:
 71 |         name_elm = root.select('title')
 72 |     name_parts = name_elm[0].string.split()
 73 |     name_parts[0] = name_parts[0].capitalize()
 74 |     name = ' '.join(name_parts)
 75 | 
 76 |     return {
 77 |         'Conference Name': name,
 78 |         'CFP URL': cfp_url,
 79 |         'Conference End Date': event_end,
 80 |         'Tags': ['devops', 'devopsdays'],
 81 |     }
 82 | 
 83 | 
 84 | def scrape():
 85 |     for data in parse_open_cfps():
 86 |         evt_data = parse_event(data['Conference URL'])
 87 |         if evt_data is None:
 88 |             continue
 89 |         data.update(evt_data)
 90 |         # Papercall is already handled.
 91 |         if 'papercall.io' in data['CFP URL']:
 92 |             continue
 93 |         yield data
 94 | 
 95 | if __name__ == '__main__':
 96 |     # print(parse_event('https://www.devopsdays.org/events/2019-indianapolis/'))
 97 |     # for d in parse_open_cfps():
 98 |         # print(d)
 99 |     for d in scrape():
100 |         print(d)
101 | 


--------------------------------------------------------------------------------
/linux_foundation.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from urllib.parse import urljoin
  3 | 
  4 | import dateparser
  5 | import pytz
  6 | import requests
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | def get(url):
 10 |     res = requests.get(url)
 11 |     return BeautifulSoup(res.text, 'html.parser')
 12 | 
 13 | 
 14 | def parse_date(raw_date):
 15 |     md = re.search(r'^(\w+) (\d+), (\d+)$', raw_date)
 16 |     if md:
 17 |         d = dateparser.parse(raw_date).date()
 18 |         return (d, d)
 19 |     md = re.search(r'^(\w+) (\d+) - (\d+), (\d+)$', raw_date)
 20 |     if md:
 21 |         return (
 22 |             dateparser.parse(f'{md.group(1)} {md.group(2)}, {md.group(4)}').date(),
 23 |             dateparser.parse(f'{md.group(1)} {md.group(3)}, {md.group(4)}').date(),
 24 |         )
 25 |     md = re.search(r'^(\w+) (\d+) - (\w+) (\d+), (\d+)$', raw_date)
 26 |     if md:
 27 |         return (
 28 |             dateparser.parse(f'{md.group(1)} {md.group(2)}, {md.group(5)}').date(),
 29 |             dateparser.parse(f'{md.group(3)} {md.group(4)}, {md.group(5)}').date(),
 30 |         )
 31 |     raise ValueError(f'Unable to parse {raw_date}')
 32 | 
 33 | 
 34 | def parse_events_page():
 35 |     root = get('https://events.linuxfoundation.org/')
 36 | 
 37 |     for elm in root.select('.single-event-wrap'):
 38 |         raw_date, location = [e.string for e in elm.find_all('h3')]
 39 |         start_date, end_date = parse_date(raw_date)
 40 |         yield {
 41 |             'Conference URL': elm.find('span', string=re.compile(r'(?i:(learn more)|(view the website))')).parent['href'],
 42 |             'Conference Start Date': start_date,
 43 |             'Conference End Date': end_date,
 44 |             'Location': location,
 45 |         }
 46 | 
 47 | 
 48 | def fetch_smapply_json():
 49 |     has_next = True
 50 |     page = 1
 51 |     # Ten page limit to deal with errors I guess?
 52 |     while has_next and page < 10:
 53 |         data = requests.get(f'https://linuxfoundation.smapply.io/prog/ds/?page={page}&base_query=all').json()
 54 |         has_next = data['has_next']
 55 |         page += 1
 56 |         yield from data['results']
 57 | 
 58 | 
 59 | def parse_smapply_json():
 60 |     for data in fetch_smapply_json():
 61 |         if not data['startdate']:
 62 |             # Malformed data.
 63 |             continue
 64 |         yield {
 65 |             'Conference Name': data['name'],
 66 |             'CFP Start Date': dateparser.parse(data['startdate']).astimezone(pytz.utc).date(),
 67 |             'CFP End Date': dateparser.parse(data['deadline']).astimezone(pytz.utc),
 68 |             'CFP URL': 'https://linuxfoundation.smapply.io{}'.format(data['listing_url']),
 69 |         }
 70 | 
 71 | 
 72 | def possible_cfp_links(evt):
 73 |     evt_page = get(evt['Conference URL'])
 74 |     for elm in evt_page.find_all('a'):
 75 |         if elm.has_attr('href') and ('cfp' in elm['href'] or 'program' in elm['href']):
 76 |             yield urljoin(evt['Conference URL'], elm['href'])
 77 | 
 78 | 
 79 | def correlate_event(evt, json_data):
 80 |     for url in possible_cfp_links(evt):
 81 |         page = requests.get(url).text
 82 |         for d in json_data:
 83 |             if d['CFP URL'].rstrip('/') in page:
 84 |                 out = {}
 85 |                 out.update(evt)
 86 |                 out.update(d)
 87 |                 return out
 88 | 
 89 | 
 90 | def scrape():
 91 |     smapply_json = list(parse_smapply_json())
 92 | 
 93 |     for evt in parse_events_page():
 94 |         out = correlate_event(evt, smapply_json)
 95 |         if out is not None:
 96 |             yield out
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     for d in scrape():
101 |         # print(d)
102 |         pass
103 | 


--------------------------------------------------------------------------------
/site/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html class="no-js" lang="">
 3 | 
 4 | <head>
 5 |   <meta charset="utf-8">
 6 |   <meta http-equiv="x-ua-compatible" content="ie=edge">
 7 |   <title>CFP Calendar</title>
 8 |   <meta name="description" content="Tech conference CFPs">
 9 |   <meta name="viewport" content="width=device-width, initial-scale=1">
10 |   <link rel="icon" href="icon.png">
11 |   <style type="text/css">
12 |     html, body {
13 |       height: 100%;
14 |       margin: 0;
15 |       padding: 0;
16 |       overflow: hidden;
17 |     }
18 |     .github-corner, .ics-link { display: none; }
19 |     @media (min-width: 600px) {
20 |       .github-corner, .ics-link { display: inline; }
21 |     }
22 |   </style>
23 | </head>
24 | 
25 | <body>
26 |   <a href="https://github.com/coderanger/cfp-scraper" class="github-corner" aria-label="View source on GitHub"><svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
27 |   <a href="https://airtable.com/shr7ZxfpvIEX0jgJC/iCal?timeZone=America%2FLos_Angeles&userLocale=en" class="ics-link" aria-label="iCal link" style="position: absolute; top: 12px; border: 0; left: 195px; color: hsl(0,0%,30%); text-decoration: none; font-family: -apple-system,system-ui,BlinkMacSystemFont,'Segoe UI',Roboto,Oxygen-Sans,Ubuntu,Cantarell,'Helvetica Neue',sans-serif,'Apple Color Emoji','Segoe UI Emoji','Segoe UI Symbol',sans-serif;"><svg width="16" height="16" enable-background="new 0 0 24 24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><g style="fill:none;stroke:#000;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10"><path d="m16 10h7"/><path d="m1 10h7"/><path d="m16 3h5c1.105 0 2 .895 2 2v16c0 1.105-.895 2-2 2h-18c-1.105 0-2-.895-2-2v-16c0-1.105.895-2 2-2h5"/><path d="m16 14-4 4-4-4"/><path d="m12 3v15"/><path d="m6 1v5"/><path d="m18 1v5"/></g></svg> iCal link</a>
28 |   <iframe class="airtable-embed" src="https://airtable.com/embed/shr7ZxfpvIEX0jgJC?backgroundColor=green&viewControls=on" frameborder="0" onmousewheel="" width="100%" height="100%" style="background: transparent;"></iframe>
29 |   <script async src="https://www.googletagmanager.com/gtag/js?id=UA-19700911-4"></script>
30 |   <script>
31 |     window.dataLayer = window.dataLayer || [];
32 |     function gtag(){dataLayer.push(arguments);}
33 |     gtag('js', new Date());
34 |     gtag('config', 'UA-19700911-4');
35 |   </script>
36 | </body>
37 | 
38 | </html>
39 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | 
  4 | import airtable
  5 | import dateparser
  6 | 
  7 | 
  8 | class AirtableModel(dict):
  9 |     class AirtablePropety:
 10 |         def __get__(_self, _instance, owner):
 11 |             if not hasattr(owner, '_db'):
 12 |                 if not owner.table_name:
 13 |                     raise ValueError(f'{owner} does not define table_name')
 14 |                 owner._db = airtable.Airtable(os.environ['AIRTABLE_BASE_KEY'], owner.table_name)
 15 |             return owner._db
 16 | 
 17 |     table_name = None
 18 |     db = AirtablePropety()
 19 | 
 20 |     def __init__(self, airtable_id=None, **fields):
 21 |         self.airtable_id = airtable_id
 22 |         super().__init__(fields)
 23 | 
 24 |     @classmethod
 25 |     def fetch(cls, **query):
 26 |         if len(query) != 1:
 27 |             raise ValueError(f'Invalid fetch query: {query}')
 28 |         key, value = list(query.items())[0]
 29 |         key = key.replace('_', ' ')
 30 |         record = cls.db.match(key, value)
 31 |         return cls(airtable_id=record.get('id'), **record.get('fields', {}))
 32 | 
 33 |     @classmethod
 34 |     def fetch_all(cls):
 35 |         for page in cls.db.get_iter():
 36 |             for record in page:
 37 |                 yield cls(airtable_id=record.get('id'), **record.get('fields', {}))
 38 | 
 39 |     def save(self):
 40 |         if self.airtable_id:
 41 |             self.db.update(self.airtable_id, self)
 42 |         else:
 43 |             record = self.db.insert(self)
 44 |             self.airtable_id = record['id']
 45 | 
 46 | 
 47 | def datetime_lt(a, b):
 48 |     if isinstance(a, (str, bytes)):
 49 |         a = dateparser.parse(a)
 50 |     if isinstance(b, (str, bytes)):
 51 |         b = dateparser.parse(b)
 52 |     return a.replace(tzinfo=None) < b.replace(tzinfo=None)
 53 | 
 54 | 
 55 | class Conference(AirtableModel):
 56 |     table_name = 'Conferences'
 57 | 
 58 |     def __str__(self):
 59 |         label = self.get('Conference Name')
 60 |         if not label:
 61 |             label = self['CFP URL']
 62 |         return f'Conference: {label}'
 63 | 
 64 |     def save(self):
 65 |         # If we didn't have a CFP Start Date, just assume it's today.
 66 |         if 'CFP Start Date' not in self:
 67 |             if self.get('CFP End Date') and datetime_lt(self['CFP End Date'], datetime.now()):
 68 |                 d = self['CFP End Date']
 69 |                 if isinstance(d, (str, bytes)):
 70 |                     d = str(dateparser.parse(d).date())
 71 |                 self['CFP Start Date'] = d
 72 |             else:
 73 |                 self['CFP Start Date'] = str(datetime.utcnow().date())
 74 | 
 75 |         # Clear computed fields.
 76 |         end_date_only = self.pop('CFP End Date (Only)', None)
 77 | 
 78 |         # Handle the tags value.
 79 |         tags = self.pop('Tags', [])
 80 |         try:
 81 |             super().save()
 82 |         finally:
 83 |             # Restore it after the save
 84 |             self['Tags'] = tags
 85 |             self['CFP End Date (Only)'] = end_date_only
 86 |         # Update any new tags.
 87 |         for t in tags:
 88 |             tag = Tag.fetch(Tag=t)
 89 |             if self.airtable_id not in tag.get('Conference', []):
 90 |                 tag['Tag'] = t
 91 |                 tag.setdefault('Conference', [])
 92 |                 tag['Conference'].append(self.airtable_id)
 93 |                 tag.save()
 94 |         # Remove any old tags.
 95 |         for t in self.db.get(self.airtable_id)['fields'].get('Tags', []):
 96 |             if t not in tags:
 97 |                 tag = Tag.fetch(Tag=t)
 98 |                 if tag.get('Conferences'):
 99 |                     tag['Conferences'].delete(self.airtable_id)
100 |                     tag.save()
101 | 
102 | 
103 | class Tag(AirtableModel):
104 |     table_name = 'Conference Tags'
105 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import os
  3 | import pytz
  4 | from datetime import date, datetime, timedelta
  5 | 
  6 | import devopsdays
  7 | import papercall
  8 | import models
  9 | import sessionize
 10 | import linux_foundation
 11 | import seecfp
 12 | import lwn
 13 | import mozilla_calendar
 14 | 
 15 | def scrape_all():
 16 |     print('Scraping Papercall')
 17 |     yield from papercall.scrape()
 18 |     print('Scraping Devopsdays')
 19 |     yield from devopsdays.scrape()
 20 |     print('Scraping Sessionize')
 21 |     yield from sessionize.scrape()
 22 |     print('Scraping Linux Foundation')
 23 |     yield from linux_foundation.scrape()
 24 |     print('Scraping SeeCFP')
 25 |     yield from seecfp.scrape()
 26 |     print('Scraping LWN CFP Calendar')
 27 |     yield from lwn.scrape()
 28 |     print('Scraping Mozilla Calendar')
 29 |     yield from mozilla_calendar.scrape()
 30 | 
 31 | 
 32 | def sync_record(existing, fields):
 33 |     # Convert any needed fields:
 34 |     for key, value in fields.items():
 35 |         if isinstance(value, datetime):
 36 |             if value.tzinfo:
 37 |                 value = pytz.UTC.normalize(value).replace(tzinfo=None)
 38 |             fields[key] = value.replace(microsecond=0).isoformat() + '.000Z'
 39 |         elif isinstance(value, date):
 40 |             fields[key] = value.isoformat()
 41 |     if not fields.get('Conference Start Date'):
 42 |         fields.pop('Conference Start Date', None)
 43 |     if not fields.get('Conference End Date'):
 44 |         fields.pop('Conference End Date', None)
 45 |     if not fields.get('Tags'):
 46 |         fields.pop('Tags', None)
 47 | 
 48 |     # No existing verison, create it.
 49 |     if existing is None:
 50 |         conf = models.Conference(**fields)
 51 |         print(f'Creating {conf}')
 52 |         conf.save()
 53 |         return conf
 54 |     else:
 55 |         # Check if a save is needed.
 56 |         do_update = False
 57 |         for key, value in fields.items():
 58 |             existing_value = existing.get(key)
 59 |             # Special case for tags, they need to be sorted to check.
 60 |             if key == 'Tags' and value and existing_value:
 61 |                 if sorted(value) != sorted(existing_value):
 62 |                     print('{} {} {}'.format(key, repr(value), repr(existing_value)))
 63 |                     do_update = True
 64 |                     break
 65 |                 else:
 66 |                     continue
 67 | 
 68 |             # Special case, none and '' are okay.
 69 |             if value == '' and existing_value is None:
 70 |                 continue
 71 | 
 72 |             if value != existing_value:
 73 |                 print('Field changed {}: was {} now {}'.format(key, repr(existing_value), repr(value)))
 74 |                 do_update = True
 75 |                 break
 76 |         if do_update:
 77 |             print(f'Updating {existing}')
 78 |             existing.update(fields)
 79 |             existing.save()
 80 |         elif os.environ.get('CI'):
 81 |             print(f'Scraped {existing}')
 82 |         return existing
 83 | 
 84 | 
 85 | def sync_all():
 86 |     # Fetch all the conferences into a local cache.
 87 |     conferences = {}
 88 |     for conf in models.Conference.fetch_all():
 89 |         conferences[conf['CFP URL']] = conf
 90 | 
 91 |     # Run the scrapes and syncs.
 92 |     for fields in scrape_all():
 93 |         # Try to filter out meetups
 94 |         if 'meetup' in fields.get('Conference Name', '').lower() or 'meetup' in fields.get('Conference URL', '').lower():
 95 |             continue
 96 |         if fields.get('Conference Start Date') and fields.get('Conference End Date') and fields['Conference End Date'] - fields['Conference Start Date'] > timedelta(days=14):
 97 |             continue
 98 | 
 99 |         conf = sync_record(conferences.get(fields['CFP URL']), fields)
100 |         conferences[conf['CFP URL']] = conf
101 | 
102 | 
103 | def main():
104 |     sync_all()
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     main()
109 | 


--------------------------------------------------------------------------------
/mozilla_calendar.py:
--------------------------------------------------------------------------------
  1 | # https://calendar.google.com/calendar/ical/mozilla.com_tptb36ac7eijerilfnf6c1onfo%40group.calendar.google.com/public/basic.ics
  2 | import re
  3 | from datetime import datetime
  4 | 
  5 | import dateparser
  6 | import requests
  7 | import ics
  8 | from urlextract import URLExtract
  9 | 
 10 | import sessionize
 11 | 
 12 | FLAG_A = ord('🇦')
 13 | FLAG_Z = FLAG_A + 26
 14 | FLAG_OFFSET = FLAG_A - ord('A')
 15 | URL_EXTRACTOR = URLExtract()
 16 | 
 17 | 
 18 | def fetch_cal():
 19 |     url = 'https://calendar.google.com/calendar/ical/mozilla.com_tptb36ac7eijerilfnf6c1onfo%40group.calendar.google.com/public/basic.ics'
 20 |     return ics.Calendar(requests.get(url).text)
 21 | 
 22 | 
 23 | def convert_flags(s):
 24 |     ords = [ord(c) for c in s]
 25 |     return ''.join(chr(c - FLAG_OFFSET) if FLAG_A <= c <= FLAG_Z else chr(c) for c in ords)
 26 | 
 27 | 
 28 | def parse_event_url(evt):
 29 |     links = URL_EXTRACTOR.find_urls(evt.description)
 30 |     if links:
 31 |         return links[0]
 32 | 
 33 | 
 34 | def parse_date(raw_date, relative_to):
 35 |     s = {'PREFER_DATES_FROM': 'future', 'RELATIVE_BASE': relative_to.replace(tzinfo=None)}
 36 | 
 37 |     md = re.search(r'^(\w+) (\d+)\s*-\s*(\w+) (\d+)(.*)$', raw_date)
 38 |     if md:
 39 |         return (
 40 |             dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date(),
 41 |             dateparser.parse(f'{md.group(3)} {md.group(4)}', settings=s).date(),
 42 |             md.group(5),
 43 |         )
 44 |     md = re.search(r'^(\w+) (\d+)\s*-\s*(\d+)(.*)$', raw_date)
 45 |     if md:
 46 |         return (
 47 |             dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date(),
 48 |             dateparser.parse(f'{md.group(1)} {md.group(3)}', settings=s).date(),
 49 |             md.group(4),
 50 |         )
 51 |     md = re.search(r'^(\w+) (\d+)(.*)$', raw_date)
 52 |     if md:
 53 |         d = dateparser.parse(f'{md.group(1)} {md.group(2)}', settings=s).date()
 54 |         return (d, d, md.group(3))
 55 |     return (None, None, raw_date)
 56 | 
 57 | 
 58 | def parse_event_name(label, relative_to):
 59 |     label = convert_flags(label)
 60 |     md = re.search(r'^(.*) \((.*?)\)$', label)
 61 |     if not md:
 62 |         return {
 63 |             'Conference Name': label.strip(),
 64 |         }
 65 |     name, dates_and_location = md.group(1, 2)
 66 |     # Try to filter out the word CFP.
 67 |     name = name.replace('CFP', '')
 68 |     name = re.sub(r'(^| ):( |$)', ' ', name)
 69 |     name = re.sub(r'\s+', ' ', name).strip()
 70 |     # Parse dates.
 71 |     start_date, end_date, location = parse_date(dates_and_location, relative_to)
 72 |     # Clean up the location.
 73 |     location = location.lstrip(',').strip()
 74 |     evt = {
 75 |         'Conference Name': name,
 76 |     }
 77 |     if start_date:
 78 |         evt['Conference Start Date'] = start_date
 79 |     if end_date:
 80 |         evt['Conference End Date'] = end_date
 81 |     if location:
 82 |         evt['Location'] = location
 83 |     return evt
 84 | 
 85 | 
 86 | def parse_events(cal):
 87 |     # Skip anything that closed more than a year ago.
 88 |     now = datetime.utcnow()
 89 |     cutoff = now.replace(year=now.year-1, tzinfo=None)
 90 | 
 91 |     for evt in cal.events:
 92 |         if evt.begin.datetime.replace(tzinfo=None) < cutoff:
 93 |             continue
 94 |         data = parse_event_name(evt.name, evt.begin.datetime)
 95 |         if evt.location:
 96 |             data['Location'] = evt.location
 97 |         data['CFP End Date'] = evt.begin.datetime.replace(tzinfo=None)
 98 |         url = parse_event_url(evt)
 99 |         if url:
100 |             data['Conference URL'] = data['CFP URL'] = url
101 |         yield data
102 | 
103 | 
104 | def scrape():
105 |     for evt in parse_events(fetch_cal()):
106 |         if evt is None or 'CFP URL' not in evt:
107 |             continue
108 |         if 'papercall.io' in evt['CFP URL']:
109 |             continue
110 |         if 'sessionize.com' in evt['CFP URL']:
111 |             s = sessionize.parse_event(evt['CFP URL'])
112 |             if s:
113 |                 evt.update(s)
114 |         yield evt
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     for e in scrape():
119 |         print(e)
120 | 


--------------------------------------------------------------------------------
/seecfp.py:
--------------------------------------------------------------------------------
 1 | import dateparser
 2 | import pytz
 3 | import requests
 4 | 
 5 | import sessionize
 6 | 
 7 | URL = '''
 8 | https://airtable.com/v0.3/view/viw1YoXQzG3f7Ty7D/readSharedViewData?stringifiedObjectParams=%7B%7D&requestId=reqcNhwt4DFJWjr0u&accessPolicy=%7B%22allowedAction
 9 | s%22%3A%5B%7B%22modelClassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viw1YoXQzG3f7Ty7D%22%2C%22action%22%3A%22readSharedViewData%22%7D%2C%7B%22modelClas
10 | sName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viw1YoXQzG3f7Ty7D%22%2C%22action%22%3A%22getMetadataForPrinting%22%7D%2C%7B%22modelClassName%22%3A%22row%22%
11 | 2C%22modelIdSelector%22%3A%22rows+*%5BdisplayedInView%3Dviw1YoXQzG3f7Ty7D%5D%22%2C%22action%22%3A%22createBoxDocumentSession%22%7D%2C%7B%22modelClassName%22%3A
12 | %22row%22%2C%22modelIdSelector%22%3A%22rows+*%5BdisplayedInView%3Dviw1YoXQzG3f7Ty7D%5D%22%2C%22action%22%3A%22createDocumentPreviewSession%22%7D%2C%7B%22modelC
13 | lassName%22%3A%22view%22%2C%22modelIdSelector%22%3A%22viw1YoXQzG3f7Ty7D%22%2C%22action%22%3A%22downloadCsv%22%7D%2C%7B%22modelClassName%22%3A%22view%22%2C%22mo
14 | delIdSelector%22%3A%22viw1YoXQzG3f7Ty7D%22%2C%22action%22%3A%22downloadICal%22%7D%2C%7B%22modelClassName%22%3A%22row%22%2C%22modelIdSelector%22%3A%22rows+*%5Bd
15 | isplayedInView%3Dviw1YoXQzG3f7Ty7D%5D%22%2C%22action%22%3A%22downloadAttachment%22%7D%5D%2C%22shareId%22%3A%22shrBMFY4CSpSRGmAs%22%2C%22applicationId%22%3A%22a
16 | ppl4CwxGoKNDk2ek%22%2C%22sessionId%22%3A%22sestt1hvhA5QXmrdz%22%2C%22generationNumber%22%3A0%2C%22signature%22%3A%22562e2ea38b121c78fada55b507c41695cb9991bfe74
17 | e1231c9be2406c3e589ee%22%7D
18 | '''.replace('\n', '')
19 | HEADERS = {'x-airtable-application-id': 'appl4CwxGoKNDk2ek', 'X-Requested-With': 'XMLHttpRequest', 'x-time-zone': 'UTC', 'x-user-locale': 'en'}
20 | 
21 | def get_data():
22 |     r = requests.get(URL, headers=HEADERS)
23 |     if r.status_code != 200:
24 |         raise requests.HTTPError(f'Error retreiving Airtable data {r.status_code}: {r.text}')
25 |     return r.json()['data']
26 | 
27 | 
28 | def convert_columns(data):
29 |     col_map = {}
30 |     for d in data['columns']:
31 |         col_map[d['id']] = d['name']
32 | 
33 |     for d in data['rows']:
34 |         row_data = {}
35 |         for k, v in d['cellValuesByColumnId'].items():
36 |             k = col_map[k]
37 |             if k == 'Country':
38 |                 v = v[0]['foreignRowDisplayName']
39 |             row_data[k] = v
40 |         yield row_data
41 | 
42 | def format_data(row):
43 |     # {'Link to the call for paper': 'http://www.jbcnconf.com/2019/', 'Submission Deadline': '2019-04-01T00:00:00.000Z', 'Country': [{'foreignRowId': 'recIhNTuv0pD1lSSN', 'foreignRowDisplayName': 'Spain'}], 'City': 'Barcelona', 'Conference Start': '2019-05-27T00:00:00.000Z', 'Conference End': '2019-05-29T00:00:00.000Z', 'Name': 'JBCNConf-2019', 'Continent': {'valuesByForeignRowId': {'recIhNTuv0pD1lSSN': ['Europe']}, 'foreignRowIdOrder': ['recIhNTuv0pD1lSSN']}, 'Days left': 69, 'When': 'May'}
44 |     location = row['Country']
45 |     if 'City' in row:
46 |         location = '{}, {}'.format(row['City'], row['Country'])
47 |     return {
48 |         'CFP URL': row['Link to the call for paper'],
49 |         'Conference URL': row['Link to the call for paper'], # Shrug, I guess I'll just use it for both.
50 |         'CFP End Date': dateparser.parse(row['Submission Deadline']).astimezone(pytz.utc).replace(tzinfo=None),
51 |         'Location': location,
52 |         'Conference Start Date': dateparser.parse(row['Conference Start']).date(),
53 |         'Conference End Date': dateparser.parse(row['Conference End']).date(),
54 |         'Conference Name': row['Name']
55 |     }
56 | 
57 | 
58 | def scrape():
59 |     for raw_row in convert_columns(get_data()):
60 |         row = format_data(raw_row)
61 |         if row is None:
62 |             continue
63 |         if 'papercall.io' in row['CFP URL']:
64 |             continue
65 |         if 'sessionize.com' in row['CFP URL']:
66 |             s = sessionize.parse_event(row['CFP URL'])
67 |             if s:
68 |                 row.update(s)
69 |         yield row
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     for row in scrape():
74 |         print(row)
75 | 
76 | 
77 | # 'Country': [{'foreignRowId': 'recOUw0MItMcZxNQe', 'foreignRowDisplayName': 'Slovakia'}]
78 | 


--------------------------------------------------------------------------------
/papercall.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | 
  3 | import dateparser
  4 | import dateparser.search
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | URL = 'https://www.papercall.io/events?open-cfps=true&page={page}'
  9 | 
 10 | def get(page):
 11 |     res = requests.get(URL.format(page=page))
 12 |     return BeautifulSoup(res.text, 'html.parser')
 13 | 
 14 | 
 15 | def maybe_int(s):
 16 |     try:
 17 |         return int(s)
 18 |     except ValueError:
 19 |         return 0
 20 | 
 21 | 
 22 | def num_pages():
 23 |     pagination = get(1).find(class_='pagination')
 24 |     return max(maybe_int(elm.string) for elm in pagination.find_all('a'))
 25 | 
 26 | 
 27 | 
 28 | # <div class="row event-list-detail">
 29 | #   <div class="col-md-10">
 30 | #     <div class="panel panel-default">
 31 | #       <div class="panel-heading">
 32 | #         <div class="row">
 33 | #           <h3 class="event__title col-md-11">
 34 | #             <a href="https://www.papercall.io/ibtechcon2019">Ignite Black Tech Conference - iBTechCon2019 - Atlanta, GA</a>
 35 | #           </h3>
 36 | #           <div class="col-md-1">
 37 | #             <div class="pull-right">
 38 | #                 <a href="http://ibtechcon.com/" target="_blank">
 39 | #                   <i class="fa fa-external-link" aria-hidden="true" data-toggle="tooltip" title="http://ibtechcon.com/"></i>
 40 | #                 </a>
 41 | #             </div>
 42 | #           </div>
 43 | #         </div>
 44 | #       </div>
 45 | #       <div class="panel-body ">
 46 | #         <div class="row">
 47 | #           <div class="col-md-1 hidden-sm hidden-xs">
 48 | #             <a href="https://www.papercall.io/ibtechcon2019" target="_blank">
 49 | #               <img width="90px" src="https://papercallio-production.s3.amazonaws.com/uploads/event/logo/1591/thumb_100_ibtechcon_squaremo.png" alt="Thumb 100 ibtechcon squaremo" />
 50 | #             </a>
 51 | #           </div>
 52 | #           <div class="col-md-11 col-sm-12">
 53 | #               <h4 class="hidden-xs">
 54 | #                 <a target="_blank" href="http://ibtechcon.com/">http://ibtechcon.com/</a>
 55 | #               </h4>
 56 | #               <h4>
 57 | #                 <strong>Event Dates:</strong> February 16, 2019, February 16, 2019
 58 | #               </h4>
 59 | 
 60 | #               <h4>
 61 | #                 <table>
 62 | #   <tbody>
 63 | #     <tr>
 64 | #       <td nowrap><strong> CFP closes at</strong>&nbsp;</td>
 65 | #       <td width="100%">January 27, 2019 23:01 UTC</td>
 66 | #     </tr>
 67 | #     <tr>
 68 | #       <td>&nbsp;</td>
 69 | #       <td style="font-size: 75%;"><time datetime="2019-01-27T23:01:00Z" data-local="time" data-format="%B %d, %Y %H:%M %Z">January 27, 2019 23:01 UTC</time> (Local)</td>
 70 | #     </tr>
 71 | #   </tbody>
 72 | # </table>
 73 | 
 74 | #               </h4>
 75 | #               <h4>
 76 | #                   <a href="/events?keywords=tags%3A+Blockchain">Blockchain</a>, <a href="/events?keywords=tags%3A+FinTech">Fintech</a>, <a href="/events?keywords=tags%3A+Healthcare+IT">Healthcare it</a>, <a href="/events?keywords=tags%3A+Energy">Energy</a>, <a href="/events?keywords=tags%3A+Entertainment">Entertainment</a>, <a href="/events?keywords=tags%3A+Artificial+Intelligence">Artificial intelligence</a>, <a href="/events?keywords=tags%3A+Virtual+Reality">Virtual reality</a>, <a href="/events?keywords=tags%3A+Gaming">Gaming</a>, <a href="/events?keywords=tags%3A+Cryptocurrency.+Cloud+Services">Cryptocurrency. cloud services</a>, <a href="/events?keywords=tags%3A+Cybersecurity">Cybersecurity</a>, <a href="/events?keywords=tags%3A+Machine+Learning">Machine learning</a>, <a href="/events?keywords=tags%3A+Big+Data">Big data</a>, <a href="/events?keywords=tags%3A+Data+Analytics">Data analytics</a>, <a href="/events?keywords=tags%3A+ERP+management">Erp management</a>, <a href="/events?keywords=tags%3A+Intermediate+Coding">Intermediate coding</a>, <a href="/events?keywords=tags%3A+Advanced+Coding">Advanced coding</a>, <a href="/events?keywords=tags%3A+BioTech">Biotech</a>, <a href="/events?keywords=tags%3A+Automation">Automation</a>, <a href="/events?keywords=tags%3A+Mechanical+engineering">Mechanical engineering</a>, <a href="/events?keywords=tags%3A+Advanced+Robotics">Advanced robotics</a>, <a href="/events?keywords=tags%3A+Unmanned+Systems+Demo">Unmanned systems demo</a>, <a href="/events?keywords=tags%3A+SAAS">Saas</a>, <a href="/events?keywords=tags%3A+CleanTech">Cleantech</a>, <a href="/events?keywords=tags%3A+System+Engineering">System engineering</a>, <a href="/events?keywords=tags%3A+Industry+Insight">Industry insight</a>
 77 | #               </h4>
 78 | #           </div>
 79 | #         </div>
 80 | #       </div>
 81 | #     </div>
 82 | #   </div>
 83 | def parse_page(root):
 84 |     for event in root.select('.event-list-detail'):
 85 |         title_line = event.select('.event__title a')[-1]
 86 |         title_parts = title_line.string.split(' - ', 1)
 87 |         if len(title_parts) == 1:
 88 |             title = title_parts[0]
 89 |             location = ''
 90 |         elif len(title_parts) == 2:
 91 |             title = title_parts[0]
 92 |             location = title_parts[1]
 93 |         try:
 94 |             url = event.select('.fa-external-link')[0]['title']
 95 |         except IndexError:
 96 |             url = ''
 97 |         cfp_close_label = event.find(lambda elm: elm.name == 'strong' and 'CFP closes at' in elm.string)
 98 |         if not cfp_close_label:
 99 |             # No real point.
100 |             continue
101 |         cfp_close = dateparser.parse(cfp_close_label.parent.find_next_sibling('td').string.strip())
102 |         start_date = end_date = None
103 |         dates = event.find(lambda elm: elm.name == 'strong' and 'Event Dates' in elm.string)
104 |         if dates:
105 |             dates = dates.next_sibling.string.strip()
106 |         if dates:
107 |             parsed_dates = [d for _, d in dateparser.search.search_dates(dates)]
108 |             if parsed_dates:
109 |                 start_date = parsed_dates[0].date()
110 |                 end_date = parsed_dates[-1].date()
111 |         tags = [t.string for t in event.select('a[href^="/events?keywords=tags"]')]
112 |         yield {
113 |             'CFP URL': title_line['href'],
114 |             'Conference Name': title,
115 |             'Location': location,
116 |             'Conference URL': url,
117 |             'Conference Start Date': start_date,
118 |             'Conference End Date': end_date,
119 |             'CFP End Date': cfp_close,
120 |             'Tags': tags,
121 |         }
122 | 
123 | 
124 | def parse_all():
125 |     count = num_pages()
126 |     for n in range(count):
127 |         yield from parse_page(get(n+1))
128 | 
129 | 
130 | def format_all(out):
131 |     writer = csv.DictWriter(out, dialect='excel-tab', fieldnames=[
132 |         'title', 'url', 'location', 'start_date', 'end_date',
133 |         'cfp_open', 'cfp_close', 'cfp_url', 'tags',
134 |     ])
135 |     writer.writeheader()
136 |     for event in parse_all():
137 |         event['cfp_open'] = True
138 |         writer.writerow(event)
139 | 
140 | 
141 | def scrape():
142 |     yield from parse_all()
143 | 
144 | 
145 | if __name__ == '__main__':
146 |     import pprint
147 |     for event in parse_all():
148 |         pprint.pprint(event)
149 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------