├── .gitignore ├── requirements.txt ├── .travis.yml ├── README.md └── platforms ├── ticket-leap ├── scrape_ticketleap.py └── json_converter.py └── scrape_eventbrite.py /.gitignore: -------------------------------------------------------------------------------- 1 | .cache/* 2 | platforms/__pycache__/* 3 | node_modules/* 4 | .coverage -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.10.0 2 | beautifulsoup4==4.5.1 3 | pytest-cov 4 | pytest 5 | coveralls 6 | 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | install: 5 | - pip install -r requirements.txt 6 | script: 7 | - py.test platforms/*.py 8 | - py.test --cov 9 | after_success: 10 | - coveralls 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Event-Collect 2 | 3 | [![Build Status](https://travis-ci.org/fossasia/event-collect.svg?branch=master)](https://travis-ci.org/fossasia/event-collect) 4 | 5 | event website listing to Open Event format scraper and converter 6 | 7 | #EventBrite Scraper 8 | Given a query, scrapes EventBrite search results and downloads JSON files of each event using [Loklak's API](https://github.com/loklak/loklak_server/blob/development/docs/parsers.md#event-brite-crawler) 9 | 10 | Usage: 11 | ```python scrape_eventbrite.py [SEARCH QUERY]``` 12 | 13 | To install all python dependencies required: 14 | ```pip install -r requirements.txt``` 15 | 16 | Running that command will install: 17 | ``` 18 | requests==2.10.0 19 | beautifulsoup4==4.5.1 20 | ``` 21 | -------------------------------------------------------------------------------- /platforms/ticket-leap/scrape_ticketleap.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import requests 3 | import os 4 | import json 5 | import json_converter 6 | import unicodedata 7 | from bs4 import BeautifulSoup 8 | 9 | files = ['event.json', 'organizers.json', 'microlocations.json', 'forms.json', 10 | 'session_types.json', 'sessions.json', 'sponsors.json', 'speakers.json', 'tracks.json'] 11 | 12 | # Prints the collected data into the corresponding files 13 | def writeToJson (path, data): 14 | for index in range (9): 15 | json_data = json.dumps(data[index], indent=4) 16 | output_file = open('{}/{}'.format(path, files[index]), 'w') 17 | output_file.write(json_data) 18 | output_file.close() 19 | 20 | # Returns a BeautifulSoup object containing the search results 21 | def getSoup (scrape_url): 22 | raw_html = requests.get(scrape_url) 23 | try: 24 | raw_html.raise_for_status() 25 | except Exception as exc: 26 | print ('{} {}'.format('Houston, we have a problem:', exc)) 27 | sys.exit() 28 | normalized_data = unicodedata.normalize('NFKD', raw_html.text).replace('\n', '') 29 | normalized_data = normalized_data.replace('\u201c', '"').replace('\u201d', '"').replace('\u2019', "'") 30 | normalized_data = normalized_data.replace('\u2013', '').replace('\u2014', ' ') 31 | return BeautifulSoup(normalized_data, 'lxml') 32 | 33 | # Picks the links to each event's description and stores them into an array 34 | def getEventsArray (soup): 35 | links = soup.select('.event h3 a') 36 | if len (links) < 1: 37 | print ('Sorry, no events matching your query!') 38 | sys.exit() 39 | print ('Yay! We found events!') 40 | events = [] 41 | for event in links: 42 | events.append(event['href']) 43 | return events 44 | 45 | def getEventsData (query, events): 46 | for url in events: 47 | ''' Uses the URL of the event page to get the event title, which will later 48 | name the folder containing the data 49 | ''' 50 | start_index = url.index('.com') + 5 51 | event_title = url[start_index:] 52 | # Creates folder for each search result 53 | path = '{}/{}'.format(query, event_title) 54 | if not os.path.exists(path): 55 | os.mkdir(path) 56 | print('{} {}'.format('Writing event data to', path)) 57 | data = createJson(url, path) 58 | writeToJson(path, data) 59 | 60 | def createJson (url, path): 61 | collectedData = [{'creator': {'id': '1', 'email': ''}, 'privacy': 'public'}] 62 | # The event details are distributed across more pages, so the data is extracted from all of them 63 | # Collects data from the main page 64 | soup = getSoup(url) 65 | json_converter.addHome(collectedData, soup, url) 66 | json_converter.addOrganizer(collectedData, soup, url) 67 | json_converter.addImage(collectedData, soup, path) 68 | # Collects data from the details page (event description, social links) 69 | soup = getSoup('{}{}'.format(url, 'details')) 70 | json_converter.addAbout(collectedData, soup) 71 | # Collects location data 72 | soup = getSoup('{}{}'.format(url, 'get-there')) 73 | json_converter.addLocation(collectedData, soup) 74 | # Adds the empty fields to match the Open Event format 75 | json_converter.addEmpty(collectedData) 76 | return collectedData 77 | 78 | def collect (query): 79 | # There are less than 5000 events, so setting the page size to 5000 will collect all search results 80 | scrape_url = '{}{}'.format('https://ticketleap.com/events/?page_size=5000&q=', query) 81 | # Gets search page content 82 | soup = getSoup(scrape_url) 83 | # Gets events links 84 | events = getEventsArray(soup) 85 | # Gets events information and prints it into folders to match the Open Event format 86 | getEventsData(query, events) 87 | print ('Done! :-)') 88 | 89 | if __name__ == '__main__': 90 | if len(sys.argv) > 1: 91 | # Query words are separated by '+' in the search results URL 92 | query = '+'.join(sys.argv[1:]) 93 | if not os.path.exists(query): 94 | os.mkdir(query) 95 | collect(query) 96 | else: 97 | print ('Please specify a query: python scrape_ticketleap.py [query]') 98 | -------------------------------------------------------------------------------- /platforms/scrape_eventbrite.py: -------------------------------------------------------------------------------- 1 | import os,sys,json,requests 2 | from bs4 import BeautifulSoup 3 | 4 | LOKLAK_API_ENDPOINT = "http://loklak.org/api/eventbritecrawler.json" 5 | SEARCH_URL = "https://www.eventbrite.com/d/worldwide/%s/?crt=regular&sort=best" 6 | 7 | 8 | def makeSoup(url): 9 | ''' 10 | Utility function to create a BeautifulSoup object of the HTML content of passed web page URL. 11 | 12 | :param url: URL of web page 13 | ''' 14 | raw_html = requests.get(url).text 15 | return BeautifulSoup(raw_html) 16 | 17 | 18 | def clearSoup(soup): 19 | ''' 20 | Utility function to find results div object and return the same 21 | 22 | :param soup: BeautifulSoup object of the web page 23 | ''' 24 | results = soup.find('div', class_='js-event-list-container l-mar-stack l-mar-top-2') 25 | return results 26 | 27 | 28 | def getEvents(soup): 29 | ''' 30 | Generate list of event URLs 31 | 32 | :param soup: BeautifulSoup object of search result page from eventbrite.com 33 | ''' 34 | events = [] 35 | 36 | # find the results div object 37 | results = clearSoup(soup) 38 | 39 | # save all event URLs to events list 40 | for event in results.findAll('a'): 41 | href = event['href'] 42 | if href.startswith('https'): 43 | events.append(href) 44 | 45 | return events 46 | 47 | 48 | def scrapeEvents(events, query): 49 | ''' 50 | Get event data using LOKLAK API and save the event as a JSON file 51 | in a directory having same name as the query 52 | 53 | :param events: List of event URLs 54 | ''' 55 | 56 | # creating folder by the query name (If doesnt exist) 57 | if not os.path.exists(query): 58 | os.mkdir(query) 59 | 60 | # setting the directory name 61 | dir_name = query 62 | 63 | print("Following events were found:") 64 | # generating json data and saving it for each event 65 | for url in events: 66 | json_data = requests.get(LOKLAK_API_ENDPOINT, params = {'url':url}).json()['data'] 67 | event_path = dir_name + '/' + getEventTitle(url) 68 | # creating folder by event name 69 | if not os.path.exists(event_path): 70 | os.mkdir(event_path) 71 | print(event_path) 72 | writeOut(json_data, event_path) 73 | 74 | 75 | def getEventTitle(event_url): 76 | ''' 77 | Utility function to generate title from event URL 78 | 79 | :param event_url: URL of the event 80 | ''' 81 | title = event_url.split('/')[-1].split('?')[0].split('-tickets')[0] 82 | return title 83 | 84 | 85 | def writeOut(data, event_path): 86 | ''' 87 | Utility function to write data to given file path 88 | 89 | :param data: data to be written to the file 90 | :param event_path: path to the event directory 91 | ''' 92 | file_names = ['/event.json', '/organizers.json', '/microlocations.json', '/forms.json', 93 | '/session_types.json', '/sessions.json', '/sponsors.json', '/speakers.json', '/tracks.json'] 94 | 95 | for x in range(len(file_names)): 96 | with open(event_path + file_names[x], 'w+') as f: 97 | json.dump(data[x], f, indent = 4) 98 | 99 | 100 | def eventCollector(query): 101 | ''' 102 | Main function which calls other sub-functions to collect and save events 103 | 104 | :param query: event query 105 | ''' 106 | # generate Event Brite(EB) URL for given search query 107 | EB_URL = SEARCH_URL%query 108 | # create BeautifulSoup object for EB results page 109 | soup = makeSoup(EB_URL) 110 | # get a list of event URLs 111 | events = getEvents(soup) 112 | # get event data and save it as JSON file 113 | scrapeEvents(events, query) 114 | 115 | 116 | if __name__ == "__main__": 117 | if len(sys.argv) > 1: 118 | query = sys.argv[1] 119 | eventCollector(query) 120 | else: 121 | print(" No query specified.") 122 | print(" Usage: python scrape_eventbrite.py [query]") 123 | -------------------------------------------------------------------------------- /platforms/ticket-leap/json_converter.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import datetime 4 | 5 | ''' Collects data from the main page. Since some fields may not always exist, 6 | try-except/if-else statements were included in order to prevent errors. 7 | ''' 8 | def addHome (data, soup, url): 9 | data[0]['ticket_url'] = '{}{}'.format(url, 'dates') 10 | data[0]['name'] = soup.select('h1 span')[0].getText().strip() 11 | ''' Getting the start and end dates is a bit more complicated and depends 12 | on the page structure, thus there's another function for it 13 | ''' 14 | getEventDates(data, soup) 15 | try: 16 | data[0]['location_name'] = soup.select('.venue span')[1].getText() 17 | except Exception: 18 | data[0]['location_name'] = '' 19 | if len(soup.select('.all-dates-in-past')): 20 | data[0]['state'] = 'Finished' 21 | else: 22 | data[0]['state'] = 'Unfinished' 23 | 24 | def getEventDates (data, soup): 25 | # The event takes place over a single day 26 | date_info = soup.select('.mobile-only .date-range-date') 27 | if len(date_info) == 1: 28 | start_date = end_date = date_info[0].getText() 29 | start_time = soup.select('.date-range-time-start')[0].getText() 30 | end_time = soup.select('.date-range-time-end')[0].getText() 31 | else: 32 | start_date = date_info[0].getText() 33 | end_date = date_info[1].getText() 34 | start_time = soup.select('.date-range-start .date-range-time')[0].getText() 35 | end_time = soup.select('.date-range-end .date-range-time')[0].getText() 36 | start = '{} {}'.format(start_date, start_time) 37 | end = '{} {}'.format(end_date, end_time) 38 | start = datetime.datetime.strptime(start, '%a, %b %d %Y %I:%M %p').strftime('%Y-%m-%dT%H:%M:%S') 39 | end = datetime.datetime.strptime(end, '%a, %b %d %Y %I:%M %p').strftime('%Y-%m-%dT%H:%M:%S') 40 | data[0]['start_time'] = start 41 | data[0]['end_time'] = end 42 | 43 | def addImage (data, soup, path): 44 | # Creates 'img' folder 45 | image_dir = '{}{}/'.format(path, 'img') 46 | if not os.path.exists(image_dir): 47 | os.mkdir(image_dir) 48 | image_file = '{}{}'.format(image_dir, 'background.jpg') 49 | # The background image URL is stored in a meta tag 50 | backgroundURL = soup.select('meta')[9]['content'] 51 | # The image is downloaded using the requests library 52 | response = requests.get(backgroundURL) 53 | with open(image_file, "wb") as output_file: 54 | output_file.write(response.content) 55 | # Its path is added to the json structure 56 | data[0]['background_image'] = 'img/background.jpg' 57 | 58 | def addOrganizer (data, soup, url): 59 | organizer = {} 60 | organizer['organizer_contact_info'] = '{}{}'.format(url, 'contact') 61 | end_index = url.index('.com') + 4 62 | organizer['organizer_link'] = url[:end_index] 63 | organizer['organizer_profile_link'] = '' 64 | org_name = soup.select('.top h3')[0].getText() 65 | # Gets the organizer name from string 'By org_name (other events)' 66 | data[0]['organizer_name'] = organizer['organizer_name'] = org_name[3:len(org_name)-15] 67 | data.append(organizer) 68 | 69 | # Adds the event descriptions and the social media links 70 | def addAbout (data, soup): 71 | data[0]['description'] = soup.select('.event-description')[0].getText().strip() 72 | data[0]['social_links'] = [] 73 | social_links = soup.select('.social a') 74 | if len(social_links) < 1: 75 | return 76 | index = 0 77 | for site in social_links: 78 | index += 1 79 | social_media = {} 80 | social_media['id'] = index 81 | social_media['name'] = site.select('p')[0].getText() 82 | ''' The email link is a relative one, so the 'organizer_contact_info' (absolute) 83 | link is used instead, since they coincide 84 | ''' 85 | if social_media['name'] == 'Email Us': 86 | social_media['link'] = data[1]['organizer_contact_info'] 87 | else: 88 | social_media['link'] = site['href'] 89 | data[0]['social_links'].append(social_media) 90 | 91 | # Latitude and longitude of the event location are kept in meta tags 92 | def addLocation (data, soup): 93 | data[0]['latitude'] = soup.select('meta')[10]['content'] 94 | data[0]['longitude'] = soup.select('meta')[11]['content'] 95 | 96 | # There is no event data matching the below fields, so they are left empty 97 | def addEmpty (data): 98 | data[0]['type'] = data[0]['topic'] = '' 99 | data.append({'microlocations': []}) 100 | data.append({'customForms': []}) 101 | data.append({'sessionTypes': []}) 102 | data.append({'sessions': []}) 103 | data.append({'sponsors': []}) 104 | data.append({'speakers': []}) 105 | data.append({'tracks': []}) 106 | --------------------------------------------------------------------------------