├── .gitignore
├── requirements.txt
├── .travis.yml
├── README.md
└── platforms
    ├── ticket-leap
        ├── scrape_ticketleap.py
        └── json_converter.py
    └── scrape_eventbrite.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .cache/*
2 | platforms/__pycache__/*
3 | node_modules/*
4 | .coverage


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.10.0
2 | beautifulsoup4==4.5.1
3 | pytest-cov
4 | pytest
5 | coveralls
6 | 
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 | install:
 5 |   - pip install -r requirements.txt
 6 | script:
 7 |   - py.test platforms/*.py
 8 |   - py.test --cov
 9 | after_success:
10 |   - coveralls
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Event-Collect
 2 | 
 3 | [![Build Status](https://travis-ci.org/fossasia/event-collect.svg?branch=master)](https://travis-ci.org/fossasia/event-collect)
 4 | 
 5 | event website listing to Open Event format scraper and converter
 6 | 
 7 | #EventBrite Scraper
 8 | Given a query, scrapes EventBrite search results and downloads JSON files of each event using [Loklak's API](https://github.com/loklak/loklak_server/blob/development/docs/parsers.md#event-brite-crawler)
 9 | 
10 | Usage: 
11 | ```python scrape_eventbrite.py [SEARCH QUERY]```
12 | 
13 | To install all python dependencies required:
14 | ```pip install -r requirements.txt```
15 | 
16 | Running that command will install:
17 | ```
18 | requests==2.10.0
19 | beautifulsoup4==4.5.1
20 | ```
21 | 


--------------------------------------------------------------------------------
/platforms/ticket-leap/scrape_ticketleap.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import requests
 3 | import os
 4 | import json
 5 | import json_converter
 6 | import unicodedata
 7 | from bs4 import BeautifulSoup
 8 | 
 9 | files = ['event.json', 'organizers.json', 'microlocations.json', 'forms.json',
10 |          'session_types.json', 'sessions.json', 'sponsors.json', 'speakers.json', 'tracks.json']
11 | 
12 | # Prints the collected data into the corresponding files
13 | def writeToJson (path, data):
14 |     for index in range (9):
15 |         json_data = json.dumps(data[index], indent=4)
16 |         output_file = open('{}/{}'.format(path, files[index]), 'w')
17 |         output_file.write(json_data)
18 |         output_file.close()
19 | 
20 | # Returns a BeautifulSoup object containing the search results
21 | def getSoup (scrape_url):
22 |     raw_html = requests.get(scrape_url)
23 |     try:
24 |         raw_html.raise_for_status()
25 |     except Exception as exc:
26 |         print ('{} {}'.format('Houston, we have a problem:', exc))
27 |         sys.exit()
28 |     normalized_data = unicodedata.normalize('NFKD', raw_html.text).replace('\n', '')
29 |     normalized_data = normalized_data.replace('\u201c', '"').replace('\u201d', '"').replace('\u2019', "'")
30 |     normalized_data = normalized_data.replace('\u2013', '').replace('\u2014', ' ')
31 |     return BeautifulSoup(normalized_data, 'lxml')
32 | 
33 | # Picks the links to each event's description and stores them into an array
34 | def getEventsArray (soup):
35 |     links = soup.select('.event h3 a')
36 |     if len (links) < 1:
37 |         print ('Sorry, no events matching your query!')
38 |         sys.exit()
39 |     print ('Yay! We found events!')
40 |     events = []
41 |     for event in links:
42 |         events.append(event['href'])
43 |     return events
44 | 
45 | def getEventsData (query, events):
46 |     for url in events:
47 |         ''' Uses the URL of the event page to get the event title, which will later
48 |         name the folder containing the data
49 |         '''
50 |         start_index = url.index('.com') + 5
51 |         event_title = url[start_index:]
52 |         # Creates folder for each search result
53 |         path = '{}/{}'.format(query, event_title)
54 |         if not os.path.exists(path):
55 |            os.mkdir(path)
56 |         print('{} {}'.format('Writing event data to', path))
57 |         data = createJson(url, path)
58 |         writeToJson(path, data)
59 | 
60 | def createJson (url, path):
61 |     collectedData = [{'creator': {'id': '1', 'email': ''}, 'privacy': 'public'}]
62 |     # The event details are distributed across more pages, so the data is extracted from all of them
63 |     # Collects data from the main page
64 |     soup = getSoup(url)
65 |     json_converter.addHome(collectedData, soup, url)
66 |     json_converter.addOrganizer(collectedData, soup, url)
67 |     json_converter.addImage(collectedData, soup, path)
68 |     # Collects data from the details page (event description, social links)
69 |     soup = getSoup('{}{}'.format(url, 'details'))
70 |     json_converter.addAbout(collectedData, soup)
71 |     # Collects location data
72 |     soup = getSoup('{}{}'.format(url, 'get-there'))
73 |     json_converter.addLocation(collectedData, soup)
74 |     # Adds the empty fields to match the Open Event format
75 |     json_converter.addEmpty(collectedData)
76 |     return collectedData
77 | 
78 | def collect (query):
79 |     # There are less than 5000 events, so setting the page size to 5000 will collect all search results
80 |     scrape_url = '{}{}'.format('https://ticketleap.com/events/?page_size=5000&q=', query)
81 |     # Gets search page content
82 |     soup = getSoup(scrape_url)
83 |     # Gets events links
84 |     events = getEventsArray(soup)
85 |     # Gets events information and prints it into folders to match the Open Event format
86 |     getEventsData(query, events)
87 |     print ('Done! :-)')
88 | 
89 | if __name__ == '__main__':
90 |     if len(sys.argv) > 1:
91 |         # Query words are separated by '+' in the search results URL
92 |         query = '+'.join(sys.argv[1:])
93 |         if not os.path.exists(query):
94 |             os.mkdir(query)
95 |         collect(query)
96 |     else:
97 |         print ('Please specify a query: python scrape_ticketleap.py [query]')
98 | 


--------------------------------------------------------------------------------
/platforms/scrape_eventbrite.py:
--------------------------------------------------------------------------------
  1 | import os,sys,json,requests
  2 | from bs4 import BeautifulSoup
  3 | 
  4 | LOKLAK_API_ENDPOINT = "http://loklak.org/api/eventbritecrawler.json"
  5 | SEARCH_URL = "https://www.eventbrite.com/d/worldwide/%s/?crt=regular&sort=best" 
  6 | 
  7 | 
  8 | def makeSoup(url):
  9 |         '''
 10 |         Utility function to create a BeautifulSoup object of the HTML content of passed web page URL.
 11 | 
 12 |         :param url: URL of web page
 13 |         ''' 
 14 |         raw_html = requests.get(url).text
 15 |         return BeautifulSoup(raw_html)
 16 | 
 17 | 
 18 | def clearSoup(soup):
 19 |         '''
 20 |         Utility function to find results div object and return the same
 21 | 
 22 |         :param soup: BeautifulSoup object of the web page
 23 |         '''
 24 |         results = soup.find('div', class_='js-event-list-container l-mar-stack l-mar-top-2')
 25 |         return results
 26 | 
 27 | 
 28 | def getEvents(soup):
 29 |         '''
 30 |         Generate list of event URLs
 31 | 
 32 |         :param soup: BeautifulSoup object of search result page from eventbrite.com
 33 |         '''
 34 |         events = []
 35 | 
 36 |         # find the results div object
 37 |         results = clearSoup(soup)
 38 | 
 39 |         # save all event URLs to events list
 40 |         for event in results.findAll('a'):
 41 |                 href = event['href']
 42 |                 if href.startswith('https'):
 43 |                         events.append(href)
 44 |                         
 45 |         return events
 46 | 
 47 | 
 48 | def scrapeEvents(events, query):
 49 |         '''
 50 |         Get event data using LOKLAK API and save the event as a JSON file
 51 |         in a directory having same name as the query
 52 | 
 53 |         :param events: List of event URLs
 54 |         '''
 55 |         
 56 |         # creating folder by the query name (If doesnt exist)
 57 |         if not os.path.exists(query):
 58 |                 os.mkdir(query)
 59 | 
 60 |         # setting the directory name
 61 |         dir_name = query
 62 | 
 63 |         print("Following events were found:")
 64 |         # generating json data and saving it for each event    
 65 |         for url in events:
 66 |                 json_data = requests.get(LOKLAK_API_ENDPOINT, params = {'url':url}).json()['data']
 67 |                 event_path = dir_name + '/' + getEventTitle(url)
 68 |                 # creating folder by event name
 69 |                 if not os.path.exists(event_path):
 70 |                         os.mkdir(event_path)
 71 |                 print(event_path)
 72 |                 writeOut(json_data, event_path)
 73 | 
 74 | 
 75 | def getEventTitle(event_url):
 76 |         '''
 77 |         Utility function to generate title from event URL
 78 | 
 79 |         :param event_url: URL of the event
 80 |         '''
 81 |         title = event_url.split('/')[-1].split('?')[0].split('-tickets')[0]
 82 |         return title
 83 | 
 84 | 
 85 | def writeOut(data, event_path):
 86 |         '''
 87 |         Utility function to write data to given file path
 88 | 
 89 |         :param data: data to be written to the file
 90 |         :param event_path: path to the event directory
 91 |         '''
 92 |         file_names = ['/event.json', '/organizers.json', '/microlocations.json', '/forms.json',
 93 |                       '/session_types.json', '/sessions.json', '/sponsors.json', '/speakers.json', '/tracks.json']
 94 | 
 95 |         for x in range(len(file_names)):
 96 |                 with open(event_path + file_names[x], 'w+') as f:
 97 |                         json.dump(data[x], f, indent = 4)
 98 | 
 99 | 
100 | def eventCollector(query):
101 |         '''
102 |         Main function which calls other sub-functions to collect and save events
103 | 
104 |         :param query: event query
105 |         '''
106 |         # generate Event Brite(EB) URL for given search query
107 |         EB_URL = SEARCH_URL%query
108 |         # create BeautifulSoup object for EB results page
109 |         soup = makeSoup(EB_URL)
110 |         # get a list of event URLs
111 |         events = getEvents(soup)
112 |         # get event data and save it as JSON file
113 |         scrapeEvents(events, query)
114 |         
115 | 
116 | if __name__ == "__main__":
117 |         if len(sys.argv) > 1:
118 |                 query = sys.argv[1]
119 |                 eventCollector(query)
120 |         else:
121 |                 print("    No query specified.")
122 |                 print("    Usage: python scrape_eventbrite.py [query]")
123 | 


--------------------------------------------------------------------------------
/platforms/ticket-leap/json_converter.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import os
  3 | import datetime
  4 | 
  5 | ''' Collects data from the main page. Since some fields may not always exist,
  6 | try-except/if-else statements were included in order to prevent errors.
  7 | '''
  8 | def addHome (data, soup, url):
  9 |     data[0]['ticket_url'] = '{}{}'.format(url, 'dates')
 10 |     data[0]['name'] = soup.select('h1 span')[0].getText().strip()
 11 |     ''' Getting the start and end dates is a bit more complicated and depends
 12 |     on the page structure, thus there's another function for it
 13 |     '''
 14 |     getEventDates(data, soup)
 15 |     try:
 16 |         data[0]['location_name'] = soup.select('.venue span')[1].getText()
 17 |     except Exception:
 18 |         data[0]['location_name'] = ''
 19 |     if len(soup.select('.all-dates-in-past')):
 20 |         data[0]['state'] = 'Finished'
 21 |     else:
 22 |         data[0]['state'] = 'Unfinished'
 23 | 
 24 | def getEventDates (data, soup):
 25 |     # The event takes place over a single day
 26 |     date_info = soup.select('.mobile-only .date-range-date')
 27 |     if len(date_info) == 1:
 28 |         start_date = end_date = date_info[0].getText()
 29 |         start_time = soup.select('.date-range-time-start')[0].getText()
 30 |         end_time = soup.select('.date-range-time-end')[0].getText()
 31 |     else:
 32 |         start_date = date_info[0].getText()
 33 |         end_date = date_info[1].getText()
 34 |         start_time = soup.select('.date-range-start .date-range-time')[0].getText()
 35 |         end_time = soup.select('.date-range-end .date-range-time')[0].getText()
 36 |     start = '{} {}'.format(start_date, start_time)
 37 |     end = '{} {}'.format(end_date, end_time)
 38 |     start = datetime.datetime.strptime(start, '%a, %b %d %Y %I:%M %p').strftime('%Y-%m-%dT%H:%M:%S')
 39 |     end = datetime.datetime.strptime(end, '%a, %b %d %Y %I:%M %p').strftime('%Y-%m-%dT%H:%M:%S')
 40 |     data[0]['start_time'] = start
 41 |     data[0]['end_time'] = end
 42 | 
 43 | def addImage (data, soup, path):
 44 |     # Creates 'img' folder
 45 |     image_dir = '{}{}/'.format(path, 'img')
 46 |     if not os.path.exists(image_dir):
 47 |         os.mkdir(image_dir)
 48 |     image_file = '{}{}'.format(image_dir, 'background.jpg')
 49 |     # The background image URL is stored in a meta tag
 50 |     backgroundURL = soup.select('meta')[9]['content']
 51 |     # The image is downloaded using the requests library
 52 |     response = requests.get(backgroundURL)
 53 |     with open(image_file, "wb") as output_file:
 54 |         output_file.write(response.content)
 55 |     # Its path is added to the json structure
 56 |     data[0]['background_image'] = 'img/background.jpg'
 57 | 
 58 | def addOrganizer (data, soup, url):
 59 |     organizer = {}
 60 |     organizer['organizer_contact_info'] = '{}{}'.format(url, 'contact')
 61 |     end_index = url.index('.com') + 4
 62 |     organizer['organizer_link'] = url[:end_index]
 63 |     organizer['organizer_profile_link'] = ''
 64 |     org_name = soup.select('.top h3')[0].getText()
 65 |     # Gets the organizer name from string 'By org_name (other events)'
 66 |     data[0]['organizer_name'] = organizer['organizer_name'] = org_name[3:len(org_name)-15]
 67 |     data.append(organizer)
 68 | 
 69 | # Adds the event descriptions and the social media links
 70 | def addAbout (data, soup):
 71 |     data[0]['description'] = soup.select('.event-description')[0].getText().strip()
 72 |     data[0]['social_links'] = []
 73 |     social_links = soup.select('.social a')
 74 |     if len(social_links) < 1:
 75 |         return
 76 |     index = 0
 77 |     for site in social_links:
 78 |         index += 1
 79 |         social_media = {}
 80 |         social_media['id'] = index
 81 |         social_media['name'] = site.select('p')[0].getText()
 82 |         ''' The email link is a relative one, so the 'organizer_contact_info' (absolute)
 83 |         link is used instead, since they coincide
 84 |         '''
 85 |         if social_media['name'] == 'Email Us':
 86 |             social_media['link'] = data[1]['organizer_contact_info']
 87 |         else:
 88 |             social_media['link'] = site['href']
 89 |         data[0]['social_links'].append(social_media)
 90 | 
 91 | # Latitude and longitude of the event location are kept in meta tags
 92 | def addLocation (data, soup):
 93 |     data[0]['latitude'] = soup.select('meta')[10]['content']
 94 |     data[0]['longitude'] = soup.select('meta')[11]['content']
 95 | 
 96 | # There is no event data matching the below fields, so they are left empty
 97 | def addEmpty (data):
 98 |     data[0]['type'] = data[0]['topic'] = ''
 99 |     data.append({'microlocations': []})
100 |     data.append({'customForms': []})
101 |     data.append({'sessionTypes': []})
102 |     data.append({'sessions': []})
103 |     data.append({'sponsors': []})
104 |     data.append({'speakers': []})
105 |     data.append({'tracks': []})
106 | 


--------------------------------------------------------------------------------