├── bin └── .gitkeep ├── results └── .gitkeep ├── src └── trouver_une_fresque_scraper │ ├── __init__.py │ ├── db │ ├── __init__.py │ ├── records.py │ ├── main.py │ └── etl.py │ ├── apis │ ├── __init__.py │ ├── main.py │ ├── ics_test.py │ ├── mobilite.py │ ├── glorieuses.py │ └── ics.py │ ├── scraper │ ├── __init__.py │ ├── main.py │ ├── helloasso.py │ ├── fec.py │ ├── fdc.py │ ├── glide.py │ ├── billetweb.py │ └── eventbrite.py │ ├── utils │ ├── utils.py │ ├── language.py │ ├── keywords.py │ ├── errors.py │ ├── language_test.py │ ├── date_and_time_test.py │ ├── location.py │ └── date_and_time.py │ └── scrape.py ├── .github └── FUNDING.yml ├── .flox ├── .gitignore ├── env.json ├── .gitattributes └── env │ └── manifest.toml ├── push_to_db.py ├── config.json.dist ├── tests └── scrape_test.py ├── loop.sh ├── .pre-commit-config.yaml ├── .gitignore ├── pyproject.toml ├── CONTRIBUTING.md ├── countries ├── uk.json ├── ch.json └── fr.json ├── supabase └── tables.sql ├── compare.py ├── TUTORIAL.md ├── TUTORIAL_OSM.md ├── README.md └── WORKSHOPS.md /bin/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /results/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | liberapay: trouver-une-fresque -------------------------------------------------------------------------------- /.flox/.gitignore: -------------------------------------------------------------------------------- 1 | run/ 2 | cache/ 3 | lib/ 4 | log/ 5 | !env/ 6 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/db/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import main 2 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/apis/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import main 2 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/scraper/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import main 2 | -------------------------------------------------------------------------------- /.flox/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "trouver-une-fresque-scraper", 3 | "version": 1 4 | } -------------------------------------------------------------------------------- /.flox/.gitattributes: -------------------------------------------------------------------------------- 1 | env/manifest.lock linguist-generated=true linguist-language=JSON 2 | -------------------------------------------------------------------------------- /push_to_db.py: -------------------------------------------------------------------------------- 1 | from trouver_une_fresque_scraper.db import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /config.json.dist: -------------------------------------------------------------------------------- 1 | { 2 | "webdriver": "", 3 | "host" : "", 4 | "port" : "", 5 | "user" : "", 6 | "psw" : "", 7 | "database": "", 8 | "timezone": "Europe/Paris" 9 | } 10 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/utils/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def get_config(key=None): 5 | file = open("config.json", "r") 6 | file = json.loads(file.read()) 7 | credentials = dict(file) 8 | if key is not None: 9 | return credentials.get(key, None) 10 | return credentials 11 | -------------------------------------------------------------------------------- /tests/scrape_test.py: -------------------------------------------------------------------------------- 1 | from trouver_une_fresque_scraper.apis import ics_test 2 | from trouver_une_fresque_scraper.utils import date_and_time_test 3 | from trouver_une_fresque_scraper.utils import language_test 4 | 5 | 6 | if __name__ == "__main__": 7 | ics_test.run_tests() 8 | date_and_time_test.run_tests() 9 | language_test.run_tests() 10 | -------------------------------------------------------------------------------- /loop.sh: -------------------------------------------------------------------------------- 1 | #!zsh 2 | while true 3 | do 4 | python -m trouver_une_fresque_scraper.scrape --skip-dirty-check 5 | if [ $? != 0 ]; then # if the command fails (returns a non-zero exit code) 6 | echo "Command failed, retrying..." 7 | sleep 5 # wait for 5 seconds before retrying 8 | else 9 | break # if the command succeeds, exit the loop 10 | fi 11 | done 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 23.3.0 4 | hooks: 5 | - id: black 6 | args: ["--line-length", "100"] 7 | # It is recommended to specify the latest version of Python 8 | # supported by your project here, or alternatively use 9 | # pre-commit's default_language_version, see 10 | # https://pre-commit.com/#top_level-default_language_version 11 | language_version: python3.13 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pdf 2 | *.sql 3 | *.json 4 | bin 5 | supabase 6 | results 7 | *.log 8 | .venv 9 | .DS_Store 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[codz] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # pre-commit 40 | .pre-commit-config.yaml 41 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "trouver-une-fresque" 3 | version = "0.1.0" 4 | description = "Détection d'ateliers sensibilisant aux questions climatiques et sociales" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | 8 | dependencies = [ 9 | "geopy>=2.4.1", 10 | "ics>=0.7.2", 11 | "numpy>=2.1.3", 12 | "pandas>=2.2.3", 13 | "psycopg[binary,pool]>=3.2.3", 14 | "python-dateutil>=2.9.0.post0", 15 | "requests>=2.32.3", 16 | "selenium>=4.26.1", 17 | "tabulate>=0.9.0", 18 | "langdetect", 19 | "pre-commit", 20 | ] 21 | 22 | [build-system] 23 | requires = ["setuptools>=61.0", "wheel"] 24 | build-backend = "setuptools.build_meta" 25 | 26 | [tool.setuptools.packages.find] 27 | where = ["src"] 28 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/apis/main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from trouver_une_fresque_scraper.apis.ics import get_ics_data 4 | from trouver_une_fresque_scraper.apis.glorieuses import get_glorieuses_data 5 | from trouver_une_fresque_scraper.apis.mobilite import get_mobilite_data 6 | 7 | APIS_FNS = { 8 | "hook.eu1.make.com": get_glorieuses_data, 9 | "calendar.google.com/calendar/ical": get_ics_data, 10 | "framagenda.org/remote.php/dav": get_ics_data, 11 | "app.fresquedelamobilite.org": get_mobilite_data, 12 | } 13 | 14 | 15 | def main(apis): 16 | records = [] 17 | 18 | for sourcek in APIS_FNS: 19 | for api in apis: 20 | if sourcek in api["url"]: 21 | records += APIS_FNS[sourcek](api) 22 | 23 | return pd.DataFrame(records) 24 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Before contributing to this project, please make sure that your git config is correct: 4 | 5 | ```console 6 | git config --global user.name "John Doe" 7 | git config --global user.email johndoe@example.com 8 | ``` 9 | 10 | Follow the installation instructions [here](https://pre-commit.com/#install) to install `pre-commit` hooks. Then, execute the `pre-commit install` command to install the Black code formatter. 11 | 12 | ```console 13 | pre-commit install 14 | pre-commit run --all-files 15 | ``` 16 | 17 | If you change scraping logic, make sure to run the `compare.py` utility to visualize the number of records scraped without and with your proposed modification. 18 | 19 | ```console 20 | python compare.py results/events_20240125_194439.json results/events_20240130_121930.json 21 | ``` 22 | -------------------------------------------------------------------------------- /countries/uk.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "Circular Economy Collage", 4 | "url": "https://www.eventbrite.fr/o/fresque-de-leconomie-circulaire-68155531313", 5 | "type": "scraper", 6 | "id": 300 7 | }, 8 | { 9 | "name": "Planetary Boundaries Fresco", 10 | "url": "https://1erdegre.glide.page/dl/3b1bc8", 11 | "type": "scraper", 12 | "filter": "Fresque des frontières planétaires", 13 | "id": 500 14 | }, 15 | { 16 | "name": "Digital Collage", 17 | "url": "https://www.billetweb.fr/multi_event.php?multi=11442", 18 | "type": "scraper", 19 | "iframe": "eventu84999", 20 | "id": 3 21 | }, 22 | { 23 | "name": "Digital Collage", 24 | "url": "https://www.billetweb.fr/pro/atelier-fresque-du-numerique", 25 | "type": "scraper", 26 | "iframe": "event41180", 27 | "id": 3 28 | }, 29 | { 30 | "name": "Biodiversity Collage", 31 | "url": "https://www.billetweb.fr/multi_event.php?user=82762", 32 | "type": "scraper", 33 | "iframe": "event17309", 34 | "id": 2 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/apis/ics_test.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from trouver_une_fresque_scraper.apis import ics 4 | 5 | 6 | def run_tests(): 7 | long_url = "https://www.eventbrite.com/e/2tonnes-world-workshop-in-basel-switzerland-tickets-1116862910029?aff=odcleoeventsincollection&keep_tld=1" 8 | test_cases = [ 9 | ("text_url", long_url, long_url), 10 | ( 11 | "html_with_extra text", 12 | 'Tickets here: registration. Come and have fun!', 13 | "http://result", 14 | ), 15 | ("text_and_url", "Lien d'inscription : http://result.org", "http://result.org"), 16 | ( 17 | "more_text_and_url", 18 | "Fresque du sol animée en ligne.\nInscription obligatoire https://www.billetweb.fr/fresque-du-sol-en-ligne11\nContact si besoinnoone@nowhere.fr.", 19 | "https://www.billetweb.fr/fresque-du-sol-en-ligne11", 20 | ), 21 | ] 22 | for test_case in test_cases: 23 | logging.info(f"Running {test_case[0]}") 24 | actual = ics.get_ticketing_url_from_description(test_case[1]) 25 | if actual == test_case[2]: 26 | logging.info("Result matches") 27 | else: 28 | logging.error(f"{test_case[0]}: expected {test_case[2]} but got {actual}") 29 | -------------------------------------------------------------------------------- /supabase/tables.sql: -------------------------------------------------------------------------------- 1 | create table "private"."events_future" ( 2 | "id" character varying, 3 | "workshop_type" bigint, 4 | "title" text, 5 | "description" text, 6 | "online" boolean, 7 | "training" boolean, 8 | "sold_out" boolean, 9 | "kids" boolean, 10 | "start_date" timestamptz, 11 | "end_date" timestamptz, 12 | "zip_code" character varying, 13 | "latitude" character varying, 14 | "longitude" character varying, 15 | "source_link" character varying, 16 | "tickets_link" character varying, 17 | "country_code" character varying, 18 | "department" character varying, 19 | "city" character varying, 20 | "address" character varying, 21 | "location_name" character varying, 22 | "full_location" character varying, 23 | "language_code" character varying, 24 | "scrape_date" timestamp with time zone, 25 | "most_recent" boolean default false 26 | ); 27 | 28 | create table "private"."events_scraped" ( 29 | like "private"."events_future" 30 | ); 31 | 32 | create view "public"."events" as ( 33 | select * from "private"."events_future" 34 | union all 35 | select * from "private"."events_scraped" where most_recent = true 36 | ); 37 | 38 | alter table "private"."events_future" enable row level security; 39 | alter table "private"."events_scraped" enable row level security; 40 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/utils/language.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from trouver_une_fresque_scraper.utils.errors import FreskLanguageNotRecognized 4 | from langdetect import detect 5 | 6 | 7 | LANGUAGE_STRINGS = { 8 | "Allemand": "de", 9 | "Anglais": "en", 10 | "Deutsch": "de", 11 | "Englisch": "en", 12 | "English": "en", 13 | "Französisch": "fr", 14 | "Français": "fr", 15 | "Français": "fr", 16 | "French": "fr", 17 | "German": "de", 18 | "Indonesian": "id", 19 | "Italien": "it", 20 | "Italian": "it", 21 | "Spanish": "es", 22 | "Russian": "ru", 23 | } 24 | 25 | 26 | def detect_language_code(title, description): 27 | """ 28 | Returns the language code of the language specified in the title if any, otherwise auto-detects from title and description. 29 | """ 30 | title_upper = title.upper() 31 | for language_string, language_code in LANGUAGE_STRINGS.items(): 32 | if language_string.upper() in title_upper: 33 | return language_code 34 | language_code = detect(title + description) 35 | if language_code in LANGUAGE_STRINGS.values(): 36 | return language_code 37 | logging.warning(f"Unexpected language code: {language_code}.") 38 | return None 39 | 40 | 41 | def get_language_code(language_text): 42 | """ 43 | Returns the ISO 639-1 language code given a human-readable string such as "Français" or "English". 44 | """ 45 | language_code = LANGUAGE_STRINGS.get(language_text) 46 | if not language_code: 47 | raise FreskLanguageNotRecognized(language_text) 48 | return language_code 49 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/db/records.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from zoneinfo import ZoneInfo 4 | from trouver_une_fresque_scraper.utils.utils import get_config 5 | 6 | 7 | def get_record_dict( 8 | uuid, 9 | ids, 10 | title, 11 | start_datetime, 12 | end_datetime, 13 | full_location, 14 | location_name, 15 | address, 16 | city, 17 | department, 18 | zip_code, 19 | country_code, 20 | latitude, 21 | longitude, 22 | language_code, 23 | online, 24 | training, 25 | sold_out, 26 | kids, 27 | event_link, 28 | tickets_link, 29 | description, 30 | ): 31 | timezone = get_config("timezone") 32 | origin_tz = ZoneInfo(timezone) 33 | 34 | return { 35 | "id": uuid, 36 | "workshop_type": ids, 37 | "title": title, 38 | "start_date": start_datetime.replace(tzinfo=origin_tz).isoformat(), 39 | "end_date": end_datetime.replace(tzinfo=origin_tz).isoformat(), 40 | "full_location": full_location, 41 | "location_name": location_name.strip(), 42 | "address": address.strip(), 43 | "city": city.strip(), 44 | "department": department, 45 | "zip_code": zip_code, 46 | "country_code": country_code, 47 | "latitude": latitude, 48 | "longitude": longitude, 49 | "language_code": ( 50 | language_code.strip() if bool(language_code and language_code.strip()) else "fr" 51 | ), 52 | "online": online, 53 | "training": training, 54 | "sold_out": sold_out, 55 | "kids": kids, 56 | "source_link": event_link, 57 | "tickets_link": tickets_link, 58 | "description": description, 59 | "scrape_date": pd.to_datetime("now", utc=True).tz_convert(timezone).isoformat(), 60 | } 61 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/db/main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import pandas as pd 4 | import psycopg 5 | 6 | from psycopg.conninfo import make_conninfo 7 | 8 | from trouver_une_fresque_scraper.db.etl import etl, insert, truncate 9 | from trouver_une_fresque_scraper.utils.utils import get_config 10 | 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | "--full-etl", 16 | action="store_true", 17 | default=True, 18 | help="perform the full ETL cycle, including truncating future events", 19 | ) 20 | parser.add_argument( 21 | "--truncate-first", 22 | action="store_true", 23 | default=False, 24 | help="truncate db before inserting again", 25 | ) 26 | parser.add_argument( 27 | "--input", 28 | type=str, 29 | help="input json file to be inserted in db", 30 | required=True, 31 | ) 32 | args = parser.parse_args() 33 | 34 | if args.full_etl and args.truncate_first: 35 | raise Exception 36 | 37 | credentials = get_config() 38 | host = credentials["host"] 39 | port = credentials["port"] 40 | user = credentials["user"] 41 | psw = credentials["psw"] 42 | database = credentials["database"] 43 | 44 | with psycopg.connect( 45 | make_conninfo(dbname=database, user=user, password=psw, host=host, port=port) 46 | ) as conn: 47 | input_records = open(args.input, "r") 48 | input_records = json.loads(input_records.read()) 49 | df = pd.DataFrame.from_dict(pd.json_normalize(input_records), orient="columns") 50 | print(df) 51 | 52 | if args.full_etl: 53 | etl(conn, df) 54 | else: 55 | if args.truncate_first: 56 | truncate(conn, "private.events_future") 57 | insert(conn, df, "private.events_future") 58 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/utils/keywords.py: -------------------------------------------------------------------------------- 1 | def is_training(input_string): 2 | training_list = [ 3 | "formation", 4 | "briefing", 5 | "animateur", 6 | "animation", 7 | "permanence", 8 | "training", 9 | "return of experience", 10 | "retex", 11 | ] 12 | input_string = input_string.lower() 13 | return any(word.lower() in input_string for word in training_list) 14 | 15 | 16 | def is_online(input_string): 17 | online_list = ["online", "en ligne", "distanciel", "en linea"] 18 | input_string = input_string.lower() 19 | return any(word.lower() in input_string for word in online_list) 20 | 21 | 22 | def is_for_kids(input_string): 23 | kids_list = ["kids", "junior", "jeunes"] 24 | input_string = input_string.lower() 25 | return any(word.lower() in input_string for word in kids_list) 26 | 27 | 28 | def has_external_tickets(input_string): 29 | external_tickets = [ 30 | "inscriptions uniquement", 31 | "inscription uniquement", 32 | "inscriptions via", 33 | "inscription via", 34 | ] 35 | input_string = input_string.lower() 36 | return any(word.lower() in input_string for word in external_tickets) 37 | 38 | 39 | def is_plenary(input_string): 40 | plenary = ["plénière"] 41 | input_string = input_string.lower() 42 | return any(word.lower() in input_string for word in plenary) 43 | 44 | 45 | def is_sold_out(input_string): 46 | sold_out = ["complet"] 47 | input_string = input_string.lower() 48 | return any(word.lower() in input_string for word in sold_out) 49 | 50 | 51 | def is_gift_card(input_string): 52 | gift = ["cadeau", "don"] 53 | input_string = input_string.lower() 54 | return any(word.lower() in input_string for word in gift) 55 | 56 | 57 | def is_canceled(input_string): 58 | canceled = ["annulé"] 59 | input_string = input_string.lower() 60 | return any(word.lower() in input_string for word in canceled) 61 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/utils/errors.py: -------------------------------------------------------------------------------- 1 | class FreskError(Exception): 2 | pass 3 | 4 | 5 | class FreskDateNotFound(FreskError): 6 | def __init__(self): 7 | self.message = f"Date not found." 8 | super().__init__(self.message) 9 | 10 | 11 | class FreskDateBadFormat(FreskError): 12 | def __init__(self, input_str: str): 13 | self.message = f"Date has a bad format, unhandled by TuF (input: {input_str})." 14 | super().__init__(self.message) 15 | 16 | 17 | class FreskDateDifferentTimezone(FreskError): 18 | def __init__(self, input_str: str): 19 | self.message = f"Date has a different timezone, unhandled by TuF (input: {input_str})." 20 | super().__init__(self.message) 21 | 22 | 23 | class FreskAddressNotFound(FreskError): 24 | def __init__(self, input_str: str): 25 | self.message = f"Address not found (input: {input_str})." 26 | super().__init__(self.message) 27 | 28 | 29 | class FreskAddressBadFormat(FreskError): 30 | def __init__(self, address: str, input_str: str, attribute: str): 31 | self.message = f'Address "{address}" has a bad {attribute} format, unhandled by TuF (input: {input_str}).' 32 | super().__init__(self.message) 33 | 34 | 35 | class FreskAddressIncomplete(FreskError): 36 | def __init__(self, address: str, input_str: str, missing_attribute: str): 37 | self.message = ( 38 | f'Address "{address}" has a missing attribute {missing_attribute} (input: {input_str}).' 39 | ) 40 | super().__init__(self.message) 41 | 42 | 43 | class FreskDepartmentNotFound(FreskError): 44 | def __init__(self, department: str): 45 | self.message = f"Department {department} not recognized." 46 | super().__init__(self.message) 47 | 48 | 49 | class FreskCountryNotSupported(FreskError): 50 | def __init__(self, address: str, input_str: str): 51 | self.message = ( 52 | f'Address "{address}" is not located in a supported country (input: {input_str}).' 53 | ) 54 | super().__init__(self.message) 55 | 56 | 57 | class FreskLanguageNotRecognized(FreskError): 58 | def __init__(self, language_text: str): 59 | self.message = f'Language "{language_text}" is not recognized.' 60 | super().__init__(self.message) 61 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/scraper/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | from trouver_une_fresque_scraper.scraper.fdc import get_fdc_data 5 | from trouver_une_fresque_scraper.scraper.fec import get_fec_data 6 | from trouver_une_fresque_scraper.scraper.billetweb import get_billetweb_data 7 | from trouver_une_fresque_scraper.scraper.eventbrite import get_eventbrite_data 8 | from trouver_une_fresque_scraper.scraper.glide import get_glide_data 9 | from trouver_une_fresque_scraper.scraper.helloasso import get_helloasso_data 10 | from selenium.webdriver.firefox.options import Options as FirefoxOptions 11 | from selenium.webdriver.firefox.service import Service 12 | from trouver_une_fresque_scraper.utils.utils import get_config 13 | 14 | SCRAPER_FNS = { 15 | "billetweb.fr": get_billetweb_data, 16 | "climatefresk.org": get_fdc_data, 17 | "eventbrite.fr": get_eventbrite_data, 18 | "fresqueduclimat.org": get_fdc_data, 19 | "lafresquedeleconomiecirculaire.com": get_fec_data, 20 | "1erdegre.glide.page": get_glide_data, 21 | "helloasso.com": get_helloasso_data, 22 | } 23 | 24 | 25 | def get_webdriver_executable(): 26 | webdriver_path = get_config("webdriver") 27 | if not webdriver_path and "WEBDRIVER_PATH" in os.environ: 28 | webdriver_path = os.environ["WEBDRIVER_PATH"] 29 | return webdriver_path 30 | 31 | 32 | def main(scrapers, headless=False): 33 | records = [] 34 | 35 | # geckodriver 36 | service = Service(executable_path=get_webdriver_executable()) 37 | 38 | # Firefox 39 | options = FirefoxOptions() 40 | if "BROWSER_PATH" in os.environ: 41 | options.binary_location = os.environ["BROWSER_PATH"] 42 | options.set_preference("intl.accept_languages", "en-us") 43 | 44 | if headless: 45 | options.add_argument("-headless") 46 | 47 | sorted_workshops = {} 48 | 49 | # Make sure that we have a scraper available for each fresk entry 50 | for sourcek, fn_value in SCRAPER_FNS.items(): 51 | for workshop in scrapers: 52 | if sourcek in workshop["url"]: 53 | # Organize fresks by values in SCRAPER_FNS 54 | if fn_value not in sorted_workshops: 55 | sorted_workshops[fn_value] = [] 56 | sorted_workshops[fn_value].append(workshop) 57 | 58 | for fn_key, sourcev in sorted_workshops.items(): 59 | records += fn_key(sourcev, service=service, options=options) 60 | 61 | return pd.DataFrame(records) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/utils/language_test.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | from trouver_une_fresque_scraper.utils import language 5 | 6 | 7 | def run_tests(): 8 | long_url = "https://www.eventbrite.com/e/2tonnes-world-workshop-in-basel-switzerland-tickets-1116862910029?aff=odcleoeventsincollection&keep_tld=1" 9 | test_cases = [ 10 | ( 11 | "FdB es", 12 | "CHILE - PROVIDENCIA", 13 | "El Mural de la Biodiversidad es un taller lúdico y colaborativo que permite sensibilizar sobre la importancia de la biodiversidad y las causas y consecuencias de su erosión. Durante este taller, descubrirás cómo funcionan los ecosistemas, cómo los humanos interactuamos con la biodiversidad y por qué la biodiversidad es crucial para el bienestar del ser humano.", 14 | "es", 15 | ), 16 | ( 17 | "FdB en", 18 | "Biodiversity Collage (NL) - AMSTERDAM", 19 | "The Biodiversity Collage is a fun and collaborative workshop that aims to raise awareness about the importance of biodiversity. With a set of cards based on the IPBES reports, you will:", 20 | "en", 21 | ), 22 | ( 23 | "FdB ru", 24 | "ONLINE BIODIVERSITY COLLAGE WORKSHOP (RU) - with Ivan Ivanovich (CET)", 25 | "Workshop in Russian Коллаж биоразнообразия — это увлекательный командный воркшоп, который помогает разобраться, почему биоразнообразие критически важно для жизни на Земле и что грозит нашей планете и людям на ней в случае его утраты. В формате совместной работы участники узнают:", 26 | "ru", 27 | ), 28 | ( 29 | "FdN it", 30 | "ONLINE DIGITAL COLLAGE WORKSHOPS IN ITALIAN - Sessione online con Mario Rossi e Corrado Romano", 31 | "Il Digital Collage è un workshop ludico e collaborativo. L'obiettivo del workshop è di sensibilizzare e formare i partecipanti sui problemi ambientali e sociali delle tecnologie digitali. Il workshop si propone anche di delineare soluzioni per una maggiore sostenibilità nelle tecnologie digitali e quindi ad aprire discussioni tra i partecipanti sull'argomento.", 32 | "it", 33 | ), 34 | ( 35 | "PlanetC de", 36 | "Zuerich, Planet C (German)", 37 | 'Registration', 38 | "de", 39 | ), 40 | ] 41 | for test_case in test_cases: 42 | logging.info(f"Running {test_case[0]}") 43 | actual = language.detect_language_code(test_case[1], test_case[2]) 44 | if actual == test_case[3]: 45 | logging.info("Result matches") 46 | else: 47 | logging.error(f"{test_case[0]}: expected {test_case[3]} but got {actual}") 48 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/db/etl.py: -------------------------------------------------------------------------------- 1 | import psycopg 2 | 3 | 4 | def update_most_recent(conn, table): 5 | query = f""" 6 | WITH MissingRows AS ( 7 | SELECT S."id", S."workshop_type", MAX(S."scrape_date") AS max_scrape_date 8 | FROM {table} S 9 | LEFT JOIN private.events_future F 10 | ON S."id" = F."id" AND S."workshop_type" = F."workshop_type" 11 | WHERE F."id" IS NULL 12 | GROUP BY S."id", S."workshop_type" 13 | ) 14 | UPDATE {table} S 15 | SET "most_recent" = TRUE 16 | FROM MissingRows M 17 | WHERE S."id" = M."id" AND S."workshop_type" = M."workshop_type" AND S."scrape_date" = M.max_scrape_date AND S."start_date" < current_timestamp; 18 | """ 19 | cursor = conn.cursor() 20 | print(query) 21 | try: 22 | cursor.execute(query) 23 | conn.commit() 24 | except (Exception, psycopg.DatabaseError) as error: 25 | print("Error: %s" % error) 26 | conn.rollback() 27 | cursor.close() 28 | return 1 29 | cursor.close() 30 | 31 | 32 | def insert(conn, df, table, most_recent=False): 33 | df["most_recent"] = most_recent 34 | tuples = [tuple(x) for x in df.to_numpy()] 35 | cols = ",".join(list(df.columns)) 36 | 37 | print(list(df.columns)) 38 | 39 | # SQL query to execute 40 | cursor = conn.cursor() 41 | try: 42 | cursor.executemany( 43 | "INSERT INTO %s(%s) VALUES (%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)" 44 | % (table, cols), 45 | tuples, 46 | returning=True, 47 | ) 48 | conn.commit() 49 | except (Exception, psycopg.DatabaseError) as error: 50 | print("Error: %s" % error) 51 | conn.rollback() 52 | cursor.close() 53 | return 1 54 | cursor.close() 55 | 56 | 57 | def truncate(conn, table): 58 | query = "TRUNCATE TABLE %s" % table 59 | cursor = conn.cursor() 60 | try: 61 | cursor.execute(query) 62 | conn.commit() 63 | except (Exception, psycopg.DatabaseError) as error: 64 | print("Error: %s" % error) 65 | conn.rollback() 66 | cursor.close() 67 | return 1 68 | cursor.close() 69 | 70 | 71 | def etl(conn, df): 72 | df = df.astype(str) 73 | 74 | # Insert all events to the historical table. Setting most_recent to False, 75 | # but maybe the call to `update_most_recent()` below will change this. 76 | insert(conn, df, "private.events_scraped", most_recent=False) 77 | 78 | # Delete all future events before inserting them again, so that they are 79 | # updated 80 | truncate(conn, "private.events_future") 81 | insert(conn, df, "private.events_future", most_recent=True) 82 | 83 | update_most_recent(conn, "private.events_scraped") 84 | -------------------------------------------------------------------------------- /.flox/env/manifest.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | [install] 4 | python.pkg-path = "python3" 5 | python.version = "3.13" 6 | uv.pkg-path = "uv" 7 | zlib.pkg-path = "zlib" 8 | gcc-unwrapped.pkg-path = "gcc-unwrapped" 9 | firefox-esr.pkg-path = "firefox-esr" 10 | geckodriver.pkg-path = "geckodriver" 11 | postgresql.pkg-path = "postgresql" 12 | 13 | 14 | # helper tools 15 | gum.pkg-path = "gum" 16 | coreutils.pkg-path = "coreutils" 17 | 18 | 19 | [vars] 20 | 21 | [hook] 22 | on-activate = ''' 23 | # Dynamically set WEBDRIVER_PATH required for the scraper 24 | export WEBDRIVER_PATH="$(which geckodriver)" 25 | export BROWSER_PATH="$(which firefox-esr)" 26 | 27 | # Flox stuff 28 | export FLOX_PYTHON_UV_CACHE_DIR="$FLOX_ENV_CACHE/python-uv" 29 | mkdir -p "$FLOX_PYTHON_UV_CACHE_DIR" 30 | 31 | export FLOX_PYTHON_UV_VENV_PATH="$FLOX_PYTHON_UV_CACHE_DIR/venv" 32 | export FLOX_PYTHON_UV_VENV_INTERPRETER="$(cat "$FLOX_PYTHON_UV_CACHE_DIR/venv.interpreter" 2> /dev/null || echo false )" 33 | export FLOX_PYTHON_UV_INTERPRETER="$(realpath $(which python3))" 34 | 35 | # Make sure any tools are not attempting to use the Python interpreter from any 36 | # existing virtual environment. 37 | unset VIRTUAL_ENV 38 | 39 | export UV_PROJECT_ENVIRONMENT="$FLOX_PYTHON_UV_VENV_PATH" 40 | 41 | function indent() { 42 | echo -e '{{ Foreground "#cccccc" " │ "}}' | \ 43 | gum format -t template --theme=auto 44 | } 45 | 46 | function with_spinner() { 47 | if [[ "$FLOX_ENVS_TESTING" == "1" ]]; then 48 | bash -c "$1" 49 | else 50 | echo 51 | gum spin \ 52 | --show-error \ 53 | --spinner line \ 54 | --spinner.foreground="#cccccc" \ 55 | --title " >>> $2 ..." \ 56 | --title.foreground="#cccccc" \ 57 | -- bash -c "$1" 58 | echo -en "\033[2A\033[K" 59 | fi 60 | } 61 | 62 | function ensure_venv() { 63 | uv venv -p "$FLOX_PYTHON_UV_INTERPRETER" "$FLOX_PYTHON_UV_VENV_PATH" 64 | source "$FLOX_PYTHON_UV_VENV_PATH/bin/activate" 65 | } 66 | export -f ensure_venv 67 | 68 | function install_packages() { 69 | uv sync 70 | } 71 | export -f install_packages 72 | 73 | 74 | 75 | indent && echo 76 | indent && echo 77 | 78 | if [ "$FLOX_PYTHON_UV_VENV_INTERPRETER" != "$FLOX_PYTHON_UV_INTERPRETER" ]; then 79 | with_spinner ensure_venv "Creating virtual environment" 80 | indent && echo -e "{{ Foreground \"#ffffff\" \"✅ Virtual environment was created.\" }}\n" \ 81 | | gum format -t template 82 | else 83 | indent && echo -e "{{ Foreground \"#ffffff\" \"✅ Virtual environment already exists.\" }}\n" \ 84 | | gum format -t template 85 | fi 86 | 87 | indent && echo 88 | 89 | if [ -f pyproject.toml ]; then 90 | with_spinner install_packages "Installing Python packages" 91 | indent && echo -e "{{ Foreground \"#ffffff\" \"✅ Python packages installed.\" }}\n" \ 92 | | gum format -t template 93 | else 94 | indent && echo -e "{{ Foreground \"#ffffff\" \"✅ No need to install Python packages.\" }}\n" \ 95 | | gum format -t template 96 | fi 97 | 98 | indent && echo 99 | 100 | ''' 101 | 102 | [profile] 103 | bash = ''' 104 | source "$FLOX_PYTHON_UV_VENV_PATH/bin/activate" 105 | ''' 106 | fish = ''' 107 | source "$FLOX_PYTHON_UV_VENV_PATH/bin/activate.fish" 108 | ''' 109 | tcsh = ''' 110 | source "$FLOX_PYTHON_UV_VENV_PATH/bin/activate.csh" 111 | ''' 112 | zsh = ''' 113 | source "$FLOX_PYTHON_UV_VENV_PATH/bin/activate" 114 | ''' 115 | 116 | [options] 117 | systems = [ 118 | "aarch64-darwin", 119 | "aarch64-linux", 120 | "x86_64-darwin", 121 | "x86_64-linux", 122 | ] 123 | -------------------------------------------------------------------------------- /compare.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import logging 4 | from tabulate import tabulate 5 | 6 | workshop_types = { 7 | 0: "FresqueNouveauxRecits", 8 | 1: "FresqueOceane", 9 | 2: "FresqueBiodiversite", 10 | 3: "FresqueNumerique", 11 | 4: "FresqueAgriAlim", 12 | 5: "FresqueAlimentation", 13 | 6: "FresqueConstruction", 14 | 7: "FresqueMobilite", 15 | 8: "FresqueSexisme", 16 | 9: "OGRE", 17 | 10: "AtelierInventonsNosViesBasCarbone", 18 | 11: "FresqueDeLeau", 19 | 12: "FutursProches", 20 | 13: "FresqueDiversite", 21 | 14: "FresqueDuTextile", 22 | 15: "FresqueDesDechets", 23 | 16: "PuzzleClimat", 24 | 17: "FresqueDeLaFinance", 25 | 18: "FresqueDeLaRSE", 26 | 19: "AtelierDesTransitionsUrbaines", 27 | 100: "2tonnes", 28 | 101: "CompteGouttes", 29 | 102: "FresqueDuBénévolat", 30 | 103: "FresqueDuPlastique", 31 | 200: "FresqueClimat", 32 | 300: "FresqueEcoCirculaire", 33 | 500: "FresqueFrontieresPlanetaires", 34 | 501: "HorizonsDecarbones", 35 | 600: "2030Glorieuses", 36 | 700: "FresqueDeLaRénovation", 37 | 701: "FresqueDeLEnergie", 38 | 702: "FresqueDesPossibles", 39 | 703: "FresqueDeLaCommunication", 40 | 704: "Zoofresque", 41 | 705: "NotreTour", 42 | 800: "PlanetCPlayAgain?", 43 | 801: "FresqueDuSol", 44 | } 45 | 46 | 47 | def get_json(file_path): 48 | try: 49 | with open(file_path, "r", encoding="utf-8") as file: 50 | return json.load(file) 51 | except FileNotFoundError: 52 | logging.info(f"File not found: {file_path}") 53 | return 0 54 | except json.JSONDecodeError: 55 | logging.info(f"Error decoding JSON in file: {file_path}") 56 | return 0 57 | 58 | 59 | def count_workshop_types(data): 60 | records = {} 61 | for record in data: 62 | if record["workshop_type"] in records: 63 | records[record["workshop_type"]] += 1 64 | else: 65 | records[record["workshop_type"]] = 1 66 | return records 67 | 68 | 69 | def display_workshop_types(counts): 70 | for workshop_type, count in counts.items(): 71 | logging.info(f"{workshop_types[workshop_type]}: {count} events") 72 | logging.info("---------") 73 | 74 | 75 | def display_table_workshop_types(counts1, counts2): 76 | table = [] 77 | for workshop_id, workshop_type in workshop_types.items(): 78 | count1 = counts1.get(workshop_id, 0) 79 | count2 = counts2.get(workshop_id, 0) 80 | table.append([workshop_type, count1, count2, count2 - count1]) 81 | return table 82 | 83 | 84 | def main(): 85 | # Check if the correct number of arguments is provided 86 | if len(sys.argv) != 3: 87 | logging.info("Usage: python program.py ") 88 | sys.exit(1) 89 | 90 | # Get file paths from command-line arguments 91 | file1_path = sys.argv[1] 92 | file2_path = sys.argv[2] 93 | 94 | # Count entries in each file 95 | json1 = get_json(file1_path) 96 | json2 = get_json(file2_path) 97 | 98 | records1 = count_workshop_types(json1) 99 | records2 = count_workshop_types(json2) 100 | 101 | # display_workshop_types(records1) 102 | # display_workshop_types(records2) 103 | 104 | headers = ["Workshop", file1_path, file2_path, "Delta"] 105 | table = display_table_workshop_types(records1, records2) 106 | totals1 = sum(row[1] for row in table) 107 | totals2 = sum(row[2] for row in table) 108 | table.append(["====Totals====", totals1, totals2, totals2 - totals1]) 109 | print(tabulate(table, headers, tablefmt="fancy_grid")) 110 | 111 | 112 | if __name__ == "__main__": 113 | main() 114 | -------------------------------------------------------------------------------- /TUTORIAL.md: -------------------------------------------------------------------------------- 1 | # Tutoriel à destination des organisateurs de fresques 2 | 3 | Ce projet vient récupérer les données de vos évènements publiés en ligne. Si un évènement n'apparaît pas sur la plateforme Trouver une Fresque, merci de vérifier les points ci-dessous. Si l'une des conditions n'est pas vérifiée, merci de mettre votre évènement à jour avant de nous contacter. 4 | 5 | Sautez à la section correspondante à la plateforme utilisée pour publier votre évènement. Dans tous les cas, soyez patients : les évènements sont mis à jour une fois tous les 4 jours en moyenne. Si vous postez un évènement, il n'apparaitra pas immédiatement sur Trouver une Fresque, patience ! 6 | 7 | - [Billetweb.fr](#billetwebfr) 8 | - [Eventbrite.fr](#eventbritefr) 9 | - [Fresqueduclimat.org](#fresqueduclimatorg) 10 | - [Lafresquedeleconomiecirculaire.com](#lafresquedeleconomiecirculairecom) 11 | - [Glide.page](#glidepage) 12 | 13 | Si après 1) avoir vérifié tous les points énoncés, et 2) avoir attendu 4 jours, votre évènement n'apparaît toujours pas sur la plateforme Trouver une Fresque, merci de lire la section [Ouvrir une issue](#ouvrir-une-issue) en bas de cette page. 14 | 15 | ## Billetweb.fr 16 | 17 | ### Date 18 | 19 | La date, comprenant au moins le jour et l'heure de début, apparaît-elle sous le titre de la page ? 20 | 21 | ### Adresse 22 | 23 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md). 24 | 25 | ### Mon atelier utilise un moyen externe pour les inscriptions (mail, téléphone, etc) mais apparaît comme complet ! 26 | 27 | Si vous n'utilisez pas le module d'achat de Billetweb mais un moyen externe pour gérer les inscriptions (mail, téléphone, etc), merci d'afficher la mention suivante dans le module d'achat : "Inscriptions uniquement via [...]" ou "Inscriptions uniquement par [...]", en complétant pour adapter à votre cas "Inscriptions uniquement par téléphone au 06xxxxxxxx". 28 | 29 | ### Mon atelier s'adresse aux juniors mais il n'apparaît pas comme tel ! 30 | 31 | Le mot-clé "junior" doit figurer dans le titre de l'atelier. 32 | 33 | ## Eventbrite.fr 34 | 35 | ### Page d'évènement 36 | 37 | UN SEUL évènement doit apparaître sur la page d'évènement. Cela signifie que le bouton "Sélectionner une date" ou "Sélectionner une heure" ne doit pas apparaître sur la page. Un bouton "Obtenir des billets" sur la page de l'évènement est une condition pour être référencé. 38 | 39 | ### Adresse 40 | 41 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md). 42 | 43 | ## Fresqueduclimat.org 44 | 45 | ### Adresse 46 | 47 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md). 48 | 49 | ### Mon atelier s'adresse aux juniors mais il n'apparait pas comme tel ! 50 | 51 | Le mot-clé "junior" doit figurer dans la description de l'atelier. 52 | 53 | ## Lafresquedeleconomiecirculaire.com 54 | 55 | ### Adresse 56 | 57 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md). 58 | 59 | ### Different timezone 60 | 61 | Les évènements ne se tenant pas en France ne sont pas encore supportés. 62 | 63 | ## Glide.page 64 | 65 | ### Adresse 66 | 67 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md). 68 | 69 | ## iCalendar / iCal / ICS 70 | 71 | Le format est supporté par un grand nombre de logiciels dont ceux énumérés sur https://fr.wikipedia.org/wiki/ICalendar. Voici comment obtenir le lien ICS pour deux d'entre eux: 72 | 73 | * [Framagenda](https://framagenda.org/): cliquez sur le menu à droite du nom du calendrier, cliquez sur le menu aà droite de "Partager le lien" puis choisissez l'option "Copier le lien pour s'abonner". Exemple: `https://framagenda.org/remote.php/dav/public-calendars/KwNwGA232xD38CnN/?export`. 74 | * [Google Calendar](https://calendar.google.com): cliquez sur le menu à droite du nom du calendrier, choisissez "Paramètres et Partage", deéfilez vers le bas pour trouver "Adresse publique au format iCal". Exemple: `https://calendar.google.com/calendar/ical/2fe1be9f8d5c073969bccaba14133699b71305877304056bee924ee0ef128977%40group.calendar.google.com/public/basic.ics`. 75 | 76 | ### Adresse 77 | 78 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md). 79 | 80 | ### Lien d'inscription 81 | 82 | La description doit contenir le lien pour s'inscrire. Si la description continent plusieurs liens, les liens vers les plateformes de visio-conférence connues sont ignorés. Si il reste encore plusieurs liens, la description doit être au format HTML et un seul des liens doit contenir le mot "Inscriptions" ou un synonyme proche. 83 | 84 | ## Ouvrir une issue 85 | 86 | Si 1) après vérification de tous les points énoncés dans la section correspondante à votre plateforme de publication, et 2) après avoir attendu 4 jours, votre évènement n'apparaît toujours pas sur la plateforme Trouver une Fresque, merci de nous contacter en remplissant [ce formulaire](https://github.com/trouver-une-fresque/trouver-une-fresque/issues/new). Un compte [GitHub](https://github.com/signup) est nécessaire. 87 | -------------------------------------------------------------------------------- /countries/ch.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "Fresque des Nouveaux Récits", 4 | "url": "https://www.billetweb.fr/pro/fdnr", 5 | "type": "scraper", 6 | "iframe": "event21569", 7 | "id": 0 8 | }, 9 | { 10 | "name": "Fresque Océane", 11 | "url": "https://www.billetweb.fr/pro/billetteriefo", 12 | "type": "scraper", 13 | "iframe": "event15247", 14 | "id": 1 15 | }, 16 | { 17 | "name": "Fresque de la Biodiversité", 18 | "url": "https://www.billetweb.fr/multi_event.php?user=82762", 19 | "type": "scraper", 20 | "iframe": "event17309", 21 | "id": 2 22 | }, 23 | { 24 | "name": "Fresque du Numérique", 25 | "url": "https://www.billetweb.fr/shop.php?event=suisse-atelier-fresque-du-numerique&color=5190f5&page=1&margin=no_margin", 26 | "type": "scraper", 27 | "iframe": "eventu84999", 28 | "id": 3 29 | }, 30 | { 31 | "name": "Fresque du Numérique", 32 | "url": "https://www.billetweb.fr/pro/atelier-fresque-du-numerique", 33 | "type": "scraper", 34 | "iframe": "event41180", 35 | "id": 3 36 | }, 37 | { 38 | "name": "Fresque Agri'Alim", 39 | "url": "https://www.billetweb.fr/pro/fresqueagrialim", 40 | "type": "scraper", 41 | "iframe": "event11421", 42 | "id": 4 43 | }, 44 | { 45 | "name": "Fresque de l'Alimentation", 46 | "url": "https://www.billetweb.fr/pro/fresquealimentation", 47 | "type": "scraper", 48 | "iframe": "event11155", 49 | "id": 5 50 | }, 51 | { 52 | "name": "Fresque de la Construction", 53 | "url": "https://www.billetweb.fr/pro/fresquedelaconstruction", 54 | "type": "scraper", 55 | "iframe": "event11574", 56 | "id": 6 57 | }, 58 | { 59 | "name": "Fresque de la Mobilité", 60 | "url": "https://www.billetweb.fr/pro/fresquedelamobilite", 61 | "type": "scraper", 62 | "iframe": "event11698", 63 | "id": 7 64 | }, 65 | { 66 | "name": "Fresque du Sexisme", 67 | "url": "https://www.billetweb.fr/pro/fresque-du-sexisme", 68 | "type": "scraper", 69 | "iframe": "event27112", 70 | "id": 8 71 | }, 72 | { 73 | "name": "Atelier OGRE", 74 | "url": "https://www.billetweb.fr/pro/atelierogre", 75 | "type": "scraper", 76 | "iframe": "event13026", 77 | "id": 9 78 | }, 79 | { 80 | "name": "Fresque de l'Eau", 81 | "url": "https://www.billetweb.fr/multi_event.php?user=138110", 82 | "type": "scraper", 83 | "iframe": "eventu138110", 84 | "id": 11 85 | }, 86 | { 87 | "name": "Fresque du Textile", 88 | "url": "https://www.billetweb.fr/multi_event.php?user=166793", 89 | "type": "scraper", 90 | "iframe": "event27458", 91 | "filter": "textile", 92 | "id": 14 93 | }, 94 | { 95 | "name": "Fresque des Déchets", 96 | "url": "https://calendar.google.com/calendar/ical/greendonut.info%40gmail.com/public/basic.ics", 97 | "type": "api", 98 | "id": 15 99 | }, 100 | { 101 | "name": "Puzzle Climat", 102 | "url": "https://www.billetweb.fr/multi_event.php?user=121600", 103 | "type": "scraper", 104 | "iframe": "event21038", 105 | "id": 16 106 | }, 107 | { 108 | "name": "Fresque de la RSE", 109 | "url": "https://www.billetweb.fr/pro/fresque", 110 | "type": "scraper", 111 | "iframe": "event35904", 112 | "id": 18 113 | }, 114 | { 115 | "name": "2tonnes", 116 | "url": "https://www.eventbrite.com/cc/ateliers-grand-public-en-presentiel-hors-france-2157189", 117 | "type": "scraper", 118 | "id": 100 119 | }, 120 | { 121 | "name": "Fresque du Plastique", 122 | "url": "https://www.eventbrite.fr/o/la-fresque-du-plastique-45763194553", 123 | "type": "scraper", 124 | "id": 103 125 | }, 126 | { 127 | "name": "Fresque du Climat (ateliers)", 128 | "url": "https://climatefresk.org/fr-ch/inscription-atelier/grand-public/", 129 | "type": "scraper", 130 | "id": 200 131 | }, 132 | { 133 | "name": "Climate Fresk (workshops)", 134 | "url": "https://climatefresk.org/de-ch/workshop-anmeldung/offentlichkeit/", 135 | "type": "scraper", 136 | "id": 200 137 | }, 138 | { 139 | "name": "Fresque de l'Economie Circulaire", 140 | "url": "https://www.billetweb.fr/pro/lafresquedeleconomiecirculaire", 141 | "language_code": "fr", 142 | "type": "scraper", 143 | "iframe": "event41148", 144 | "id": 300 145 | }, 146 | { 147 | "name": "Circular Economy Collage", 148 | "url": "https://www.eventbrite.fr/o/fresque-de-leconomie-circulaire-68155531313", 149 | "type": "scraper", 150 | "id": 300 151 | }, 152 | { 153 | "name": "Fresque des Frontières Planétaires (ateliers)", 154 | "url": "https://1erdegre.glide.page/dl/3b1bc8", 155 | "type": "scraper", 156 | "filter": "Fresque des frontières planétaires", 157 | "id": 500 158 | }, 159 | { 160 | "name": "Planet C Play Again?", 161 | "url": "https://calendar.google.com/calendar/ical/2fe1be9f8d5c073969bccaba14133699b71305877304056bee924ee0ef128977%40group.calendar.google.com/public/basic.ics", 162 | "type": "api", 163 | "id": 800 164 | }, 165 | { 166 | "name": "Notre Tour", 167 | "url": "https://www.helloasso.com/associations/mush", 168 | "type": "scraper", 169 | "id": 705 170 | }, 171 | { 172 | "name": "Fresque du Sol", 173 | "url": "https://framagenda.org/remote.php/dav/public-calendars/KwNwGA232xD38CnN/?export", 174 | "type": "api", 175 | "id": 801 176 | } 177 | ] 178 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/utils/date_and_time_test.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import logging 3 | from attrs import define 4 | 5 | 6 | from trouver_une_fresque_scraper.utils import date_and_time 7 | 8 | 9 | def run_get_dates_tests(): 10 | # tuple fields: 11 | # 1. Test case name or ID 12 | # 2. Input date string 13 | # 3. Expected output start datetime 14 | # 4. Expected output end datetime 15 | test_cases = [ 16 | ( 17 | "BilletWeb: one hour", 18 | "Thu Oct 19, 2023 from 01:00 PM to 02:00 PM", 19 | datetime(2023, 10, 19, 13, 0), 20 | datetime(2023, 10, 19, 14, 0), 21 | ), 22 | ( 23 | "BilletWeb: multiple months", 24 | "Thu Oct 19, 2023 at 01:00 PM to Sat Feb 24, 2024 at 02:00 PM", 25 | datetime(2023, 10, 19, 13, 0), 26 | datetime(2024, 2, 24, 14, 0), 27 | ), 28 | ( 29 | "BilletWeb: single date and time", 30 | "March 7, 2025 at 10:00 AM", 31 | datetime(2025, 3, 7, 10, 0), 32 | datetime(2025, 3, 7, 13, 0), 33 | ), 34 | ( 35 | "EventBrite", 36 | "ven. 11 avr. 2025 14:00 - 17:30 CEST", 37 | datetime(2025, 4, 11, 14, 0), 38 | datetime(2025, 4, 11, 17, 30), 39 | ), 40 | ( 41 | "FdC French", 42 | "16 mai 2025, de 18h30 à 21h30 (heure de Paris)", 43 | datetime(2025, 5, 16, 18, 30), 44 | datetime(2025, 5, 16, 21, 30), 45 | ), 46 | ( 47 | "FdC English: June 3", 48 | "June 03, 2025, from 05:30pm to 09:30pm (Paris time)", 49 | datetime(2025, 6, 3, 17, 30), 50 | datetime(2025, 6, 3, 21, 30), 51 | ), 52 | ( 53 | "FdC English: October 28", 54 | "October 28, 2025, from 09:00am to 12:00pm (Zürich time)", 55 | datetime(2025, 10, 28, 9, 0), 56 | datetime(2025, 10, 28, 12, 0), 57 | ), 58 | ( 59 | "FEC", 60 | "03 mars 2025, 14:00 – 17:00 UTC+1", 61 | datetime(2025, 3, 3, 14, 0), 62 | datetime(2025, 3, 3, 17, 0), 63 | ), 64 | ( 65 | "Glide", 66 | "mercredi 12 février 2025 de 19h00 à 22h00", 67 | datetime(2025, 2, 12, 19, 0), 68 | datetime(2025, 2, 12, 22, 0), 69 | ), 70 | ( 71 | "HelloAsso", 72 | "Le 12 février 2025, de 18h à 20h", 73 | datetime(2025, 2, 12, 18, 0), 74 | datetime(2025, 2, 12, 20, 0), 75 | ), 76 | ] 77 | for test_case in test_cases: 78 | logging.info(f"Running {test_case[0]}") 79 | actual_start_time, actual_end_time = date_and_time.get_dates(test_case[1]) 80 | if actual_start_time != test_case[2]: 81 | logging.error(f"{test_case[0]}: expected {test_case[2]} but got {actual_start_time}") 82 | if actual_end_time != test_case[3]: 83 | logging.error(f"{test_case[0]}: expected {test_case[3]} but got {actual_end_time}") 84 | 85 | 86 | @define 87 | class MockWebDriverElement: 88 | text: str 89 | dt: str | None 90 | 91 | def get_attribute(self, ignored: str) -> str | None: 92 | return self.dt 93 | 94 | 95 | def run_get_dates_from_element_tests(): 96 | # tuple fields: 97 | # 1. Test case name or ID 98 | # 2. Input date string 99 | # 3. Expected output start datetime 100 | # 4. Expected output end datetime 101 | test_cases = [ 102 | ( 103 | "BilletWeb: no datetime, fallback on text parsing", 104 | None, 105 | "Thu Oct 19, 2023 from 01:00 PM to 02:00 PM", 106 | datetime(2023, 10, 19, 13, 0), 107 | datetime(2023, 10, 19, 14, 0), 108 | ), 109 | ( 110 | "EventBrite: morning", 111 | "2025-12-05", 112 | "déc. 5 de 8am à 11am UTC", 113 | datetime(2025, 12, 5, 8, 0), 114 | datetime(2025, 12, 5, 11, 0), 115 | ), 116 | ( 117 | "EventBrite: evening", 118 | "2025-12-12", 119 | "déc. 12 de 6pm à 9pm UTC+1", 120 | datetime(2025, 12, 12, 18, 0), 121 | datetime(2025, 12, 12, 21, 0), 122 | ), 123 | ( 124 | "EventBrite: afternoon in German", 125 | "2024-12-16", 126 | "Dez. 16 von 5nachm. bis 8nachm. UTC", 127 | datetime(2024, 12, 16, 17, 0), 128 | datetime(2024, 12, 16, 20, 0), 129 | ), 130 | ( 131 | "EventBrite: afternoon with minutes in German", 132 | "2024-12-03", 133 | "Dez. 3 von 5:30nachm. bis 8:30nachm. MEZ", 134 | datetime(2024, 12, 3, 17, 30), 135 | datetime(2024, 12, 3, 20, 30), 136 | ), 137 | ( 138 | "EventBrite: PM adds 12 to the hours only from 1 PM onwards", 139 | "2025-12-14", 140 | "déc. 14 de 9:30am à 12:30pm UTC+1", 141 | datetime(2025, 12, 14, 9, 30), 142 | datetime(2025, 12, 14, 12, 30), 143 | ), 144 | ( 145 | "EventBrite: start and end minutes differ", 146 | "2026-01-21", 147 | "janv. 21 de 9am à 12:30pm UTC+1", 148 | datetime(2026, 1, 21, 9, 0), 149 | datetime(2026, 1, 21, 12, 30), 150 | ), 151 | ] 152 | for test_case in test_cases: 153 | logging.info(f"Running {test_case[0]}") 154 | actual_start_time, actual_end_time = date_and_time.get_dates_from_element( 155 | MockWebDriverElement(dt=test_case[1], text=test_case[2]) 156 | ) 157 | if actual_start_time != test_case[3]: 158 | logging.error(f"{test_case[0]}: expected {test_case[3]} but got {actual_start_time}") 159 | if actual_end_time != test_case[4]: 160 | logging.error(f"{test_case[0]}: expected {test_case[4]} but got {actual_end_time}") 161 | 162 | 163 | def run_tests(): 164 | run_get_dates_tests() 165 | run_get_dates_from_element_tests() 166 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/apis/mobilite.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import logging 4 | import pandas as pd 5 | 6 | from datetime import datetime, timedelta 7 | 8 | from trouver_une_fresque_scraper.db.records import get_record_dict 9 | from trouver_une_fresque_scraper.utils.errors import FreskError 10 | from trouver_une_fresque_scraper.utils.keywords import is_online, is_training, is_for_kids 11 | from trouver_une_fresque_scraper.utils.language import detect_language_code 12 | from trouver_une_fresque_scraper.utils.location import get_address 13 | 14 | 15 | def get_df(source): 16 | try: 17 | response = requests.get(source) 18 | # Check if the request was successful (status code 200) 19 | if response.status_code == 200: 20 | try: 21 | return pd.json_normalize(response.json()["response"]["results"]) 22 | except KeyError as e: 23 | logging.info(f"incorrect results key in source json: {e}") 24 | else: 25 | logging.info(f"request failed with status code: {response.status_code}") 26 | except requests.RequestException as e: 27 | logging.info(f"request error occurred: {e}") 28 | 29 | 30 | def get_mobilite_data(source): 31 | logging.info("Getting data from Fresque de la Mobilité API") 32 | 33 | records = [] 34 | 35 | # Get two make results and merge them 36 | df_sessions = get_df("https://hook.eu1.make.com/ui9bvl4c3w69dxdlb7goskl3o22x74um") 37 | df_versions = get_df("https://hook.eu1.make.com/sy4ud6vxutts9h62t4tt6gv0xr5rrkyd") 38 | try: 39 | df_sessions = df_sessions.merge( 40 | df_versions, 41 | how="left", 42 | left_on="atelier_version_custom_atelier_version", 43 | right_on="_id", 44 | suffixes=(None, "_y"), 45 | ) 46 | except Exception as e: 47 | logging.info(f"dataframe merge error occurred: {e}") 48 | return 49 | 50 | for event_id, row in df_sessions.iterrows(): 51 | logging.info("") 52 | 53 | try: 54 | format_key = row["format_option_version_format"] 55 | type_key = row["type_option_version_type"] 56 | perimetre_key = row["p_rim_tre_option_version_p_rim_tre"] 57 | places_key = row["nb_places_number"] 58 | participants_key = row["nb_participants_number"] 59 | id_key = row["_id"] 60 | theme_key = row["th_me_option_version_th_me"] 61 | date_key = row["date_date"] 62 | duration_key = row["dur_e__en_minutes__number"] 63 | address_key = row["lieu_adresse_exact_text"] 64 | except KeyError as e: 65 | logging.info(f"incorrect key in source json: {e}") 66 | continue 67 | 68 | title = f"{type_key} {theme_key} {perimetre_key} {format_key}" 69 | description = "" 70 | online = is_online(format_key) 71 | sold_out = (places_key - participants_key) == 0 72 | source_link = tickets_link = f"https://app.fresquedelamobilite.org/atelier_details/{id_key}" 73 | 74 | ################################################################ 75 | # Parse start and end dates 76 | ################################################################ 77 | try: 78 | # Convert time strings to datetime objects 79 | event_start_datetime = datetime.strptime(date_key, "%Y-%m-%dT%H:%M:%S.%fZ") 80 | except Exception as e: 81 | logging.info(f"Rejecting record: bad date format {e}") 82 | continue 83 | 84 | try: 85 | event_end_datetime = event_start_datetime + timedelta(minutes=duration_key) 86 | except Exception as e: 87 | logging.info(f"Rejecting record: bad duration format {e}") 88 | continue 89 | 90 | ################################################################ 91 | # Location data 92 | ################################################################ 93 | full_location = "" 94 | location_name = "" 95 | address = "" 96 | city = "" 97 | department = "" 98 | longitude = "" 99 | latitude = "" 100 | zip_code = "" 101 | country_code = "" 102 | 103 | if not online: 104 | try: 105 | address_dict = get_address(address_key) 106 | ( 107 | location_name, 108 | address, 109 | city, 110 | department, 111 | zip_code, 112 | country_code, 113 | latitude, 114 | longitude, 115 | ) = address_dict.values() 116 | except json.JSONDecodeError: 117 | logging.info("Rejecting record: error while parsing API response") 118 | continue 119 | except FreskError as error: 120 | logging.info(f"Rejecting record: {error}.") 121 | continue 122 | 123 | ################################################################ 124 | # Building final object 125 | ################################################################ 126 | record = get_record_dict( 127 | f"{source['id']}-{event_id}", 128 | source["id"], 129 | title, 130 | event_start_datetime, 131 | event_end_datetime, 132 | full_location, 133 | location_name, 134 | address, 135 | city, 136 | department, 137 | zip_code, 138 | country_code, 139 | latitude, 140 | longitude, 141 | detect_language_code(title, description), 142 | online, 143 | is_training(type_key), 144 | sold_out, 145 | is_for_kids(perimetre_key), 146 | source_link, 147 | tickets_link, 148 | description, 149 | ) 150 | 151 | records.append(record) 152 | logging.info(f"Successfully API record\n{json.dumps(record, indent=4)}") 153 | 154 | return records 155 | -------------------------------------------------------------------------------- /TUTORIAL_OSM.md: -------------------------------------------------------------------------------- 1 | # Tutoriel OpenStreetMap (OSM) à destination des organisateurs de fresques 2 | 3 | La validité des adresses figurant sur les billeteries des fresques est vérifiée via les données OpenStreetMap (OSM), un projet collaboratif de cartographie en ligne qui vise à constituer une base de données géographiques libre du monde. 4 | 5 | Si votre atelier n'apparaît pas sur Trouver une Fresque, il y a de grandes chances que l'adresse renseignée soit invalide, ou non encore connue d'OSM. Voici un diagnostic à effectuer pour corriger le tir. 6 | 7 | - Rendez-vous sur [OpenStreetMap.org](https://www.openstreetmap.org). 8 | - Dans le champ de recherche en haut à gauche, copiez-collez l'adresse que vous avez renseignée sur votre atelier. Par exemple: `L'Epicerie d'ADDA, 18 Rue de Savenay, 44000 Nantes, France`. 9 | - Si il n'y a pas de résultat, votre adresse n'est pas reconnue par OSM et c'est la raison pour laquelle votre atelier n'apparaît pas sur notre plateforme. 10 | 11 | Merci de parcourir les sections suivantes dans l'ordre pour comprendre comment corriger votre adresse. 12 | 13 | ## 1) Format de l'adresse 14 | 15 | La première chose à vérifier est que votre adresse utilise un format classique, sans informations additionnelles qui devraient figurer ailleurs. Le nom du lieu est une information utile. 16 | 17 | | Mauvais format | Correction | 18 | |----------|----------| 19 | | Chez moi, 1560 Rue Maurice Jacob, Lyon, France | 1560 Rue Maurice Jacob, Lyon, France | 20 | | Le Grand Bain, 20 Allée de la Maison Rouge, Nantes - Accessible aux PMR | Le Grand Bain, 20 Allée de la Maison Rouge, Nantes | 21 | | La Capsule, 1er étage, Bâtiment le Churchill, 3 rue du Président Rooselvelt 51100 Reims | La Capsule, Bâtiment le Churchill, 3 rue du Président Rooselvelt 51100 Reims | 22 | | La Ruche près du petit ruisseau, 24 Rue de l'Est 75020 Paris | La Ruche près du petit ruisseau, 24 Rue de l'Est 75020 Paris | 23 | | 84 Av. de Grammont, 84 Avenue de Grammont 37000 Tours | 84 Avenue de Grammont 37000 Tours | 24 | | La Galerie du Zéro Déchet, entrée Place Dulcie September, 5 Rue Fénelon, 44000 Nantes, France | La Galerie du Zéro Déchet, 5 Rue Fénelon, 44000 Nantes, France | 25 | 26 | ## 2) bis/ter 27 | 28 | Si votre adresse contient une particule bis ou ter, merci de formatter votre adresse comme suit : 29 | 30 | | Mauvais format | Correction | 31 | |----------|----------| 32 | | Mille club, 5T Rue Paul Serusier, Morlaix, France | Mille club, 5 ter Rue Paul Serusier, Morlaix, France | 33 | | Le Grand Bain, 20B Allée de la Maison Rouge, Nantes | Le Grand Bain, 20 bis Allée de la Maison Rouge, Nantes | 34 | 35 | ## 3) Abbréviations 36 | 37 | Si votre adresse contient des abbrévations, essayez d'utiliser le(s) mot(s) complet(s). 38 | 39 | | Mauvais format | Correction | 40 | |----------|----------| 41 | | Palais du travail, 9 Pl. du Dr Lazare Goujon, 69100 Villeurbanne, France | Palais du travail, 9 Place du Docteur Lazare Goujon, 69100 Villeurbanne, France | 42 | | Melting Coop, 229 Cr Emile Zola, 69100 Villeurbanne, France | Melting Coop, 229 Cours Emile Zola, 69100 Villeurbanne, France | 43 | 44 | ## 4) Nom du lieu 45 | 46 | Peut-être que le nom du lieu n'est pas rattaché à l'adresse sur OSM. Pour le vérifier, tapez votre adresse sans le nom. Par exemple, si votre adresse est `Melting Coop, 229 Cours Emile Zinzolin, 69100 Villeurbanne, France`, tapez plutôt `229 Cours Emile Zinzolin, 69100 Villeurbanne, France`. 47 | 48 | - Si vous n'obtenez pas de résultat, l'adresse (sans le nom du lieu) n'est pas répertoriée sur OpenStreetMap. Naviguez manuellement à l'adresse en vous déplaçant sur la carte pour récupérer l'adresse telle qu'elle apparaît dans OSM. Dans notre cas, on se rendra compte que l'adresse correcte est `229 Cours Emile Zola, 69100 Villeurbanne, France`. 49 | 50 | - Si vous obtenez un résultat, deux cas de figure: 51 | 52 | - Soit, en naviguant manuellement sur la carte, le lieu où l'atelier est organisé est bien répertorié. Dans ce cas, il faut: 53 | 54 | - Si le nom du lieu n'a pas la bonne orthographe par rapport à la carte, ajustez le nom du lieu pour le faire correspondre à l'information de la carte. 55 | 56 | - Si l'orthographe du lieu est correcte dans votre addresse par rapport à la carte, il faut [rattacher une adresse à ce lieu](#rattacher-une-adresse-à-un-lieu-existant). 57 | 58 | - Soit, en naviguant manuellement sur la carte, le lieu où l'atelier est organisé n'apparaît pas. Dans ce cas, il faut [ajouter un nouveau lieu et lui rattacher son adresse](#créer-un-lieu). 59 | 60 | ### Rattacher une adresse à un lieu existant 61 | 62 | Suivre les étapes suivantes: 63 | 64 | - Créer un [compte sur OpenStreetMap](https://www.openstreetmap.org/user/new). 65 | 66 | - Naviguer manuellement sur la carte jusqu'au lieu auquel une adresse doit être rattachée. 67 | 68 | - Cliquer sur "Modifier" en haut à gauche. 69 | 70 | - Sur la carte, cliquer sur la petite icône du lieu à modifier. 71 | 72 | - Dans le panneau latéral qui s'ouvre à gauche, ajouter l'adresse du lieu. Vous pouvez en profiter pour enrichir les données OSM avec des données supplémentaires comme le numéro de téléphone, le site web, etc. 73 | 74 | - Une fois les attributs renseignés, cliquer sur "Sauvegarder" en haut à droite. 75 | 76 | - Écrire un message décrivant vos modifications. Par exemple : "Ajout de l'adresse à un lieu existant". 77 | 78 | - Cliquer sur "Envoyer". 79 | 80 | Merci d'avoir contribué à OpenStreetMap ! 81 | 82 | Attendre une dizaine de minutes, et relire ce tutoriel depuis le début :) 83 | 84 | ### Créer un lieu 85 | 86 | Suivre les étapes suivantes: 87 | 88 | - Créer un [compte sur OpenStreetMap](https://www.openstreetmap.org/user/new). 89 | 90 | - Naviguer manuellement sur la carte jusqu'à l'endroit où le lieu doit être ajouté. 91 | 92 | - Cliquer sur "Point" en haut au centre. 93 | 94 | - Cliquer sur le bâtiment où le lieu doit être ajouté. 95 | 96 | - Dans le panneau latéral qui s'ouvre à gauche, choisir un type pour le lieu. Par exemple, `Café`, `Restaurant`, `Espace de coworking`, ou `Centre communautaire` pour un tiers-lieu. Ajouter ensuite le nom et l'adresse du lieu. Vous pouvez en profiter pour enrichir les données OSM avec des données supplémentaires comme le numéro de téléphone, le site web, etc. 97 | 98 | - Une fois les attributs renseignés, cliquer sur "Sauvegarder" en haut à droite. 99 | 100 | - Écrire un message décrivant vos modifications. Par exemple : "Ajout d'un lieu". 101 | 102 | - Cliquer sur "Envoyer". 103 | 104 | Merci d'avoir contribué à OpenStreetMap ! 105 | 106 | Attendre une dizaine de minutes, et relire ce tutoriel depuis le début :) 107 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/apis/glorieuses.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import time 4 | import logging 5 | 6 | from datetime import datetime 7 | 8 | from trouver_une_fresque_scraper.db.records import get_record_dict 9 | from trouver_une_fresque_scraper.utils.errors import FreskError 10 | from trouver_une_fresque_scraper.utils.keywords import * 11 | from trouver_une_fresque_scraper.utils.language import detect_language_code 12 | from trouver_une_fresque_scraper.utils.location import get_address 13 | 14 | 15 | def get_glorieuses_data(source): 16 | logging.info("Getting data from Glorieuses API") 17 | 18 | json_records = [] 19 | records = [] 20 | 21 | try: 22 | response = requests.get(source["url"]) 23 | # Check if the request was successful (status code 200) 24 | if response.status_code == 200: 25 | json_records = response.json() 26 | else: 27 | logging.info(f"Request failed with status code: {response.status_code}") 28 | except requests.RequestException as e: 29 | logging.info(f"An error occurred: {e}") 30 | 31 | for json_record in json_records: 32 | time.sleep(1.5) 33 | logging.info("") 34 | 35 | ################################################################ 36 | # Get event id 37 | ################################################################ 38 | event_id = json_record["RECORD_ID()"] 39 | 40 | ################################################################ 41 | # Get event title 42 | ################################################################ 43 | title = json_record["Label event"] 44 | 45 | ################################################################ 46 | # Parse start and end dates 47 | ################################################################ 48 | event_start_time = json_record["Date"] 49 | 50 | try: 51 | # Convert time strings to datetime objects 52 | event_start_datetime = datetime.strptime(event_start_time, "%Y-%m-%dT%H:%M:%S.%fZ") 53 | except Exception as e: 54 | logging.info(f"Rejecting record: bad date format {e}") 55 | continue 56 | 57 | event_end_time = json_record["Date fin"] 58 | 59 | try: 60 | # Convert time strings to datetime objects 61 | event_end_datetime = datetime.strptime(event_end_time, "%Y-%m-%dT%H:%M:%S.%fZ") 62 | except Exception as e: 63 | logging.info(f"Rejecting record: bad date format {e}") 64 | continue 65 | 66 | ########################################################### 67 | # Is it an online event? 68 | ################################################################ 69 | if "Format" in json_record and json_record["Format"] is not None: 70 | online = is_online(json_record["Format"]) 71 | else: 72 | logging.info(f"Rejecting record: no workshop format provided") 73 | continue 74 | 75 | ################################################################ 76 | # Location data 77 | ################################################################ 78 | full_location = "" 79 | location_name = "" 80 | address = "" 81 | city = "" 82 | department = "" 83 | longitude = "" 84 | latitude = "" 85 | zip_code = "" 86 | country_code = "" 87 | 88 | if not online: 89 | address = json_record["Adresse"] 90 | if not address: 91 | logging.info("Rejecting record: no address provided") 92 | continue 93 | 94 | city = json_record["Ville"] 95 | full_location = f"{address}, {city}" 96 | 97 | try: 98 | address_dict = get_address(full_location) 99 | ( 100 | location_name, 101 | address, 102 | city, 103 | department, 104 | zip_code, 105 | country_code, 106 | latitude, 107 | longitude, 108 | ) = address_dict.values() 109 | except json.JSONDecodeError: 110 | logging.info("Rejecting record: error while parsing API response") 111 | continue 112 | except FreskError as error: 113 | logging.info(f"Rejecting record: {error}.") 114 | continue 115 | 116 | ################################################################ 117 | # Description 118 | ################################################################ 119 | description = json_record["Label event"] 120 | 121 | ################################################################ 122 | # Training? 123 | ################################################################ 124 | training = is_training(json_record["Type"]) 125 | 126 | ################################################################ 127 | # Is it full? 128 | ################################################################ 129 | sold_out = False 130 | 131 | ################################################################ 132 | # Is it suited for kids? 133 | ################################################################ 134 | kids = False 135 | 136 | ################################################################ 137 | # Parse tickets link 138 | ################################################################ 139 | tickets_link = json_record["Lien billeterie"] 140 | source_link = tickets_link 141 | 142 | ################################################################ 143 | # Building final object 144 | ################################################################ 145 | record = get_record_dict( 146 | f"{source['id']}-{event_id}", 147 | source["id"], 148 | title, 149 | event_start_datetime, 150 | event_end_datetime, 151 | full_location, 152 | location_name, 153 | address, 154 | city, 155 | department, 156 | zip_code, 157 | country_code, 158 | latitude, 159 | longitude, 160 | source.get("language_code", detect_language_code(title, description)), 161 | online, 162 | training, 163 | sold_out, 164 | kids, 165 | source_link, 166 | tickets_link, 167 | description, 168 | ) 169 | 170 | records.append(record) 171 | logging.info(f"Successfully API record\n{json.dumps(record, indent=4)}") 172 | 173 | return records 174 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # trouver-une-fresque-scraper 2 | 3 | Le scraper de Trouver une Fresque est un outil open source permettant de détecter les ateliers disponibles dans votre département. 4 | 5 | Les données sont extraites des billetteries officielles via la technique du scraping. La validité des adresses est vérifiée en utilisant les données d'OpenStreetMap. 6 | 7 | Si vous utilisez ce code, merci de respecter la [charte de Nominatim](https://operations.osmfoundation.org/policies/nominatim/). 8 | 9 | ## 🌍 Organisateurs: signaler un problème 10 | 11 | Si vous êtes l'organisateur d'un atelier Fresque et que votre évènement n'apparaît pas sur la plateforme Trouver une Fresque, merci de lire le [tutoriel à destination des organisateurs de fresques](https://github.com/trouver-une-fresque/trouver-une-fresque/blob/main/TUTORIAL.md). 12 | 13 | Ouvrez une [issue Github](https://github.com/thomas-bouvier/trouver-une-fresque/issues/new) si vous souhaitez signaler un problème non couvert dans le tutoriel, ou suggérer l'intégration d'un nouvel atelier. 14 | 15 | Les ateliers actuellement supportés sont listés sur la [feuille de route](WORKSHOPS.md). 16 | 17 | ## 🤖 Développeurs: installation 18 | 19 | Le scraping est effectué en utilisant Selenium, qui s'appuie sur geckodriver pour afficher les données à récupérer. Notre outil peut être installé sur un Raspberry Pi sans problème. 20 | 21 | ### Avec `flox` (méthode recommandée) 22 | 23 | Flox est un gestionnaire de paquets multiplateforme qui vise à permettre la reproducibilité, la robustesse, la portabilité et la stabilité des systèmes d'information. Cette approche permet d'installer les paquets Python et dépendances système en une seule fois. 24 | 25 | Suivez les instructions pour installer Flox sur votre système [ici](https://flox.dev/docs/install-flox/). Tout est prêt ! Utilisez la commande `flox activate` dans ce dossier pour commencer à développer. 26 | 27 | Vérifiez que tout fonctionne: 28 | 29 | ```console 30 | python -c "import trouver_une_fresque_scraper as m; print(m.__file__)" 31 | ``` 32 | 33 | ### Manuellement avec `uv` 34 | 35 | Cette méthode d'installation n'est pas recommandée. Préférez l'utilisation de Flox, qui vous facilitera la tâche et garantira d'avoir toutes les dépendances nécessaires pour lancer le scraper. 36 | 37 | Téléchargez la version la plus récente de [geckodriver](https://github.com/mozilla/geckodriver/releases), puis extrayez le binaire `geckodriver` dans un dossier `bin/` (ou n'importe où sur votre système). 38 | 39 | Les librairies suivantes doivent être installées sur votre système: 40 | 41 | ```console 42 | apt install firefox-esr libpq-dev python3-dev 43 | ``` 44 | 45 | Enfin, suivez les instructions pour installer `uv` [ici](https://docs.astral.sh/uv/getting-started/installation/) et créez un environnement Python: 46 | 47 | ```console 48 | uv venv .venv --python 3.13 49 | ``` 50 | 51 | Activez l'environnement: 52 | 53 | ```console 54 | source .venv/bin/activate 55 | ``` 56 | 57 | Installez le scraper avec: 58 | 59 | ```console 60 | uv sync 61 | ``` 62 | 63 | Vérifiez que tout fonctionne: 64 | 65 | ```console 66 | python -c "import trouver_une_fresque_scraper as m; print(m.__file__)" 67 | ``` 68 | 69 | ## 🤖 Développeurs: utilisation 70 | 71 | Avant de contribuer au projet, assurez-vous d'avoir lu le document [CONTRIBUTING.md](./CONTRIBUTING.md). 72 | 73 | ### Configuration 74 | 75 | Renommez le fichier de configuration `config.json.dist` en `config.json` et renseignez les champs. 76 | 77 | ```json 78 | { 79 | "webdriver": "", 80 | "host" : "", 81 | "port" : "", 82 | "user" : "", 83 | "psw" : "", 84 | "database": "", 85 | "timezone": "Europe/Paris" 86 | } 87 | ``` 88 | 89 | Le champ `webdriver` est à renseigner avec le chemin vers le binaire `geckodriver` dans le cas d'une installation sans Flox (= manuelle avec `uv` uniquement) uniquement. 90 | 91 | 92 | ### Lancer le scraping 93 | 94 | ```console 95 | python -m trouver_une_fresque_scraper.scrape 96 | # or 97 | python -m trouver_une_fresque_scraper.scrape --headless --country ch --skip-dirty-check 98 | ``` 99 | 100 | À la fin du scraping, un fichier JSON nommé avec le format `events_20230814_153752.json` est créé dans le dossier `results/`. 101 | 102 | L'option `--headless` exécute le scraping en mode headless, et `--push-to-db` pousse les résultats du fichier json de sortie dans la base de données en utilisant les identifiants définis dans `config.json`. 103 | 104 | ### Base de données 105 | 106 | Nous utilisons [Supabase](https://supabase.com/docs/guides/cli/local-development) pour persister les données scrapées, une alternative open source à Firebase qui fournit une base de données Postgres gratuitement. 107 | 108 | Login to the CLI and start the database. When starting the database, if file `supabase/seed.sql` is present, the `INSERT` statements will be executed to populate the database with testing data. 109 | 110 | ```console 111 | supabase login 112 | supabase init 113 | supabase start 114 | ``` 115 | 116 | The `supabase/tables.sql` contains SQL statements allowing to create the required tables. 117 | 118 | To push some data into the database, use the following command: 119 | 120 | ```console 121 | python push_to_db.py --input results/output.json 122 | ``` 123 | 124 | This command will perform the following actions: 125 | 126 | - All events are inserted into the historical table `events_scraped`. Setting `most_recent=False`, but maybe the call to `update_most_recent()` below will change this. 127 | - Delete all events from `events_future` before inserting them again, so that they are updated. Setting `most_recent=True`. 128 | - The `most_recent` attribute of events in `events_scraped` are set to `True` if the following conditions are met: 129 | - A query identifies rows in the `events_scraped` table that do not have a corresponding entry in the `events_future` table. 130 | - For these rows, it finds the most recent `scrape_date` for each `id` and `workshop_type`. 131 | - It then updates the `most_recent` column to `TRUE` for these rows, but only if the `start_date` of the event is in the past. 132 | 133 | ### Lancer les tests 134 | 135 | ```console 136 | cd tests 137 | python scrape_tests.py 138 | ``` 139 | 140 | ## Comment contribuer 141 | 142 | Pour proposer une modification, un ajout, ou décrire un bug sur l'outil de détection, vous pouvez ouvrir une [issue](https://github.com/thomas-bouvier/trouver-une-fresque/issues/new) ou une [Pull Request](https://github.com/thomas-bouvier/trouver-une-fresque/pulls) avec vos modifications. 143 | 144 | Avant de développer, merci d'installer le hook git en suivant les instructions listées dans le fichier [CONTRIBUTING](https://github.com/trouver-une-fresque/trouver-une-fresque/blob/main/CONTRIBUTING.md). Pour le code en Python, veillez à respecter le standard PEP8 avant de soumettre une Pull Request. La plupart des IDEs et éditeurs de code modernes proposent des outils permettant de mettre en page votre code en suivant ce standard automatiquement. 145 | -------------------------------------------------------------------------------- /WORKSHOPS.md: -------------------------------------------------------------------------------- 1 | # Ateliers supportés et feuille de route 2 | 3 | Développer et maintenir Trouver une Fresque nous prend beaucoup de temps et d'énergie, et l'infrastructure n'est pas gratuite non plus. Il nous reste encore beaucoup d'ateliers à intégrer, en France et à l'international. N'hésitez pas à nous soutenir via un petit [don récurrent](https://fr.liberapay.com/trouver-une-fresque/), merci :) 4 | 5 | ## Ateliers supportés 6 | 7 | ### Country `fr` 8 | 9 | | Atelier | Lien | Source | Supporté | 10 | | ------------- |:-------------:| :-----:| :-----:| 11 | | Fresque du Climat | https://fresqueduclimat.org/participer-a-un-atelier-grand-public | Scraping fdc | OK | 12 | | Atelier 2tonnes | https://www.eventbrite.fr/o/2-tonnes-29470123869 | Scraping Eventbrite | OK | 13 | | Fresque de la Biodiversité | https://www.fresquedelabiodiversite.org/#participer | Scraping Billetweb | OK | 14 | | Fresque Océane | https://www.billetweb.fr/pro/billetteriefo | Scraping Billetweb | OK | 15 | | Fresque Agri'Alim | https://www.billetweb.fr/pro/fresqueagrialim | Scraping Billetweb | OK | 16 | | Fresque du Numérique | https://www.fresquedunumerique.org/#participer | Scraping Billetweb | OK | 17 | | Fresque des Nouveaux Récits | https://www.billetweb.fr/pro/fdnr | Scraping Billetweb | OK | 18 | | Fresque de la Mobilité | https://www.billetweb.fr/pro/fresquedelamobilite | Scraping Billetweb | OK | 19 | | Fresque de l'Alimentation | https://www.billetweb.fr/pro/fresquealimentation | Scraping Billetweb | OK | 20 | | Fresque de la Construction | https://www.billetweb.fr/pro/fresquedelaconstruction | Scraping Billetweb | OK | 21 | | Fresque du Sexisme | https://www.billetweb.fr/pro/fresque-du-sexisme | Scraping Billetweb | OK | 22 | | Atelier OGRE | https://www.billetweb.fr/pro/atelierogre | Scraping Billetweb | OK | 23 | | Fresque Nos Vies Bas Carbone | https://www.billetweb.fr/multi_event.php?user=132897 | Scraping Billetweb | OK | 24 | | Fresque de l'Eau | https://www.billetweb.fr/multi_event.php?user=138110 | Scraping Billetweb | OK | 25 | | Atelier futurs proches | https://www.billetweb.fr/pro/futursproches | Scraping Billetweb | OK | 26 | | Fresque de la Diversité | https://www.billetweb.fr/multi_event.php?user=168799 | Scraping Billetweb | OK | 27 | | Fresque de l'Économie Circulaire | https://www.billetweb.fr/multi_event.php?user=246258 | Scaping Billetweb | OK | 28 | | Fresque du Textile | https://www.billetweb.fr/multi_event.php?user=166793 | Scraping Billetweb | OK | 29 | | Fresque des Déchets | https://www.billetweb.fr/multi_event.php?user=166793 | Scraping Billetweb | OK | 30 | | Fresque des Frontières Planétaires | https://1erdegre.glide.page/dl/6471c6 | Scraping Glide Pages | OK | 31 | | Fresque de la Finance | https://www.billetweb.fr/pro/fresquedelafinance | Scraping Billetweb | OK | 32 | | Puzzle Climat | https://www.puzzleclimat.org/ | Scraping Billetweb | OK | 33 | | Atelier Horizons Décarbonés | https://1erdegre.glide.page/dl/6471c6 | Scraping Glide Pages | OK | 34 | | 2030 Glorieuses | https://www.2030glorieuses.org/event | API | OK | 35 | | Fresque de la RSE | https://www.billetweb.fr/multi_event.php?user=139214 | Scraping Billetweb | OK | 36 | | Atelier des Transitions Urbaines | https://www.billetweb.fr/multi_event.php?user=216884 | Scraping Billetweb | OK | 37 | | Fresque de la Rénovation | https://www.helloasso.com/associations/fresque-de-la-renovation/ | Scraping HelloAsso | OK | 38 | | Fresque de l'Energie | https://www.helloasso.com/associations/la-fresque-de-l-energie | Scraping HelloAsso | OK | 39 | | Fresque des Possibles | https://www.helloasso.com/associations/le-lieu-dit | Scraping HelloAsso | OK | 40 | | Fresque de la Communication | https://www.helloasso.com/associations/la-fresque-de-la-communication | Scraping HelloAsso | OK | 41 | | Fresque Zoofresque | https://www.helloasso.com/associations/ajas-association-justice-animaux-savoie | Scraping HelloAsso | OK | 42 | | Atelier Compte-Gouttes | https://www.eventbrite.com/o/atelier-compte-gouttes-73003088333 | Scraping Eventbrite | OK | 43 | | Fresque du Bénévolat | https://www.eventbrite.fr/o/jeveuxaidergouvfr-77010082313 | Scraping Eventbrite | OK | 44 | | Fresque du Plastique | https://www.eventbrite.fr/o/la-fresque-du-plastique-45763194553 | Scraping Eventbrite | OK | 45 | | Cyber Fresque | https://www.eventbrite.fr/o/senscyb-89802295343 | Scraping Eventbrite | OK | 46 | | Fresque du Sol | https://fresquedusol.com/comment-participer/dates-a-venir/ | Calendrier ICS | OK | 47 | | Notre Tour | https://www.helloasso.com/associations/mush | Scraping HelloAsso | OK | 48 | | Planet C Play Again? | https://planetc.org/ | Calendrier ICS | OK | 49 | | Fresque de la Transition Energétique | https://fresques-tilleul.glide.page/dl/6471c6 | Scraping Glide Pages | Prévu, priorité 1 | 50 | | Pitch Climat | https://www.billetweb.fr/multi_event.php?user=186116 | Scraping Billetweb | Prévu, priorité 1 | 51 | | Fresque de l'Equité | https://www.fresquedelequite.fr/ | Scraping custom | Prévu, priorité 1 | 52 | | Fresque du Mouvement | https://www.eventbrite.fr/o/la-fresque-du-mouvement-108241184341 | Scraping Eventbrite | Prévu, priorité 1 | 53 | | Fresque de l'Environnement | | API | Prévu, priorité 2 | 54 | | Fresque de la Forêt | https://all4trees.org/agir/fresque-foret/evenements | Scraping site custom | Prévu, priorité 2 | 55 | | Atelier Découverte de la Renaissance Écologique | https://renaissanceecologique.org/ | Scraping site custom | Prévu, priorité 2 | 56 | | Atelier Éco-challenge Little Big Impact | https://www.billetweb.fr/pro/lbi-quiz-sedd | Scraping Billetweb | Prévu, priorité 2 | 57 | | Fresque de l'Attention | https://www.billetweb.fr/pro/fresquedelattention | Scraping Billetweb | Prévu, priorité 2 | 58 | | Fresque des Écrans | https://colori.fr/la-fresque-des-ecrans | Scraping custom | Prévu, priorité 2 | 59 | | Fresque de l'Éco-conception | https://www.lafresquedelecoconception.fr/infos-pratiques | Scraping site custom | Prévu, priorité 2 | 60 | | Atelier L'éco-naissance | https://www.eventbrite.fr/o/leco-naissance-62237583643 | Scraping Eventbrite | En réflexion | 61 | | Fresque de l'Emploi Durable | https://www.helloasso.com/associations/solidarites-nouvelles-face-au-chomage-snc | Scraping HelloAsso | En réflexion | 62 | | Atelier Marche du Temps Profond | https://www.helloasso.com/associations/ecotopia | Scraping HelloAsso | En réflexion | 63 | | Fresque des Entreprises Inclusives | https://www.helloasso.com/associations/tous-tes-possibles/evenements/fresque-des-entreprises-inclusives| Scraping HelloAsso | En réflexion | 64 | 65 | ## Initiatives locales 66 | 67 | Nous avons pris connaissance d'initiatives locales organisée au niveau du département. Nous réfléchissons à un moyen d'intégrer ces ateliers. Le souci est que certains ateliers pourraient être dupliqués par rapport aux billetterie officielles. 68 | 69 | | Département | Lien | Source | Supporté | 70 | | ------------- |:-------------:| :-----:| :-----:| 71 | | Atelier Déclics | https://www.helloasso.com/associations/objective-zero | Scraping HelloAsso | Non | 72 | | Isère | https://enjeuxcommuns.fr/les-prochains-ateliers-en-isere/ | Airtable | Non | 73 | | Bas-Rhin/Haut-Rhin | https://hoplatransition.org/index.php/nos-evenements/ | Framagenda | Non | 74 | | Réunion | https://fresques.re/event-directory/ | Custom | Non | 75 | | Marseille | https://fada.earth/ | Airtable | Non | 76 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/scrape.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import subprocess 5 | import sys 6 | import pandas as pd 7 | import psycopg 8 | 9 | from datetime import datetime 10 | from pathlib import Path 11 | from psycopg.conninfo import make_conninfo 12 | 13 | from trouver_une_fresque_scraper.apis import main as main_apis 14 | from trouver_une_fresque_scraper.scraper import main as main_scraper 15 | 16 | 17 | def configure_logging(log_file_path, error_log_file_path): 18 | """ 19 | Configures the logging system to write all levels of messages to both a file and the console, 20 | and errors to a separate file. 21 | 22 | :param log_file_path: The path to the log file for all levels of messages. 23 | :param error_log_file_path: The path to the log file for error messages only. 24 | """ 25 | # Ensure the directories exist 26 | log_file_path.parent.mkdir(parents=True, exist_ok=True) 27 | error_log_file_path.parent.mkdir(parents=True, exist_ok=True) 28 | 29 | # Create a logger 30 | logger = logging.getLogger() 31 | logger.setLevel(logging.INFO) 32 | 33 | # Create a file handler for all levels of messages 34 | file_handler = logging.FileHandler(log_file_path) 35 | file_handler.setLevel(logging.INFO) 36 | file_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) 37 | 38 | # Create a stream handler for all levels of messages 39 | stream_handler = logging.StreamHandler() 40 | stream_handler.setLevel(logging.INFO) 41 | stream_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) 42 | 43 | # Create a file handler for error messages only 44 | error_file_handler = logging.FileHandler(error_log_file_path) 45 | error_file_handler.setLevel(logging.ERROR) 46 | error_file_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) 47 | 48 | # Add handlers to the logger 49 | logger.addHandler(file_handler) 50 | logger.addHandler(stream_handler) 51 | logger.addHandler(error_file_handler) 52 | 53 | 54 | def is_git_repository_dirty(): 55 | # Check if the repository is dirty 56 | try: 57 | result = subprocess.run(["git", "status", "--porcelain"], capture_output=True, text=True) 58 | return bool(result.stdout.strip()) 59 | except subprocess.CalledProcessError as e: 60 | logging.error(f"Error checking git status: {e}") 61 | sys.exit(1) 62 | 63 | 64 | def get_git_commit_hash(): 65 | # Get the current commit hash 66 | try: 67 | result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True) 68 | return result.stdout.strip() 69 | except subprocess.CalledProcessError as e: 70 | logging.error(f"Error checking git status: {e}") 71 | sys.exit(1) 72 | 73 | 74 | def get_sources(content): 75 | try: 76 | data = json.loads(content) 77 | except json.JSONDecodeError as e: 78 | logging.error(f"Failed to decode JSON: {e}") 79 | raise 80 | except Exception as e: 81 | logging.error(f"An unexpected error occurred: {e}") 82 | raise 83 | 84 | # Validate the data structure 85 | for d in data: 86 | if not isinstance(d, dict): 87 | logging.error(f"Invalid data structure: expected a dictionary, got {type(d).__name__}") 88 | raise 89 | 90 | required_keys = ["name", "id", "url", "type"] 91 | for key in required_keys: 92 | if key not in d: 93 | logging.error(f"Missing required key '{key}' in data: {d}") 94 | raise 95 | 96 | scrapers, apis = [], [] 97 | for d in data: 98 | if d["type"] == "scraper": 99 | scrapers.append(d) 100 | elif d["type"] == "api": 101 | apis.append(d) 102 | 103 | return scrapers, apis 104 | 105 | 106 | if __name__ == "__main__": 107 | parser = argparse.ArgumentParser() 108 | parser.add_argument( 109 | "--country", 110 | default="fr", 111 | help="run the scraper for the given json containing data sources", 112 | ) 113 | parser.add_argument( 114 | "--headless", 115 | action="store_true", 116 | default=False, 117 | help="run scraping in headless mode", 118 | ) 119 | parser.add_argument( 120 | "--push-to-db", 121 | action="store_true", 122 | default=False, 123 | help="push the scraped results to db", 124 | ) 125 | parser.add_argument( 126 | "--skip-dirty-check", 127 | action="store_true", 128 | default=False, 129 | help="skips checking that the git repository is clean", 130 | ) 131 | args = parser.parse_args() 132 | 133 | # This scraper should be run from a clean state to ensure reproducibility 134 | dirty = is_git_repository_dirty() 135 | if dirty and not args.skip_dirty_check: 136 | logging.warning("The git repository is dirty. Consider a clean state for reproducibility.") 137 | user_input = input("Do you want to continue? (y/n): ").strip().lower() 138 | if user_input != "y": 139 | logging.error("Operation cancelled.") 140 | sys.exit(0) 141 | 142 | # Validate the source file 143 | source_path = Path(f"countries/{args.country}.json") 144 | try: 145 | with open(source_path, "r") as file: 146 | content = file.read() 147 | except FileNotFoundError: 148 | logging.info(f"Source file {source_path} does not exist.") 149 | raise 150 | 151 | # Parse the sources 152 | scrapers, apis = get_sources(content) 153 | 154 | # Build the results path for this run 155 | dt = datetime.now() 156 | scraping_time = dt.strftime("%Y%m%d_%H%M%S") 157 | results_path = Path(f"results/{args.country}/{scraping_time}") 158 | results_path.mkdir(parents=True, exist_ok=True) 159 | commit_hash = get_git_commit_hash() 160 | with open(f"{results_path}/commit_hash.txt", "w") as file: 161 | file.write(commit_hash) 162 | if dirty: 163 | file.write("\n" + "dirty" + "\n") 164 | 165 | # Logging 166 | log_path = results_path / Path("log.txt") 167 | errors_path = results_path / Path("error_log.txt") 168 | configure_logging(log_path, errors_path) 169 | 170 | # Launch the scraper 171 | df1 = main_scraper(scrapers, headless=args.headless) 172 | df2 = main_apis(apis) 173 | df_merged = pd.concat([df1, df2]) 174 | 175 | dt = datetime.now() 176 | insert_time = dt.strftime("%Y%m%d_%H%M%S") 177 | with open(results_path / Path(f"events_{insert_time}.json"), "w", encoding="UTF-8") as file: 178 | df_merged.to_json(file, orient="records", force_ascii=False, indent=2) 179 | 180 | # Push the resulting json file to the database 181 | if args.push_to_db: 182 | logging.info("Pushing scraped results into db...") 183 | credentials = get_config() 184 | host = credentials["host"] 185 | port = credentials["port"] 186 | user = credentials["user"] 187 | psw = credentials["psw"] 188 | database = credentials["database"] 189 | 190 | with psycopg.connect( 191 | make_conninfo(dbname=database, user=user, password=psw, host=host, port=port) 192 | ) as conn: 193 | etl(conn, df_merged) 194 | 195 | logging.info("Done") 196 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/apis/ics.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import datetime 3 | import json 4 | import pytz 5 | import re 6 | import requests 7 | import logging 8 | 9 | from trouver_une_fresque_scraper.db.records import get_record_dict 10 | from ics import Calendar 11 | import re 12 | from trouver_une_fresque_scraper.utils.errors import FreskError 13 | from trouver_une_fresque_scraper.utils.language import detect_language_code 14 | from trouver_une_fresque_scraper.utils.location import get_address 15 | import xml.etree.ElementTree as ET 16 | 17 | 18 | # from https://regexr.com/37i6s 19 | REGEX_URL = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)" 20 | 21 | IGNORABLE_DOMAINS = [ 22 | "https://meet.google.com", 23 | "https://support.google.com", 24 | "https://us02web.zoom.us", 25 | ] 26 | 27 | TICKETING_TEXT = ["billetterie", "registration", "ticket", "inscription"] 28 | 29 | 30 | # Returns a ticketing URL extracted from a description in plain text or formatted as HTML. 31 | def get_ticketing_url_from_description(description): 32 | # list of tuples: (URL, anchor text if HTML document otherwise same URL) 33 | links = [] 34 | 35 | try: 36 | # try as HTML document 37 | root = ET.fromstring(description) 38 | for elem in root.findall(".//a[@href]"): 39 | links.append((elem.get("href"), elem.text)) 40 | except ET.ParseError: 41 | # fall back to plain text 42 | for url in re.findall(REGEX_URL, description): 43 | links.append((url, url)) 44 | 45 | def should_link_be_kept(link): 46 | url = link[0] 47 | for domain in IGNORABLE_DOMAINS: 48 | if url.startswith(domain): 49 | return False 50 | return True 51 | 52 | links = list(filter(should_link_be_kept, links)) 53 | if len(links) == 1: 54 | return links[0][0] 55 | 56 | def does_text_look_like_registration(link): 57 | lower_text = link[1].upper() 58 | for text in TICKETING_TEXT: 59 | if lower_text.find(text) > -1: 60 | return True 61 | return False 62 | 63 | links = list(filter(does_text_look_like_registration, links)) 64 | if len(links) == 1: 65 | return links[0][0] 66 | 67 | return None 68 | 69 | 70 | def get_ics_data(source): 71 | logging.info(f"Getting iCalendar data from {source['url']}") 72 | 73 | calendar = None 74 | records = [] 75 | 76 | try: 77 | response = requests.get(source["url"]) 78 | # Check if the request was successful (status code 200). 79 | if response.status_code == 200: 80 | # Remove VALARMs which incorrectly crash the ics library. 81 | text = re.sub("BEGIN:VALARM.*END:VALARM", "", response.text, flags=re.DOTALL) 82 | calendar = Calendar(text) 83 | else: 84 | logging.info(f"Request failed with status code: {response.status_code}") 85 | except requests.RequestException as e: 86 | logging.info(f"An error occurred: {e}") 87 | 88 | if not calendar: 89 | return records 90 | 91 | for event in calendar.events: 92 | logging.info(f"Processing event {event.name}") 93 | 94 | ################################################################ 95 | # Kick out event early if it is in the past 96 | ################################################################ 97 | event_start_datetime = event.begin 98 | event_end_datetime = event.end 99 | if event_start_datetime < pytz.UTC.localize(datetime.datetime.now()): 100 | logging.info(f"Rejecting record: start time before now.") 101 | continue 102 | 103 | ################################################################ 104 | # Get basic event metadata 105 | ################################################################ 106 | event_id = event.uid 107 | title = event.name 108 | description = event.description 109 | 110 | ################################################################ 111 | # Location data, or online 112 | ################################################################ 113 | full_location = "" 114 | location_name = "" 115 | address = "" 116 | city = "" 117 | department = "" 118 | longitude = "" 119 | latitude = "" 120 | zip_code = "" 121 | country_code = "" 122 | 123 | online = event.location == None 124 | if not online: 125 | location = event.location.lstrip() 126 | for domain in IGNORABLE_DOMAINS: 127 | if location.startswith(domain): 128 | online = True 129 | break 130 | 131 | if not online: 132 | try: 133 | full_location = event.location 134 | address_dict = get_address(full_location.split("\n", 1).pop()) 135 | ( 136 | location_name, 137 | address, 138 | city, 139 | department, 140 | zip_code, 141 | country_code, 142 | latitude, 143 | longitude, 144 | ) = address_dict.values() 145 | except FreskError as error: 146 | logging.info(f"Rejecting record: {error}.") 147 | continue 148 | 149 | ################################################################ 150 | # Infer more event metadata 151 | ################################################################ 152 | title_upper = title.upper() 153 | training = "FORMATION" in title_upper or "TRAINING" in title_upper 154 | sold_out = False 155 | kids = False 156 | 157 | ################################################################ 158 | # Get tickets link: try URL else extract from description 159 | ################################################################ 160 | tickets_link = event.url 161 | if not tickets_link and event.description: 162 | tickets_link = get_ticketing_url_from_description(event.description) 163 | if not tickets_link: 164 | logging.warning(f"Rejecting record {event_id}: no ticket link extracted.") 165 | continue 166 | source_link = tickets_link 167 | 168 | ################################################################ 169 | # Building final object 170 | ################################################################ 171 | record = get_record_dict( 172 | f"{source['id']}-{event_id}", 173 | source["id"], 174 | title, 175 | event_start_datetime, 176 | event_end_datetime, 177 | full_location, 178 | location_name, 179 | address, 180 | city, 181 | department, 182 | zip_code, 183 | country_code, 184 | latitude, 185 | longitude, 186 | source.get("language_code", detect_language_code(title, description)), 187 | online, 188 | training, 189 | sold_out, 190 | kids, 191 | source_link, 192 | tickets_link, 193 | description, 194 | ) 195 | 196 | records.append(record) 197 | logging.info(f"Successfully got record\n{json.dumps(record, indent=4)}") 198 | 199 | logging.info(f"Got {len(records)} records.") 200 | return records 201 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/utils/location.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | 4 | from functools import lru_cache 5 | from trouver_une_fresque_scraper.utils.errors import * 6 | 7 | from geopy.geocoders import Nominatim 8 | 9 | geolocator = Nominatim(user_agent="trouver-une-fresque", timeout=10) 10 | 11 | departments = { 12 | "01": "Ain", 13 | "02": "Aisne", 14 | "03": "Allier", 15 | "04": "Alpes-de-Haute-Provence", 16 | "05": "Hautes-Alpes", 17 | "06": "Alpes-Maritimes", 18 | "07": "Ardèche", 19 | "08": "Ardennes", 20 | "09": "Ariège", 21 | "10": "Aube", 22 | "11": "Aude", 23 | "12": "Aveyron", 24 | "13": "Bouches-du-Rhône", 25 | "14": "Calvados", 26 | "15": "Cantal", 27 | "16": "Charente", 28 | "17": "Charente-Maritime", 29 | "18": "Cher", 30 | "19": "Corrèze", 31 | "2A": "Corse-du-Sud", 32 | "2B": "Haute-Corse", 33 | "21": "Côte-d'Or", 34 | "22": "Côtes-d'Armor", 35 | "23": "Creuse", 36 | "24": "Dordogne", 37 | "25": "Doubs", 38 | "26": "Drôme", 39 | "27": "Eure", 40 | "28": "Eure-et-Loir", 41 | "29": "Finistère", 42 | "30": "Gard", 43 | "31": "Haute-Garonne", 44 | "32": "Gers", 45 | "33": "Gironde", 46 | "34": "Hérault", 47 | "35": "Ille-et-Vilaine", 48 | "36": "Indre", 49 | "37": "Indre-et-Loire", 50 | "38": "Isère", 51 | "39": "Jura", 52 | "40": "Landes", 53 | "41": "Loir-et-Cher", 54 | "42": "Loire", 55 | "43": "Haute-Loire", 56 | "44": "Loire-Atlantique", 57 | "45": "Loiret", 58 | "46": "Lot", 59 | "47": "Lot-et-Garonne", 60 | "48": "Lozère", 61 | "49": "Maine-et-Loire", 62 | "50": "Manche", 63 | "51": "Marne", 64 | "52": "Haute-Marne", 65 | "53": "Mayenne", 66 | "54": "Meurthe-et-Moselle", 67 | "55": "Meuse", 68 | "56": "Morbihan", 69 | "57": "Moselle", 70 | "58": "Nièvre", 71 | "59": "Nord", 72 | "60": "Oise", 73 | "61": "Orne", 74 | "62": "Pas-de-Calais", 75 | "63": "Puy-de-Dôme", 76 | "64": "Pyrénées-Atlantiques", 77 | "65": "Hautes-Pyrénées", 78 | "66": "Pyrénées-Orientales", 79 | "67": "Bas-Rhin", 80 | "68": "Haut-Rhin", 81 | "69": "Rhône", 82 | "70": "Haute-Saône", 83 | "71": "Saône-et-Loire", 84 | "72": "Sarthe", 85 | "73": "Savoie", 86 | "74": "Haute-Savoie", 87 | "75": "Paris", 88 | "76": "Seine-Maritime", 89 | "77": "Seine-et-Marne", 90 | "78": "Yvelines", 91 | "79": "Deux-Sèvres", 92 | "80": "Somme", 93 | "81": "Tarn", 94 | "82": "Tarn-et-Garonne", 95 | "83": "Var", 96 | "84": "Vaucluse", 97 | "85": "Vendée", 98 | "86": "Vienne", 99 | "87": "Haute-Vienne", 100 | "88": "Vosges", 101 | "89": "Yonne", 102 | "90": "Territoire de Belfort", 103 | "91": "Essonne", 104 | "92": "Hauts-de-Seine", 105 | "93": "Seine-Saint-Denis", 106 | "94": "Val-de-Marne", 107 | "95": "Val-d'Oise", 108 | "971": "Guadeloupe", 109 | "972": "Martinique", 110 | "973": "Guyane", 111 | "974": "La Réunion", 112 | "976": "Mayotte", 113 | } 114 | 115 | cache = {} 116 | 117 | 118 | @lru_cache(maxsize=None) 119 | def geocode_location_string(location_string): 120 | """ 121 | Requests Nomatim to geocode an input string. All results are cached and 122 | reused thanks to the @lru_cache decorator. 123 | """ 124 | logging.info(f"Calling geocoder: {location_string}") 125 | return geolocator.geocode(location_string, addressdetails=True) 126 | 127 | 128 | def get_address(full_location): 129 | """ 130 | Gets structured location data from an input string, tries substrings if 131 | relevant, verifies that the result is sufficiently precise (address or park 132 | level) and returns a dictionnary with the address properties. 133 | """ 134 | try: 135 | if not full_location: 136 | raise FreskAddressNotFound("") 137 | 138 | location = geocode_location_string(full_location) 139 | if location is None: 140 | full_location = re.sub(r"\(.*\)", "", full_location) 141 | location = geocode_location_string(full_location) 142 | if location is None: 143 | if "," in full_location: 144 | location = geocode_location_string(full_location.split(",", 1)[1]) 145 | if location is None: 146 | lines = full_location.splitlines(keepends=True) 147 | if len(lines) > 1: 148 | location = geocode_location_string("".join(lines[1:])) 149 | if location is None: 150 | raise FreskAddressNotFound(full_location) 151 | 152 | address = location.raw["address"] 153 | 154 | if address["country_code"] != "fr" and address["country_code"] != "ch" and address["country_code"] != "gb": 155 | raise FreskCountryNotSupported(address, full_location) 156 | 157 | house_number = "" 158 | if "house_number" in address.keys(): 159 | house_number = f"{address['house_number']} " 160 | 161 | road = "" 162 | if "road" in address.keys(): 163 | road = address["road"] 164 | elif "square" in address.keys(): 165 | road = address["square"] 166 | elif "park" in address.keys(): 167 | road = address["park"] 168 | else: 169 | raise FreskAddressBadFormat(address, full_location, "road") 170 | 171 | city = None 172 | if "city" in address.keys(): 173 | city = address["city"] 174 | elif "town" in address.keys(): 175 | city = address["town"] 176 | elif "village" in address.keys(): 177 | city = address["village"] 178 | else: 179 | raise FreskAddressBadFormat(address, full_location, "city") 180 | 181 | # Trying to infer the "department" code 182 | num_department = None 183 | if address["country_code"] == "fr": 184 | department = None 185 | if "state_district" in address.keys(): 186 | department = address["state_district"] 187 | elif "county" in address.keys(): 188 | department = address["county"] 189 | elif "city_district" in address.keys(): 190 | department = address["city_district"] 191 | elif "state" in address.keys(): 192 | department = address["state"] 193 | else: 194 | raise FreskAddressBadFormat(address, full_location, "department") 195 | try: 196 | num_department = department_to_num(department) 197 | except FreskError: 198 | raise 199 | if address["country_code"] == "ch": 200 | # Swiss department "numbers" are ISO codes from https://en.wikipedia.org/wiki/ISO_3166-2:CH. 201 | if "ISO3166-2-lvl4" in address.keys(): 202 | canton = address["ISO3166-2-lvl4"] 203 | if not canton.startswith("CH-"): 204 | raise FreskAddressBadFormat(address, full_location, "department") 205 | num_department = canton[3:] 206 | else: 207 | raise FreskAddressBadFormat(address, full_location, "department") 208 | 209 | # Missing fields 210 | if "postcode" not in address: 211 | raise FreskAddressIncomplete(address, full_location, "postcode") 212 | 213 | except FreskError as e: 214 | logging.error(f"get_address: {e}") 215 | raise 216 | 217 | return { 218 | "location_name": location.raw["name"], 219 | "address": f"{house_number}{road}", 220 | "city": city, 221 | "department": num_department, 222 | "zip_code": address["postcode"], 223 | "country_code": address["country_code"], 224 | "latitude": location.raw["lat"], 225 | "longitude": location.raw["lon"], 226 | } 227 | 228 | 229 | def department_to_num(department): 230 | for k, v in departments.items(): 231 | if v == department: 232 | return k 233 | raise FreskDepartmentNotFound(f"Department number.") 234 | 235 | -------------------------------------------------------------------------------- /countries/fr.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "Fresque des Nouveaux Récits", 4 | "url": "https://www.billetweb.fr/pro/fdnr", 5 | "type": "scraper", 6 | "iframe": "event21569", 7 | "id": 0 8 | }, 9 | { 10 | "name": "Fresque Océane", 11 | "url": "https://www.billetweb.fr/pro/billetteriefo", 12 | "type": "scraper", 13 | "iframe": "event15247", 14 | "id": 1 15 | }, 16 | { 17 | "name": "Fresque de la Biodiversité", 18 | "url": "https://www.billetweb.fr/multi_event.php?user=82762", 19 | "type": "scraper", 20 | "iframe": "event17309", 21 | "id": 2 22 | }, 23 | { 24 | "name": "Fresque du Numérique", 25 | "url": "https://www.billetweb.fr/pro/atelier-fresque-du-numerique", 26 | "type": "scraper", 27 | "iframe": "event41180", 28 | "id": 3 29 | }, 30 | { 31 | "name": "Fresque Agri'Alim", 32 | "url": "https://www.billetweb.fr/pro/fresqueagrialim", 33 | "type": "scraper", 34 | "iframe": "event11421", 35 | "id": 4 36 | }, 37 | { 38 | "name": "Fresque de l'Alimentation", 39 | "url": "https://www.billetweb.fr/pro/fresquealimentation", 40 | "type": "scraper", 41 | "iframe": "event11155", 42 | "id": 5 43 | }, 44 | { 45 | "name": "Fresque de la Construction", 46 | "url": "https://www.billetweb.fr/pro/fresquedelaconstruction", 47 | "type": "scraper", 48 | "iframe": "event11574", 49 | "id": 6 50 | }, 51 | { 52 | "name": "Fresque de la Mobilité", 53 | "url": "https://app.fresquedelamobilite.org/", 54 | "type": "api", 55 | "id": 7 56 | }, 57 | { 58 | "name": "Fresque du Sexisme", 59 | "url": "https://www.billetweb.fr/pro/fresque-du-sexisme", 60 | "type": "scraper", 61 | "iframe": "event27112", 62 | "id": 8 63 | }, 64 | { 65 | "name": "Atelier OGRE", 66 | "url": "https://www.billetweb.fr/pro/atelierogre", 67 | "type": "scraper", 68 | "iframe": "event13026", 69 | "id": 9 70 | }, 71 | { 72 | "name": "Atelier Nos vies bas carbone", 73 | "url": "https://www.billetweb.fr/multi_event.php?user=132897", 74 | "type": "scraper", 75 | "iframe": "event22230", 76 | "id": 10 77 | }, 78 | { 79 | "name": "Fresque de l'Eau", 80 | "url": "https://www.billetweb.fr/multi_event.php?user=138110", 81 | "type": "scraper", 82 | "iframe": "eventu138110", 83 | "id": 11 84 | }, 85 | { 86 | "name": "futurs proches", 87 | "url": "https://www.billetweb.fr/pro/futursproches", 88 | "type": "scraper", 89 | "iframe": "event14893", 90 | "id": 12 91 | }, 92 | { 93 | "name": "Fresque de la Diversité", 94 | "url": "https://www.billetweb.fr/multi_event.php?user=168799", 95 | "type": "scraper", 96 | "iframe": "event38362", 97 | "id": 13 98 | }, 99 | { 100 | "name": "Fresque du Textile", 101 | "url": "https://www.billetweb.fr/multi_event.php?user=166793", 102 | "type": "scraper", 103 | "iframe": "event27458", 104 | "filter": "textile", 105 | "id": 14 106 | }, 107 | { 108 | "name": "Fresque des Déchets", 109 | "url": "https://www.billetweb.fr/multi_event.php?user=166793", 110 | "type": "scraper", 111 | "iframe": "event27458", 112 | "filter": "dechet", 113 | "id": 15 114 | }, 115 | { 116 | "name": "Puzzle Climat", 117 | "url": "https://www.billetweb.fr/multi_event.php?user=121600", 118 | "type": "scraper", 119 | "iframe": "event21038", 120 | "id": 16 121 | }, 122 | { 123 | "name": "Fresque de la Finance", 124 | "url": "https://www.billetweb.fr/pro/fresquedelafinance", 125 | "type": "scraper", 126 | "iframe": "event34683", 127 | "id": 17 128 | }, 129 | { 130 | "name": "Fresque de la RSE", 131 | "url": "https://www.billetweb.fr/pro/fresque", 132 | "type": "scraper", 133 | "iframe": "event35904", 134 | "id": 18 135 | }, 136 | { 137 | "name": "Atelier des Transitions Urbaines", 138 | "url": "https://www.billetweb.fr/multi_event.php?user=216884", 139 | "type": "scraper", 140 | "iframe": "event38980", 141 | "id": 19 142 | }, 143 | { 144 | "name": "2tonnes", 145 | "url": "https://www.eventbrite.fr/o/2-tonnes-29470123869", 146 | "type": "scraper", 147 | "id": 100 148 | }, 149 | { 150 | "name": "Atelier Compte-Gouttes", 151 | "url": "https://www.eventbrite.fr/o/atelier-compte-gouttes-73003088333", 152 | "type": "scraper", 153 | "id": 101 154 | }, 155 | { 156 | "name": "Fresque du Bénévolat", 157 | "url": "https://www.eventbrite.fr/o/jeveuxaidergouvfr-77010082313", 158 | "type": "scraper", 159 | "id": 102 160 | }, 161 | { 162 | "name": "Fresque du Plastique", 163 | "url": "https://www.eventbrite.fr/o/la-fresque-du-plastique-45763194553", 164 | "type": "scraper", 165 | "id": 103 166 | }, 167 | { 168 | "name": "Cyber Fresque", 169 | "url": "https://www.eventbrite.fr/o/senscyb-89802295343", 170 | "type": "scraper", 171 | "id": 104 172 | }, 173 | { 174 | "name": "Fresque du Climat (ateliers)", 175 | "url": "https://fresqueduclimat.org/inscription-atelier/grand-public/", 176 | "type": "scraper", 177 | "id": 200 178 | }, 179 | { 180 | "name": "Fresque du Climat (formations)", 181 | "url": "https://fresqueduclimat.org/inscription-formation/grand-public/", 182 | "type": "scraper", 183 | "id": 200 184 | }, 185 | { 186 | "name": "Fresque de l'Economie Circulaire", 187 | "url": "https://www.billetweb.fr/multi_event.php?user=246258", 188 | "type": "scraper", 189 | "iframe": "event41148", 190 | "id": 300 191 | }, 192 | { 193 | "name": "Fresque des Frontières Planétaires (ateliers)", 194 | "url": "https://1erdegre.glide.page/dl/3b1bc8", 195 | "type": "scraper", 196 | "id": 500, 197 | "filter": "Fresque des frontières planétaires" 198 | }, 199 | { 200 | "name": "Fresque des Frontières Planétaires (formations)", 201 | "url": "https://1erdegre.glide.page/dl/dcc150", 202 | "type": "scraper", 203 | "id": 500, 204 | "filter": "Fresque des frontières planétaires" 205 | }, 206 | { 207 | "name": "Horizons Décarbonés (ateliers)", 208 | "url": "https://1erdegre.glide.page/dl/3b1bc8", 209 | "type": "scraper", 210 | "id": 501, 211 | "filter": "Horizons Décarbonés" 212 | }, 213 | { 214 | "name": "Horizons Décarbonés (formations)", 215 | "url": "https://1erdegre.glide.page/dl/dcc150", 216 | "type": "scraper", 217 | "id": 501, 218 | "filter": "Horizons Décarbonés" 219 | }, 220 | { 221 | "name": "30 Glorieuses", 222 | "url": "https://hook.eu1.make.com/koqwhb0igq5air3aysx58rsjeld1uacl", 223 | "type": "api", 224 | "id": 600 225 | }, 226 | { 227 | "name": "Fresque de la Rénovation", 228 | "url": "https://www.helloasso.com/associations/fresque-de-la-renovation", 229 | "type": "scraper", 230 | "id": 700 231 | }, 232 | { 233 | "name": "Fresque de l'Energie", 234 | "url": "https://www.helloasso.com/associations/la-fresque-de-l-energie", 235 | "type": "scraper", 236 | "id": 701 237 | }, 238 | { 239 | "name": "Fresque des Possibles", 240 | "url": "https://www.helloasso.com/associations/le-lieu-dit", 241 | "type": "scraper", 242 | "id": 702 243 | }, 244 | { 245 | "name": "Fresque de la Communication", 246 | "url": "https://www.helloasso.com/associations/la-fresque-de-la-communication", 247 | "type": "scraper", 248 | "id": 703 249 | }, 250 | { 251 | "name": "Zoofresque", 252 | "url": "https://www.helloasso.com/associations/ajas-association-justice-animaux-savoie", 253 | "type": "scraper", 254 | "id": 704 255 | }, 256 | { 257 | "name": "Notre Tour", 258 | "url": "https://www.helloasso.com/associations/mush", 259 | "type": "scraper", 260 | "id": 705 261 | }, 262 | { 263 | "name": "Fresque du Sol", 264 | "url": "https://framagenda.org/remote.php/dav/public-calendars/KwNwGA232xD38CnN/?export", 265 | "type": "api", 266 | "id": 801 267 | } 268 | ] 269 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/scraper/helloasso.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import time 4 | import logging 5 | 6 | from selenium import webdriver 7 | from selenium.common.exceptions import NoSuchElementException, TimeoutException 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | 12 | from trouver_une_fresque_scraper.db.records import get_record_dict 13 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates 14 | from trouver_une_fresque_scraper.utils.errors import ( 15 | FreskError, 16 | FreskDateNotFound, 17 | FreskDateBadFormat, 18 | ) 19 | from trouver_une_fresque_scraper.utils.keywords import * 20 | from trouver_une_fresque_scraper.utils.language import detect_language_code 21 | from trouver_une_fresque_scraper.utils.location import get_address 22 | 23 | 24 | def scroll_to_bottom(driver): 25 | while True: 26 | logging.info("Scrolling to the bottom...") 27 | try: 28 | time.sleep(2) 29 | next_button = WebDriverWait(driver, 10).until( 30 | EC.element_to_be_clickable( 31 | ( 32 | By.CSS_SELECTOR, 33 | 'button[data-hook="load-more-button"]', 34 | ) 35 | ) 36 | ) 37 | desired_y = (next_button.size["height"] / 2) + next_button.location["y"] 38 | window_h = driver.execute_script("return window.innerHeight") 39 | window_y = driver.execute_script("return window.pageYOffset") 40 | current_y = (window_h / 2) + window_y 41 | scroll_y_by = desired_y - current_y 42 | driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by) 43 | time.sleep(2) 44 | next_button.click() 45 | except TimeoutException: 46 | break 47 | 48 | 49 | def get_helloasso_data(sources, service, options): 50 | logging.info("Scraping data from helloasso.com") 51 | 52 | driver = webdriver.Firefox(service=service, options=options) 53 | 54 | records = [] 55 | 56 | for page in sources: 57 | logging.info(f"==================\nProcessing page {page}") 58 | driver.get(page["url"]) 59 | driver.implicitly_wait(5) 60 | time.sleep(3) 61 | 62 | # Scroll to bottom to load all events 63 | desired_y = 2300 64 | window_h = driver.execute_script("return window.innerHeight") 65 | window_y = driver.execute_script("return window.pageYOffset") 66 | current_y = (window_h / 2) + window_y 67 | scroll_y_by = desired_y - current_y 68 | driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by) 69 | time.sleep(5) 70 | 71 | try: 72 | button = driver.find_element( 73 | By.XPATH, 74 | '//button[@data-ux="Explore_OrganizationPublicPage_Actions_ActionEvent_ShowAllActions"]', 75 | ) 76 | button.click() 77 | except NoSuchElementException: 78 | pass 79 | 80 | ele = driver.find_elements(By.CSS_SELECTOR, "a.ActionLink-Event") 81 | links = [e.get_attribute("href") for e in ele] 82 | num_el = len(ele) 83 | logging.info(f"Found {num_el} elements") 84 | 85 | for link in links: 86 | logging.info(f"\n-> Processing {link} ...") 87 | driver.get(link) 88 | driver.implicitly_wait(3) 89 | 90 | ################################################################ 91 | # Parse event id 92 | ################################################################ 93 | uuid = link.split("/")[-1] 94 | if not uuid: 95 | logging.info("Rejecting record: UUID not found") 96 | continue 97 | 98 | ################################################################ 99 | # Parse event title 100 | ################################################################ 101 | title_el = driver.find_element( 102 | by=By.TAG_NAME, 103 | value="h1", 104 | ) 105 | title = title_el.text 106 | 107 | ################################################################ 108 | # Parse start and end dates 109 | ################################################################ 110 | try: 111 | date_info_el = driver.find_element( 112 | by=By.CSS_SELECTOR, 113 | value="span.CampaignHeader--Date", 114 | ) 115 | event_time = date_info_el.text 116 | except NoSuchElementException as error: 117 | logging.info(f"Reject record: {error}") 118 | continue 119 | 120 | try: 121 | event_start_datetime, event_end_datetime = get_dates(event_time) 122 | except Exception as e: 123 | logging.info(f"Rejecting record: {e}") 124 | continue 125 | 126 | ################################################################ 127 | # Is it an online event? 128 | ################################################################ 129 | online = is_online(title) 130 | 131 | ################################################################ 132 | # Location data 133 | ################################################################ 134 | full_location = "" 135 | location_name = "" 136 | address = "" 137 | city = "" 138 | department = "" 139 | longitude = "" 140 | latitude = "" 141 | zip_code = "" 142 | country_code = "" 143 | 144 | if not online: 145 | try: 146 | location_el = driver.find_element( 147 | By.CSS_SELECTOR, "section.CardAddress--Location" 148 | ) 149 | except NoSuchElementException: 150 | logging.info("Rejecting record: no location") 151 | continue 152 | 153 | full_location = location_el.text 154 | 155 | try: 156 | address_dict = get_address(full_location) 157 | ( 158 | location_name, 159 | address, 160 | city, 161 | department, 162 | zip_code, 163 | country_code, 164 | latitude, 165 | longitude, 166 | ) = address_dict.values() 167 | except FreskError as error: 168 | logging.info(f"Rejecting record: {error}.") 169 | continue 170 | 171 | ################################################################ 172 | # Description 173 | ################################################################ 174 | try: 175 | description_el = driver.find_element( 176 | By.CSS_SELECTOR, "div.CampaignHeader--Description" 177 | ) 178 | except NoSuchElementException: 179 | logging.info(f"Rejecting record: no description") 180 | continue 181 | 182 | description = description_el.text 183 | 184 | ################################################################ 185 | # Training? 186 | ################################################################ 187 | training = is_training(title) 188 | 189 | ################################################################ 190 | # Is it full? 191 | ################################################################ 192 | sold_out = False 193 | 194 | ################################################################ 195 | # Is it suited for kids? 196 | ################################################################ 197 | kids = is_for_kids(title) 198 | 199 | ################################################################ 200 | # Parse tickets link 201 | ################################################################ 202 | tickets_link = link 203 | 204 | ################################################################ 205 | # Building final object 206 | ################################################################ 207 | record = get_record_dict( 208 | f"{page['id']}-{uuid}", 209 | page["id"], 210 | title, 211 | event_start_datetime, 212 | event_end_datetime, 213 | full_location, 214 | location_name, 215 | address, 216 | city, 217 | department, 218 | zip_code, 219 | country_code, 220 | latitude, 221 | longitude, 222 | page.get( 223 | "language_code", 224 | detect_language_code(title, description), 225 | ), 226 | online, 227 | training, 228 | sold_out, 229 | kids, 230 | link, 231 | link, 232 | description, 233 | ) 234 | 235 | records.append(record) 236 | logging.info(f"Successfully scraped {link}\n{json.dumps(record, indent=4)}") 237 | 238 | driver.quit() 239 | 240 | return records 241 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/scraper/fec.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import logging 4 | 5 | from selenium import webdriver 6 | from selenium.common.exceptions import NoSuchElementException, TimeoutException 7 | from selenium.webdriver.common.by import By 8 | from selenium.webdriver.support.ui import WebDriverWait 9 | from selenium.webdriver.support import expected_conditions as EC 10 | 11 | from trouver_une_fresque_scraper.db.records import get_record_dict 12 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates 13 | from trouver_une_fresque_scraper.utils.errors import ( 14 | FreskError, 15 | FreskDateBadFormat, 16 | FreskDateNotFound, 17 | FreskDateDifferentTimezone, 18 | ) 19 | from trouver_une_fresque_scraper.utils.keywords import * 20 | from trouver_une_fresque_scraper.utils.language import detect_language_code 21 | from trouver_une_fresque_scraper.utils.location import get_address 22 | 23 | 24 | def scroll_to_bottom(driver): 25 | while True: 26 | logging.info("Scrolling to the bottom...") 27 | try: 28 | time.sleep(2) 29 | next_button = WebDriverWait(driver, 10).until( 30 | EC.element_to_be_clickable( 31 | ( 32 | By.CSS_SELECTOR, 33 | 'button[data-hook="load-more-button"]', 34 | ) 35 | ) 36 | ) 37 | desired_y = (next_button.size["height"] / 2) + next_button.location["y"] 38 | window_h = driver.execute_script("return window.innerHeight") 39 | window_y = driver.execute_script("return window.pageYOffset") 40 | current_y = (window_h / 2) + window_y 41 | scroll_y_by = desired_y - current_y 42 | driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by) 43 | time.sleep(2) 44 | next_button.click() 45 | except TimeoutException: 46 | break 47 | 48 | 49 | def get_fec_data(sources, service, options): 50 | logging.info("Scraping data from lafresquedeleconomiecirculaire.com") 51 | 52 | driver = webdriver.Firefox(service=service, options=options) 53 | 54 | records = [] 55 | 56 | for page in sources: 57 | logging.info("========================") 58 | driver.get(page["url"]) 59 | driver.implicitly_wait(2) 60 | 61 | # Scroll to bottom to load all events 62 | scroll_to_bottom(driver) 63 | driver.execute_script("window.scrollTo(0, 0);") 64 | 65 | ele = driver.find_elements( 66 | By.CSS_SELECTOR, 'li[data-hook="events-card"] a[data-hook="title"]' 67 | ) 68 | links = [e.get_attribute("href") for e in ele] 69 | 70 | # Only events published on lafresquedeleconomiecirculaire.com can be extracted 71 | links = [l for l in links if "lafresquedeleconomiecirculaire.com" in l] 72 | 73 | for link in links: 74 | logging.info(f"\n-> Processing {link} ...") 75 | driver.get(link) 76 | driver.implicitly_wait(3) 77 | time.sleep(5) 78 | 79 | ################################################################ 80 | # Parse event id 81 | ################################################################ 82 | # Define the regex pattern for UUIDs 83 | uuid = link.split("/event-details/")[-1] 84 | if not uuid: 85 | logging.info("Rejecting record: UUID not found") 86 | continue 87 | 88 | ################################################################ 89 | # Parse event title 90 | ################################################################ 91 | title_el = driver.find_element( 92 | by=By.TAG_NAME, 93 | value="h1", 94 | ) 95 | title = title_el.text 96 | 97 | ################################################################ 98 | # Parse start and end dates 99 | ################################################################ 100 | try: 101 | date_info_el = driver.find_element( 102 | by=By.CSS_SELECTOR, 103 | value='p[data-hook="event-full-date"]', 104 | ) 105 | event_time = date_info_el.text 106 | except NoSuchElementException: 107 | raise FreskDateNotFound 108 | 109 | try: 110 | event_start_datetime, event_end_datetime = get_dates(event_time) 111 | except FreskDateBadFormat as error: 112 | logging.info(f"Reject record: {error}") 113 | continue 114 | 115 | ################################################################ 116 | # Is it an online event? 117 | ################################################################ 118 | online = False 119 | try: 120 | online_el = driver.find_element( 121 | By.CSS_SELECTOR, 'p[data-hook="event-full-location"]' 122 | ) 123 | if is_online(online_el.text): 124 | online = True 125 | except NoSuchElementException: 126 | pass 127 | 128 | ################################################################ 129 | # Location data 130 | ################################################################ 131 | full_location = "" 132 | location_name = "" 133 | address = "" 134 | city = "" 135 | department = "" 136 | longitude = "" 137 | latitude = "" 138 | zip_code = "" 139 | country_code = "" 140 | 141 | if not online: 142 | location_el = driver.find_element( 143 | By.CSS_SELECTOR, 'p[data-hook="event-full-location"]' 144 | ) 145 | full_location = location_el.text 146 | 147 | try: 148 | address_dict = get_address(full_location) 149 | ( 150 | location_name, 151 | address, 152 | city, 153 | department, 154 | zip_code, 155 | country_code, 156 | latitude, 157 | longitude, 158 | ) = address_dict.values() 159 | except FreskError as error: 160 | logging.info(f"Rejecting record: {error}.") 161 | continue 162 | 163 | ################################################################ 164 | # Description 165 | ################################################################ 166 | driver.execute_script("window.scrollBy(0, document.body.scrollHeight);") 167 | 168 | # Click on "show more" button 169 | try: 170 | show_more_el = driver.find_element( 171 | By.CSS_SELECTOR, 'button[data-hook="about-section-button"]' 172 | ) 173 | show_more_el.click() 174 | except NoSuchElementException: 175 | pass 176 | 177 | try: 178 | description_el = driver.find_element( 179 | By.CSS_SELECTOR, 'div[data-hook="about-section-text"]' 180 | ) 181 | except NoSuchElementException: 182 | try: 183 | description_el = driver.find_element( 184 | By.CSS_SELECTOR, 'div[data-hook="about-section"]' 185 | ) 186 | except NoSuchElementException: 187 | logging.info(f"Rejecting record: no description") 188 | continue 189 | 190 | description = description_el.text 191 | 192 | ################################################################ 193 | # Training? 194 | ################################################################ 195 | training = is_training(title) 196 | 197 | ################################################################ 198 | # Is it full? 199 | ################################################################ 200 | sold_out = True 201 | try: 202 | _ = driver.find_element( 203 | by=By.CSS_SELECTOR, 204 | value='div[data-hook="event-sold-out"]', 205 | ) 206 | except NoSuchElementException: 207 | sold_out = False 208 | 209 | ################################################################ 210 | # Is it suited for kids? 211 | ################################################################ 212 | kids = is_for_kids(title) 213 | 214 | ################################################################ 215 | # Parse tickets link 216 | ################################################################ 217 | tickets_link = link 218 | 219 | ################################################################ 220 | # Building final object 221 | ################################################################ 222 | record = get_record_dict( 223 | f"{page['id']}-{uuid}", 224 | page["id"], 225 | title, 226 | event_start_datetime, 227 | event_end_datetime, 228 | full_location, 229 | location_name, 230 | address, 231 | city, 232 | department, 233 | zip_code, 234 | country_code, 235 | latitude, 236 | longitude, 237 | page.get( 238 | "language_code", 239 | detect_language_code(title, description), 240 | ), 241 | online, 242 | training, 243 | sold_out, 244 | kids, 245 | link, 246 | tickets_link, 247 | description, 248 | ) 249 | 250 | records.append(record) 251 | logging.info(f"Successfully scraped {link}\n{json.dumps(record, indent=4)}") 252 | 253 | driver.quit() 254 | 255 | return records 256 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/scraper/fdc.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import time 4 | import logging 5 | 6 | from selenium import webdriver 7 | from selenium.common.exceptions import NoSuchElementException, TimeoutException 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | 12 | from trouver_une_fresque_scraper.db.records import get_record_dict 13 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates 14 | from trouver_une_fresque_scraper.utils.errors import ( 15 | FreskError, 16 | FreskDateBadFormat, 17 | FreskLanguageNotRecognized, 18 | ) 19 | from trouver_une_fresque_scraper.utils.keywords import * 20 | from trouver_une_fresque_scraper.utils.language import get_language_code 21 | from trouver_une_fresque_scraper.utils.location import get_address 22 | 23 | 24 | def get_fdc_data(sources, service, options): 25 | logging.info("Scraping data from fresqueduclimat.org") 26 | 27 | driver = webdriver.Firefox(service=service, options=options) 28 | 29 | records = [] 30 | 31 | for page in sources: 32 | logging.info("========================") 33 | driver.get(page["url"]) 34 | driver.implicitly_wait(2) 35 | 36 | wait = WebDriverWait(driver, 10) 37 | iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe"))) 38 | driver.switch_to.frame(iframe) 39 | 40 | while True: 41 | ele = driver.find_elements(By.CSS_SELECTOR, "a.link-dark") 42 | links = [e.get_attribute("href") for e in ele] 43 | 44 | for link in links: 45 | logging.info(f"\n-> Processing {link} ...") 46 | driver.get(link) 47 | driver.implicitly_wait(3) 48 | 49 | ################################################################ 50 | # Parse event id 51 | ################################################################ 52 | # Define the regex pattern for UUIDs 53 | uuid_pattern = ( 54 | r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}" 55 | ) 56 | uuids = re.findall(uuid_pattern, link) 57 | if not uuids: 58 | logging.info("Rejecting record: UUID not found") 59 | driver.back() 60 | wait = WebDriverWait(driver, 10) 61 | iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe"))) 62 | driver.switch_to.frame(iframe) 63 | continue 64 | 65 | ################################################################ 66 | # Parse event title 67 | ################################################################ 68 | title_el = driver.find_element( 69 | by=By.TAG_NAME, 70 | value="h3", 71 | ) 72 | title = title_el.text 73 | 74 | ################################################################ 75 | # Parse start and end dates 76 | ################################################################ 77 | clock_icon = driver.find_element(By.CLASS_NAME, "fa-clock") 78 | parent_div = clock_icon.find_element(By.XPATH, "..") 79 | event_time = parent_div.text 80 | 81 | try: 82 | event_start_datetime, event_end_datetime = get_dates(event_time) 83 | except FreskDateBadFormat as error: 84 | logging.info(f"Reject record: {error}") 85 | driver.back() 86 | wait = WebDriverWait(driver, 10) 87 | iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe"))) 88 | driver.switch_to.frame(iframe) 89 | continue 90 | 91 | ################################################################ 92 | # Workshop language 93 | ################################################################ 94 | language_code = None 95 | try: 96 | globe_in_event = driver.find_element( 97 | By.XPATH, '//div[contains(@class, "mb-3")]/i[contains(@class, "fa-globe")]' 98 | ) 99 | parent = globe_in_event.find_element(By.XPATH, "..") 100 | language_code = get_language_code(parent.text) 101 | except FreskLanguageNotRecognized as e: 102 | logging.warning(f"Unable to parse workshop language: {e}") 103 | language_code = None 104 | except NoSuchElementException: 105 | logging.warning("Unable to find workshop language on the page.") 106 | language_code = None 107 | 108 | ################################################################ 109 | # Is it an online event? 110 | ################################################################ 111 | online = True 112 | try: 113 | driver.find_element(By.CLASS_NAME, "fa-video") 114 | except NoSuchElementException: 115 | online = False 116 | 117 | ################################################################ 118 | # Location data 119 | ################################################################ 120 | full_location = "" 121 | location_name = "" 122 | address = "" 123 | city = "" 124 | department = "" 125 | longitude = "" 126 | latitude = "" 127 | zip_code = "" 128 | country_code = "" 129 | 130 | if not online: 131 | pin_icon = driver.find_element(By.CLASS_NAME, "fa-map-pin") 132 | parent_div = pin_icon.find_element(By.XPATH, "..") 133 | full_location = parent_div.text 134 | 135 | try: 136 | logging.info(f"Full location: {full_location}") 137 | address_dict = get_address(full_location) 138 | ( 139 | location_name, 140 | address, 141 | city, 142 | department, 143 | zip_code, 144 | country_code, 145 | latitude, 146 | longitude, 147 | ) = address_dict.values() 148 | except FreskError as error: 149 | logging.info(f"Rejecting record: {error}.") 150 | driver.back() 151 | wait = WebDriverWait(driver, 10) 152 | iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe"))) 153 | driver.switch_to.frame(iframe) 154 | continue 155 | 156 | ################################################################ 157 | # Description 158 | ################################################################ 159 | description_title_el = driver.find_element( 160 | By.XPATH, "//strong[text()='Description']" 161 | ) 162 | parent_description_el = description_title_el.find_element(By.XPATH, "..") 163 | description = parent_description_el.text 164 | 165 | ################################################################ 166 | # Training? 167 | ################################################################ 168 | training = is_training(title) 169 | 170 | ################################################################ 171 | # Is it full? 172 | ################################################################ 173 | user_icon = driver.find_element(By.CLASS_NAME, "fa-user") 174 | parent_container = user_icon.find_element(By.XPATH, "../..") 175 | sold_out = is_sold_out(parent_container.text) 176 | 177 | ################################################################ 178 | # Is it suited for kids? 179 | ################################################################ 180 | kids = is_for_kids(description) and not training 181 | 182 | ################################################################ 183 | # Parse tickets link 184 | ################################################################ 185 | user_icon = driver.find_element(By.CLASS_NAME, "fa-user") 186 | parent_link = user_icon.find_element(By.XPATH, "..") 187 | tickets_link = parent_link.get_attribute("href") 188 | 189 | ################################################################ 190 | # Building final object 191 | ################################################################ 192 | record = get_record_dict( 193 | f"{page['id']}-{uuids[0]}", 194 | page["id"], 195 | title, 196 | event_start_datetime, 197 | event_end_datetime, 198 | full_location, 199 | location_name, 200 | address, 201 | city, 202 | department, 203 | zip_code, 204 | country_code, 205 | latitude, 206 | longitude, 207 | language_code, 208 | online, 209 | training, 210 | sold_out, 211 | kids, 212 | link, 213 | tickets_link, 214 | description, 215 | ) 216 | 217 | records.append(record) 218 | logging.info(f"Successfully scraped {link}\n{json.dumps(record, indent=4)}") 219 | 220 | driver.back() 221 | wait = WebDriverWait(driver, 10) 222 | iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe"))) 223 | driver.switch_to.frame(iframe) 224 | 225 | try: 226 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 227 | driver.implicitly_wait(2) 228 | time.sleep(2) 229 | next_button = WebDriverWait(driver, 10).until( 230 | EC.element_to_be_clickable( 231 | ( 232 | By.XPATH, 233 | "//a[@class='page-link' and contains(text(), 'Suivant')]", 234 | ) 235 | ) 236 | ) 237 | next_button.location_once_scrolled_into_view 238 | time.sleep(2) 239 | next_button.click() 240 | time.sleep(10) 241 | except TimeoutException: 242 | break 243 | 244 | driver.quit() 245 | 246 | return records 247 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/scraper/glide.py: -------------------------------------------------------------------------------- 1 | import time 2 | import re 3 | import json 4 | import logging 5 | 6 | from selenium import webdriver 7 | from selenium.common.exceptions import NoSuchElementException, TimeoutException 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | 12 | from trouver_une_fresque_scraper.db.records import get_record_dict 13 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates 14 | from trouver_une_fresque_scraper.utils.errors import FreskError 15 | from trouver_une_fresque_scraper.utils.keywords import * 16 | from trouver_une_fresque_scraper.utils.language import detect_language_code 17 | from trouver_une_fresque_scraper.utils.location import get_address 18 | 19 | 20 | def get_glide_data(sources, service, options): 21 | logging.info("Scraping data from glide.page") 22 | 23 | driver = webdriver.Firefox(service=service, options=options) 24 | 25 | records = [] 26 | 27 | for page in sources: 28 | logging.info(f"==================\nProcessing page {page}") 29 | driver.get(page["url"]) 30 | driver.implicitly_wait(10) 31 | time.sleep(20) 32 | 33 | tab_button_element = driver.find_element( 34 | By.XPATH, 35 | f"//div[contains(@class, 'button-text') and text()='{page['filter']}']", 36 | ) 37 | tab_button_element.click() 38 | 39 | # Maybe there are multiple pages, so we loop. 40 | while True: 41 | time.sleep(5) 42 | ele = driver.find_elements( 43 | By.XPATH, 44 | "//div[contains(@class, 'collection-item') and @role='button']", 45 | ) 46 | num_el = len(ele) 47 | logging.info(f"Found {num_el} elements") 48 | 49 | for i in range(num_el): 50 | time.sleep(5) 51 | ele = driver.find_elements( 52 | By.XPATH, 53 | "//div[contains(@class, 'collection-item') and @role='button']", 54 | ) 55 | 56 | # The following is ugly, but necessary as elements are loaded dynamically in JS. 57 | # We have to make sure that all elements are loaded before proceeding. 58 | max_tries = 10 59 | count = 0 60 | while len(ele) != num_el: 61 | driver.refresh() 62 | time.sleep(5) 63 | ele = driver.find_elements( 64 | By.XPATH, 65 | "//div[contains(@class, 'collection-item') and @role='button']", 66 | ) 67 | 68 | count += 1 69 | if count == max_tries: 70 | raise RuntimeError( 71 | f"Cannot load the {num_el} JS elements after {count} tries." 72 | ) 73 | 74 | el = ele[i] 75 | el.click() 76 | 77 | time.sleep(5) 78 | link = driver.current_url 79 | logging.info(f"\n-> Processing {link} ...") 80 | driver.implicitly_wait(3) 81 | 82 | ################################################################ 83 | # Is it canceled? 84 | ################################################################ 85 | try: 86 | # Attempt to find the div element by its id 87 | large_title_el = driver.find_element(By.CSS_SELECTOR, "h2.headlineMedium") 88 | large_title = large_title_el.text 89 | if is_canceled(large_title): 90 | logging.info("Rejecting record: canceled") 91 | driver.back() 92 | continue 93 | except NoSuchElementException: 94 | pass 95 | 96 | ################################################################ 97 | # Parse event id 98 | ################################################################ 99 | uuid = link.split("/")[-1] 100 | if not uuid: 101 | logging.info("Rejecting record: UUID not found") 102 | driver.back() 103 | continue 104 | 105 | ################################################################ 106 | # Parse event title 107 | ################################################################ 108 | title_el = driver.find_element(by=By.CSS_SELECTOR, value="h2.headlineSmall") 109 | title = title_el.text 110 | 111 | ################################################################ 112 | # Parse start and end dates 113 | ################################################################ 114 | time_el = driver.find_element( 115 | by=By.XPATH, 116 | value="//li/div[contains(text(), 'Date')]", 117 | ) 118 | parent_el = time_el.find_element(by=By.XPATH, value="..") 119 | event_time_el = parent_el.find_element(by=By.XPATH, value="./*[2]") 120 | event_time = event_time_el.text.lower() 121 | 122 | try: 123 | event_start_datetime, event_end_datetime = get_dates(event_time) 124 | except Exception as e: 125 | logging.info(f"Rejecting record: {e}") 126 | driver.back() 127 | continue 128 | 129 | ################################################################ 130 | # Is it an online event? 131 | ################################################################ 132 | time_label_el = driver.find_element( 133 | by=By.XPATH, 134 | value="//li/div[contains(text(), 'Format')]", 135 | ) 136 | parent_el = time_label_el.find_element(by=By.XPATH, value="..") 137 | online_el = parent_el.find_element(by=By.XPATH, value="./*[2]") 138 | online = is_online(online_el.text) 139 | 140 | ################################################################ 141 | # Location data 142 | ################################################################ 143 | full_location = "" 144 | location_name = "" 145 | address = "" 146 | city = "" 147 | department = "" 148 | longitude = "" 149 | latitude = "" 150 | zip_code = "" 151 | country_code = "" 152 | 153 | if not online: 154 | try: 155 | address_label_el = driver.find_element( 156 | by=By.XPATH, 157 | value="//li/div[contains(text(), 'Adresse')]", 158 | ) 159 | parent_el = address_label_el.find_element(by=By.XPATH, value="..") 160 | address_el = parent_el.find_element(by=By.XPATH, value="./*[2]") 161 | except Exception: 162 | logging.info("Rejecting record: empty address") 163 | driver.back() 164 | continue 165 | 166 | full_location = address_el.text 167 | 168 | try: 169 | address_dict = get_address(full_location) 170 | ( 171 | location_name, 172 | address, 173 | city, 174 | department, 175 | zip_code, 176 | country_code, 177 | latitude, 178 | longitude, 179 | ) = address_dict.values() 180 | except FreskError as error: 181 | logging.info(f"Rejecting record: {error}.") 182 | driver.back() 183 | continue 184 | 185 | ################################################################ 186 | # Description 187 | ################################################################ 188 | description_label_el = driver.find_element( 189 | by=By.XPATH, 190 | value="//li/div[contains(text(), 'Description')]", 191 | ) 192 | parent_el = description_label_el.find_element(by=By.XPATH, value="..") 193 | description_el = parent_el.find_element(by=By.XPATH, value="./*[2]") 194 | description = description_el.text 195 | 196 | ################################################################ 197 | # Training? 198 | ################################################################ 199 | training = is_training(title) 200 | 201 | ################################################################ 202 | # Is it full? 203 | ################################################################ 204 | attendees_label_el = driver.find_element( 205 | by=By.XPATH, 206 | value="//li/div[contains(text(), 'participant')]", 207 | ) 208 | parent_el = attendees_label_el.find_element(by=By.XPATH, value="..") 209 | attendees_el = parent_el.find_element(by=By.XPATH, value="./*[2]") 210 | attendees = attendees_el.text 211 | 212 | sold_out = attendees.split("/")[0] == attendees.split("/")[1] 213 | 214 | ################################################################ 215 | # Is it suited for kids? 216 | ################################################################ 217 | kids = False 218 | 219 | ################################################################ 220 | # Building final object 221 | ################################################################ 222 | record = get_record_dict( 223 | f"{page['id']}-{uuid}", 224 | page["id"], 225 | title, 226 | event_start_datetime, 227 | event_end_datetime, 228 | full_location, 229 | location_name, 230 | address, 231 | city, 232 | department, 233 | zip_code, 234 | country_code, 235 | latitude, 236 | longitude, 237 | page.get( 238 | "language_code", 239 | detect_language_code(title, description), 240 | ), 241 | online, 242 | training, 243 | sold_out, 244 | kids, 245 | link, 246 | link, 247 | description, 248 | ) 249 | 250 | records.append(record) 251 | logging.info(f"Successfully scraped {link}\n{json.dumps(record, indent=4)}") 252 | 253 | driver.back() 254 | 255 | try: 256 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 257 | driver.implicitly_wait(2) 258 | time.sleep(2) 259 | next_button = WebDriverWait(driver, 10).until( 260 | EC.element_to_be_clickable( 261 | ( 262 | By.XPATH, 263 | "//button[@aria-label='Next']", 264 | ) 265 | ) 266 | ) 267 | next_button.location_once_scrolled_into_view 268 | time.sleep(2) 269 | next_button.click() 270 | time.sleep(2) 271 | except TimeoutException: 272 | break 273 | 274 | driver.quit() 275 | 276 | return records 277 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/scraper/billetweb.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import logging 4 | from datetime import timedelta 5 | 6 | from selenium import webdriver 7 | from selenium.common.exceptions import NoSuchElementException, TimeoutException 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | 12 | from trouver_une_fresque_scraper.db.records import get_record_dict 13 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates 14 | from trouver_une_fresque_scraper.utils.errors import FreskError 15 | from trouver_une_fresque_scraper.utils.keywords import * 16 | from trouver_une_fresque_scraper.utils.language import detect_language_code 17 | from trouver_une_fresque_scraper.utils.location import get_address 18 | 19 | 20 | def get_billetweb_data(sources, service, options): 21 | logging.info("Scraping data from www.billetweb.fr") 22 | 23 | driver = webdriver.Firefox(service=service, options=options) 24 | wait = WebDriverWait(driver, 10) 25 | 26 | records = [] 27 | 28 | for page in sources: 29 | logging.info(f"==================\nProcessing page {page}") 30 | driver.get(page["url"]) 31 | 32 | try: 33 | wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, page["iframe"]))) 34 | except TimeoutException: 35 | logging.info("Rejecting record: iframe not found") 36 | continue 37 | 38 | wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete") 39 | ele = driver.find_elements(By.CSS_SELECTOR, "a.naviguate") 40 | links = [e.get_attribute("href") for e in ele] 41 | 42 | for link in links: 43 | logging.info(f"------------------\nProcessing event {link}") 44 | driver.get(link) 45 | wait.until( 46 | lambda driver: driver.execute_script("return document.readyState") == "complete" 47 | ) 48 | 49 | # Useful for different workshops sharing same event link 50 | if "filter" in page: 51 | if page["filter"] not in link: 52 | logging.info( 53 | "Rejecting filter: expected filter keyword not present in current link" 54 | ) 55 | continue 56 | 57 | # Description 58 | try: 59 | driver.find_element(By.ID, "more_info").click() 60 | except Exception: 61 | pass # normal case if description is without more info 62 | 63 | try: 64 | description = driver.find_element(by=By.CSS_SELECTOR, value="#description").text 65 | except Exception: 66 | logging.info("Rejecting record: no description") 67 | continue 68 | 69 | # Parse event id 70 | event_id = re.search(r"/([^/]+?)&", link).group(1) 71 | if not event_id: 72 | logging.info("Rejecting record: event_id not found") 73 | continue 74 | 75 | # Parse main title 76 | try: 77 | main_title = driver.find_element( 78 | by=By.CSS_SELECTOR, value="#event_title > div.event_name" 79 | ).text 80 | except NoSuchElementException: 81 | main_title = driver.find_element( 82 | by=By.CSS_SELECTOR, 83 | value="#description_block > div.event_title > div.event_name", 84 | ).text 85 | 86 | # Location data 87 | try: 88 | try: 89 | main_full_location = driver.find_element( 90 | by=By.CSS_SELECTOR, value="div.location_summary" 91 | ).text 92 | except NoSuchElementException: 93 | main_full_location = driver.find_element( 94 | by=By.CSS_SELECTOR, 95 | value="#page_block_location > div.location > div.location_info > div.address > a", 96 | ).text 97 | except Exception: 98 | main_full_location = "" 99 | 100 | event_info = [] 101 | 102 | # Retrieve sessions if exist 103 | wait.until( 104 | EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#shop_block iframe")) 105 | ) 106 | wait.until( 107 | lambda driver: driver.execute_script("return document.readyState") == "complete" 108 | ) 109 | back_links = driver.find_elements(By.CSS_SELECTOR, ".back_header_link.summarizable") 110 | if back_links: 111 | # Case of Multi-time with only one date, we arrive directly to Basket, so get back to sessions 112 | driver.get(back_links[0].get_attribute("href")) 113 | wait.until( 114 | lambda driver: driver.execute_script("return document.readyState") == "complete" 115 | ) 116 | sessions = driver.find_elements(By.CSS_SELECTOR, "a.sesssion_href") 117 | sessions_links = [ 118 | s.get_attribute("href") for s in sessions 119 | ] # No sessions for Mono-time 120 | driver.switch_to.parent_frame() 121 | 122 | ################################################################ 123 | # Multi-time management 124 | ################################################################ 125 | for sessions_link in sessions_links: 126 | driver.get(sessions_link) 127 | wait.until( 128 | lambda driver: driver.execute_script("return document.readyState") == "complete" 129 | ) 130 | context = driver.find_element(By.CSS_SELECTOR, "#context_title").text 131 | 132 | # Parse title, dates, location 133 | if match := re.match( 134 | r"\s*((?P.*) : )?(?P<event_time>.*)(\n\s*(?P<full_location>.*))?", 135 | context, 136 | ): 137 | if not match.group("title"): 138 | sub_title = main_title 139 | elif "atelier" in match.group("title").lower(): 140 | sub_title = match.group("title") 141 | else: 142 | sub_title = main_title + " - " + match.group("title") 143 | 144 | event_time = match.group("event_time") 145 | sub_full_location = ( 146 | match.group("full_location") 147 | if match.group("full_location") 148 | else main_full_location 149 | ) 150 | else: 151 | raise 152 | 153 | # Is it full? 154 | try: 155 | # The presence of div.block indicates that the event is sold out, 156 | # except if the text below is displayed. 157 | empty = driver.find_element(By.CSS_SELECTOR, "div.block") 158 | sold_out = not has_external_tickets(empty.text) 159 | except NoSuchElementException: 160 | sold_out = False 161 | 162 | # Parse session id 163 | session_id = re.search(r"&session=(\d+)", sessions_link).group(1) 164 | uuid = f"{event_id}-{session_id}" 165 | 166 | event_info.append( 167 | [sub_title, event_time, sub_full_location, sold_out, sessions_link, uuid] 168 | ) 169 | 170 | ################################################################ 171 | # Mono-time management 172 | ################################################################ 173 | if not sessions_links: 174 | # Parse start and end dates 175 | try: 176 | event_time = driver.find_element( 177 | by=By.CSS_SELECTOR, 178 | value="#event_title > div.event_start_time > span.text", 179 | ).text 180 | except NoSuchElementException: 181 | event_time = driver.find_element( 182 | by=By.CSS_SELECTOR, 183 | value="#description_block > div.event_title > span > a > div.event_start_time", 184 | ).text 185 | 186 | # Is it full? 187 | try: 188 | wait.until( 189 | EC.frame_to_be_available_and_switch_to_it( 190 | (By.CSS_SELECTOR, "#shop_block iframe") 191 | ) 192 | ) 193 | wait.until( 194 | lambda driver: driver.execute_script("return document.readyState") 195 | == "complete" 196 | ) 197 | 198 | # The presence of div.block indicates that the event is sold out, 199 | # except if the text below is displayed. 200 | empty = driver.find_element(By.CSS_SELECTOR, "div.block") 201 | sold_out = not has_external_tickets(empty.text) 202 | except NoSuchElementException: 203 | sold_out = False 204 | finally: 205 | driver.switch_to.parent_frame() 206 | 207 | event_info.append( 208 | [main_title, event_time, main_full_location, sold_out, link, event_id] 209 | ) 210 | 211 | ################################################################ 212 | # Session loop 213 | ################################################################ 214 | for index, (title, event_time, full_location, sold_out, ticket_link, uuid) in enumerate( 215 | event_info 216 | ): 217 | logging.info( 218 | f"\n-> Processing session {index+1}/{len(event_info)} {ticket_link} ..." 219 | ) 220 | if is_gift_card(title): 221 | logging.info("Rejecting record: gift card") 222 | continue 223 | 224 | ################################################################ 225 | # Date and time 226 | ################################################################ 227 | try: 228 | event_start_datetime, event_end_datetime = get_dates(event_time) 229 | except Exception as e: 230 | logging.info(f"Rejecting record: {e}") 231 | continue 232 | 233 | if event_end_datetime - event_start_datetime > timedelta(days=1): 234 | logging.info(f"Rejecting record: event is too long: {event_time}") 235 | continue 236 | 237 | # Is it an online event? 238 | online = is_online(title) or is_online(full_location) 239 | title = title.replace(" Online event", "") # Button added by billetweb 240 | 241 | ################################################################ 242 | # Location data 243 | ################################################################ 244 | location_name = ( 245 | address 246 | ) = city = department = longitude = latitude = zip_code = country_code = "" 247 | if not online: 248 | try: 249 | address_dict = get_address(full_location) 250 | ( 251 | location_name, 252 | address, 253 | city, 254 | department, 255 | zip_code, 256 | country_code, 257 | latitude, 258 | longitude, 259 | ) = address_dict.values() 260 | except FreskError as error: 261 | logging.info(f"Rejecting record: {error}.") 262 | continue 263 | 264 | # Training? 265 | training = is_training(title) 266 | 267 | # Is it suited for kids? 268 | kids = is_for_kids(title) and not training # no trainings for kids 269 | 270 | # Building final object 271 | record = get_record_dict( 272 | f"{page['id']}-{uuid}", 273 | page["id"], 274 | title, 275 | event_start_datetime, 276 | event_end_datetime, 277 | full_location, 278 | location_name, 279 | address, 280 | city, 281 | department, 282 | zip_code, 283 | country_code, 284 | latitude, 285 | longitude, 286 | page.get( 287 | "language_code", 288 | detect_language_code(title, description), 289 | ), 290 | online, 291 | training, 292 | sold_out, 293 | kids, 294 | link, 295 | ticket_link, 296 | description, 297 | ) 298 | records.append(record) 299 | logging.info(f"Successfully scraped:\n{json.dumps(record, indent=4)}") 300 | 301 | driver.quit() 302 | 303 | return records 304 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/utils/date_and_time.py: -------------------------------------------------------------------------------- 1 | import re 2 | import traceback 3 | import logging 4 | 5 | from datetime import datetime, timedelta 6 | from dateutil.parser import parse 7 | 8 | from trouver_une_fresque_scraper.utils.errors import ( 9 | FreskError, 10 | FreskDateBadFormat, 11 | FreskDateDifferentTimezone, 12 | ) 13 | 14 | DEFAULT_DURATION = 3 15 | CURRENT_YEAR = 2025 16 | 17 | FRENCH_SHORT_DAYS = { 18 | "lun": 1, 19 | "mar": 2, 20 | "mer": 3, 21 | "jeu": 4, 22 | "ven": 5, 23 | "sam": 6, 24 | "dim": 7, 25 | } 26 | 27 | FRENCH_DAYS = { 28 | "lundi": 1, 29 | "mardi": 2, 30 | "mercredi": 3, 31 | "jeudi": 4, 32 | "vendredi": 5, 33 | "samedi": 6, 34 | "dimanche": 7, 35 | } 36 | 37 | FRENCH_SHORT_MONTHS = { 38 | "janv": 1, 39 | "févr": 2, 40 | "mars": 3, 41 | "avr": 4, 42 | "mai": 5, 43 | "juin": 6, 44 | "juil": 7, 45 | "août": 8, 46 | "sept": 9, 47 | "oct": 10, 48 | "nov": 11, 49 | "déc": 12, 50 | } 51 | 52 | FRENCH_MONTHS = { 53 | "janvier": 1, 54 | "février": 2, 55 | "mars": 3, 56 | "avril": 4, 57 | "mai": 5, 58 | "juin": 6, 59 | "juillet": 7, 60 | "août": 8, 61 | "septembre": 9, 62 | "octobre": 10, 63 | "novembre": 11, 64 | "décembre": 12, 65 | } 66 | 67 | 68 | def get_dates(event_time): 69 | try: 70 | # =================== 71 | # FdC English 72 | 73 | # June 03, 2025, from 05:30pm to 09:30pm (Paris time) 74 | if match := re.match( 75 | r"(?P<date>\w+\s\d{2},\s\d{4})" 76 | r",\sfrom\s" 77 | r"(?P<start_time>\d{2}:\d{2}[ap]m)" 78 | r"\sto\s" 79 | r"(?P<end_time>\d{2}:\d{2}[ap]m)" 80 | r"\s\(.*\stime\)", 81 | event_time, 82 | ): 83 | event_start_datetime = parse(f"{match.group('date')} {match.group('start_time')}") 84 | event_end_datetime = parse(f"{match.group('date')} {match.group('end_time')}") 85 | return event_start_datetime, event_end_datetime 86 | 87 | # =================== 88 | # Billetweb 89 | 90 | # Thu Oct 19, 2023 from 01:00 PM to 02:00 PM 91 | if match := re.match( 92 | r"(?P<date>.*)\s" r"from\s" r"(?P<start_time>.*)\s" r"to\s" r"(?P<end_time>.*)", 93 | event_time, 94 | ): 95 | event_start_datetime = parse(f"{match.group('date')} {match.group('start_time')}") 96 | event_end_datetime = parse(f"{match.group('date')} {match.group('end_time')}") 97 | return event_start_datetime, event_end_datetime 98 | 99 | # =================== 100 | # Billetweb 101 | 102 | # Thu Oct 19, 2023 at 01:00 PM to Sat Feb 24, 2024 at 02:00 PM 103 | elif match := re.match( 104 | r"(?P<start_date>.*)\s" 105 | r"at\s" 106 | r"(?P<start_time>.*)\s" 107 | r"to\s" 108 | r"(?P<end_date>.*)\s" 109 | r"at\s" 110 | r"(?P<end_time>.*)", 111 | event_time, 112 | ): 113 | event_start_datetime = parse(f"{match.group('start_date')} {match.group('start_time')}") 114 | event_end_datetime = parse(f"{match.group('end_date')} {match.group('end_time')}") 115 | return event_start_datetime, event_end_datetime 116 | 117 | # =================== 118 | # Billetweb 119 | 120 | # Thu Oct 19, 2023 at 01:00 PM 121 | # March 7, 2025 at 10:00 AM 122 | elif match := re.match(r"(?P<date>.*)\s" r"at\s" r"(?P<time>.*)", event_time): 123 | event_start_datetime = parse(f"{match.group('date')} {match.group('time')}") 124 | event_end_datetime = event_start_datetime + timedelta(hours=DEFAULT_DURATION) 125 | return event_start_datetime, event_end_datetime 126 | 127 | # =================== 128 | # Eventbrite 129 | 130 | # ven. 11 avr. 2025 14:00 - 17:30 CEST 131 | elif match := re.match( 132 | rf"(?P<day_of_week>{'|'.join(FRENCH_SHORT_DAYS.keys())})\.?\s" 133 | r"(?P<day>\d{1,2})\s" 134 | rf"(?P<month>{'|'.join(FRENCH_SHORT_MONTHS.keys())})\.?\s" 135 | r"(?P<year>\d{4})\s" 136 | r"(?P<start_time>\d{2}:\d{2})\s" 137 | r"-\s" 138 | r"(?P<end_time>\d{2}:\d{2})\s" 139 | r"(?P<timezone>.*)", 140 | event_time, 141 | ): 142 | event_start_datetime = datetime( 143 | int(match.group("year")), 144 | FRENCH_SHORT_MONTHS[match.group("month")], 145 | int(match.group("day")), 146 | int(match.group("start_time").split(":")[0]), 147 | int(match.group("start_time").split(":")[1]), 148 | ) 149 | event_end_datetime = datetime( 150 | int(match.group("year")), 151 | FRENCH_SHORT_MONTHS[match.group("month")], 152 | int(match.group("day")), 153 | int(match.group("end_time").split(":")[0]), 154 | int(match.group("end_time").split(":")[1]), 155 | ) 156 | return event_start_datetime, event_end_datetime 157 | 158 | # =================== 159 | # FdC French 160 | 161 | # 16 mai 2025, de 18h30 à 21h30 (heure de Paris) 162 | elif match := re.match( 163 | r"(?P<day>\d{1,2})\s" 164 | rf"(?P<month>{'|'.join(FRENCH_MONTHS.keys())})\s" 165 | r"(?P<year>\d{4}),\s" 166 | r"de\s" 167 | r"(?P<start_time>\d{1,2}h\d{2})\s" 168 | r"à\s" 169 | r"(?P<end_time>\d{1,2}h\d{2})", 170 | event_time, 171 | ): 172 | # Construct the datetime objects 173 | event_start_datetime = datetime( 174 | int(match.group("year")), 175 | FRENCH_MONTHS[match.group("month")], 176 | int(match.group("day")), 177 | int(match.group("start_time").split("h")[0]), 178 | int(match.group("start_time").split("h")[1]), 179 | ) 180 | event_end_datetime = datetime( 181 | int(match.group("year")), 182 | FRENCH_MONTHS[match.group("month")], 183 | int(match.group("day")), 184 | int(match.group("end_time").split("h")[0]), 185 | int(match.group("end_time").split("h")[1]), 186 | ) 187 | return event_start_datetime, event_end_datetime 188 | 189 | # =================== 190 | # FEC 191 | 192 | # 03 mars 2025, 14:00 – 17:00 UTC+1 193 | elif match := re.match( 194 | rf"((?P<day_of_week>{'|'.join(FRENCH_SHORT_DAYS.keys())})\.?\s)?" 195 | r"(?P<day>\d{1,2})\s" 196 | rf"(?P<month>{'|'.join(FRENCH_SHORT_MONTHS.keys())})\.?\s" 197 | r"(?P<year>\d{4})?,\s" 198 | r"(?P<start_time>\d{2}:\d{2})\s" 199 | r"–\s" 200 | r"(?P<end_time>\d{2}:\d{2})" 201 | r"(\sUTC(?P<timezone>.*))?", 202 | event_time, 203 | ): 204 | timezone = match.group("timezone") 205 | if timezone and timezone not in ("+1", "+2"): 206 | raise FreskDateDifferentTimezone(event_time) 207 | 208 | event_start_datetime = datetime( 209 | int(match.group("year")), 210 | FRENCH_SHORT_MONTHS[match.group("month")], 211 | int(match.group("day")), 212 | int(match.group("start_time").split(":")[0]), 213 | int(match.group("start_time").split(":")[1]), 214 | ) 215 | event_end_datetime = datetime( 216 | int(match.group("year")), 217 | FRENCH_SHORT_MONTHS[match.group("month")], 218 | int(match.group("day")), 219 | int(match.group("end_time").split(":")[0]), 220 | int(match.group("end_time").split(":")[1]), 221 | ) 222 | return event_start_datetime, event_end_datetime 223 | 224 | # =================== 225 | # Glide 226 | 227 | # mercredi 12 février 2025 de 19h00 à 22h00 228 | elif match := re.match( 229 | rf"((?P<day_of_week>{'|'.join(FRENCH_DAYS.keys())})\s)?" 230 | r"(?P<day>\d{1,2})\s" 231 | rf"(?P<month>{'|'.join(FRENCH_MONTHS)})\s" 232 | r"(?P<year>\d{4})\s" 233 | r"de\s" 234 | r"(?P<start_time>\d{1,2}h\d{2})\s" 235 | r"à\s" 236 | r"(?P<end_time>\d{1,2}h\d{2})", 237 | event_time, 238 | ): 239 | event_start_datetime = datetime( 240 | int(match.group("year")), 241 | FRENCH_MONTHS[match.group("month")], 242 | int(match.group("day")), 243 | int(match.group("start_time").split("h")[0]), 244 | int(match.group("start_time").split("h")[1]), 245 | ) 246 | event_end_datetime = datetime( 247 | int(match.group("year")), 248 | FRENCH_MONTHS[match.group("month")], 249 | int(match.group("day")), 250 | int(match.group("end_time").split("h")[0]), 251 | int(match.group("end_time").split("h")[1]), 252 | ) 253 | return event_start_datetime, event_end_datetime 254 | 255 | # =================== 256 | # HelloAsso 257 | 258 | # Le 12 février 2025, de 18h à 20h 259 | elif match := re.match( 260 | r"Le\s" 261 | r"(?P<day>\d{1,2})\s" 262 | rf"(?P<month>{'|'.join(FRENCH_MONTHS)})\s" 263 | r"(?P<year>\d{4}),\s" 264 | r"de\s" 265 | r"(?P<start_time>\d{1,2}h\d{0,2})\s" 266 | r"à\s" 267 | r"(?P<end_time>\d{1,2}h\d{0,2})", 268 | event_time, 269 | ): 270 | start_parts = match.group("start_time").split("h") 271 | event_start_datetime = datetime( 272 | int(match.group("year")), 273 | FRENCH_MONTHS[match.group("month")], 274 | int(match.group("day")), 275 | int(start_parts[0]), 276 | (int(start_parts[1]) if len(start_parts) > 1 and len(start_parts[1]) else 0), 277 | ) 278 | end_parts = match.group("end_time").split("h") 279 | event_end_datetime = datetime( 280 | int(match.group("year")), 281 | FRENCH_MONTHS[match.group("month")], 282 | int(match.group("day")), 283 | int(end_parts[0]), 284 | int(end_parts[1]) if len(end_parts) > 1 and len(end_parts[1]) else 0, 285 | ) 286 | return event_start_datetime, event_end_datetime 287 | 288 | else: 289 | raise FreskDateBadFormat(event_time) 290 | 291 | except Exception as e: 292 | if not isinstance(e, FreskError): 293 | traceback.print_exc() 294 | logging.error(f"get_dates: {event_time}") 295 | raise FreskDateBadFormat(event_time) 296 | 297 | 298 | def get_dates_from_element(el): 299 | """Returns start and end datetime objects extracted from the element. 300 | 301 | The "datetime" attribute of the element is used if present to extract the date, otherwise falls back on get_dates to parse the day and hours from the element text. Returns None, None on failure. 302 | 303 | May throw FreskDateDifferentTimezone, FreskDateBadFormat and any exception thrown by get_dates. 304 | """ 305 | event_day = el.get_attribute("datetime") 306 | event_time = el.text 307 | 308 | try: 309 | # Leverage the datetime attribute if present. 310 | # datetime: 2025-12-05 311 | # text: déc. 5 de 9am à 12pm UTC+1 312 | if event_day: 313 | day_match = re.match(r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})", event_day) 314 | 315 | def PATTERN_TIME(hour_name, minute_name, pm_name): 316 | return ( 317 | r"(?P<" 318 | + hour_name 319 | + r">\d{1,2})(?P<" 320 | + minute_name 321 | + r">:\d{2})?(?P<" 322 | + pm_name 323 | + r">(am|pm|vorm.|nachm.))" 324 | ) 325 | 326 | def ParseTime(match_object, hour_name, minute_name, pm_name): 327 | hour = int(match_object.group(hour_name)) 328 | PATTERN_PM = ["pm", "nachm."] 329 | if match_object.group(pm_name) in PATTERN_PM and hour < 12: 330 | hour += 12 331 | 332 | minute = 0 333 | match_minute = hour_match.group(minute_name) 334 | if match_minute: 335 | minute = int(match_minute[1:]) 336 | 337 | return hour, minute 338 | 339 | # TODO: add proper support for timezone. 340 | # We use re.search to skip the text for the date at the beginning of the string. 341 | hour_match = re.search( 342 | r"(de|von)\s" 343 | + PATTERN_TIME("start_hour", "start_minute", "start_am_or_pm") 344 | + r"\s" 345 | + r"(à|bis)\s" 346 | + PATTERN_TIME("end_hour", "end_minute", "end_am_or_pm") 347 | + r"\s" 348 | + r"((UTC|MEZ)(?P<timezone>.*))", 349 | event_time, 350 | ) 351 | if day_match and hour_match: 352 | timezone = hour_match.group("timezone") 353 | if timezone and timezone not in ("+1", "+2"): 354 | raise FreskDateDifferentTimezone(event_time) 355 | dt = datetime( 356 | int(day_match.group("year")), 357 | int(day_match.group("month")), 358 | int(day_match.group("day")), 359 | ) 360 | start_hour, start_minute = ParseTime( 361 | hour_match, "start_hour", "start_minute", "start_am_or_pm" 362 | ) 363 | end_hour, end_minute = ParseTime( 364 | hour_match, "end_hour", "end_minute", "end_am_or_pm" 365 | ) 366 | return datetime(dt.year, dt.month, dt.day, start_hour, start_minute), datetime( 367 | dt.year, dt.month, dt.day, end_hour, end_minute 368 | ) 369 | 370 | return get_dates(event_time) 371 | 372 | except Exception as e: 373 | if not isinstance(e, FreskError): 374 | traceback.print_exc() 375 | logging.error(f"get_dates_from_element: {event_time}") 376 | raise FreskDateBadFormat(event_time) 377 | -------------------------------------------------------------------------------- /src/trouver_une_fresque_scraper/scraper/eventbrite.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import json 4 | import logging 5 | import re 6 | 7 | from selenium import webdriver 8 | from selenium.common.exceptions import ( 9 | StaleElementReferenceException, 10 | NoSuchElementException, 11 | TimeoutException, 12 | ) 13 | from selenium.webdriver.common.by import By 14 | from selenium.webdriver.support.ui import WebDriverWait 15 | from selenium.webdriver.support import expected_conditions as EC 16 | 17 | from trouver_une_fresque_scraper.db.records import get_record_dict 18 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates_from_element 19 | from trouver_une_fresque_scraper.utils.errors import ( 20 | FreskError, 21 | FreskDateBadFormat, 22 | FreskDateNotFound, 23 | ) 24 | from trouver_une_fresque_scraper.utils.keywords import * 25 | from trouver_une_fresque_scraper.utils.language import detect_language_code 26 | from trouver_une_fresque_scraper.utils.location import get_address 27 | 28 | 29 | def delete_cookies_overlay(driver): 30 | try: 31 | transcend_element = WebDriverWait(driver, 10).until( 32 | EC.presence_of_element_located((By.CSS_SELECTOR, "#transcend-consent-manager")) 33 | ) 34 | 35 | # Use JavaScript to remove the transcend-consent-manager element 36 | script = """ 37 | var element = arguments[0]; 38 | element.parentNode.removeChild(element); 39 | """ 40 | driver.execute_script(script, transcend_element) 41 | except Exception as e: 42 | logging.info(f"Transcend consent manager element couldn't be removed: {e}") 43 | 44 | 45 | def scroll_to_bottom(driver): 46 | more_content = True 47 | while more_content: 48 | logging.info("Scrolling to the bottom...") 49 | try: 50 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 51 | time.sleep(5) # Give the page some time to load new content 52 | 53 | # Function to safely click the next button 54 | def click_next_button(): 55 | try: 56 | next_button = WebDriverWait(driver, 10).until( 57 | EC.element_to_be_clickable( 58 | ( 59 | By.CSS_SELECTOR, 60 | "div.organizer-profile__section--content div.organizer-profile__show-more > button", 61 | ) 62 | ) 63 | ) 64 | 65 | desired_y = (next_button.size["height"] / 2) + next_button.location["y"] 66 | window_h = driver.execute_script("return window.innerHeight") 67 | window_y = driver.execute_script("return window.pageYOffset") 68 | current_y = (window_h / 2) + window_y 69 | scroll_y_by = desired_y - current_y 70 | 71 | driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by) 72 | next_button.click() 73 | 74 | except StaleElementReferenceException: 75 | click_next_button() # Retry if the element is stale 76 | 77 | click_next_button() 78 | 79 | except TimeoutException: 80 | more_content = False 81 | 82 | 83 | def get_eventbrite_data(sources, service, options): 84 | logging.info("Scraping data from eventbrite.fr") 85 | 86 | driver = webdriver.Firefox(service=service, options=options) 87 | 88 | records = [] 89 | 90 | for page in sources: 91 | logging.info(f"==================\nProcessing page {page}") 92 | driver.get(page["url"]) 93 | driver.implicitly_wait(5) 94 | 95 | # Scroll to bottom to load all events 96 | scroll_to_bottom(driver) 97 | driver.execute_script("window.scrollTo(0, 0);") 98 | 99 | elements = [] 100 | future_events = driver.find_element( 101 | By.CSS_SELECTOR, 'div[data-testid="organizer-profile__future-events"]' 102 | ) 103 | event_card_divs = future_events.find_elements(By.CSS_SELECTOR, "div.event-card") 104 | 105 | logging.info(f"Found {len(event_card_divs)} events") 106 | 107 | for event_card_div in event_card_divs: 108 | link_elements = event_card_div.find_elements(By.CSS_SELECTOR, "a.event-card-link") 109 | elements.extend(link_elements) 110 | 111 | links = [] 112 | for link_element in elements: 113 | href = link_element.get_attribute("href") 114 | if href: 115 | links.append(href) 116 | links = np.unique(links) 117 | 118 | for link in links: 119 | logging.info(f"\n-> Processing {link} ...") 120 | driver.get(link) 121 | delete_cookies_overlay(driver) 122 | driver.implicitly_wait(3) 123 | time.sleep(3) # Pages are quite long to load 124 | 125 | ################################################################ 126 | # Has it expired? 127 | ################################################################ 128 | try: 129 | badge = driver.find_element( 130 | By.XPATH, '//div[@data-testid="enhancedExpiredEventsBadge"]' 131 | ) 132 | # If the element has children elements, it is enabled 133 | try: 134 | if badge.find_elements(By.XPATH, "./*"): 135 | logging.info("Rejecting record: event expired") 136 | continue 137 | except StaleElementReferenceException: 138 | if driver.find_element( 139 | By.XPATH, '//div[@data-testid="enhancedExpiredEventsBadge"]' 140 | ).find_elements(By.XPATH, "./*"): 141 | logging.info("Rejecting record: event expired") 142 | continue 143 | 144 | except NoSuchElementException: 145 | pass 146 | 147 | try: 148 | badge = driver.find_element(By.CSS_SELECTOR, "div.enhanced-expired-badge") 149 | logging.info("Rejecting record: event expired") 150 | continue 151 | except NoSuchElementException: 152 | pass 153 | 154 | ################################################################ 155 | # Is it full? 156 | ################################################################ 157 | sold_out = False 158 | try: 159 | badge = driver.find_element(By.XPATH, '//div[@data-testid="salesEndedMessage"]') 160 | # If the element has children elements, it is enabled 161 | sold_out = bool(badge.find_elements(By.XPATH, "./*")) 162 | except NoSuchElementException: 163 | pass 164 | 165 | if sold_out: 166 | # We reject sold out events as the Eventbrite UX hides 167 | # relevant info in this case (which looks like an awful practice) 168 | logging.info("Rejecting record: sold out") 169 | continue 170 | 171 | ################################################################ 172 | # Parse event title 173 | ################################################################ 174 | title_el = driver.find_element( 175 | by=By.TAG_NAME, 176 | value="h1", 177 | ) 178 | title = title_el.text 179 | 180 | if is_plenary(title): 181 | logging.info("Rejecting record: plénière") 182 | continue 183 | 184 | ########################################################### 185 | # Is it an online event? 186 | ################################################################ 187 | online = is_online(title) 188 | if not online: 189 | try: 190 | short_location_el = driver.find_element( 191 | By.CSS_SELECTOR, "span.start-date-and-location__location" 192 | ) 193 | online = is_online(short_location_el.text) 194 | except NoSuchElementException: 195 | pass 196 | 197 | ################################################################ 198 | # Location data 199 | ################################################################ 200 | full_location = "" 201 | location_name = "" 202 | address = "" 203 | city = "" 204 | department = "" 205 | longitude = "" 206 | latitude = "" 207 | zip_code = "" 208 | country_code = "" 209 | 210 | if not online: 211 | try: 212 | full_location_el = driver.find_element( 213 | By.CSS_SELECTOR, 'div[class^="Location-module__addressWrapper___"' 214 | ) 215 | except NoSuchElementException: 216 | logging.error( 217 | f"Location element not found for offline event {link}.", 218 | ) 219 | continue 220 | full_location = full_location_el.text.replace("\n", ", ") 221 | 222 | try: 223 | address_dict = get_address(full_location) 224 | ( 225 | location_name, 226 | address, 227 | city, 228 | department, 229 | zip_code, 230 | country_code, 231 | latitude, 232 | longitude, 233 | ) = address_dict.values() 234 | except FreskError as error: 235 | logging.info(f"Rejecting record: {error}.") 236 | continue 237 | 238 | ################################################################ 239 | # Description 240 | ################################################################ 241 | try: 242 | description_title_el = driver.find_element(By.CSS_SELECTOR, "div.event-description") 243 | description = description_title_el.text 244 | except NoSuchElementException: 245 | logging.info("Rejecting record: Description not found.") 246 | continue 247 | 248 | ################################################################ 249 | # Training? 250 | ################################################################ 251 | training = is_training(title) 252 | 253 | ################################################################ 254 | # Is it suited for kids? 255 | ################################################################ 256 | kids = False 257 | 258 | ################################################################ 259 | # Multiple events 260 | ################################################################ 261 | event_info = [] 262 | 263 | try: 264 | date_time_div = WebDriverWait(driver, 10).until( 265 | EC.presence_of_element_located((By.CSS_SELECTOR, "div.select-date-and-time")) 266 | ) 267 | if date_time_div: 268 | driver.execute_script("window.scrollBy(0, arguments[0]);", 800) 269 | 270 | li_elements = date_time_div.find_elements( 271 | By.CSS_SELECTOR, "li:not([data-heap-id])" 272 | ) 273 | for li in li_elements: 274 | clickable_li = WebDriverWait(driver, 10).until( 275 | EC.element_to_be_clickable(li) 276 | ) 277 | clickable_li.click() 278 | 279 | ################################################################ 280 | # Dates 281 | ################################################################ 282 | try: 283 | date_info_el = driver.find_element( 284 | by=By.CSS_SELECTOR, 285 | value="time.start-date-and-location__date", 286 | ) 287 | except NoSuchElementException: 288 | raise FreskDateNotFound 289 | 290 | try: 291 | event_start_datetime, event_end_datetime = get_dates_from_element( 292 | date_info_el 293 | ) 294 | except FreskDateBadFormat as error: 295 | logging.info(f"Reject record: {error}") 296 | continue 297 | 298 | ################################################################ 299 | # Parse tickets link 300 | ################################################################ 301 | tickets_link = driver.current_url 302 | 303 | ################################################################ 304 | # Parse event id 305 | ################################################################ 306 | uuid = re.search(r"/e/([^/?]+)", tickets_link).group(1) 307 | 308 | # Selenium clicks on "sold out" cards (li elements), but this 309 | # has no effect. Worse, this adds the previous non-sold out 310 | # event another time. One can detect such cases by scanning 311 | # through previous event ids. 312 | already_scanned = False 313 | for event in event_info: 314 | if uuid in event[0]: 315 | already_scanned = True 316 | 317 | if not already_scanned: 318 | event_info.append( 319 | [ 320 | uuid, 321 | event_start_datetime, 322 | event_end_datetime, 323 | tickets_link, 324 | ] 325 | ) 326 | 327 | # There is only one event on this page. 328 | except TimeoutException: 329 | ################################################################ 330 | # Single event with multiple dates (a "collection"). 331 | ################################################################ 332 | try: 333 | check_availability_btn = driver.find_element( 334 | by=By.CSS_SELECTOR, value="button.check-availability-btn__button" 335 | ) 336 | # TODO: add support for this. 337 | logging.error(f"EventBrite collection not supported in event {link}.") 338 | continue 339 | except NoSuchElementException: 340 | pass 341 | 342 | ################################################################ 343 | # Dates 344 | ################################################################ 345 | try: 346 | date_info_el = driver.find_element( 347 | by=By.CSS_SELECTOR, 348 | value="time.start-date-and-location__date", 349 | ) 350 | except NoSuchElementException: 351 | raise FreskDateNotFound 352 | 353 | try: 354 | event_start_datetime, event_end_datetime = get_dates_from_element(date_info_el) 355 | except FreskDateBadFormat as error: 356 | logging.info(f"Reject record: {error}") 357 | continue 358 | 359 | ################################################################ 360 | # Parse tickets link 361 | ################################################################ 362 | tickets_link = driver.current_url 363 | 364 | ################################################################ 365 | # Parse event id 366 | ################################################################ 367 | uuid = re.search(r"/e/([^/?]+)", tickets_link).group(1) 368 | 369 | event_info.append([uuid, event_start_datetime, event_end_datetime, tickets_link]) 370 | 371 | ################################################################ 372 | # Session loop 373 | ################################################################ 374 | for index, ( 375 | uuid, 376 | event_start_datetime, 377 | event_end_datetime, 378 | link, 379 | ) in enumerate(event_info): 380 | record = get_record_dict( 381 | f"{page['id']}-{uuid}", 382 | page["id"], 383 | title, 384 | event_start_datetime, 385 | event_end_datetime, 386 | full_location, 387 | location_name, 388 | address, 389 | city, 390 | department, 391 | zip_code, 392 | country_code, 393 | latitude, 394 | longitude, 395 | page.get( 396 | "language_code", 397 | detect_language_code(title, description), 398 | ), 399 | online, 400 | training, 401 | sold_out, 402 | kids, 403 | link, 404 | link, 405 | description, 406 | ) 407 | records.append(record) 408 | logging.info(f"Successfully scraped {link}\n{json.dumps(record, indent=4)}") 409 | 410 | driver.quit() 411 | 412 | return records 413 | --------------------------------------------------------------------------------