├── bin
    └── .gitkeep
├── results
    └── .gitkeep
├── src
    └── trouver_une_fresque_scraper
    │   ├── __init__.py
    │   ├── db
    │       ├── __init__.py
    │       ├── records.py
    │       ├── main.py
    │       └── etl.py
    │   ├── apis
    │       ├── __init__.py
    │       ├── main.py
    │       ├── ics_test.py
    │       ├── mobilite.py
    │       ├── glorieuses.py
    │       └── ics.py
    │   ├── scraper
    │       ├── __init__.py
    │       ├── main.py
    │       ├── helloasso.py
    │       ├── fec.py
    │       ├── fdc.py
    │       ├── glide.py
    │       ├── billetweb.py
    │       └── eventbrite.py
    │   ├── utils
    │       ├── utils.py
    │       ├── language.py
    │       ├── keywords.py
    │       ├── errors.py
    │       ├── language_test.py
    │       ├── date_and_time_test.py
    │       ├── location.py
    │       └── date_and_time.py
    │   └── scrape.py
├── .github
    └── FUNDING.yml
├── .flox
    ├── .gitignore
    ├── env.json
    ├── .gitattributes
    └── env
    │   └── manifest.toml
├── push_to_db.py
├── config.json.dist
├── tests
    └── scrape_test.py
├── loop.sh
├── .pre-commit-config.yaml
├── .gitignore
├── pyproject.toml
├── CONTRIBUTING.md
├── countries
    ├── uk.json
    ├── ch.json
    └── fr.json
├── supabase
    └── tables.sql
├── compare.py
├── TUTORIAL.md
├── TUTORIAL_OSM.md
├── README.md
└── WORKSHOPS.md


/bin/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/results/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | liberapay: trouver-une-fresque


--------------------------------------------------------------------------------
/.flox/.gitignore:
--------------------------------------------------------------------------------
1 | run/
2 | cache/
3 | lib/
4 | log/
5 | !env/
6 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/db/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/apis/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/scraper/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 | 


--------------------------------------------------------------------------------
/.flox/env.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "trouver-une-fresque-scraper",
3 |   "version": 1
4 | }


--------------------------------------------------------------------------------
/.flox/.gitattributes:
--------------------------------------------------------------------------------
1 | env/manifest.lock linguist-generated=true linguist-language=JSON
2 | 


--------------------------------------------------------------------------------
/push_to_db.py:
--------------------------------------------------------------------------------
1 | from trouver_une_fresque_scraper.db import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/config.json.dist:
--------------------------------------------------------------------------------
 1 | {
 2 |     "webdriver": "",
 3 |     "host" : "",
 4 |     "port" : "",
 5 |     "user" : "",
 6 |     "psw"  : "",
 7 |     "database": "",
 8 |     "timezone": "Europe/Paris"
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def get_config(key=None):
 5 |     file = open("config.json", "r")
 6 |     file = json.loads(file.read())
 7 |     credentials = dict(file)
 8 |     if key is not None:
 9 |         return credentials.get(key, None)
10 |     return credentials
11 | 


--------------------------------------------------------------------------------
/tests/scrape_test.py:
--------------------------------------------------------------------------------
 1 | from trouver_une_fresque_scraper.apis import ics_test
 2 | from trouver_une_fresque_scraper.utils import date_and_time_test
 3 | from trouver_une_fresque_scraper.utils import language_test
 4 | 
 5 | 
 6 | if __name__ == "__main__":
 7 |     ics_test.run_tests()
 8 |     date_and_time_test.run_tests()
 9 |     language_test.run_tests()
10 | 


--------------------------------------------------------------------------------
/loop.sh:
--------------------------------------------------------------------------------
 1 | #!zsh
 2 | while true
 3 | do
 4 |     python -m trouver_une_fresque_scraper.scrape --skip-dirty-check
 5 |     if [ $? != 0 ]; then  # if the command fails (returns a non-zero exit code)
 6 |         echo "Command failed, retrying..."
 7 |         sleep 5  # wait for 5 seconds before retrying
 8 |     else
 9 |         break  # if the command succeeds, exit the loop
10 |     fi
11 | done
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/psf/black
 3 |     rev: 23.3.0
 4 |     hooks:
 5 |       - id: black
 6 |         args: ["--line-length", "100"]
 7 |         # It is recommended to specify the latest version of Python
 8 |         # supported by your project here, or alternatively use
 9 |         # pre-commit's default_language_version, see
10 |         # https://pre-commit.com/#top_level-default_language_version
11 |         language_version: python3.13


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pdf
 2 | *.sql
 3 | *.json
 4 | bin
 5 | supabase
 6 | results
 7 | *.log
 8 | .venv
 9 | .DS_Store
10 | 
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[codz]
14 | *$py.class
15 | 
16 | # C extensions
17 | *.so
18 | 
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | wheels/
33 | share/python-wheels/
34 | *.egg-info/
35 | .installed.cfg
36 | *.egg
37 | MANIFEST
38 | 
39 | # pre-commit
40 | .pre-commit-config.yaml
41 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "trouver-une-fresque"
 3 | version = "0.1.0"
 4 | description = "Détection d'ateliers sensibilisant aux questions climatiques et sociales"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | 
 8 | dependencies = [
 9 |     "geopy>=2.4.1",
10 |     "ics>=0.7.2",
11 |     "numpy>=2.1.3",
12 |     "pandas>=2.2.3",
13 |     "psycopg[binary,pool]>=3.2.3",
14 |     "python-dateutil>=2.9.0.post0",
15 |     "requests>=2.32.3",
16 |     "selenium>=4.26.1",
17 |     "tabulate>=0.9.0",
18 |     "langdetect",
19 |     "pre-commit",
20 | ]
21 | 
22 | [build-system]
23 | requires = ["setuptools>=61.0", "wheel"]
24 | build-backend = "setuptools.build_meta"
25 | 
26 | [tool.setuptools.packages.find]
27 | where = ["src"]
28 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/apis/main.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from trouver_une_fresque_scraper.apis.ics import get_ics_data
 4 | from trouver_une_fresque_scraper.apis.glorieuses import get_glorieuses_data
 5 | from trouver_une_fresque_scraper.apis.mobilite import get_mobilite_data
 6 | 
 7 | APIS_FNS = {
 8 |     "hook.eu1.make.com": get_glorieuses_data,
 9 |     "calendar.google.com/calendar/ical": get_ics_data,
10 |     "framagenda.org/remote.php/dav": get_ics_data,
11 |     "app.fresquedelamobilite.org": get_mobilite_data,
12 | }
13 | 
14 | 
15 | def main(apis):
16 |     records = []
17 | 
18 |     for sourcek in APIS_FNS:
19 |         for api in apis:
20 |             if sourcek in api["url"]:
21 |                 records += APIS_FNS[sourcek](api)
22 | 
23 |     return pd.DataFrame(records)
24 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Before contributing to this project, please make sure that your git config is correct:
 4 | 
 5 | ```console
 6 | git config --global user.name "John Doe"
 7 | git config --global user.email johndoe@example.com
 8 | ```
 9 | 
10 | Follow the installation instructions [here](https://pre-commit.com/#install) to install `pre-commit` hooks. Then, execute the `pre-commit install` command to install the Black code formatter.
11 | 
12 | ```console
13 | pre-commit install
14 | pre-commit run --all-files
15 | ```
16 | 
17 | If you change scraping logic, make sure to run the `compare.py` utility to visualize the number of records scraped without and with your proposed modification.
18 | 
19 | ```console
20 | python compare.py results/events_20240125_194439.json results/events_20240130_121930.json
21 | ```
22 | 


--------------------------------------------------------------------------------
/countries/uk.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "name": "Circular Economy Collage",
 4 |         "url": "https://www.eventbrite.fr/o/fresque-de-leconomie-circulaire-68155531313",
 5 |         "type": "scraper",
 6 |         "id": 300
 7 |     },
 8 |     {
 9 |         "name": "Planetary Boundaries Fresco",
10 |         "url": "https://1erdegre.glide.page/dl/3b1bc8",
11 |         "type": "scraper",
12 |         "filter": "Fresque des frontières planétaires",
13 |         "id": 500
14 |     },
15 |     {
16 |         "name": "Digital Collage",
17 |         "url": "https://www.billetweb.fr/multi_event.php?multi=11442",
18 |         "type": "scraper",
19 |         "iframe": "eventu84999",
20 |         "id": 3
21 |     },
22 |     {
23 |         "name": "Digital Collage",
24 |         "url": "https://www.billetweb.fr/pro/atelier-fresque-du-numerique",
25 |         "type": "scraper",
26 |         "iframe": "event41180",
27 |         "id": 3
28 |     },
29 |     {
30 |         "name": "Biodiversity Collage",
31 |         "url": "https://www.billetweb.fr/multi_event.php?user=82762",
32 |         "type": "scraper",
33 |         "iframe": "event17309",
34 |         "id": 2
35 |     }
36 | ]
37 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/apis/ics_test.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from trouver_une_fresque_scraper.apis import ics
 4 | 
 5 | 
 6 | def run_tests():
 7 |     long_url = "https://www.eventbrite.com/e/2tonnes-world-workshop-in-basel-switzerland-tickets-1116862910029?aff=odcleoeventsincollection&keep_tld=1"
 8 |     test_cases = [
 9 |         ("text_url", long_url, long_url),
10 |         (
11 |             "html_with_extra text",
12 |             '<html><body>Tickets here: <a href="http://result">registration</a>. Come and have fun!</body></html>',
13 |             "http://result",
14 |         ),
15 |         ("text_and_url", "Lien d'inscription : http://result.org", "http://result.org"),
16 |         (
17 |             "more_text_and_url",
18 |             "Fresque du sol animée en ligne.\nInscription obligatoire https://www.billetweb.fr/fresque-du-sol-en-ligne11\nContact si besoinnoone@nowhere.fr.",
19 |             "https://www.billetweb.fr/fresque-du-sol-en-ligne11",
20 |         ),
21 |     ]
22 |     for test_case in test_cases:
23 |         logging.info(f"Running {test_case[0]}")
24 |         actual = ics.get_ticketing_url_from_description(test_case[1])
25 |         if actual == test_case[2]:
26 |             logging.info("Result matches")
27 |         else:
28 |             logging.error(f"{test_case[0]}: expected {test_case[2]} but got {actual}")
29 | 


--------------------------------------------------------------------------------
/supabase/tables.sql:
--------------------------------------------------------------------------------
 1 | create table "private"."events_future" (
 2 |     "id" character varying,
 3 |     "workshop_type" bigint,
 4 |     "title" text,
 5 |     "description" text,
 6 |     "online" boolean,
 7 |     "training" boolean,
 8 |     "sold_out" boolean,
 9 |     "kids" boolean,
10 |     "start_date" timestamptz,
11 |     "end_date" timestamptz,
12 |     "zip_code" character varying,
13 |     "latitude" character varying,
14 |     "longitude" character varying,
15 |     "source_link" character varying,
16 |     "tickets_link" character varying,
17 |     "country_code" character varying,
18 |     "department" character varying,
19 |     "city" character varying,
20 |     "address" character varying,
21 |     "location_name" character varying,
22 |     "full_location" character varying,
23 |     "language_code" character varying,
24 |     "scrape_date" timestamp with time zone,
25 |     "most_recent" boolean default false
26 | );
27 | 
28 | create table "private"."events_scraped" (
29 |     like "private"."events_future"
30 | );
31 | 
32 | create view "public"."events" as ( 
33 |     select * from "private"."events_future"
34 |     union all
35 |     select * from "private"."events_scraped" where most_recent = true
36 | );
37 | 
38 | alter table "private"."events_future" enable row level security;
39 | alter table "private"."events_scraped" enable row level security;
40 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/utils/language.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from trouver_une_fresque_scraper.utils.errors import FreskLanguageNotRecognized
 4 | from langdetect import detect
 5 | 
 6 | 
 7 | LANGUAGE_STRINGS = {
 8 |     "Allemand": "de",
 9 |     "Anglais": "en",
10 |     "Deutsch": "de",
11 |     "Englisch": "en",
12 |     "English": "en",
13 |     "Französisch": "fr",
14 |     "Français": "fr",
15 |     "Français": "fr",
16 |     "French": "fr",
17 |     "German": "de",
18 |     "Indonesian": "id",
19 |     "Italien": "it",
20 |     "Italian": "it",
21 |     "Spanish": "es",
22 |     "Russian": "ru",
23 | }
24 | 
25 | 
26 | def detect_language_code(title, description):
27 |     """
28 |     Returns the language code of the language specified in the title if any, otherwise auto-detects from title and description.
29 |     """
30 |     title_upper = title.upper()
31 |     for language_string, language_code in LANGUAGE_STRINGS.items():
32 |         if language_string.upper() in title_upper:
33 |             return language_code
34 |     language_code = detect(title + description)
35 |     if language_code in LANGUAGE_STRINGS.values():
36 |         return language_code
37 |     logging.warning(f"Unexpected language code: {language_code}.")
38 |     return None
39 | 
40 | 
41 | def get_language_code(language_text):
42 |     """
43 |     Returns the ISO 639-1 language code given a human-readable string such as "Français" or "English".
44 |     """
45 |     language_code = LANGUAGE_STRINGS.get(language_text)
46 |     if not language_code:
47 |         raise FreskLanguageNotRecognized(language_text)
48 |     return language_code
49 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/db/records.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from zoneinfo import ZoneInfo
 4 | from trouver_une_fresque_scraper.utils.utils import get_config
 5 | 
 6 | 
 7 | def get_record_dict(
 8 |     uuid,
 9 |     ids,
10 |     title,
11 |     start_datetime,
12 |     end_datetime,
13 |     full_location,
14 |     location_name,
15 |     address,
16 |     city,
17 |     department,
18 |     zip_code,
19 |     country_code,
20 |     latitude,
21 |     longitude,
22 |     language_code,
23 |     online,
24 |     training,
25 |     sold_out,
26 |     kids,
27 |     event_link,
28 |     tickets_link,
29 |     description,
30 | ):
31 |     timezone = get_config("timezone")
32 |     origin_tz = ZoneInfo(timezone)
33 | 
34 |     return {
35 |         "id": uuid,
36 |         "workshop_type": ids,
37 |         "title": title,
38 |         "start_date": start_datetime.replace(tzinfo=origin_tz).isoformat(),
39 |         "end_date": end_datetime.replace(tzinfo=origin_tz).isoformat(),
40 |         "full_location": full_location,
41 |         "location_name": location_name.strip(),
42 |         "address": address.strip(),
43 |         "city": city.strip(),
44 |         "department": department,
45 |         "zip_code": zip_code,
46 |         "country_code": country_code,
47 |         "latitude": latitude,
48 |         "longitude": longitude,
49 |         "language_code": (
50 |             language_code.strip() if bool(language_code and language_code.strip()) else "fr"
51 |         ),
52 |         "online": online,
53 |         "training": training,
54 |         "sold_out": sold_out,
55 |         "kids": kids,
56 |         "source_link": event_link,
57 |         "tickets_link": tickets_link,
58 |         "description": description,
59 |         "scrape_date": pd.to_datetime("now", utc=True).tz_convert(timezone).isoformat(),
60 |     }
61 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/db/main.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | import pandas as pd
 4 | import psycopg
 5 | 
 6 | from psycopg.conninfo import make_conninfo
 7 | 
 8 | from trouver_une_fresque_scraper.db.etl import etl, insert, truncate
 9 | from trouver_une_fresque_scraper.utils.utils import get_config
10 | 
11 | 
12 | def main():
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument(
15 |         "--full-etl",
16 |         action="store_true",
17 |         default=True,
18 |         help="perform the full ETL cycle, including truncating future events",
19 |     )
20 |     parser.add_argument(
21 |         "--truncate-first",
22 |         action="store_true",
23 |         default=False,
24 |         help="truncate db before inserting again",
25 |     )
26 |     parser.add_argument(
27 |         "--input",
28 |         type=str,
29 |         help="input json file to be inserted in db",
30 |         required=True,
31 |     )
32 |     args = parser.parse_args()
33 | 
34 |     if args.full_etl and args.truncate_first:
35 |         raise Exception
36 | 
37 |     credentials = get_config()
38 |     host = credentials["host"]
39 |     port = credentials["port"]
40 |     user = credentials["user"]
41 |     psw = credentials["psw"]
42 |     database = credentials["database"]
43 | 
44 |     with psycopg.connect(
45 |         make_conninfo(dbname=database, user=user, password=psw, host=host, port=port)
46 |     ) as conn:
47 |         input_records = open(args.input, "r")
48 |         input_records = json.loads(input_records.read())
49 |         df = pd.DataFrame.from_dict(pd.json_normalize(input_records), orient="columns")
50 |         print(df)
51 | 
52 |         if args.full_etl:
53 |             etl(conn, df)
54 |         else:
55 |             if args.truncate_first:
56 |                 truncate(conn, "private.events_future")
57 |             insert(conn, df, "private.events_future")
58 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/utils/keywords.py:
--------------------------------------------------------------------------------
 1 | def is_training(input_string):
 2 |     training_list = [
 3 |         "formation",
 4 |         "briefing",
 5 |         "animateur",
 6 |         "animation",
 7 |         "permanence",
 8 |         "training",
 9 |         "return of experience",
10 |         "retex",
11 |     ]
12 |     input_string = input_string.lower()
13 |     return any(word.lower() in input_string for word in training_list)
14 | 
15 | 
16 | def is_online(input_string):
17 |     online_list = ["online", "en ligne", "distanciel", "en linea"]
18 |     input_string = input_string.lower()
19 |     return any(word.lower() in input_string for word in online_list)
20 | 
21 | 
22 | def is_for_kids(input_string):
23 |     kids_list = ["kids", "junior", "jeunes"]
24 |     input_string = input_string.lower()
25 |     return any(word.lower() in input_string for word in kids_list)
26 | 
27 | 
28 | def has_external_tickets(input_string):
29 |     external_tickets = [
30 |         "inscriptions uniquement",
31 |         "inscription uniquement",
32 |         "inscriptions via",
33 |         "inscription via",
34 |     ]
35 |     input_string = input_string.lower()
36 |     return any(word.lower() in input_string for word in external_tickets)
37 | 
38 | 
39 | def is_plenary(input_string):
40 |     plenary = ["plénière"]
41 |     input_string = input_string.lower()
42 |     return any(word.lower() in input_string for word in plenary)
43 | 
44 | 
45 | def is_sold_out(input_string):
46 |     sold_out = ["complet"]
47 |     input_string = input_string.lower()
48 |     return any(word.lower() in input_string for word in sold_out)
49 | 
50 | 
51 | def is_gift_card(input_string):
52 |     gift = ["cadeau", "don"]
53 |     input_string = input_string.lower()
54 |     return any(word.lower() in input_string for word in gift)
55 | 
56 | 
57 | def is_canceled(input_string):
58 |     canceled = ["annulé"]
59 |     input_string = input_string.lower()
60 |     return any(word.lower() in input_string for word in canceled)
61 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/utils/errors.py:
--------------------------------------------------------------------------------
 1 | class FreskError(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class FreskDateNotFound(FreskError):
 6 |     def __init__(self):
 7 |         self.message = f"Date not found."
 8 |         super().__init__(self.message)
 9 | 
10 | 
11 | class FreskDateBadFormat(FreskError):
12 |     def __init__(self, input_str: str):
13 |         self.message = f"Date has a bad format, unhandled by TuF (input: {input_str})."
14 |         super().__init__(self.message)
15 | 
16 | 
17 | class FreskDateDifferentTimezone(FreskError):
18 |     def __init__(self, input_str: str):
19 |         self.message = f"Date has a different timezone, unhandled by TuF (input: {input_str})."
20 |         super().__init__(self.message)
21 | 
22 | 
23 | class FreskAddressNotFound(FreskError):
24 |     def __init__(self, input_str: str):
25 |         self.message = f"Address not found (input: {input_str})."
26 |         super().__init__(self.message)
27 | 
28 | 
29 | class FreskAddressBadFormat(FreskError):
30 |     def __init__(self, address: str, input_str: str, attribute: str):
31 |         self.message = f'Address "{address}" has a bad {attribute} format, unhandled by TuF (input: {input_str}).'
32 |         super().__init__(self.message)
33 | 
34 | 
35 | class FreskAddressIncomplete(FreskError):
36 |     def __init__(self, address: str, input_str: str, missing_attribute: str):
37 |         self.message = (
38 |             f'Address "{address}" has a missing attribute {missing_attribute} (input: {input_str}).'
39 |         )
40 |         super().__init__(self.message)
41 | 
42 | 
43 | class FreskDepartmentNotFound(FreskError):
44 |     def __init__(self, department: str):
45 |         self.message = f"Department {department} not recognized."
46 |         super().__init__(self.message)
47 | 
48 | 
49 | class FreskCountryNotSupported(FreskError):
50 |     def __init__(self, address: str, input_str: str):
51 |         self.message = (
52 |             f'Address "{address}" is not located in a supported country (input: {input_str}).'
53 |         )
54 |         super().__init__(self.message)
55 | 
56 | 
57 | class FreskLanguageNotRecognized(FreskError):
58 |     def __init__(self, language_text: str):
59 |         self.message = f'Language "{language_text}" is not recognized.'
60 |         super().__init__(self.message)
61 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/scraper/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | from trouver_une_fresque_scraper.scraper.fdc import get_fdc_data
 5 | from trouver_une_fresque_scraper.scraper.fec import get_fec_data
 6 | from trouver_une_fresque_scraper.scraper.billetweb import get_billetweb_data
 7 | from trouver_une_fresque_scraper.scraper.eventbrite import get_eventbrite_data
 8 | from trouver_une_fresque_scraper.scraper.glide import get_glide_data
 9 | from trouver_une_fresque_scraper.scraper.helloasso import get_helloasso_data
10 | from selenium.webdriver.firefox.options import Options as FirefoxOptions
11 | from selenium.webdriver.firefox.service import Service
12 | from trouver_une_fresque_scraper.utils.utils import get_config
13 | 
14 | SCRAPER_FNS = {
15 |     "billetweb.fr": get_billetweb_data,
16 |     "climatefresk.org": get_fdc_data,
17 |     "eventbrite.fr": get_eventbrite_data,
18 |     "fresqueduclimat.org": get_fdc_data,
19 |     "lafresquedeleconomiecirculaire.com": get_fec_data,
20 |     "1erdegre.glide.page": get_glide_data,
21 |     "helloasso.com": get_helloasso_data,
22 | }
23 | 
24 | 
25 | def get_webdriver_executable():
26 |     webdriver_path = get_config("webdriver")
27 |     if not webdriver_path and "WEBDRIVER_PATH" in os.environ:
28 |         webdriver_path = os.environ["WEBDRIVER_PATH"]
29 |     return webdriver_path
30 | 
31 | 
32 | def main(scrapers, headless=False):
33 |     records = []
34 | 
35 |     # geckodriver
36 |     service = Service(executable_path=get_webdriver_executable())
37 | 
38 |     # Firefox
39 |     options = FirefoxOptions()
40 |     if "BROWSER_PATH" in os.environ:
41 |         options.binary_location = os.environ["BROWSER_PATH"]
42 |     options.set_preference("intl.accept_languages", "en-us")
43 | 
44 |     if headless:
45 |         options.add_argument("-headless")
46 | 
47 |     sorted_workshops = {}
48 | 
49 |     # Make sure that we have a scraper available for each fresk entry
50 |     for sourcek, fn_value in SCRAPER_FNS.items():
51 |         for workshop in scrapers:
52 |             if sourcek in workshop["url"]:
53 |                 # Organize fresks by values in SCRAPER_FNS
54 |                 if fn_value not in sorted_workshops:
55 |                     sorted_workshops[fn_value] = []
56 |                 sorted_workshops[fn_value].append(workshop)
57 | 
58 |     for fn_key, sourcev in sorted_workshops.items():
59 |         records += fn_key(sourcev, service=service, options=options)
60 | 
61 |     return pd.DataFrame(records)
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/utils/language_test.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | from trouver_une_fresque_scraper.utils import language
 5 | 
 6 | 
 7 | def run_tests():
 8 |     long_url = "https://www.eventbrite.com/e/2tonnes-world-workshop-in-basel-switzerland-tickets-1116862910029?aff=odcleoeventsincollection&keep_tld=1"
 9 |     test_cases = [
10 |         (
11 |             "FdB es",
12 |             "CHILE - PROVIDENCIA",
13 |             "El Mural de la Biodiversidad es un taller lúdico y colaborativo que permite sensibilizar sobre la importancia de la biodiversidad y las causas y consecuencias de su erosión. Durante este taller, descubrirás cómo funcionan los ecosistemas, cómo los humanos interactuamos con la biodiversidad y por qué la biodiversidad es crucial para el bienestar del ser humano.",
14 |             "es",
15 |         ),
16 |         (
17 |             "FdB en",
18 |             "Biodiversity Collage (NL) - AMSTERDAM",
19 |             "The Biodiversity Collage is a fun and collaborative workshop that aims to raise awareness about the importance of biodiversity. With a set of cards based on the IPBES reports, you will:",
20 |             "en",
21 |         ),
22 |         (
23 |             "FdB ru",
24 |             "ONLINE BIODIVERSITY COLLAGE WORKSHOP (RU) - with Ivan Ivanovich (CET)",
25 |             "Workshop in Russian Коллаж биоразнообразия — это увлекательный командный воркшоп, который помогает разобраться, почему биоразнообразие критически важно для жизни на Земле и что грозит нашей планете и людям на ней в случае его утраты. В формате совместной работы участники узнают:",
26 |             "ru",
27 |         ),
28 |         (
29 |             "FdN it",
30 |             "ONLINE DIGITAL COLLAGE WORKSHOPS IN ITALIAN - Sessione online con Mario Rossi e Corrado Romano",
31 |             "Il Digital Collage è un workshop ludico e collaborativo. L'obiettivo del workshop è di sensibilizzare e formare i partecipanti sui problemi ambientali e sociali delle tecnologie digitali. Il workshop si propone anche di delineare soluzioni per una maggiore sostenibilità nelle tecnologie digitali e quindi ad aprire discussioni tra i partecipanti sull'argomento.",
32 |             "it",
33 |         ),
34 |         (
35 |             "PlanetC de",
36 |             "Zuerich, Planet C (German)",
37 |             '<a href="https://eventfrog.ch/fr/p/cours-seminaires/autres-cours-seminaires/planet-c-play-again-7281260992926791750.html">Registration</a>',
38 |             "de",
39 |         ),
40 |     ]
41 |     for test_case in test_cases:
42 |         logging.info(f"Running {test_case[0]}")
43 |         actual = language.detect_language_code(test_case[1], test_case[2])
44 |         if actual == test_case[3]:
45 |             logging.info("Result matches")
46 |         else:
47 |             logging.error(f"{test_case[0]}: expected {test_case[3]} but got {actual}")
48 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/db/etl.py:
--------------------------------------------------------------------------------
 1 | import psycopg
 2 | 
 3 | 
 4 | def update_most_recent(conn, table):
 5 |     query = f"""
 6 |     WITH MissingRows AS (
 7 |         SELECT S."id", S."workshop_type", MAX(S."scrape_date") AS max_scrape_date
 8 |         FROM {table} S
 9 |         LEFT JOIN private.events_future F
10 |         ON S."id" = F."id" AND S."workshop_type" = F."workshop_type"
11 |         WHERE F."id" IS NULL
12 |         GROUP BY S."id", S."workshop_type"
13 |     )
14 |     UPDATE {table} S
15 |     SET "most_recent" = TRUE
16 |     FROM MissingRows M
17 |     WHERE S."id" = M."id" AND S."workshop_type" = M."workshop_type" AND S."scrape_date" = M.max_scrape_date AND S."start_date" < current_timestamp;
18 |     """
19 |     cursor = conn.cursor()
20 |     print(query)
21 |     try:
22 |         cursor.execute(query)
23 |         conn.commit()
24 |     except (Exception, psycopg.DatabaseError) as error:
25 |         print("Error: %s" % error)
26 |         conn.rollback()
27 |         cursor.close()
28 |         return 1
29 |     cursor.close()
30 | 
31 | 
32 | def insert(conn, df, table, most_recent=False):
33 |     df["most_recent"] = most_recent
34 |     tuples = [tuple(x) for x in df.to_numpy()]
35 |     cols = ",".join(list(df.columns))
36 | 
37 |     print(list(df.columns))
38 | 
39 |     # SQL query to execute
40 |     cursor = conn.cursor()
41 |     try:
42 |         cursor.executemany(
43 |             "INSERT INTO %s(%s) VALUES (%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)"
44 |             % (table, cols),
45 |             tuples,
46 |             returning=True,
47 |         )
48 |         conn.commit()
49 |     except (Exception, psycopg.DatabaseError) as error:
50 |         print("Error: %s" % error)
51 |         conn.rollback()
52 |         cursor.close()
53 |         return 1
54 |     cursor.close()
55 | 
56 | 
57 | def truncate(conn, table):
58 |     query = "TRUNCATE TABLE %s" % table
59 |     cursor = conn.cursor()
60 |     try:
61 |         cursor.execute(query)
62 |         conn.commit()
63 |     except (Exception, psycopg.DatabaseError) as error:
64 |         print("Error: %s" % error)
65 |         conn.rollback()
66 |         cursor.close()
67 |         return 1
68 |     cursor.close()
69 | 
70 | 
71 | def etl(conn, df):
72 |     df = df.astype(str)
73 | 
74 |     # Insert all events to the historical table. Setting most_recent to False,
75 |     # but maybe the call to `update_most_recent()` below will change this.
76 |     insert(conn, df, "private.events_scraped", most_recent=False)
77 | 
78 |     # Delete all future events before inserting them again, so that they are
79 |     # updated
80 |     truncate(conn, "private.events_future")
81 |     insert(conn, df, "private.events_future", most_recent=True)
82 | 
83 |     update_most_recent(conn, "private.events_scraped")
84 | 


--------------------------------------------------------------------------------
/.flox/env/manifest.toml:
--------------------------------------------------------------------------------
  1 | version = 1
  2 | 
  3 | [install]
  4 | python.pkg-path = "python3"
  5 | python.version = "3.13"
  6 | uv.pkg-path = "uv"
  7 | zlib.pkg-path = "zlib"
  8 | gcc-unwrapped.pkg-path = "gcc-unwrapped"
  9 | firefox-esr.pkg-path = "firefox-esr"
 10 | geckodriver.pkg-path = "geckodriver"
 11 | postgresql.pkg-path = "postgresql"
 12 | 
 13 | 
 14 | # helper tools
 15 | gum.pkg-path = "gum"
 16 | coreutils.pkg-path = "coreutils"
 17 | 
 18 | 
 19 | [vars]
 20 | 
 21 | [hook]
 22 | on-activate = '''
 23 | # Dynamically set WEBDRIVER_PATH required for the scraper
 24 | export WEBDRIVER_PATH="$(which geckodriver)"
 25 | export BROWSER_PATH="$(which firefox-esr)"
 26 | 
 27 | # Flox stuff
 28 | export FLOX_PYTHON_UV_CACHE_DIR="$FLOX_ENV_CACHE/python-uv"
 29 | mkdir -p "$FLOX_PYTHON_UV_CACHE_DIR"
 30 | 
 31 | export FLOX_PYTHON_UV_VENV_PATH="$FLOX_PYTHON_UV_CACHE_DIR/venv"
 32 | export FLOX_PYTHON_UV_VENV_INTERPRETER="$(cat "$FLOX_PYTHON_UV_CACHE_DIR/venv.interpreter" 2> /dev/null || echo false )"
 33 | export FLOX_PYTHON_UV_INTERPRETER="$(realpath $(which python3))"
 34 | 
 35 | # Make sure any tools are not attempting to use the Python interpreter from any
 36 | # existing virtual environment.
 37 | unset VIRTUAL_ENV
 38 | 
 39 | export UV_PROJECT_ENVIRONMENT="$FLOX_PYTHON_UV_VENV_PATH"
 40 | 
 41 | function indent() {
 42 |   echo -e '{{ Foreground "#cccccc" " │ "}}' | \
 43 |     gum format -t template --theme=auto
 44 | }
 45 | 
 46 | function with_spinner() {
 47 |   if [[ "$FLOX_ENVS_TESTING" == "1" ]]; then
 48 |     bash -c "$1"
 49 |   else
 50 |     echo
 51 |     gum spin \
 52 |       --show-error \
 53 |       --spinner line \
 54 |       --spinner.foreground="#cccccc" \
 55 |       --title " >>> $2 ..." \
 56 |       --title.foreground="#cccccc" \
 57 |         -- bash -c "$1"
 58 |     echo -en "\033[2A\033[K"
 59 |   fi
 60 | }
 61 | 
 62 | function ensure_venv() {
 63 |   uv venv -p "$FLOX_PYTHON_UV_INTERPRETER" "$FLOX_PYTHON_UV_VENV_PATH"
 64 |   source "$FLOX_PYTHON_UV_VENV_PATH/bin/activate"
 65 | }
 66 | export -f ensure_venv
 67 | 
 68 | function install_packages() {
 69 |   uv sync
 70 | }
 71 | export -f install_packages
 72 | 
 73 | 
 74 | 
 75 | indent && echo
 76 | indent && echo
 77 | 
 78 | if [ "$FLOX_PYTHON_UV_VENV_INTERPRETER" != "$FLOX_PYTHON_UV_INTERPRETER" ]; then
 79 |   with_spinner ensure_venv "Creating virtual environment"
 80 |   indent && echo -e "{{ Foreground \"#ffffff\" \"✅ Virtual environment was created.\" }}\n" \
 81 |     | gum format -t template
 82 | else
 83 |   indent && echo -e "{{ Foreground \"#ffffff\" \"✅ Virtual environment already exists.\" }}\n" \
 84 |     | gum format -t template
 85 | fi
 86 | 
 87 | indent && echo
 88 | 
 89 | if [ -f pyproject.toml ]; then
 90 |   with_spinner install_packages "Installing Python packages"
 91 |   indent && echo -e "{{ Foreground \"#ffffff\" \"✅ Python packages installed.\" }}\n" \
 92 |     | gum format -t template
 93 | else
 94 |   indent && echo -e "{{ Foreground \"#ffffff\" \"✅ No need to install Python packages.\" }}\n" \
 95 |     | gum format -t template
 96 | fi
 97 | 
 98 | indent && echo
 99 | 
100 | '''
101 | 
102 | [profile]
103 | bash = '''
104 | source "$FLOX_PYTHON_UV_VENV_PATH/bin/activate"
105 | '''
106 | fish = '''
107 | source "$FLOX_PYTHON_UV_VENV_PATH/bin/activate.fish"
108 | '''
109 | tcsh = '''
110 | source "$FLOX_PYTHON_UV_VENV_PATH/bin/activate.csh"
111 | '''
112 | zsh = '''
113 | source "$FLOX_PYTHON_UV_VENV_PATH/bin/activate"
114 | '''
115 | 
116 | [options]
117 | systems = [
118 |   "aarch64-darwin",
119 |   "aarch64-linux",
120 |   "x86_64-darwin",
121 |   "x86_64-linux",
122 | ]
123 | 


--------------------------------------------------------------------------------
/compare.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | import logging
  4 | from tabulate import tabulate
  5 | 
  6 | workshop_types = {
  7 |     0: "FresqueNouveauxRecits",
  8 |     1: "FresqueOceane",
  9 |     2: "FresqueBiodiversite",
 10 |     3: "FresqueNumerique",
 11 |     4: "FresqueAgriAlim",
 12 |     5: "FresqueAlimentation",
 13 |     6: "FresqueConstruction",
 14 |     7: "FresqueMobilite",
 15 |     8: "FresqueSexisme",
 16 |     9: "OGRE",
 17 |     10: "AtelierInventonsNosViesBasCarbone",
 18 |     11: "FresqueDeLeau",
 19 |     12: "FutursProches",
 20 |     13: "FresqueDiversite",
 21 |     14: "FresqueDuTextile",
 22 |     15: "FresqueDesDechets",
 23 |     16: "PuzzleClimat",
 24 |     17: "FresqueDeLaFinance",
 25 |     18: "FresqueDeLaRSE",
 26 |     19: "AtelierDesTransitionsUrbaines",
 27 |     100: "2tonnes",
 28 |     101: "CompteGouttes",
 29 |     102: "FresqueDuBénévolat",
 30 |     103: "FresqueDuPlastique",
 31 |     200: "FresqueClimat",
 32 |     300: "FresqueEcoCirculaire",
 33 |     500: "FresqueFrontieresPlanetaires",
 34 |     501: "HorizonsDecarbones",
 35 |     600: "2030Glorieuses",
 36 |     700: "FresqueDeLaRénovation",
 37 |     701: "FresqueDeLEnergie",
 38 |     702: "FresqueDesPossibles",
 39 |     703: "FresqueDeLaCommunication",
 40 |     704: "Zoofresque",
 41 |     705: "NotreTour",
 42 |     800: "PlanetCPlayAgain?",
 43 |     801: "FresqueDuSol",
 44 | }
 45 | 
 46 | 
 47 | def get_json(file_path):
 48 |     try:
 49 |         with open(file_path, "r", encoding="utf-8") as file:
 50 |             return json.load(file)
 51 |     except FileNotFoundError:
 52 |         logging.info(f"File not found: {file_path}")
 53 |         return 0
 54 |     except json.JSONDecodeError:
 55 |         logging.info(f"Error decoding JSON in file: {file_path}")
 56 |         return 0
 57 | 
 58 | 
 59 | def count_workshop_types(data):
 60 |     records = {}
 61 |     for record in data:
 62 |         if record["workshop_type"] in records:
 63 |             records[record["workshop_type"]] += 1
 64 |         else:
 65 |             records[record["workshop_type"]] = 1
 66 |     return records
 67 | 
 68 | 
 69 | def display_workshop_types(counts):
 70 |     for workshop_type, count in counts.items():
 71 |         logging.info(f"{workshop_types[workshop_type]}: {count} events")
 72 |     logging.info("---------")
 73 | 
 74 | 
 75 | def display_table_workshop_types(counts1, counts2):
 76 |     table = []
 77 |     for workshop_id, workshop_type in workshop_types.items():
 78 |         count1 = counts1.get(workshop_id, 0)
 79 |         count2 = counts2.get(workshop_id, 0)
 80 |         table.append([workshop_type, count1, count2, count2 - count1])
 81 |     return table
 82 | 
 83 | 
 84 | def main():
 85 |     # Check if the correct number of arguments is provided
 86 |     if len(sys.argv) != 3:
 87 |         logging.info("Usage: python program.py <file1_path> <file2_path>")
 88 |         sys.exit(1)
 89 | 
 90 |     # Get file paths from command-line arguments
 91 |     file1_path = sys.argv[1]
 92 |     file2_path = sys.argv[2]
 93 | 
 94 |     # Count entries in each file
 95 |     json1 = get_json(file1_path)
 96 |     json2 = get_json(file2_path)
 97 | 
 98 |     records1 = count_workshop_types(json1)
 99 |     records2 = count_workshop_types(json2)
100 | 
101 |     # display_workshop_types(records1)
102 |     # display_workshop_types(records2)
103 | 
104 |     headers = ["Workshop", file1_path, file2_path, "Delta"]
105 |     table = display_table_workshop_types(records1, records2)
106 |     totals1 = sum(row[1] for row in table)
107 |     totals2 = sum(row[2] for row in table)
108 |     table.append(["====Totals====", totals1, totals2, totals2 - totals1])
109 |     print(tabulate(table, headers, tablefmt="fancy_grid"))
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     main()
114 | 


--------------------------------------------------------------------------------
/TUTORIAL.md:
--------------------------------------------------------------------------------
 1 | # Tutoriel à destination des organisateurs de fresques
 2 | 
 3 | Ce projet vient récupérer les données de vos évènements publiés en ligne. Si un évènement n'apparaît pas sur la plateforme Trouver une Fresque, merci de vérifier les points ci-dessous. Si l'une des conditions n'est pas vérifiée, merci de mettre votre évènement à jour avant de nous contacter.
 4 | 
 5 | Sautez à la section correspondante à la plateforme utilisée pour publier votre évènement. Dans tous les cas, soyez patients : les évènements sont mis à jour une fois tous les 4 jours en moyenne. Si vous postez un évènement, il n'apparaitra pas immédiatement sur Trouver une Fresque, patience !
 6 | 
 7 | - [Billetweb.fr](#billetwebfr)
 8 | - [Eventbrite.fr](#eventbritefr)
 9 | - [Fresqueduclimat.org](#fresqueduclimatorg)
10 | - [Lafresquedeleconomiecirculaire.com](#lafresquedeleconomiecirculairecom)
11 | - [Glide.page](#glidepage)
12 | 
13 | Si après 1) avoir vérifié tous les points énoncés, et 2) avoir attendu 4 jours, votre évènement n'apparaît toujours pas sur la plateforme Trouver une Fresque, merci de lire la section [Ouvrir une issue](#ouvrir-une-issue) en bas de cette page.
14 | 
15 | ## Billetweb.fr
16 | 
17 | ### Date
18 | 
19 | La date, comprenant au moins le jour et l'heure de début, apparaît-elle sous le titre de la page ?
20 | 
21 | ### Adresse
22 | 
23 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md).
24 | 
25 | ### Mon atelier utilise un moyen externe pour les inscriptions (mail, téléphone, etc) mais apparaît comme complet !
26 | 
27 | Si vous n'utilisez pas le module d'achat de Billetweb mais un moyen externe pour gérer les inscriptions (mail, téléphone, etc), merci d'afficher la mention suivante dans le module d'achat : "Inscriptions uniquement via [...]" ou "Inscriptions uniquement par [...]", en complétant pour adapter à votre cas "Inscriptions uniquement par téléphone au 06xxxxxxxx".
28 | 
29 | ### Mon atelier s'adresse aux juniors mais il n'apparaît pas comme tel !
30 | 
31 | Le mot-clé "junior" doit figurer dans le titre de l'atelier.
32 | 
33 | ## Eventbrite.fr
34 | 
35 | ### Page d'évènement
36 | 
37 | UN SEUL évènement doit apparaître sur la page d'évènement. Cela signifie que le bouton "Sélectionner une date" ou "Sélectionner une heure" ne doit pas apparaître sur la page. Un bouton "Obtenir des billets" sur la page de l'évènement est une condition pour être référencé.
38 | 
39 | ### Adresse
40 | 
41 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md).
42 | 
43 | ## Fresqueduclimat.org
44 | 
45 | ### Adresse
46 | 
47 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md).
48 | 
49 | ### Mon atelier s'adresse aux juniors mais il n'apparait pas comme tel !
50 | 
51 | Le mot-clé "junior" doit figurer dans la description de l'atelier.
52 | 
53 | ## Lafresquedeleconomiecirculaire.com
54 | 
55 | ### Adresse
56 | 
57 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md).
58 | 
59 | ### Different timezone
60 | 
61 | Les évènements ne se tenant pas en France ne sont pas encore supportés.
62 | 
63 | ## Glide.page
64 | 
65 | ### Adresse
66 | 
67 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md).
68 | 
69 | ## iCalendar / iCal / ICS
70 | 
71 | Le format est supporté par un grand nombre de logiciels dont ceux énumérés sur https://fr.wikipedia.org/wiki/ICalendar. Voici comment obtenir le lien ICS pour deux d'entre eux:
72 | 
73 | * [Framagenda](https://framagenda.org/): cliquez sur le menu à droite du nom du calendrier, cliquez sur le menu aà droite de "Partager le lien" puis choisissez l'option "Copier le lien pour s'abonner". Exemple: `https://framagenda.org/remote.php/dav/public-calendars/KwNwGA232xD38CnN/?export`.
74 | * [Google Calendar](https://calendar.google.com): cliquez sur le menu à droite du nom du calendrier, choisissez "Paramètres et Partage", deéfilez vers le bas pour trouver "Adresse publique au format iCal". Exemple: `https://calendar.google.com/calendar/ical/2fe1be9f8d5c073969bccaba14133699b71305877304056bee924ee0ef128977%40group.calendar.google.com/public/basic.ics`.
75 | 
76 | ### Adresse
77 | 
78 | Voir le [tutoriel OpenStreetMap](TUTORIAL_OSM.md).
79 | 
80 | ### Lien d'inscription
81 | 
82 | La description doit contenir le lien pour s'inscrire. Si la description continent plusieurs liens, les liens vers les plateformes de visio-conférence connues sont ignorés. Si il reste encore plusieurs liens, la description doit être au format HTML et un seul des liens doit contenir le mot "Inscriptions" ou un synonyme proche.
83 | 
84 | ## Ouvrir une issue
85 | 
86 | Si 1) après vérification de tous les points énoncés dans la section correspondante à votre plateforme de publication, et 2) après avoir attendu 4 jours, votre évènement n'apparaît toujours pas sur la plateforme Trouver une Fresque, merci de nous contacter en remplissant [ce formulaire](https://github.com/trouver-une-fresque/trouver-une-fresque/issues/new). Un compte [GitHub](https://github.com/signup) est nécessaire.
87 | 


--------------------------------------------------------------------------------
/countries/ch.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "name": "Fresque des Nouveaux Récits",
  4 |         "url": "https://www.billetweb.fr/pro/fdnr",
  5 |         "type": "scraper",
  6 |         "iframe": "event21569",
  7 |         "id": 0
  8 |     },
  9 |     {
 10 |         "name": "Fresque Océane",
 11 |         "url": "https://www.billetweb.fr/pro/billetteriefo",
 12 |         "type": "scraper",
 13 |         "iframe": "event15247",
 14 |         "id": 1
 15 |     },
 16 |     {
 17 |         "name": "Fresque de la Biodiversité",
 18 |         "url": "https://www.billetweb.fr/multi_event.php?user=82762",
 19 |         "type": "scraper",
 20 |         "iframe": "event17309",
 21 |         "id": 2
 22 |     },
 23 |     {
 24 |         "name": "Fresque du Numérique",
 25 |         "url": "https://www.billetweb.fr/shop.php?event=suisse-atelier-fresque-du-numerique&color=5190f5&page=1&margin=no_margin",
 26 |         "type": "scraper",
 27 |         "iframe": "eventu84999",
 28 |         "id": 3
 29 |     },
 30 |     {
 31 |         "name": "Fresque du Numérique",
 32 |         "url": "https://www.billetweb.fr/pro/atelier-fresque-du-numerique",
 33 |         "type": "scraper",
 34 |         "iframe": "event41180",
 35 |         "id": 3
 36 |     },
 37 |     {
 38 |         "name": "Fresque Agri'Alim",
 39 |         "url": "https://www.billetweb.fr/pro/fresqueagrialim",
 40 |         "type": "scraper",
 41 |         "iframe": "event11421",
 42 |         "id": 4
 43 |     },
 44 |     {
 45 |         "name": "Fresque de l'Alimentation",
 46 |         "url": "https://www.billetweb.fr/pro/fresquealimentation",
 47 |         "type": "scraper",
 48 |         "iframe": "event11155",
 49 |         "id": 5
 50 |     },
 51 |     {
 52 |         "name": "Fresque de la Construction",
 53 |         "url": "https://www.billetweb.fr/pro/fresquedelaconstruction",
 54 |         "type": "scraper",
 55 |         "iframe": "event11574",
 56 |         "id": 6
 57 |     },
 58 |     {
 59 |         "name": "Fresque de la Mobilité",
 60 |         "url": "https://www.billetweb.fr/pro/fresquedelamobilite",
 61 |         "type": "scraper",
 62 |         "iframe": "event11698",
 63 |         "id": 7
 64 |     },
 65 |     {
 66 |         "name": "Fresque du Sexisme",
 67 |         "url": "https://www.billetweb.fr/pro/fresque-du-sexisme",
 68 |         "type": "scraper",
 69 |         "iframe": "event27112",
 70 |         "id": 8
 71 |     },
 72 |     {
 73 |         "name": "Atelier OGRE",
 74 |         "url": "https://www.billetweb.fr/pro/atelierogre",
 75 |         "type": "scraper",
 76 |         "iframe": "event13026",
 77 |         "id": 9
 78 |     },
 79 |     {
 80 |         "name": "Fresque de l'Eau",
 81 |         "url": "https://www.billetweb.fr/multi_event.php?user=138110",
 82 |         "type": "scraper",
 83 |         "iframe": "eventu138110",
 84 |         "id": 11
 85 |     },
 86 |     {
 87 |         "name": "Fresque du Textile",
 88 |         "url": "https://www.billetweb.fr/multi_event.php?user=166793",
 89 |         "type": "scraper",
 90 |         "iframe": "event27458",
 91 |         "filter": "textile",
 92 |         "id": 14
 93 |     },
 94 |     {
 95 |         "name": "Fresque des Déchets",
 96 |         "url": "https://calendar.google.com/calendar/ical/greendonut.info%40gmail.com/public/basic.ics",
 97 |         "type": "api",
 98 |         "id": 15
 99 |     },
100 |     {
101 |         "name": "Puzzle Climat",
102 |         "url": "https://www.billetweb.fr/multi_event.php?user=121600",
103 |         "type": "scraper",
104 |         "iframe": "event21038",
105 |         "id": 16
106 |     },
107 |     {
108 |         "name": "Fresque de la RSE",
109 |         "url": "https://www.billetweb.fr/pro/fresque",
110 |         "type": "scraper",
111 |         "iframe": "event35904",
112 |         "id": 18
113 |     },
114 |     {
115 |         "name": "2tonnes",
116 |         "url": "https://www.eventbrite.com/cc/ateliers-grand-public-en-presentiel-hors-france-2157189",
117 |         "type": "scraper",
118 |         "id": 100
119 |     },
120 |     {
121 |         "name": "Fresque du Plastique",
122 |         "url": "https://www.eventbrite.fr/o/la-fresque-du-plastique-45763194553",
123 |         "type": "scraper",
124 |         "id": 103
125 |     },
126 |     {
127 |         "name": "Fresque du Climat (ateliers)",
128 |         "url": "https://climatefresk.org/fr-ch/inscription-atelier/grand-public/",
129 |         "type": "scraper",
130 |         "id": 200
131 |     },
132 |         {
133 |         "name": "Climate Fresk (workshops)",
134 |         "url": "https://climatefresk.org/de-ch/workshop-anmeldung/offentlichkeit/",
135 |         "type": "scraper",
136 |         "id": 200
137 |     },
138 |     {
139 |         "name": "Fresque de l'Economie Circulaire",
140 |         "url": "https://www.billetweb.fr/pro/lafresquedeleconomiecirculaire",
141 |         "language_code": "fr",
142 |         "type": "scraper",
143 |         "iframe": "event41148",
144 |         "id": 300
145 |     },
146 |     {
147 |         "name": "Circular Economy Collage",
148 |         "url": "https://www.eventbrite.fr/o/fresque-de-leconomie-circulaire-68155531313",
149 |         "type": "scraper",
150 |         "id": 300
151 |     },
152 |     {
153 |         "name": "Fresque des Frontières Planétaires (ateliers)",
154 |         "url": "https://1erdegre.glide.page/dl/3b1bc8",
155 |         "type": "scraper",
156 |         "filter": "Fresque des frontières planétaires",
157 |         "id": 500
158 |     },
159 |     {
160 |         "name": "Planet C Play Again?",
161 |         "url": "https://calendar.google.com/calendar/ical/2fe1be9f8d5c073969bccaba14133699b71305877304056bee924ee0ef128977%40group.calendar.google.com/public/basic.ics",
162 |         "type": "api",
163 |         "id": 800
164 |     },
165 |     {
166 |         "name": "Notre Tour",
167 |         "url": "https://www.helloasso.com/associations/mush",
168 |         "type": "scraper",
169 |         "id": 705
170 |     },
171 |     {
172 |         "name": "Fresque du Sol",
173 |         "url": "https://framagenda.org/remote.php/dav/public-calendars/KwNwGA232xD38CnN/?export",
174 |         "type": "api",
175 |         "id": 801
176 |     }
177 | ]
178 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/utils/date_and_time_test.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import logging
  3 | from attrs import define
  4 | 
  5 | 
  6 | from trouver_une_fresque_scraper.utils import date_and_time
  7 | 
  8 | 
  9 | def run_get_dates_tests():
 10 |     # tuple fields:
 11 |     # 1. Test case name or ID
 12 |     # 2. Input date string
 13 |     # 3. Expected output start datetime
 14 |     # 4. Expected output end datetime
 15 |     test_cases = [
 16 |         (
 17 |             "BilletWeb: one hour",
 18 |             "Thu Oct 19, 2023 from 01:00 PM to 02:00 PM",
 19 |             datetime(2023, 10, 19, 13, 0),
 20 |             datetime(2023, 10, 19, 14, 0),
 21 |         ),
 22 |         (
 23 |             "BilletWeb: multiple months",
 24 |             "Thu Oct 19, 2023 at 01:00 PM to Sat Feb 24, 2024 at 02:00 PM",
 25 |             datetime(2023, 10, 19, 13, 0),
 26 |             datetime(2024, 2, 24, 14, 0),
 27 |         ),
 28 |         (
 29 |             "BilletWeb: single date and time",
 30 |             "March 7, 2025 at 10:00 AM",
 31 |             datetime(2025, 3, 7, 10, 0),
 32 |             datetime(2025, 3, 7, 13, 0),
 33 |         ),
 34 |         (
 35 |             "EventBrite",
 36 |             "ven. 11 avr. 2025 14:00 - 17:30 CEST",
 37 |             datetime(2025, 4, 11, 14, 0),
 38 |             datetime(2025, 4, 11, 17, 30),
 39 |         ),
 40 |         (
 41 |             "FdC French",
 42 |             "16 mai 2025, de 18h30 à 21h30 (heure de Paris)",
 43 |             datetime(2025, 5, 16, 18, 30),
 44 |             datetime(2025, 5, 16, 21, 30),
 45 |         ),
 46 |         (
 47 |             "FdC English: June 3",
 48 |             "June 03, 2025, from 05:30pm to 09:30pm (Paris time)",
 49 |             datetime(2025, 6, 3, 17, 30),
 50 |             datetime(2025, 6, 3, 21, 30),
 51 |         ),
 52 |         (
 53 |             "FdC English: October 28",
 54 |             "October 28, 2025, from 09:00am to 12:00pm (Zürich time)",
 55 |             datetime(2025, 10, 28, 9, 0),
 56 |             datetime(2025, 10, 28, 12, 0),
 57 |         ),
 58 |         (
 59 |             "FEC",
 60 |             "03 mars 2025, 14:00 – 17:00 UTC+1",
 61 |             datetime(2025, 3, 3, 14, 0),
 62 |             datetime(2025, 3, 3, 17, 0),
 63 |         ),
 64 |         (
 65 |             "Glide",
 66 |             "mercredi 12 février 2025 de 19h00 à 22h00",
 67 |             datetime(2025, 2, 12, 19, 0),
 68 |             datetime(2025, 2, 12, 22, 0),
 69 |         ),
 70 |         (
 71 |             "HelloAsso",
 72 |             "Le 12 février 2025, de 18h à 20h",
 73 |             datetime(2025, 2, 12, 18, 0),
 74 |             datetime(2025, 2, 12, 20, 0),
 75 |         ),
 76 |     ]
 77 |     for test_case in test_cases:
 78 |         logging.info(f"Running {test_case[0]}")
 79 |         actual_start_time, actual_end_time = date_and_time.get_dates(test_case[1])
 80 |         if actual_start_time != test_case[2]:
 81 |             logging.error(f"{test_case[0]}: expected {test_case[2]} but got {actual_start_time}")
 82 |         if actual_end_time != test_case[3]:
 83 |             logging.error(f"{test_case[0]}: expected {test_case[3]} but got {actual_end_time}")
 84 | 
 85 | 
 86 | @define
 87 | class MockWebDriverElement:
 88 |     text: str
 89 |     dt: str | None
 90 | 
 91 |     def get_attribute(self, ignored: str) -> str | None:
 92 |         return self.dt
 93 | 
 94 | 
 95 | def run_get_dates_from_element_tests():
 96 |     # tuple fields:
 97 |     # 1. Test case name or ID
 98 |     # 2. Input date string
 99 |     # 3. Expected output start datetime
100 |     # 4. Expected output end datetime
101 |     test_cases = [
102 |         (
103 |             "BilletWeb: no datetime, fallback on text parsing",
104 |             None,
105 |             "Thu Oct 19, 2023 from 01:00 PM to 02:00 PM",
106 |             datetime(2023, 10, 19, 13, 0),
107 |             datetime(2023, 10, 19, 14, 0),
108 |         ),
109 |         (
110 |             "EventBrite: morning",
111 |             "2025-12-05",
112 |             "déc. 5 de 8am à 11am UTC",
113 |             datetime(2025, 12, 5, 8, 0),
114 |             datetime(2025, 12, 5, 11, 0),
115 |         ),
116 |         (
117 |             "EventBrite: evening",
118 |             "2025-12-12",
119 |             "déc. 12 de 6pm à 9pm UTC+1",
120 |             datetime(2025, 12, 12, 18, 0),
121 |             datetime(2025, 12, 12, 21, 0),
122 |         ),
123 |         (
124 |             "EventBrite: afternoon in German",
125 |             "2024-12-16",
126 |             "Dez. 16 von 5nachm. bis 8nachm. UTC",
127 |             datetime(2024, 12, 16, 17, 0),
128 |             datetime(2024, 12, 16, 20, 0),
129 |         ),
130 |         (
131 |             "EventBrite: afternoon with minutes in German",
132 |             "2024-12-03",
133 |             "Dez. 3 von 5:30nachm. bis 8:30nachm. MEZ",
134 |             datetime(2024, 12, 3, 17, 30),
135 |             datetime(2024, 12, 3, 20, 30),
136 |         ),
137 |         (
138 |             "EventBrite: PM adds 12 to the hours only from 1 PM onwards",
139 |             "2025-12-14",
140 |             "déc. 14 de 9:30am à 12:30pm UTC+1",
141 |             datetime(2025, 12, 14, 9, 30),
142 |             datetime(2025, 12, 14, 12, 30),
143 |         ),
144 |         (
145 |             "EventBrite: start and end minutes differ",
146 |             "2026-01-21",
147 |             "janv. 21 de 9am à 12:30pm UTC+1",
148 |             datetime(2026, 1, 21, 9, 0),
149 |             datetime(2026, 1, 21, 12, 30),
150 |         ),
151 |     ]
152 |     for test_case in test_cases:
153 |         logging.info(f"Running {test_case[0]}")
154 |         actual_start_time, actual_end_time = date_and_time.get_dates_from_element(
155 |             MockWebDriverElement(dt=test_case[1], text=test_case[2])
156 |         )
157 |         if actual_start_time != test_case[3]:
158 |             logging.error(f"{test_case[0]}: expected {test_case[3]} but got {actual_start_time}")
159 |         if actual_end_time != test_case[4]:
160 |             logging.error(f"{test_case[0]}: expected {test_case[4]} but got {actual_end_time}")
161 | 
162 | 
163 | def run_tests():
164 |     run_get_dates_tests()
165 |     run_get_dates_from_element_tests()
166 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/apis/mobilite.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import requests
  3 | import logging
  4 | import pandas as pd
  5 | 
  6 | from datetime import datetime, timedelta
  7 | 
  8 | from trouver_une_fresque_scraper.db.records import get_record_dict
  9 | from trouver_une_fresque_scraper.utils.errors import FreskError
 10 | from trouver_une_fresque_scraper.utils.keywords import is_online, is_training, is_for_kids
 11 | from trouver_une_fresque_scraper.utils.language import detect_language_code
 12 | from trouver_une_fresque_scraper.utils.location import get_address
 13 | 
 14 | 
 15 | def get_df(source):
 16 |     try:
 17 |         response = requests.get(source)
 18 |         # Check if the request was successful (status code 200)
 19 |         if response.status_code == 200:
 20 |             try:
 21 |                 return pd.json_normalize(response.json()["response"]["results"])
 22 |             except KeyError as e:
 23 |                 logging.info(f"incorrect results key in source json: {e}")
 24 |         else:
 25 |             logging.info(f"request failed with status code: {response.status_code}")
 26 |     except requests.RequestException as e:
 27 |         logging.info(f"request error occurred: {e}")
 28 | 
 29 | 
 30 | def get_mobilite_data(source):
 31 |     logging.info("Getting data from Fresque de la Mobilité API")
 32 | 
 33 |     records = []
 34 | 
 35 |     # Get two make results and merge them
 36 |     df_sessions = get_df("https://hook.eu1.make.com/ui9bvl4c3w69dxdlb7goskl3o22x74um")
 37 |     df_versions = get_df("https://hook.eu1.make.com/sy4ud6vxutts9h62t4tt6gv0xr5rrkyd")
 38 |     try:
 39 |         df_sessions = df_sessions.merge(
 40 |             df_versions,
 41 |             how="left",
 42 |             left_on="atelier_version_custom_atelier_version",
 43 |             right_on="_id",
 44 |             suffixes=(None, "_y"),
 45 |         )
 46 |     except Exception as e:
 47 |         logging.info(f"dataframe merge error occurred: {e}")
 48 |         return
 49 | 
 50 |     for event_id, row in df_sessions.iterrows():
 51 |         logging.info("")
 52 | 
 53 |         try:
 54 |             format_key = row["format_option_version_format"]
 55 |             type_key = row["type_option_version_type"]
 56 |             perimetre_key = row["p_rim_tre_option_version_p_rim_tre"]
 57 |             places_key = row["nb_places_number"]
 58 |             participants_key = row["nb_participants_number"]
 59 |             id_key = row["_id"]
 60 |             theme_key = row["th_me_option_version_th_me"]
 61 |             date_key = row["date_date"]
 62 |             duration_key = row["dur_e__en_minutes__number"]
 63 |             address_key = row["lieu_adresse_exact_text"]
 64 |         except KeyError as e:
 65 |             logging.info(f"incorrect key in source json: {e}")
 66 |             continue
 67 | 
 68 |         title = f"{type_key} {theme_key} {perimetre_key} {format_key}"
 69 |         description = ""
 70 |         online = is_online(format_key)
 71 |         sold_out = (places_key - participants_key) == 0
 72 |         source_link = tickets_link = f"https://app.fresquedelamobilite.org/atelier_details/{id_key}"
 73 | 
 74 |         ################################################################
 75 |         # Parse start and end dates
 76 |         ################################################################
 77 |         try:
 78 |             # Convert time strings to datetime objects
 79 |             event_start_datetime = datetime.strptime(date_key, "%Y-%m-%dT%H:%M:%S.%fZ")
 80 |         except Exception as e:
 81 |             logging.info(f"Rejecting record: bad date format {e}")
 82 |             continue
 83 | 
 84 |         try:
 85 |             event_end_datetime = event_start_datetime + timedelta(minutes=duration_key)
 86 |         except Exception as e:
 87 |             logging.info(f"Rejecting record: bad duration format {e}")
 88 |             continue
 89 | 
 90 |         ################################################################
 91 |         # Location data
 92 |         ################################################################
 93 |         full_location = ""
 94 |         location_name = ""
 95 |         address = ""
 96 |         city = ""
 97 |         department = ""
 98 |         longitude = ""
 99 |         latitude = ""
100 |         zip_code = ""
101 |         country_code = ""
102 | 
103 |         if not online:
104 |             try:
105 |                 address_dict = get_address(address_key)
106 |                 (
107 |                     location_name,
108 |                     address,
109 |                     city,
110 |                     department,
111 |                     zip_code,
112 |                     country_code,
113 |                     latitude,
114 |                     longitude,
115 |                 ) = address_dict.values()
116 |             except json.JSONDecodeError:
117 |                 logging.info("Rejecting record: error while parsing API response")
118 |                 continue
119 |             except FreskError as error:
120 |                 logging.info(f"Rejecting record: {error}.")
121 |                 continue
122 | 
123 |         ################################################################
124 |         # Building final object
125 |         ################################################################
126 |         record = get_record_dict(
127 |             f"{source['id']}-{event_id}",
128 |             source["id"],
129 |             title,
130 |             event_start_datetime,
131 |             event_end_datetime,
132 |             full_location,
133 |             location_name,
134 |             address,
135 |             city,
136 |             department,
137 |             zip_code,
138 |             country_code,
139 |             latitude,
140 |             longitude,
141 |             detect_language_code(title, description),
142 |             online,
143 |             is_training(type_key),
144 |             sold_out,
145 |             is_for_kids(perimetre_key),
146 |             source_link,
147 |             tickets_link,
148 |             description,
149 |         )
150 | 
151 |         records.append(record)
152 |         logging.info(f"Successfully API record\n{json.dumps(record, indent=4)}")
153 | 
154 |     return records
155 | 


--------------------------------------------------------------------------------
/TUTORIAL_OSM.md:
--------------------------------------------------------------------------------
  1 | # Tutoriel OpenStreetMap (OSM) à destination des organisateurs de fresques
  2 | 
  3 | La validité des adresses figurant sur les billeteries des fresques est vérifiée via les données OpenStreetMap (OSM), un projet collaboratif de cartographie en ligne qui vise à constituer une base de données géographiques libre du monde.
  4 | 
  5 | Si votre atelier n'apparaît pas sur Trouver une Fresque, il y a de grandes chances que l'adresse renseignée soit invalide, ou non encore connue d'OSM. Voici un diagnostic à effectuer pour corriger le tir.
  6 | 
  7 | - Rendez-vous sur [OpenStreetMap.org](https://www.openstreetmap.org).
  8 | - Dans le champ de recherche en haut à gauche, copiez-collez l'adresse que vous avez renseignée sur votre atelier. Par exemple: `L'Epicerie d'ADDA, 18 Rue de Savenay, 44000 Nantes, France`.
  9 | - Si il n'y a pas de résultat, votre adresse n'est pas reconnue par OSM et c'est la raison pour laquelle votre atelier n'apparaît pas sur notre plateforme.
 10 | 
 11 | Merci de parcourir les sections suivantes dans l'ordre pour comprendre comment corriger votre adresse.
 12 | 
 13 | ## 1) Format de l'adresse
 14 | 
 15 | La première chose à vérifier est que votre adresse utilise un format classique, sans informations additionnelles qui devraient figurer ailleurs. Le nom du lieu est une information utile.
 16 | 
 17 | | Mauvais format | Correction |
 18 | |----------|----------|
 19 | | Chez moi, 1560 Rue Maurice Jacob, Lyon, France | 1560 Rue Maurice Jacob, Lyon, France |
 20 | | Le Grand Bain, 20 Allée de la Maison Rouge, Nantes - Accessible aux PMR | Le Grand Bain, 20 Allée de la Maison Rouge, Nantes |
 21 | | La Capsule, 1er étage, Bâtiment le Churchill, 3 rue du Président Rooselvelt 51100 Reims | La Capsule, Bâtiment le Churchill, 3 rue du Président Rooselvelt 51100 Reims |
 22 | | La Ruche près du petit ruisseau, 24 Rue de l'Est 75020 Paris | La Ruche près du petit ruisseau, 24 Rue de l'Est 75020 Paris |
 23 | | 84 Av. de Grammont, 84 Avenue de Grammont 37000 Tours | 84 Avenue de Grammont 37000 Tours |
 24 | | La Galerie du Zéro Déchet, entrée Place Dulcie September, 5 Rue Fénelon, 44000 Nantes, France | La Galerie du Zéro Déchet, 5 Rue Fénelon, 44000 Nantes, France |
 25 | 
 26 | ## 2) bis/ter
 27 | 
 28 | Si votre adresse contient une particule bis ou ter, merci de formatter votre adresse comme suit :
 29 | 
 30 | | Mauvais format | Correction |
 31 | |----------|----------|
 32 | | Mille club, 5T Rue Paul Serusier, Morlaix, France | Mille club, 5 ter Rue Paul Serusier, Morlaix, France |
 33 | | Le Grand Bain, 20B Allée de la Maison Rouge, Nantes | Le Grand Bain, 20 bis Allée de la Maison Rouge, Nantes |
 34 | 
 35 | ## 3) Abbréviations
 36 | 
 37 | Si votre adresse contient des abbrévations, essayez d'utiliser le(s) mot(s) complet(s).
 38 | 
 39 | | Mauvais format | Correction |
 40 | |----------|----------|
 41 | | Palais du travail, 9 Pl. du Dr Lazare Goujon, 69100 Villeurbanne, France | Palais du travail, 9 Place du Docteur Lazare Goujon, 69100 Villeurbanne, France |
 42 | | Melting Coop, 229 Cr Emile Zola, 69100 Villeurbanne, France | Melting Coop, 229 Cours Emile Zola, 69100 Villeurbanne, France |
 43 | 
 44 | ## 4) Nom du lieu
 45 | 
 46 | Peut-être que le nom du lieu n'est pas rattaché à l'adresse sur OSM. Pour le vérifier, tapez votre adresse sans le nom. Par exemple, si votre adresse est `Melting Coop, 229 Cours Emile Zinzolin, 69100 Villeurbanne, France`, tapez plutôt `229 Cours Emile Zinzolin, 69100 Villeurbanne, France`.
 47 | 
 48 | - Si vous n'obtenez pas de résultat, l'adresse (sans le nom du lieu) n'est pas répertoriée sur OpenStreetMap. Naviguez manuellement à l'adresse en vous déplaçant sur la carte pour récupérer l'adresse telle qu'elle apparaît dans OSM. Dans notre cas, on se rendra compte que l'adresse correcte est `229 Cours Emile Zola, 69100 Villeurbanne, France`.
 49 | 
 50 | - Si vous obtenez un résultat, deux cas de figure:
 51 | 
 52 |     - Soit, en naviguant manuellement sur la carte, le lieu où l'atelier est organisé est bien répertorié. Dans ce cas, il faut:
 53 | 
 54 |         - Si le nom du lieu n'a pas la bonne orthographe par rapport à la carte, ajustez le nom du lieu pour le faire correspondre à l'information de la carte.
 55 | 
 56 |         - Si l'orthographe du lieu est correcte dans votre addresse par rapport à la carte, il faut [rattacher une adresse à ce lieu](#rattacher-une-adresse-à-un-lieu-existant).
 57 | 
 58 |     - Soit, en naviguant manuellement sur la carte, le lieu où l'atelier est organisé n'apparaît pas. Dans ce cas, il faut [ajouter un nouveau lieu et lui rattacher son adresse](#créer-un-lieu).
 59 | 
 60 | ### Rattacher une adresse à un lieu existant
 61 | 
 62 | Suivre les étapes suivantes:
 63 | 
 64 | - Créer un [compte sur OpenStreetMap](https://www.openstreetmap.org/user/new).
 65 | 
 66 | - Naviguer manuellement sur la carte jusqu'au lieu auquel une adresse doit être rattachée.
 67 | 
 68 | - Cliquer sur "Modifier" en haut à gauche.
 69 | 
 70 | - Sur la carte, cliquer sur la petite icône du lieu à modifier.
 71 | 
 72 | - Dans le panneau latéral qui s'ouvre à gauche, ajouter l'adresse du lieu. Vous pouvez en profiter pour enrichir les données OSM avec des données supplémentaires comme le numéro de téléphone, le site web, etc.
 73 | 
 74 | - Une fois les attributs renseignés, cliquer sur "Sauvegarder" en haut à droite.
 75 | 
 76 | - Écrire un message décrivant vos modifications. Par exemple : "Ajout de l'adresse à un lieu existant".
 77 | 
 78 | - Cliquer sur "Envoyer".
 79 | 
 80 | Merci d'avoir contribué à OpenStreetMap !
 81 | 
 82 | Attendre une dizaine de minutes, et relire ce tutoriel depuis le début :)
 83 | 
 84 | ### Créer un lieu
 85 | 
 86 | Suivre les étapes suivantes:
 87 | 
 88 | - Créer un [compte sur OpenStreetMap](https://www.openstreetmap.org/user/new).
 89 | 
 90 | - Naviguer manuellement sur la carte jusqu'à l'endroit où le lieu doit être ajouté.
 91 | 
 92 | - Cliquer sur "Point" en haut au centre.
 93 | 
 94 | - Cliquer sur le bâtiment où le lieu doit être ajouté.
 95 | 
 96 | - Dans le panneau latéral qui s'ouvre à gauche, choisir un type pour le lieu. Par exemple, `Café`, `Restaurant`, `Espace de coworking`, ou `Centre communautaire` pour un tiers-lieu. Ajouter ensuite le nom et l'adresse du lieu. Vous pouvez en profiter pour enrichir les données OSM avec des données supplémentaires comme le numéro de téléphone, le site web, etc.
 97 | 
 98 | - Une fois les attributs renseignés, cliquer sur "Sauvegarder" en haut à droite.
 99 | 
100 | - Écrire un message décrivant vos modifications. Par exemple : "Ajout d'un lieu".
101 | 
102 | - Cliquer sur "Envoyer".
103 | 
104 | Merci d'avoir contribué à OpenStreetMap !
105 | 
106 | Attendre une dizaine de minutes, et relire ce tutoriel depuis le début :)
107 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/apis/glorieuses.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import requests
  3 | import time
  4 | import logging
  5 | 
  6 | from datetime import datetime
  7 | 
  8 | from trouver_une_fresque_scraper.db.records import get_record_dict
  9 | from trouver_une_fresque_scraper.utils.errors import FreskError
 10 | from trouver_une_fresque_scraper.utils.keywords import *
 11 | from trouver_une_fresque_scraper.utils.language import detect_language_code
 12 | from trouver_une_fresque_scraper.utils.location import get_address
 13 | 
 14 | 
 15 | def get_glorieuses_data(source):
 16 |     logging.info("Getting data from Glorieuses API")
 17 | 
 18 |     json_records = []
 19 |     records = []
 20 | 
 21 |     try:
 22 |         response = requests.get(source["url"])
 23 |         # Check if the request was successful (status code 200)
 24 |         if response.status_code == 200:
 25 |             json_records = response.json()
 26 |         else:
 27 |             logging.info(f"Request failed with status code: {response.status_code}")
 28 |     except requests.RequestException as e:
 29 |         logging.info(f"An error occurred: {e}")
 30 | 
 31 |     for json_record in json_records:
 32 |         time.sleep(1.5)
 33 |         logging.info("")
 34 | 
 35 |         ################################################################
 36 |         # Get event id
 37 |         ################################################################
 38 |         event_id = json_record["RECORD_ID()"]
 39 | 
 40 |         ################################################################
 41 |         # Get event title
 42 |         ################################################################
 43 |         title = json_record["Label event"]
 44 | 
 45 |         ################################################################
 46 |         # Parse start and end dates
 47 |         ################################################################
 48 |         event_start_time = json_record["Date"]
 49 | 
 50 |         try:
 51 |             # Convert time strings to datetime objects
 52 |             event_start_datetime = datetime.strptime(event_start_time, "%Y-%m-%dT%H:%M:%S.%fZ")
 53 |         except Exception as e:
 54 |             logging.info(f"Rejecting record: bad date format {e}")
 55 |             continue
 56 | 
 57 |         event_end_time = json_record["Date fin"]
 58 | 
 59 |         try:
 60 |             # Convert time strings to datetime objects
 61 |             event_end_datetime = datetime.strptime(event_end_time, "%Y-%m-%dT%H:%M:%S.%fZ")
 62 |         except Exception as e:
 63 |             logging.info(f"Rejecting record: bad date format {e}")
 64 |             continue
 65 | 
 66 |         ###########################################################
 67 |         # Is it an online event?
 68 |         ################################################################
 69 |         if "Format" in json_record and json_record["Format"] is not None:
 70 |             online = is_online(json_record["Format"])
 71 |         else:
 72 |             logging.info(f"Rejecting record: no workshop format provided")
 73 |             continue
 74 | 
 75 |         ################################################################
 76 |         # Location data
 77 |         ################################################################
 78 |         full_location = ""
 79 |         location_name = ""
 80 |         address = ""
 81 |         city = ""
 82 |         department = ""
 83 |         longitude = ""
 84 |         latitude = ""
 85 |         zip_code = ""
 86 |         country_code = ""
 87 | 
 88 |         if not online:
 89 |             address = json_record["Adresse"]
 90 |             if not address:
 91 |                 logging.info("Rejecting record: no address provided")
 92 |                 continue
 93 | 
 94 |             city = json_record["Ville"]
 95 |             full_location = f"{address}, {city}"
 96 | 
 97 |             try:
 98 |                 address_dict = get_address(full_location)
 99 |                 (
100 |                     location_name,
101 |                     address,
102 |                     city,
103 |                     department,
104 |                     zip_code,
105 |                     country_code,
106 |                     latitude,
107 |                     longitude,
108 |                 ) = address_dict.values()
109 |             except json.JSONDecodeError:
110 |                 logging.info("Rejecting record: error while parsing API response")
111 |                 continue
112 |             except FreskError as error:
113 |                 logging.info(f"Rejecting record: {error}.")
114 |                 continue
115 | 
116 |         ################################################################
117 |         # Description
118 |         ################################################################
119 |         description = json_record["Label event"]
120 | 
121 |         ################################################################
122 |         # Training?
123 |         ################################################################
124 |         training = is_training(json_record["Type"])
125 | 
126 |         ################################################################
127 |         # Is it full?
128 |         ################################################################
129 |         sold_out = False
130 | 
131 |         ################################################################
132 |         # Is it suited for kids?
133 |         ################################################################
134 |         kids = False
135 | 
136 |         ################################################################
137 |         # Parse tickets link
138 |         ################################################################
139 |         tickets_link = json_record["Lien billeterie"]
140 |         source_link = tickets_link
141 | 
142 |         ################################################################
143 |         # Building final object
144 |         ################################################################
145 |         record = get_record_dict(
146 |             f"{source['id']}-{event_id}",
147 |             source["id"],
148 |             title,
149 |             event_start_datetime,
150 |             event_end_datetime,
151 |             full_location,
152 |             location_name,
153 |             address,
154 |             city,
155 |             department,
156 |             zip_code,
157 |             country_code,
158 |             latitude,
159 |             longitude,
160 |             source.get("language_code", detect_language_code(title, description)),
161 |             online,
162 |             training,
163 |             sold_out,
164 |             kids,
165 |             source_link,
166 |             tickets_link,
167 |             description,
168 |         )
169 | 
170 |         records.append(record)
171 |         logging.info(f"Successfully API record\n{json.dumps(record, indent=4)}")
172 | 
173 |     return records
174 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # trouver-une-fresque-scraper
  2 | 
  3 | Le scraper de Trouver une Fresque est un outil open source permettant de détecter les ateliers disponibles dans votre département.
  4 | 
  5 | Les données sont extraites des billetteries officielles via la technique du scraping. La validité des adresses est vérifiée en utilisant les données d'OpenStreetMap.
  6 | 
  7 | Si vous utilisez ce code, merci de respecter la [charte de Nominatim](https://operations.osmfoundation.org/policies/nominatim/).
  8 | 
  9 | ## 🌍 Organisateurs: signaler un problème
 10 | 
 11 | Si vous êtes l'organisateur d'un atelier Fresque et que votre évènement n'apparaît pas sur la plateforme Trouver une Fresque, merci de lire le [tutoriel à destination des organisateurs de fresques](https://github.com/trouver-une-fresque/trouver-une-fresque/blob/main/TUTORIAL.md).
 12 | 
 13 | Ouvrez une [issue Github](https://github.com/thomas-bouvier/trouver-une-fresque/issues/new) si vous souhaitez signaler un problème non couvert dans le tutoriel, ou suggérer l'intégration d'un nouvel atelier.
 14 | 
 15 | Les ateliers actuellement supportés sont listés sur la [feuille de route](WORKSHOPS.md).
 16 | 
 17 | ## 🤖 Développeurs: installation
 18 | 
 19 | Le scraping est effectué en utilisant Selenium, qui s'appuie sur geckodriver pour afficher les données à récupérer. Notre outil peut être installé sur un Raspberry Pi sans problème.
 20 | 
 21 | ### Avec `flox` (méthode recommandée)
 22 | 
 23 | Flox est un gestionnaire de paquets multiplateforme qui vise à permettre la reproducibilité, la robustesse, la portabilité et la stabilité des systèmes d'information. Cette approche permet d'installer les paquets Python et dépendances système en une seule fois.
 24 | 
 25 | Suivez les instructions pour installer Flox sur votre système [ici](https://flox.dev/docs/install-flox/). Tout est prêt ! Utilisez la commande `flox activate` dans ce dossier pour commencer à développer.
 26 | 
 27 | Vérifiez que tout fonctionne:
 28 | 
 29 | ```console
 30 | python -c "import trouver_une_fresque_scraper as m; print(m.__file__)"
 31 | ```
 32 | 
 33 | ### Manuellement avec `uv`
 34 | 
 35 | Cette méthode d'installation n'est pas recommandée. Préférez l'utilisation de Flox, qui vous facilitera la tâche et garantira d'avoir toutes les dépendances nécessaires pour lancer le scraper.
 36 | 
 37 | Téléchargez la version la plus récente de [geckodriver](https://github.com/mozilla/geckodriver/releases), puis extrayez le binaire `geckodriver` dans un dossier `bin/` (ou n'importe où sur votre système).
 38 | 
 39 | Les librairies suivantes doivent être installées sur votre système:
 40 | 
 41 | ```console
 42 | apt install firefox-esr libpq-dev python3-dev
 43 | ```
 44 | 
 45 | Enfin, suivez les instructions pour installer `uv` [ici](https://docs.astral.sh/uv/getting-started/installation/) et créez un environnement Python:
 46 | 
 47 | ```console
 48 | uv venv .venv --python 3.13
 49 | ```
 50 | 
 51 | Activez l'environnement:
 52 | 
 53 | ```console
 54 | source .venv/bin/activate
 55 | ```
 56 | 
 57 | Installez le scraper avec:
 58 | 
 59 | ```console
 60 | uv sync
 61 | ```
 62 | 
 63 | Vérifiez que tout fonctionne:
 64 | 
 65 | ```console
 66 | python -c "import trouver_une_fresque_scraper as m; print(m.__file__)"
 67 | ```
 68 | 
 69 | ## 🤖 Développeurs: utilisation
 70 | 
 71 | Avant de contribuer au projet, assurez-vous d'avoir lu le document [CONTRIBUTING.md](./CONTRIBUTING.md).
 72 | 
 73 | ### Configuration
 74 | 
 75 | Renommez le fichier de configuration `config.json.dist` en `config.json` et renseignez les champs.
 76 | 
 77 | ```json
 78 | {
 79 |     "webdriver": "",
 80 |     "host" : "",
 81 |     "port" : "",
 82 |     "user" : "",
 83 |     "psw"  : "",
 84 |     "database": "",
 85 |     "timezone": "Europe/Paris"
 86 | }
 87 | ```
 88 | 
 89 | Le champ `webdriver` est à renseigner avec le chemin vers le binaire `geckodriver` dans le cas d'une installation sans Flox (= manuelle avec `uv` uniquement) uniquement.
 90 | 
 91 | 
 92 | ### Lancer le scraping
 93 | 
 94 | ```console
 95 | python -m trouver_une_fresque_scraper.scrape
 96 | # or
 97 | python -m trouver_une_fresque_scraper.scrape --headless --country ch --skip-dirty-check
 98 | ```
 99 | 
100 | À la fin du scraping, un fichier JSON nommé avec le format `events_20230814_153752.json` est créé dans le dossier `results/`.
101 | 
102 | L'option `--headless` exécute le scraping en mode headless, et `--push-to-db` pousse les résultats du fichier json de sortie dans la base de données en utilisant les identifiants définis dans `config.json`.
103 | 
104 | ### Base de données
105 | 
106 | Nous utilisons [Supabase](https://supabase.com/docs/guides/cli/local-development) pour persister les données scrapées, une alternative open source à Firebase qui fournit une base de données Postgres gratuitement.
107 | 
108 | Login to the CLI and start the database. When starting the database, if file `supabase/seed.sql` is present, the `INSERT` statements will be executed to populate the database with testing data. 
109 | 
110 | ```console
111 | supabase login
112 | supabase init
113 | supabase start
114 | ```
115 | 
116 | The `supabase/tables.sql` contains SQL statements allowing to create the required tables. 
117 | 
118 | To push some data into the database, use the following command:
119 | 
120 | ```console
121 | python push_to_db.py --input results/output.json
122 | ```
123 | 
124 | This command will perform the following actions:
125 | 
126 | - All events are inserted into the historical table `events_scraped`. Setting `most_recent=False`, but maybe the call to `update_most_recent()` below will change this.
127 | - Delete all events from `events_future` before inserting them again, so that they are updated. Setting `most_recent=True`.
128 | - The `most_recent` attribute of events in `events_scraped` are set to `True` if the following conditions are met:
129 |     - A query identifies rows in the `events_scraped` table that do not have a corresponding entry in the `events_future` table.
130 |     - For these rows, it finds the most recent `scrape_date` for each `id` and `workshop_type`.
131 |     - It then updates the `most_recent` column to `TRUE` for these rows, but only if the `start_date` of the event is in the past.
132 | 
133 | ### Lancer les tests
134 | 
135 | ```console
136 | cd tests
137 | python scrape_tests.py
138 | ```
139 | 
140 | ## Comment contribuer
141 | 
142 | Pour proposer une modification, un ajout, ou décrire un bug sur l'outil de détection, vous pouvez ouvrir une [issue](https://github.com/thomas-bouvier/trouver-une-fresque/issues/new) ou une [Pull Request](https://github.com/thomas-bouvier/trouver-une-fresque/pulls) avec vos modifications.
143 | 
144 | Avant de développer, merci d'installer le hook git en suivant les instructions listées dans le fichier [CONTRIBUTING](https://github.com/trouver-une-fresque/trouver-une-fresque/blob/main/CONTRIBUTING.md). Pour le code en Python, veillez à respecter le standard PEP8 avant de soumettre une Pull Request. La plupart des IDEs et éditeurs de code modernes proposent des outils permettant de mettre en page votre code en suivant ce standard automatiquement.
145 | 


--------------------------------------------------------------------------------
/WORKSHOPS.md:
--------------------------------------------------------------------------------
 1 | # Ateliers supportés et feuille de route
 2 | 
 3 | Développer et maintenir Trouver une Fresque nous prend beaucoup de temps et d'énergie, et l'infrastructure n'est pas gratuite non plus. Il nous reste encore beaucoup d'ateliers à intégrer, en France et à l'international. N'hésitez pas à nous soutenir via un petit [don récurrent](https://fr.liberapay.com/trouver-une-fresque/), merci :)
 4 | 
 5 | ## Ateliers supportés
 6 | 
 7 | ### Country `fr`
 8 | 
 9 | | Atelier       | Lien           | Source | Supporté  |
10 | | ------------- |:-------------:| :-----:| :-----:|
11 | | Fresque du Climat | https://fresqueduclimat.org/participer-a-un-atelier-grand-public | Scraping fdc | OK |
12 | | Atelier 2tonnes | https://www.eventbrite.fr/o/2-tonnes-29470123869 | Scraping Eventbrite | OK |
13 | | Fresque de la Biodiversité | https://www.fresquedelabiodiversite.org/#participer | Scraping Billetweb | OK |
14 | | Fresque Océane | https://www.billetweb.fr/pro/billetteriefo | Scraping Billetweb | OK |
15 | | Fresque Agri'Alim | https://www.billetweb.fr/pro/fresqueagrialim | Scraping Billetweb | OK |
16 | | Fresque du Numérique | https://www.fresquedunumerique.org/#participer | Scraping Billetweb | OK |
17 | | Fresque des Nouveaux Récits | https://www.billetweb.fr/pro/fdnr | Scraping Billetweb | OK |
18 | | Fresque de la Mobilité | https://www.billetweb.fr/pro/fresquedelamobilite | Scraping Billetweb | OK |
19 | | Fresque de l'Alimentation | https://www.billetweb.fr/pro/fresquealimentation | Scraping Billetweb | OK |
20 | | Fresque de la Construction | https://www.billetweb.fr/pro/fresquedelaconstruction | Scraping Billetweb | OK |
21 | | Fresque du Sexisme | https://www.billetweb.fr/pro/fresque-du-sexisme | Scraping Billetweb | OK |
22 | | Atelier OGRE | https://www.billetweb.fr/pro/atelierogre | Scraping Billetweb | OK |
23 | | Fresque Nos Vies Bas Carbone | https://www.billetweb.fr/multi_event.php?user=132897 | Scraping Billetweb | OK |
24 | | Fresque de l'Eau | https://www.billetweb.fr/multi_event.php?user=138110 | Scraping Billetweb | OK |
25 | | Atelier futurs proches | https://www.billetweb.fr/pro/futursproches | Scraping Billetweb | OK |
26 | | Fresque de la Diversité | https://www.billetweb.fr/multi_event.php?user=168799 | Scraping Billetweb | OK |
27 | | Fresque de l'Économie Circulaire | https://www.billetweb.fr/multi_event.php?user=246258 | Scaping Billetweb | OK |
28 | | Fresque du Textile | https://www.billetweb.fr/multi_event.php?user=166793 | Scraping Billetweb | OK |
29 | | Fresque des Déchets | https://www.billetweb.fr/multi_event.php?user=166793 | Scraping Billetweb | OK |
30 | | Fresque des Frontières Planétaires | https://1erdegre.glide.page/dl/6471c6 | Scraping Glide Pages | OK |
31 | | Fresque de la Finance | https://www.billetweb.fr/pro/fresquedelafinance | Scraping Billetweb | OK |
32 | | Puzzle Climat | https://www.puzzleclimat.org/ | Scraping Billetweb | OK |
33 | | Atelier Horizons Décarbonés | https://1erdegre.glide.page/dl/6471c6 | Scraping Glide Pages | OK |
34 | | 2030 Glorieuses | https://www.2030glorieuses.org/event | API | OK |
35 | | Fresque de la RSE | https://www.billetweb.fr/multi_event.php?user=139214 | Scraping Billetweb | OK |
36 | | Atelier des Transitions Urbaines | https://www.billetweb.fr/multi_event.php?user=216884 | Scraping Billetweb | OK |
37 | | Fresque de la Rénovation | https://www.helloasso.com/associations/fresque-de-la-renovation/ | Scraping HelloAsso | OK |
38 | | Fresque de l'Energie | https://www.helloasso.com/associations/la-fresque-de-l-energie | Scraping HelloAsso | OK |
39 | | Fresque des Possibles | https://www.helloasso.com/associations/le-lieu-dit | Scraping HelloAsso | OK |
40 | | Fresque de la Communication | https://www.helloasso.com/associations/la-fresque-de-la-communication | Scraping HelloAsso | OK |
41 | | Fresque Zoofresque | https://www.helloasso.com/associations/ajas-association-justice-animaux-savoie | Scraping HelloAsso | OK |
42 | | Atelier Compte-Gouttes | https://www.eventbrite.com/o/atelier-compte-gouttes-73003088333 | Scraping Eventbrite | OK |
43 | | Fresque du Bénévolat | https://www.eventbrite.fr/o/jeveuxaidergouvfr-77010082313 | Scraping Eventbrite | OK |
44 | | Fresque du Plastique | https://www.eventbrite.fr/o/la-fresque-du-plastique-45763194553 | Scraping Eventbrite | OK |
45 | | Cyber Fresque | https://www.eventbrite.fr/o/senscyb-89802295343 | Scraping Eventbrite | OK |
46 | | Fresque du Sol | https://fresquedusol.com/comment-participer/dates-a-venir/ | Calendrier ICS | OK |
47 | | Notre Tour | https://www.helloasso.com/associations/mush | Scraping HelloAsso | OK |
48 | | Planet C Play Again? | https://planetc.org/ | Calendrier ICS | OK |
49 | | Fresque de la Transition Energétique | https://fresques-tilleul.glide.page/dl/6471c6 | Scraping Glide Pages | Prévu, priorité 1 |
50 | | Pitch Climat | https://www.billetweb.fr/multi_event.php?user=186116 | Scraping Billetweb | Prévu, priorité 1 |
51 | | Fresque de l'Equité | https://www.fresquedelequite.fr/ | Scraping custom | Prévu, priorité 1 |
52 | | Fresque du Mouvement | https://www.eventbrite.fr/o/la-fresque-du-mouvement-108241184341 | Scraping Eventbrite | Prévu, priorité 1 |
53 | | Fresque de l'Environnement | | API | Prévu, priorité 2 |
54 | | Fresque de la Forêt | https://all4trees.org/agir/fresque-foret/evenements | Scraping site custom | Prévu, priorité 2 |
55 | | Atelier Découverte de la Renaissance Écologique | https://renaissanceecologique.org/ | Scraping site custom | Prévu, priorité 2 |
56 | | Atelier Éco-challenge Little Big Impact | https://www.billetweb.fr/pro/lbi-quiz-sedd | Scraping Billetweb | Prévu, priorité 2 |
57 | | Fresque de l'Attention | https://www.billetweb.fr/pro/fresquedelattention | Scraping Billetweb | Prévu, priorité 2 |
58 | | Fresque des Écrans | https://colori.fr/la-fresque-des-ecrans | Scraping custom | Prévu, priorité 2 |
59 | | Fresque de l'Éco-conception | https://www.lafresquedelecoconception.fr/infos-pratiques | Scraping site custom | Prévu, priorité 2 |
60 | | Atelier L'éco-naissance | https://www.eventbrite.fr/o/leco-naissance-62237583643 | Scraping Eventbrite | En réflexion |
61 | | Fresque de l'Emploi Durable | https://www.helloasso.com/associations/solidarites-nouvelles-face-au-chomage-snc | Scraping HelloAsso | En réflexion |
62 | | Atelier Marche du Temps Profond | https://www.helloasso.com/associations/ecotopia | Scraping HelloAsso | En réflexion |
63 | | Fresque des Entreprises Inclusives | https://www.helloasso.com/associations/tous-tes-possibles/evenements/fresque-des-entreprises-inclusives| Scraping HelloAsso | En réflexion |
64 | 
65 | ## Initiatives locales
66 | 
67 | Nous avons pris connaissance d'initiatives locales organisée au niveau du département. Nous réfléchissons à un moyen d'intégrer ces ateliers. Le souci est que certains ateliers pourraient être dupliqués par rapport aux billetterie officielles.
68 | 
69 | | Département       | Lien           | Source | Supporté  |
70 | | ------------- |:-------------:| :-----:| :-----:|
71 | | Atelier Déclics | https://www.helloasso.com/associations/objective-zero | Scraping HelloAsso | Non |
72 | | Isère | https://enjeuxcommuns.fr/les-prochains-ateliers-en-isere/ | Airtable | Non |
73 | | Bas-Rhin/Haut-Rhin | https://hoplatransition.org/index.php/nos-evenements/ | Framagenda | Non |
74 | | Réunion | https://fresques.re/event-directory/ | Custom | Non |
75 | | Marseille | https://fada.earth/ | Airtable | Non |
76 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/scrape.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import subprocess
  5 | import sys
  6 | import pandas as pd
  7 | import psycopg
  8 | 
  9 | from datetime import datetime
 10 | from pathlib import Path
 11 | from psycopg.conninfo import make_conninfo
 12 | 
 13 | from trouver_une_fresque_scraper.apis import main as main_apis
 14 | from trouver_une_fresque_scraper.scraper import main as main_scraper
 15 | 
 16 | 
 17 | def configure_logging(log_file_path, error_log_file_path):
 18 |     """
 19 |     Configures the logging system to write all levels of messages to both a file and the console,
 20 |     and errors to a separate file.
 21 | 
 22 |     :param log_file_path: The path to the log file for all levels of messages.
 23 |     :param error_log_file_path: The path to the log file for error messages only.
 24 |     """
 25 |     # Ensure the directories exist
 26 |     log_file_path.parent.mkdir(parents=True, exist_ok=True)
 27 |     error_log_file_path.parent.mkdir(parents=True, exist_ok=True)
 28 | 
 29 |     # Create a logger
 30 |     logger = logging.getLogger()
 31 |     logger.setLevel(logging.INFO)
 32 | 
 33 |     # Create a file handler for all levels of messages
 34 |     file_handler = logging.FileHandler(log_file_path)
 35 |     file_handler.setLevel(logging.INFO)
 36 |     file_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
 37 | 
 38 |     # Create a stream handler for all levels of messages
 39 |     stream_handler = logging.StreamHandler()
 40 |     stream_handler.setLevel(logging.INFO)
 41 |     stream_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
 42 | 
 43 |     # Create a file handler for error messages only
 44 |     error_file_handler = logging.FileHandler(error_log_file_path)
 45 |     error_file_handler.setLevel(logging.ERROR)
 46 |     error_file_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
 47 | 
 48 |     # Add handlers to the logger
 49 |     logger.addHandler(file_handler)
 50 |     logger.addHandler(stream_handler)
 51 |     logger.addHandler(error_file_handler)
 52 | 
 53 | 
 54 | def is_git_repository_dirty():
 55 |     # Check if the repository is dirty
 56 |     try:
 57 |         result = subprocess.run(["git", "status", "--porcelain"], capture_output=True, text=True)
 58 |         return bool(result.stdout.strip())
 59 |     except subprocess.CalledProcessError as e:
 60 |         logging.error(f"Error checking git status: {e}")
 61 |         sys.exit(1)
 62 | 
 63 | 
 64 | def get_git_commit_hash():
 65 |     # Get the current commit hash
 66 |     try:
 67 |         result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True)
 68 |         return result.stdout.strip()
 69 |     except subprocess.CalledProcessError as e:
 70 |         logging.error(f"Error checking git status: {e}")
 71 |         sys.exit(1)
 72 | 
 73 | 
 74 | def get_sources(content):
 75 |     try:
 76 |         data = json.loads(content)
 77 |     except json.JSONDecodeError as e:
 78 |         logging.error(f"Failed to decode JSON: {e}")
 79 |         raise
 80 |     except Exception as e:
 81 |         logging.error(f"An unexpected error occurred: {e}")
 82 |         raise
 83 | 
 84 |     # Validate the data structure
 85 |     for d in data:
 86 |         if not isinstance(d, dict):
 87 |             logging.error(f"Invalid data structure: expected a dictionary, got {type(d).__name__}")
 88 |             raise
 89 | 
 90 |         required_keys = ["name", "id", "url", "type"]
 91 |         for key in required_keys:
 92 |             if key not in d:
 93 |                 logging.error(f"Missing required key '{key}' in data: {d}")
 94 |                 raise
 95 | 
 96 |     scrapers, apis = [], []
 97 |     for d in data:
 98 |         if d["type"] == "scraper":
 99 |             scrapers.append(d)
100 |         elif d["type"] == "api":
101 |             apis.append(d)
102 | 
103 |     return scrapers, apis
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     parser = argparse.ArgumentParser()
108 |     parser.add_argument(
109 |         "--country",
110 |         default="fr",
111 |         help="run the scraper for the given json containing data sources",
112 |     )
113 |     parser.add_argument(
114 |         "--headless",
115 |         action="store_true",
116 |         default=False,
117 |         help="run scraping in headless mode",
118 |     )
119 |     parser.add_argument(
120 |         "--push-to-db",
121 |         action="store_true",
122 |         default=False,
123 |         help="push the scraped results to db",
124 |     )
125 |     parser.add_argument(
126 |         "--skip-dirty-check",
127 |         action="store_true",
128 |         default=False,
129 |         help="skips checking that the git repository is clean",
130 |     )
131 |     args = parser.parse_args()
132 | 
133 |     # This scraper should be run from a clean state to ensure reproducibility
134 |     dirty = is_git_repository_dirty()
135 |     if dirty and not args.skip_dirty_check:
136 |         logging.warning("The git repository is dirty. Consider a clean state for reproducibility.")
137 |         user_input = input("Do you want to continue? (y/n): ").strip().lower()
138 |         if user_input != "y":
139 |             logging.error("Operation cancelled.")
140 |             sys.exit(0)
141 | 
142 |     # Validate the source file
143 |     source_path = Path(f"countries/{args.country}.json")
144 |     try:
145 |         with open(source_path, "r") as file:
146 |             content = file.read()
147 |     except FileNotFoundError:
148 |         logging.info(f"Source file {source_path} does not exist.")
149 |         raise
150 | 
151 |     # Parse the sources
152 |     scrapers, apis = get_sources(content)
153 | 
154 |     # Build the results path for this run
155 |     dt = datetime.now()
156 |     scraping_time = dt.strftime("%Y%m%d_%H%M%S")
157 |     results_path = Path(f"results/{args.country}/{scraping_time}")
158 |     results_path.mkdir(parents=True, exist_ok=True)
159 |     commit_hash = get_git_commit_hash()
160 |     with open(f"{results_path}/commit_hash.txt", "w") as file:
161 |         file.write(commit_hash)
162 |         if dirty:
163 |             file.write("\n" + "dirty" + "\n")
164 | 
165 |     # Logging
166 |     log_path = results_path / Path("log.txt")
167 |     errors_path = results_path / Path("error_log.txt")
168 |     configure_logging(log_path, errors_path)
169 | 
170 |     # Launch the scraper
171 |     df1 = main_scraper(scrapers, headless=args.headless)
172 |     df2 = main_apis(apis)
173 |     df_merged = pd.concat([df1, df2])
174 | 
175 |     dt = datetime.now()
176 |     insert_time = dt.strftime("%Y%m%d_%H%M%S")
177 |     with open(results_path / Path(f"events_{insert_time}.json"), "w", encoding="UTF-8") as file:
178 |         df_merged.to_json(file, orient="records", force_ascii=False, indent=2)
179 | 
180 |     # Push the resulting json file to the database
181 |     if args.push_to_db:
182 |         logging.info("Pushing scraped results into db...")
183 |         credentials = get_config()
184 |         host = credentials["host"]
185 |         port = credentials["port"]
186 |         user = credentials["user"]
187 |         psw = credentials["psw"]
188 |         database = credentials["database"]
189 | 
190 |         with psycopg.connect(
191 |             make_conninfo(dbname=database, user=user, password=psw, host=host, port=port)
192 |         ) as conn:
193 |             etl(conn, df_merged)
194 | 
195 |         logging.info("Done")
196 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/apis/ics.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import datetime
  3 | import json
  4 | import pytz
  5 | import re
  6 | import requests
  7 | import logging
  8 | 
  9 | from trouver_une_fresque_scraper.db.records import get_record_dict
 10 | from ics import Calendar
 11 | import re
 12 | from trouver_une_fresque_scraper.utils.errors import FreskError
 13 | from trouver_une_fresque_scraper.utils.language import detect_language_code
 14 | from trouver_une_fresque_scraper.utils.location import get_address
 15 | import xml.etree.ElementTree as ET
 16 | 
 17 | 
 18 | # from https://regexr.com/37i6s
 19 | REGEX_URL = "https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)"
 20 | 
 21 | IGNORABLE_DOMAINS = [
 22 |     "https://meet.google.com",
 23 |     "https://support.google.com",
 24 |     "https://us02web.zoom.us",
 25 | ]
 26 | 
 27 | TICKETING_TEXT = ["billetterie", "registration", "ticket", "inscription"]
 28 | 
 29 | 
 30 | # Returns a ticketing URL extracted from a description in plain text or formatted as HTML.
 31 | def get_ticketing_url_from_description(description):
 32 |     # list of tuples: (URL, anchor text if HTML document otherwise same URL)
 33 |     links = []
 34 | 
 35 |     try:
 36 |         # try as HTML document
 37 |         root = ET.fromstring(description)
 38 |         for elem in root.findall(".//a[@href]"):
 39 |             links.append((elem.get("href"), elem.text))
 40 |     except ET.ParseError:
 41 |         # fall back to plain text
 42 |         for url in re.findall(REGEX_URL, description):
 43 |             links.append((url, url))
 44 | 
 45 |     def should_link_be_kept(link):
 46 |         url = link[0]
 47 |         for domain in IGNORABLE_DOMAINS:
 48 |             if url.startswith(domain):
 49 |                 return False
 50 |         return True
 51 | 
 52 |     links = list(filter(should_link_be_kept, links))
 53 |     if len(links) == 1:
 54 |         return links[0][0]
 55 | 
 56 |     def does_text_look_like_registration(link):
 57 |         lower_text = link[1].upper()
 58 |         for text in TICKETING_TEXT:
 59 |             if lower_text.find(text) > -1:
 60 |                 return True
 61 |         return False
 62 | 
 63 |     links = list(filter(does_text_look_like_registration, links))
 64 |     if len(links) == 1:
 65 |         return links[0][0]
 66 | 
 67 |     return None
 68 | 
 69 | 
 70 | def get_ics_data(source):
 71 |     logging.info(f"Getting iCalendar data from {source['url']}")
 72 | 
 73 |     calendar = None
 74 |     records = []
 75 | 
 76 |     try:
 77 |         response = requests.get(source["url"])
 78 |         # Check if the request was successful (status code 200).
 79 |         if response.status_code == 200:
 80 |             # Remove VALARMs which incorrectly crash the ics library.
 81 |             text = re.sub("BEGIN:VALARM.*END:VALARM", "", response.text, flags=re.DOTALL)
 82 |             calendar = Calendar(text)
 83 |         else:
 84 |             logging.info(f"Request failed with status code: {response.status_code}")
 85 |     except requests.RequestException as e:
 86 |         logging.info(f"An error occurred: {e}")
 87 | 
 88 |     if not calendar:
 89 |         return records
 90 | 
 91 |     for event in calendar.events:
 92 |         logging.info(f"Processing event {event.name}")
 93 | 
 94 |         ################################################################
 95 |         # Kick out event early if it is in the past
 96 |         ################################################################
 97 |         event_start_datetime = event.begin
 98 |         event_end_datetime = event.end
 99 |         if event_start_datetime < pytz.UTC.localize(datetime.datetime.now()):
100 |             logging.info(f"Rejecting record: start time before now.")
101 |             continue
102 | 
103 |         ################################################################
104 |         # Get basic event metadata
105 |         ################################################################
106 |         event_id = event.uid
107 |         title = event.name
108 |         description = event.description
109 | 
110 |         ################################################################
111 |         # Location data, or online
112 |         ################################################################
113 |         full_location = ""
114 |         location_name = ""
115 |         address = ""
116 |         city = ""
117 |         department = ""
118 |         longitude = ""
119 |         latitude = ""
120 |         zip_code = ""
121 |         country_code = ""
122 | 
123 |         online = event.location == None
124 |         if not online:
125 |            location = event.location.lstrip()
126 |            for domain in IGNORABLE_DOMAINS:
127 |                if location.startswith(domain):
128 |                    online = True
129 |                    break
130 | 
131 |         if not online:
132 |             try:
133 |                 full_location = event.location
134 |                 address_dict = get_address(full_location.split("\n", 1).pop())
135 |                 (
136 |                     location_name,
137 |                     address,
138 |                     city,
139 |                     department,
140 |                     zip_code,
141 |                     country_code,
142 |                     latitude,
143 |                     longitude,
144 |                 ) = address_dict.values()
145 |             except FreskError as error:
146 |                 logging.info(f"Rejecting record: {error}.")
147 |                 continue
148 | 
149 |         ################################################################
150 |         # Infer more event metadata
151 |         ################################################################
152 |         title_upper = title.upper()
153 |         training = "FORMATION" in title_upper or "TRAINING" in title_upper
154 |         sold_out = False
155 |         kids = False
156 | 
157 |         ################################################################
158 |         # Get tickets link: try URL else extract from description
159 |         ################################################################
160 |         tickets_link = event.url
161 |         if not tickets_link and event.description:
162 |             tickets_link = get_ticketing_url_from_description(event.description)
163 |         if not tickets_link:
164 |             logging.warning(f"Rejecting record {event_id}: no ticket link extracted.")
165 |             continue
166 |         source_link = tickets_link
167 | 
168 |         ################################################################
169 |         # Building final object
170 |         ################################################################
171 |         record = get_record_dict(
172 |             f"{source['id']}-{event_id}",
173 |             source["id"],
174 |             title,
175 |             event_start_datetime,
176 |             event_end_datetime,
177 |             full_location,
178 |             location_name,
179 |             address,
180 |             city,
181 |             department,
182 |             zip_code,
183 |             country_code,
184 |             latitude,
185 |             longitude,
186 |             source.get("language_code", detect_language_code(title, description)),
187 |             online,
188 |             training,
189 |             sold_out,
190 |             kids,
191 |             source_link,
192 |             tickets_link,
193 |             description,
194 |         )
195 | 
196 |         records.append(record)
197 |         logging.info(f"Successfully got record\n{json.dumps(record, indent=4)}")
198 | 
199 |     logging.info(f"Got {len(records)} records.")
200 |     return records
201 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/utils/location.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | 
  4 | from functools import lru_cache
  5 | from trouver_une_fresque_scraper.utils.errors import *
  6 | 
  7 | from geopy.geocoders import Nominatim
  8 | 
  9 | geolocator = Nominatim(user_agent="trouver-une-fresque", timeout=10)
 10 | 
 11 | departments = {
 12 |     "01": "Ain",
 13 |     "02": "Aisne",
 14 |     "03": "Allier",
 15 |     "04": "Alpes-de-Haute-Provence",
 16 |     "05": "Hautes-Alpes",
 17 |     "06": "Alpes-Maritimes",
 18 |     "07": "Ardèche",
 19 |     "08": "Ardennes",
 20 |     "09": "Ariège",
 21 |     "10": "Aube",
 22 |     "11": "Aude",
 23 |     "12": "Aveyron",
 24 |     "13": "Bouches-du-Rhône",
 25 |     "14": "Calvados",
 26 |     "15": "Cantal",
 27 |     "16": "Charente",
 28 |     "17": "Charente-Maritime",
 29 |     "18": "Cher",
 30 |     "19": "Corrèze",
 31 |     "2A": "Corse-du-Sud",
 32 |     "2B": "Haute-Corse",
 33 |     "21": "Côte-d'Or",
 34 |     "22": "Côtes-d'Armor",
 35 |     "23": "Creuse",
 36 |     "24": "Dordogne",
 37 |     "25": "Doubs",
 38 |     "26": "Drôme",
 39 |     "27": "Eure",
 40 |     "28": "Eure-et-Loir",
 41 |     "29": "Finistère",
 42 |     "30": "Gard",
 43 |     "31": "Haute-Garonne",
 44 |     "32": "Gers",
 45 |     "33": "Gironde",
 46 |     "34": "Hérault",
 47 |     "35": "Ille-et-Vilaine",
 48 |     "36": "Indre",
 49 |     "37": "Indre-et-Loire",
 50 |     "38": "Isère",
 51 |     "39": "Jura",
 52 |     "40": "Landes",
 53 |     "41": "Loir-et-Cher",
 54 |     "42": "Loire",
 55 |     "43": "Haute-Loire",
 56 |     "44": "Loire-Atlantique",
 57 |     "45": "Loiret",
 58 |     "46": "Lot",
 59 |     "47": "Lot-et-Garonne",
 60 |     "48": "Lozère",
 61 |     "49": "Maine-et-Loire",
 62 |     "50": "Manche",
 63 |     "51": "Marne",
 64 |     "52": "Haute-Marne",
 65 |     "53": "Mayenne",
 66 |     "54": "Meurthe-et-Moselle",
 67 |     "55": "Meuse",
 68 |     "56": "Morbihan",
 69 |     "57": "Moselle",
 70 |     "58": "Nièvre",
 71 |     "59": "Nord",
 72 |     "60": "Oise",
 73 |     "61": "Orne",
 74 |     "62": "Pas-de-Calais",
 75 |     "63": "Puy-de-Dôme",
 76 |     "64": "Pyrénées-Atlantiques",
 77 |     "65": "Hautes-Pyrénées",
 78 |     "66": "Pyrénées-Orientales",
 79 |     "67": "Bas-Rhin",
 80 |     "68": "Haut-Rhin",
 81 |     "69": "Rhône",
 82 |     "70": "Haute-Saône",
 83 |     "71": "Saône-et-Loire",
 84 |     "72": "Sarthe",
 85 |     "73": "Savoie",
 86 |     "74": "Haute-Savoie",
 87 |     "75": "Paris",
 88 |     "76": "Seine-Maritime",
 89 |     "77": "Seine-et-Marne",
 90 |     "78": "Yvelines",
 91 |     "79": "Deux-Sèvres",
 92 |     "80": "Somme",
 93 |     "81": "Tarn",
 94 |     "82": "Tarn-et-Garonne",
 95 |     "83": "Var",
 96 |     "84": "Vaucluse",
 97 |     "85": "Vendée",
 98 |     "86": "Vienne",
 99 |     "87": "Haute-Vienne",
100 |     "88": "Vosges",
101 |     "89": "Yonne",
102 |     "90": "Territoire de Belfort",
103 |     "91": "Essonne",
104 |     "92": "Hauts-de-Seine",
105 |     "93": "Seine-Saint-Denis",
106 |     "94": "Val-de-Marne",
107 |     "95": "Val-d'Oise",
108 |     "971": "Guadeloupe",
109 |     "972": "Martinique",
110 |     "973": "Guyane",
111 |     "974": "La Réunion",
112 |     "976": "Mayotte",
113 | }
114 | 
115 | cache = {}
116 | 
117 | 
118 | @lru_cache(maxsize=None)
119 | def geocode_location_string(location_string):
120 |     """
121 |     Requests Nomatim to geocode an input string. All results are cached and
122 |     reused thanks to the @lru_cache decorator.
123 |     """
124 |     logging.info(f"Calling geocoder: {location_string}")
125 |     return geolocator.geocode(location_string, addressdetails=True)
126 | 
127 | 
128 | def get_address(full_location):
129 |     """
130 |     Gets structured location data from an input string, tries substrings if
131 |     relevant, verifies that the result is sufficiently precise (address or park
132 |     level) and returns a dictionnary with the address properties.
133 |     """
134 |     try:
135 |         if not full_location:
136 |             raise FreskAddressNotFound("")
137 | 
138 |         location = geocode_location_string(full_location)
139 |         if location is None:
140 |             full_location = re.sub(r"\(.*\)", "", full_location)
141 |             location = geocode_location_string(full_location)
142 |         if location is None:
143 |             if "," in full_location:
144 |                 location = geocode_location_string(full_location.split(",", 1)[1])
145 |         if location is None:
146 |             lines = full_location.splitlines(keepends=True)
147 |             if len(lines) > 1:
148 |                 location = geocode_location_string("".join(lines[1:]))
149 |         if location is None:
150 |             raise FreskAddressNotFound(full_location)
151 | 
152 |         address = location.raw["address"]
153 | 
154 |         if address["country_code"] != "fr" and address["country_code"] != "ch" and address["country_code"] != "gb":
155 |             raise FreskCountryNotSupported(address, full_location)
156 | 
157 |         house_number = ""
158 |         if "house_number" in address.keys():
159 |             house_number = f"{address['house_number']} "
160 | 
161 |         road = ""
162 |         if "road" in address.keys():
163 |             road = address["road"]
164 |         elif "square" in address.keys():
165 |             road = address["square"]
166 |         elif "park" in address.keys():
167 |             road = address["park"]
168 |         else:
169 |             raise FreskAddressBadFormat(address, full_location, "road")
170 | 
171 |         city = None
172 |         if "city" in address.keys():
173 |             city = address["city"]
174 |         elif "town" in address.keys():
175 |             city = address["town"]
176 |         elif "village" in address.keys():
177 |             city = address["village"]
178 |         else:
179 |             raise FreskAddressBadFormat(address, full_location, "city")
180 | 
181 |         # Trying to infer the "department" code
182 |         num_department = None
183 |         if address["country_code"] == "fr":
184 |             department = None
185 |             if "state_district" in address.keys():
186 |                 department = address["state_district"]
187 |             elif "county" in address.keys():
188 |                 department = address["county"]
189 |             elif "city_district" in address.keys():
190 |                 department = address["city_district"]
191 |             elif "state" in address.keys():
192 |                 department = address["state"]
193 |             else:
194 |                 raise FreskAddressBadFormat(address, full_location, "department")
195 |             try:
196 |                 num_department = department_to_num(department)
197 |             except FreskError:
198 |                 raise
199 |         if address["country_code"] == "ch":
200 |             # Swiss department "numbers" are ISO codes from https://en.wikipedia.org/wiki/ISO_3166-2:CH.
201 |             if "ISO3166-2-lvl4" in address.keys():
202 |                 canton = address["ISO3166-2-lvl4"]
203 |                 if not canton.startswith("CH-"):
204 |                     raise FreskAddressBadFormat(address, full_location, "department")
205 |                 num_department = canton[3:]
206 |             else:
207 |                 raise FreskAddressBadFormat(address, full_location, "department")
208 | 
209 |         # Missing fields
210 |         if "postcode" not in address:
211 |             raise FreskAddressIncomplete(address, full_location, "postcode")
212 | 
213 |     except FreskError as e:
214 |         logging.error(f"get_address: {e}")
215 |         raise
216 | 
217 |     return {
218 |         "location_name": location.raw["name"],
219 |         "address": f"{house_number}{road}",
220 |         "city": city,
221 |         "department": num_department,
222 |         "zip_code": address["postcode"],
223 |         "country_code": address["country_code"],
224 |         "latitude": location.raw["lat"],
225 |         "longitude": location.raw["lon"],
226 |     }
227 | 
228 | 
229 | def department_to_num(department):
230 |     for k, v in departments.items():
231 |         if v == department:
232 |             return k
233 |     raise FreskDepartmentNotFound(f"Department number.")
234 | 
235 | 


--------------------------------------------------------------------------------
/countries/fr.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "name": "Fresque des Nouveaux Récits",
  4 |         "url": "https://www.billetweb.fr/pro/fdnr",
  5 |         "type": "scraper",
  6 |         "iframe": "event21569",
  7 |         "id": 0
  8 |     },
  9 |     {
 10 |         "name": "Fresque Océane",
 11 |         "url": "https://www.billetweb.fr/pro/billetteriefo",
 12 |         "type": "scraper",
 13 |         "iframe": "event15247",
 14 |         "id": 1
 15 |     },
 16 |     {
 17 |         "name": "Fresque de la Biodiversité",
 18 |         "url": "https://www.billetweb.fr/multi_event.php?user=82762",
 19 |         "type": "scraper",
 20 |         "iframe": "event17309",
 21 |         "id": 2
 22 |     },
 23 |     {
 24 |         "name": "Fresque du Numérique",
 25 |         "url": "https://www.billetweb.fr/pro/atelier-fresque-du-numerique",
 26 |         "type": "scraper",
 27 |         "iframe": "event41180",
 28 |         "id": 3
 29 |     },
 30 |     {
 31 |         "name": "Fresque Agri'Alim",
 32 |         "url": "https://www.billetweb.fr/pro/fresqueagrialim",
 33 |         "type": "scraper",
 34 |         "iframe": "event11421",
 35 |         "id": 4
 36 |     },
 37 |     {
 38 |         "name": "Fresque de l'Alimentation",
 39 |         "url": "https://www.billetweb.fr/pro/fresquealimentation",
 40 |         "type": "scraper",
 41 |         "iframe": "event11155",
 42 |         "id": 5
 43 |     },
 44 |     {
 45 |         "name": "Fresque de la Construction",
 46 |         "url": "https://www.billetweb.fr/pro/fresquedelaconstruction",
 47 |         "type": "scraper",
 48 |         "iframe": "event11574",
 49 |         "id": 6
 50 |     },
 51 |     {
 52 |         "name": "Fresque de la Mobilité",
 53 |         "url": "https://app.fresquedelamobilite.org/",
 54 |         "type": "api",
 55 |         "id": 7
 56 |     },
 57 |     {
 58 |         "name": "Fresque du Sexisme",
 59 |         "url": "https://www.billetweb.fr/pro/fresque-du-sexisme",
 60 |         "type": "scraper",
 61 |         "iframe": "event27112",
 62 |         "id": 8
 63 |     },
 64 |     {
 65 |         "name": "Atelier OGRE",
 66 |         "url": "https://www.billetweb.fr/pro/atelierogre",
 67 |         "type": "scraper",
 68 |         "iframe": "event13026",
 69 |         "id": 9
 70 |     },
 71 |     {
 72 |         "name": "Atelier Nos vies bas carbone",
 73 |         "url": "https://www.billetweb.fr/multi_event.php?user=132897",
 74 |         "type": "scraper",
 75 |         "iframe": "event22230",
 76 |         "id": 10
 77 |     },
 78 |     {
 79 |         "name": "Fresque de l'Eau",
 80 |         "url": "https://www.billetweb.fr/multi_event.php?user=138110",
 81 |         "type": "scraper",
 82 |         "iframe": "eventu138110",
 83 |         "id": 11
 84 |     },
 85 |     {
 86 |         "name": "futurs proches",
 87 |         "url": "https://www.billetweb.fr/pro/futursproches",
 88 |         "type": "scraper",
 89 |         "iframe": "event14893",
 90 |         "id": 12
 91 |     },
 92 |     {
 93 |         "name": "Fresque de la Diversité",
 94 |         "url": "https://www.billetweb.fr/multi_event.php?user=168799",
 95 |         "type": "scraper",
 96 |         "iframe": "event38362",
 97 |         "id": 13
 98 |     },
 99 |     {
100 |         "name": "Fresque du Textile",
101 |         "url": "https://www.billetweb.fr/multi_event.php?user=166793",
102 |         "type": "scraper",
103 |         "iframe": "event27458",
104 |         "filter": "textile",
105 |         "id": 14
106 |     },
107 |     {
108 |         "name": "Fresque des Déchets",
109 |         "url": "https://www.billetweb.fr/multi_event.php?user=166793",
110 |         "type": "scraper",
111 |         "iframe": "event27458",
112 |         "filter": "dechet",
113 |         "id": 15
114 |     },
115 |     {
116 |         "name": "Puzzle Climat",
117 |         "url": "https://www.billetweb.fr/multi_event.php?user=121600",
118 |         "type": "scraper",
119 |         "iframe": "event21038",
120 |         "id": 16
121 |     },
122 |     {
123 |         "name": "Fresque de la Finance",
124 |         "url": "https://www.billetweb.fr/pro/fresquedelafinance",
125 |         "type": "scraper",
126 |         "iframe": "event34683",
127 |         "id": 17
128 |     },
129 |     {
130 |         "name": "Fresque de la RSE",
131 |         "url": "https://www.billetweb.fr/pro/fresque",
132 |         "type": "scraper",
133 |         "iframe": "event35904",
134 |         "id": 18
135 |     },
136 |     {
137 |         "name": "Atelier des Transitions Urbaines",
138 |         "url": "https://www.billetweb.fr/multi_event.php?user=216884",
139 |         "type": "scraper",
140 |         "iframe": "event38980",
141 |         "id": 19
142 |     },
143 |     {
144 |         "name": "2tonnes",
145 |         "url": "https://www.eventbrite.fr/o/2-tonnes-29470123869",
146 |         "type": "scraper",
147 |         "id": 100
148 |     },
149 |     {
150 |         "name": "Atelier Compte-Gouttes",
151 |         "url": "https://www.eventbrite.fr/o/atelier-compte-gouttes-73003088333",
152 |         "type": "scraper",
153 |         "id": 101
154 |     },
155 |     {
156 |         "name": "Fresque du Bénévolat",
157 |         "url": "https://www.eventbrite.fr/o/jeveuxaidergouvfr-77010082313",
158 |         "type": "scraper",
159 |         "id": 102
160 |     },
161 |     {
162 |         "name": "Fresque du Plastique",
163 |         "url": "https://www.eventbrite.fr/o/la-fresque-du-plastique-45763194553",
164 |         "type": "scraper",
165 |         "id": 103
166 |     },
167 |     {
168 |         "name": "Cyber Fresque",
169 |         "url": "https://www.eventbrite.fr/o/senscyb-89802295343",
170 |         "type": "scraper",
171 |         "id": 104
172 |     },
173 |     {
174 |         "name": "Fresque du Climat (ateliers)",
175 |         "url": "https://fresqueduclimat.org/inscription-atelier/grand-public/",
176 |         "type": "scraper",
177 |         "id": 200
178 |     },
179 |     {
180 |         "name": "Fresque du Climat (formations)",
181 |         "url": "https://fresqueduclimat.org/inscription-formation/grand-public/",
182 |         "type": "scraper",
183 |         "id": 200
184 |     },
185 |     {
186 |         "name": "Fresque de l'Economie Circulaire",
187 |         "url": "https://www.billetweb.fr/multi_event.php?user=246258",
188 |         "type": "scraper",
189 |         "iframe": "event41148",
190 |         "id": 300
191 |     },
192 |     {
193 |         "name": "Fresque des Frontières Planétaires (ateliers)",
194 |         "url": "https://1erdegre.glide.page/dl/3b1bc8",
195 |         "type": "scraper",
196 |         "id": 500,
197 |         "filter": "Fresque des frontières planétaires"
198 |     },
199 |     {
200 |         "name": "Fresque des Frontières Planétaires (formations)",
201 |         "url": "https://1erdegre.glide.page/dl/dcc150",
202 |         "type": "scraper",
203 |         "id": 500,
204 |         "filter": "Fresque des frontières planétaires"
205 |     },
206 |     {
207 |         "name": "Horizons Décarbonés (ateliers)",
208 |         "url": "https://1erdegre.glide.page/dl/3b1bc8",
209 |         "type": "scraper",
210 |         "id": 501,
211 |         "filter": "Horizons Décarbonés"
212 |     },
213 |     {
214 |         "name": "Horizons Décarbonés (formations)",
215 |         "url": "https://1erdegre.glide.page/dl/dcc150",
216 |         "type": "scraper",
217 |         "id": 501,
218 |         "filter": "Horizons Décarbonés"
219 |     },
220 |     {
221 |         "name": "30 Glorieuses",
222 |         "url": "https://hook.eu1.make.com/koqwhb0igq5air3aysx58rsjeld1uacl",
223 |         "type": "api",
224 |         "id": 600
225 |     },
226 |     {
227 |         "name": "Fresque de la Rénovation",
228 |         "url": "https://www.helloasso.com/associations/fresque-de-la-renovation",
229 |         "type": "scraper",
230 |         "id": 700
231 |     },
232 |     {
233 |         "name": "Fresque de l'Energie",
234 |         "url": "https://www.helloasso.com/associations/la-fresque-de-l-energie",
235 |         "type": "scraper",
236 |         "id": 701
237 |     },
238 |     {
239 |         "name": "Fresque des Possibles",
240 |         "url": "https://www.helloasso.com/associations/le-lieu-dit",
241 |         "type": "scraper",
242 |         "id": 702
243 |     },
244 |     {
245 |         "name": "Fresque de la Communication",
246 |         "url": "https://www.helloasso.com/associations/la-fresque-de-la-communication",
247 |         "type": "scraper",
248 |         "id": 703
249 |     },
250 |     {
251 |         "name": "Zoofresque",
252 |         "url": "https://www.helloasso.com/associations/ajas-association-justice-animaux-savoie",
253 |         "type": "scraper",
254 |         "id": 704
255 |     },
256 |     {
257 |         "name": "Notre Tour",
258 |         "url": "https://www.helloasso.com/associations/mush",
259 |         "type": "scraper",
260 |         "id": 705
261 |     },
262 |     {
263 |         "name": "Fresque du Sol",
264 |         "url": "https://framagenda.org/remote.php/dav/public-calendars/KwNwGA232xD38CnN/?export",
265 |         "type": "api",
266 |         "id": 801
267 |     }
268 | ]
269 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/scraper/helloasso.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import time
  4 | import logging
  5 | 
  6 | from selenium import webdriver
  7 | from selenium.common.exceptions import NoSuchElementException, TimeoutException
  8 | from selenium.webdriver.common.by import By
  9 | from selenium.webdriver.support.ui import WebDriverWait
 10 | from selenium.webdriver.support import expected_conditions as EC
 11 | 
 12 | from trouver_une_fresque_scraper.db.records import get_record_dict
 13 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates
 14 | from trouver_une_fresque_scraper.utils.errors import (
 15 |     FreskError,
 16 |     FreskDateNotFound,
 17 |     FreskDateBadFormat,
 18 | )
 19 | from trouver_une_fresque_scraper.utils.keywords import *
 20 | from trouver_une_fresque_scraper.utils.language import detect_language_code
 21 | from trouver_une_fresque_scraper.utils.location import get_address
 22 | 
 23 | 
 24 | def scroll_to_bottom(driver):
 25 |     while True:
 26 |         logging.info("Scrolling to the bottom...")
 27 |         try:
 28 |             time.sleep(2)
 29 |             next_button = WebDriverWait(driver, 10).until(
 30 |                 EC.element_to_be_clickable(
 31 |                     (
 32 |                         By.CSS_SELECTOR,
 33 |                         'button[data-hook="load-more-button"]',
 34 |                     )
 35 |                 )
 36 |             )
 37 |             desired_y = (next_button.size["height"] / 2) + next_button.location["y"]
 38 |             window_h = driver.execute_script("return window.innerHeight")
 39 |             window_y = driver.execute_script("return window.pageYOffset")
 40 |             current_y = (window_h / 2) + window_y
 41 |             scroll_y_by = desired_y - current_y
 42 |             driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by)
 43 |             time.sleep(2)
 44 |             next_button.click()
 45 |         except TimeoutException:
 46 |             break
 47 | 
 48 | 
 49 | def get_helloasso_data(sources, service, options):
 50 |     logging.info("Scraping data from helloasso.com")
 51 | 
 52 |     driver = webdriver.Firefox(service=service, options=options)
 53 | 
 54 |     records = []
 55 | 
 56 |     for page in sources:
 57 |         logging.info(f"==================\nProcessing page {page}")
 58 |         driver.get(page["url"])
 59 |         driver.implicitly_wait(5)
 60 |         time.sleep(3)
 61 | 
 62 |         # Scroll to bottom to load all events
 63 |         desired_y = 2300
 64 |         window_h = driver.execute_script("return window.innerHeight")
 65 |         window_y = driver.execute_script("return window.pageYOffset")
 66 |         current_y = (window_h / 2) + window_y
 67 |         scroll_y_by = desired_y - current_y
 68 |         driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by)
 69 |         time.sleep(5)
 70 | 
 71 |         try:
 72 |             button = driver.find_element(
 73 |                 By.XPATH,
 74 |                 '//button[@data-ux="Explore_OrganizationPublicPage_Actions_ActionEvent_ShowAllActions"]',
 75 |             )
 76 |             button.click()
 77 |         except NoSuchElementException:
 78 |             pass
 79 | 
 80 |         ele = driver.find_elements(By.CSS_SELECTOR, "a.ActionLink-Event")
 81 |         links = [e.get_attribute("href") for e in ele]
 82 |         num_el = len(ele)
 83 |         logging.info(f"Found {num_el} elements")
 84 | 
 85 |         for link in links:
 86 |             logging.info(f"\n-> Processing {link} ...")
 87 |             driver.get(link)
 88 |             driver.implicitly_wait(3)
 89 | 
 90 |             ################################################################
 91 |             # Parse event id
 92 |             ################################################################
 93 |             uuid = link.split("/")[-1]
 94 |             if not uuid:
 95 |                 logging.info("Rejecting record: UUID not found")
 96 |                 continue
 97 | 
 98 |             ################################################################
 99 |             # Parse event title
100 |             ################################################################
101 |             title_el = driver.find_element(
102 |                 by=By.TAG_NAME,
103 |                 value="h1",
104 |             )
105 |             title = title_el.text
106 | 
107 |             ################################################################
108 |             # Parse start and end dates
109 |             ################################################################
110 |             try:
111 |                 date_info_el = driver.find_element(
112 |                     by=By.CSS_SELECTOR,
113 |                     value="span.CampaignHeader--Date",
114 |                 )
115 |                 event_time = date_info_el.text
116 |             except NoSuchElementException as error:
117 |                 logging.info(f"Reject record: {error}")
118 |                 continue
119 | 
120 |             try:
121 |                 event_start_datetime, event_end_datetime = get_dates(event_time)
122 |             except Exception as e:
123 |                 logging.info(f"Rejecting record: {e}")
124 |                 continue
125 | 
126 |             ################################################################
127 |             # Is it an online event?
128 |             ################################################################
129 |             online = is_online(title)
130 | 
131 |             ################################################################
132 |             # Location data
133 |             ################################################################
134 |             full_location = ""
135 |             location_name = ""
136 |             address = ""
137 |             city = ""
138 |             department = ""
139 |             longitude = ""
140 |             latitude = ""
141 |             zip_code = ""
142 |             country_code = ""
143 | 
144 |             if not online:
145 |                 try:
146 |                     location_el = driver.find_element(
147 |                         By.CSS_SELECTOR, "section.CardAddress--Location"
148 |                     )
149 |                 except NoSuchElementException:
150 |                     logging.info("Rejecting record: no location")
151 |                     continue
152 | 
153 |                 full_location = location_el.text
154 | 
155 |                 try:
156 |                     address_dict = get_address(full_location)
157 |                     (
158 |                         location_name,
159 |                         address,
160 |                         city,
161 |                         department,
162 |                         zip_code,
163 |                         country_code,
164 |                         latitude,
165 |                         longitude,
166 |                     ) = address_dict.values()
167 |                 except FreskError as error:
168 |                     logging.info(f"Rejecting record: {error}.")
169 |                     continue
170 | 
171 |             ################################################################
172 |             # Description
173 |             ################################################################
174 |             try:
175 |                 description_el = driver.find_element(
176 |                     By.CSS_SELECTOR, "div.CampaignHeader--Description"
177 |                 )
178 |             except NoSuchElementException:
179 |                 logging.info(f"Rejecting record: no description")
180 |                 continue
181 | 
182 |             description = description_el.text
183 | 
184 |             ################################################################
185 |             # Training?
186 |             ################################################################
187 |             training = is_training(title)
188 | 
189 |             ################################################################
190 |             # Is it full?
191 |             ################################################################
192 |             sold_out = False
193 | 
194 |             ################################################################
195 |             # Is it suited for kids?
196 |             ################################################################
197 |             kids = is_for_kids(title)
198 | 
199 |             ################################################################
200 |             # Parse tickets link
201 |             ################################################################
202 |             tickets_link = link
203 | 
204 |             ################################################################
205 |             # Building final object
206 |             ################################################################
207 |             record = get_record_dict(
208 |                 f"{page['id']}-{uuid}",
209 |                 page["id"],
210 |                 title,
211 |                 event_start_datetime,
212 |                 event_end_datetime,
213 |                 full_location,
214 |                 location_name,
215 |                 address,
216 |                 city,
217 |                 department,
218 |                 zip_code,
219 |                 country_code,
220 |                 latitude,
221 |                 longitude,
222 |                 page.get(
223 |                     "language_code",
224 |                     detect_language_code(title, description),
225 |                 ),
226 |                 online,
227 |                 training,
228 |                 sold_out,
229 |                 kids,
230 |                 link,
231 |                 link,
232 |                 description,
233 |             )
234 | 
235 |             records.append(record)
236 |             logging.info(f"Successfully scraped {link}\n{json.dumps(record, indent=4)}")
237 | 
238 |     driver.quit()
239 | 
240 |     return records
241 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/scraper/fec.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | import logging
  4 | 
  5 | from selenium import webdriver
  6 | from selenium.common.exceptions import NoSuchElementException, TimeoutException
  7 | from selenium.webdriver.common.by import By
  8 | from selenium.webdriver.support.ui import WebDriverWait
  9 | from selenium.webdriver.support import expected_conditions as EC
 10 | 
 11 | from trouver_une_fresque_scraper.db.records import get_record_dict
 12 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates
 13 | from trouver_une_fresque_scraper.utils.errors import (
 14 |     FreskError,
 15 |     FreskDateBadFormat,
 16 |     FreskDateNotFound,
 17 |     FreskDateDifferentTimezone,
 18 | )
 19 | from trouver_une_fresque_scraper.utils.keywords import *
 20 | from trouver_une_fresque_scraper.utils.language import detect_language_code
 21 | from trouver_une_fresque_scraper.utils.location import get_address
 22 | 
 23 | 
 24 | def scroll_to_bottom(driver):
 25 |     while True:
 26 |         logging.info("Scrolling to the bottom...")
 27 |         try:
 28 |             time.sleep(2)
 29 |             next_button = WebDriverWait(driver, 10).until(
 30 |                 EC.element_to_be_clickable(
 31 |                     (
 32 |                         By.CSS_SELECTOR,
 33 |                         'button[data-hook="load-more-button"]',
 34 |                     )
 35 |                 )
 36 |             )
 37 |             desired_y = (next_button.size["height"] / 2) + next_button.location["y"]
 38 |             window_h = driver.execute_script("return window.innerHeight")
 39 |             window_y = driver.execute_script("return window.pageYOffset")
 40 |             current_y = (window_h / 2) + window_y
 41 |             scroll_y_by = desired_y - current_y
 42 |             driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by)
 43 |             time.sleep(2)
 44 |             next_button.click()
 45 |         except TimeoutException:
 46 |             break
 47 | 
 48 | 
 49 | def get_fec_data(sources, service, options):
 50 |     logging.info("Scraping data from lafresquedeleconomiecirculaire.com")
 51 | 
 52 |     driver = webdriver.Firefox(service=service, options=options)
 53 | 
 54 |     records = []
 55 | 
 56 |     for page in sources:
 57 |         logging.info("========================")
 58 |         driver.get(page["url"])
 59 |         driver.implicitly_wait(2)
 60 | 
 61 |         # Scroll to bottom to load all events
 62 |         scroll_to_bottom(driver)
 63 |         driver.execute_script("window.scrollTo(0, 0);")
 64 | 
 65 |         ele = driver.find_elements(
 66 |             By.CSS_SELECTOR, 'li[data-hook="events-card"] a[data-hook="title"]'
 67 |         )
 68 |         links = [e.get_attribute("href") for e in ele]
 69 | 
 70 |         # Only events published on lafresquedeleconomiecirculaire.com can be extracted
 71 |         links = [l for l in links if "lafresquedeleconomiecirculaire.com" in l]
 72 | 
 73 |         for link in links:
 74 |             logging.info(f"\n-> Processing {link} ...")
 75 |             driver.get(link)
 76 |             driver.implicitly_wait(3)
 77 |             time.sleep(5)
 78 | 
 79 |             ################################################################
 80 |             # Parse event id
 81 |             ################################################################
 82 |             # Define the regex pattern for UUIDs
 83 |             uuid = link.split("/event-details/")[-1]
 84 |             if not uuid:
 85 |                 logging.info("Rejecting record: UUID not found")
 86 |                 continue
 87 | 
 88 |             ################################################################
 89 |             # Parse event title
 90 |             ################################################################
 91 |             title_el = driver.find_element(
 92 |                 by=By.TAG_NAME,
 93 |                 value="h1",
 94 |             )
 95 |             title = title_el.text
 96 | 
 97 |             ################################################################
 98 |             # Parse start and end dates
 99 |             ################################################################
100 |             try:
101 |                 date_info_el = driver.find_element(
102 |                     by=By.CSS_SELECTOR,
103 |                     value='p[data-hook="event-full-date"]',
104 |                 )
105 |                 event_time = date_info_el.text
106 |             except NoSuchElementException:
107 |                 raise FreskDateNotFound
108 | 
109 |             try:
110 |                 event_start_datetime, event_end_datetime = get_dates(event_time)
111 |             except FreskDateBadFormat as error:
112 |                 logging.info(f"Reject record: {error}")
113 |                 continue
114 | 
115 |             ################################################################
116 |             # Is it an online event?
117 |             ################################################################
118 |             online = False
119 |             try:
120 |                 online_el = driver.find_element(
121 |                     By.CSS_SELECTOR, 'p[data-hook="event-full-location"]'
122 |                 )
123 |                 if is_online(online_el.text):
124 |                     online = True
125 |             except NoSuchElementException:
126 |                 pass
127 | 
128 |             ################################################################
129 |             # Location data
130 |             ################################################################
131 |             full_location = ""
132 |             location_name = ""
133 |             address = ""
134 |             city = ""
135 |             department = ""
136 |             longitude = ""
137 |             latitude = ""
138 |             zip_code = ""
139 |             country_code = ""
140 | 
141 |             if not online:
142 |                 location_el = driver.find_element(
143 |                     By.CSS_SELECTOR, 'p[data-hook="event-full-location"]'
144 |                 )
145 |                 full_location = location_el.text
146 | 
147 |                 try:
148 |                     address_dict = get_address(full_location)
149 |                     (
150 |                         location_name,
151 |                         address,
152 |                         city,
153 |                         department,
154 |                         zip_code,
155 |                         country_code,
156 |                         latitude,
157 |                         longitude,
158 |                     ) = address_dict.values()
159 |                 except FreskError as error:
160 |                     logging.info(f"Rejecting record: {error}.")
161 |                     continue
162 | 
163 |             ################################################################
164 |             # Description
165 |             ################################################################
166 |             driver.execute_script("window.scrollBy(0, document.body.scrollHeight);")
167 | 
168 |             # Click on "show more" button
169 |             try:
170 |                 show_more_el = driver.find_element(
171 |                     By.CSS_SELECTOR, 'button[data-hook="about-section-button"]'
172 |                 )
173 |                 show_more_el.click()
174 |             except NoSuchElementException:
175 |                 pass
176 | 
177 |             try:
178 |                 description_el = driver.find_element(
179 |                     By.CSS_SELECTOR, 'div[data-hook="about-section-text"]'
180 |                 )
181 |             except NoSuchElementException:
182 |                 try:
183 |                     description_el = driver.find_element(
184 |                         By.CSS_SELECTOR, 'div[data-hook="about-section"]'
185 |                     )
186 |                 except NoSuchElementException:
187 |                     logging.info(f"Rejecting record: no description")
188 |                     continue
189 | 
190 |             description = description_el.text
191 | 
192 |             ################################################################
193 |             # Training?
194 |             ################################################################
195 |             training = is_training(title)
196 | 
197 |             ################################################################
198 |             # Is it full?
199 |             ################################################################
200 |             sold_out = True
201 |             try:
202 |                 _ = driver.find_element(
203 |                     by=By.CSS_SELECTOR,
204 |                     value='div[data-hook="event-sold-out"]',
205 |                 )
206 |             except NoSuchElementException:
207 |                 sold_out = False
208 | 
209 |             ################################################################
210 |             # Is it suited for kids?
211 |             ################################################################
212 |             kids = is_for_kids(title)
213 | 
214 |             ################################################################
215 |             # Parse tickets link
216 |             ################################################################
217 |             tickets_link = link
218 | 
219 |             ################################################################
220 |             # Building final object
221 |             ################################################################
222 |             record = get_record_dict(
223 |                 f"{page['id']}-{uuid}",
224 |                 page["id"],
225 |                 title,
226 |                 event_start_datetime,
227 |                 event_end_datetime,
228 |                 full_location,
229 |                 location_name,
230 |                 address,
231 |                 city,
232 |                 department,
233 |                 zip_code,
234 |                 country_code,
235 |                 latitude,
236 |                 longitude,
237 |                 page.get(
238 |                     "language_code",
239 |                     detect_language_code(title, description),
240 |                 ),
241 |                 online,
242 |                 training,
243 |                 sold_out,
244 |                 kids,
245 |                 link,
246 |                 tickets_link,
247 |                 description,
248 |             )
249 | 
250 |             records.append(record)
251 |             logging.info(f"Successfully scraped {link}\n{json.dumps(record, indent=4)}")
252 | 
253 |     driver.quit()
254 | 
255 |     return records
256 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/scraper/fdc.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import time
  4 | import logging
  5 | 
  6 | from selenium import webdriver
  7 | from selenium.common.exceptions import NoSuchElementException, TimeoutException
  8 | from selenium.webdriver.common.by import By
  9 | from selenium.webdriver.support.ui import WebDriverWait
 10 | from selenium.webdriver.support import expected_conditions as EC
 11 | 
 12 | from trouver_une_fresque_scraper.db.records import get_record_dict
 13 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates
 14 | from trouver_une_fresque_scraper.utils.errors import (
 15 |     FreskError,
 16 |     FreskDateBadFormat,
 17 |     FreskLanguageNotRecognized,
 18 | )
 19 | from trouver_une_fresque_scraper.utils.keywords import *
 20 | from trouver_une_fresque_scraper.utils.language import get_language_code
 21 | from trouver_une_fresque_scraper.utils.location import get_address
 22 | 
 23 | 
 24 | def get_fdc_data(sources, service, options):
 25 |     logging.info("Scraping data from fresqueduclimat.org")
 26 | 
 27 |     driver = webdriver.Firefox(service=service, options=options)
 28 | 
 29 |     records = []
 30 | 
 31 |     for page in sources:
 32 |         logging.info("========================")
 33 |         driver.get(page["url"])
 34 |         driver.implicitly_wait(2)
 35 | 
 36 |         wait = WebDriverWait(driver, 10)
 37 |         iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe")))
 38 |         driver.switch_to.frame(iframe)
 39 | 
 40 |         while True:
 41 |             ele = driver.find_elements(By.CSS_SELECTOR, "a.link-dark")
 42 |             links = [e.get_attribute("href") for e in ele]
 43 | 
 44 |             for link in links:
 45 |                 logging.info(f"\n-> Processing {link} ...")
 46 |                 driver.get(link)
 47 |                 driver.implicitly_wait(3)
 48 | 
 49 |                 ################################################################
 50 |                 # Parse event id
 51 |                 ################################################################
 52 |                 # Define the regex pattern for UUIDs
 53 |                 uuid_pattern = (
 54 |                     r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"
 55 |                 )
 56 |                 uuids = re.findall(uuid_pattern, link)
 57 |                 if not uuids:
 58 |                     logging.info("Rejecting record: UUID not found")
 59 |                     driver.back()
 60 |                     wait = WebDriverWait(driver, 10)
 61 |                     iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe")))
 62 |                     driver.switch_to.frame(iframe)
 63 |                     continue
 64 | 
 65 |                 ################################################################
 66 |                 # Parse event title
 67 |                 ################################################################
 68 |                 title_el = driver.find_element(
 69 |                     by=By.TAG_NAME,
 70 |                     value="h3",
 71 |                 )
 72 |                 title = title_el.text
 73 | 
 74 |                 ################################################################
 75 |                 # Parse start and end dates
 76 |                 ################################################################
 77 |                 clock_icon = driver.find_element(By.CLASS_NAME, "fa-clock")
 78 |                 parent_div = clock_icon.find_element(By.XPATH, "..")
 79 |                 event_time = parent_div.text
 80 | 
 81 |                 try:
 82 |                     event_start_datetime, event_end_datetime = get_dates(event_time)
 83 |                 except FreskDateBadFormat as error:
 84 |                     logging.info(f"Reject record: {error}")
 85 |                     driver.back()
 86 |                     wait = WebDriverWait(driver, 10)
 87 |                     iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe")))
 88 |                     driver.switch_to.frame(iframe)
 89 |                     continue
 90 | 
 91 |                 ################################################################
 92 |                 # Workshop language
 93 |                 ################################################################
 94 |                 language_code = None
 95 |                 try:
 96 |                     globe_in_event = driver.find_element(
 97 |                         By.XPATH, '//div[contains(@class, "mb-3")]/i[contains(@class, "fa-globe")]'
 98 |                     )
 99 |                     parent = globe_in_event.find_element(By.XPATH, "..")
100 |                     language_code = get_language_code(parent.text)
101 |                 except FreskLanguageNotRecognized as e:
102 |                     logging.warning(f"Unable to parse workshop language: {e}")
103 |                     language_code = None
104 |                 except NoSuchElementException:
105 |                     logging.warning("Unable to find workshop language on the page.")
106 |                     language_code = None
107 | 
108 |                 ################################################################
109 |                 # Is it an online event?
110 |                 ################################################################
111 |                 online = True
112 |                 try:
113 |                     driver.find_element(By.CLASS_NAME, "fa-video")
114 |                 except NoSuchElementException:
115 |                     online = False
116 | 
117 |                 ################################################################
118 |                 # Location data
119 |                 ################################################################
120 |                 full_location = ""
121 |                 location_name = ""
122 |                 address = ""
123 |                 city = ""
124 |                 department = ""
125 |                 longitude = ""
126 |                 latitude = ""
127 |                 zip_code = ""
128 |                 country_code = ""
129 | 
130 |                 if not online:
131 |                     pin_icon = driver.find_element(By.CLASS_NAME, "fa-map-pin")
132 |                     parent_div = pin_icon.find_element(By.XPATH, "..")
133 |                     full_location = parent_div.text
134 | 
135 |                     try:
136 |                         logging.info(f"Full location: {full_location}")
137 |                         address_dict = get_address(full_location)
138 |                         (
139 |                             location_name,
140 |                             address,
141 |                             city,
142 |                             department,
143 |                             zip_code,
144 |                             country_code,
145 |                             latitude,
146 |                             longitude,
147 |                         ) = address_dict.values()
148 |                     except FreskError as error:
149 |                         logging.info(f"Rejecting record: {error}.")
150 |                         driver.back()
151 |                         wait = WebDriverWait(driver, 10)
152 |                         iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe")))
153 |                         driver.switch_to.frame(iframe)
154 |                         continue
155 | 
156 |                 ################################################################
157 |                 # Description
158 |                 ################################################################
159 |                 description_title_el = driver.find_element(
160 |                     By.XPATH, "//strong[text()='Description']"
161 |                 )
162 |                 parent_description_el = description_title_el.find_element(By.XPATH, "..")
163 |                 description = parent_description_el.text
164 | 
165 |                 ################################################################
166 |                 # Training?
167 |                 ################################################################
168 |                 training = is_training(title)
169 | 
170 |                 ################################################################
171 |                 # Is it full?
172 |                 ################################################################
173 |                 user_icon = driver.find_element(By.CLASS_NAME, "fa-user")
174 |                 parent_container = user_icon.find_element(By.XPATH, "../..")
175 |                 sold_out = is_sold_out(parent_container.text)
176 | 
177 |                 ################################################################
178 |                 # Is it suited for kids?
179 |                 ################################################################
180 |                 kids = is_for_kids(description) and not training
181 | 
182 |                 ################################################################
183 |                 # Parse tickets link
184 |                 ################################################################
185 |                 user_icon = driver.find_element(By.CLASS_NAME, "fa-user")
186 |                 parent_link = user_icon.find_element(By.XPATH, "..")
187 |                 tickets_link = parent_link.get_attribute("href")
188 | 
189 |                 ################################################################
190 |                 # Building final object
191 |                 ################################################################
192 |                 record = get_record_dict(
193 |                     f"{page['id']}-{uuids[0]}",
194 |                     page["id"],
195 |                     title,
196 |                     event_start_datetime,
197 |                     event_end_datetime,
198 |                     full_location,
199 |                     location_name,
200 |                     address,
201 |                     city,
202 |                     department,
203 |                     zip_code,
204 |                     country_code,
205 |                     latitude,
206 |                     longitude,
207 |                     language_code,
208 |                     online,
209 |                     training,
210 |                     sold_out,
211 |                     kids,
212 |                     link,
213 |                     tickets_link,
214 |                     description,
215 |                 )
216 | 
217 |                 records.append(record)
218 |                 logging.info(f"Successfully scraped {link}\n{json.dumps(record, indent=4)}")
219 | 
220 |                 driver.back()
221 |                 wait = WebDriverWait(driver, 10)
222 |                 iframe = wait.until(EC.presence_of_element_located((By.TAG_NAME, "iframe")))
223 |                 driver.switch_to.frame(iframe)
224 | 
225 |             try:
226 |                 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
227 |                 driver.implicitly_wait(2)
228 |                 time.sleep(2)
229 |                 next_button = WebDriverWait(driver, 10).until(
230 |                     EC.element_to_be_clickable(
231 |                         (
232 |                             By.XPATH,
233 |                             "//a[@class='page-link' and contains(text(), 'Suivant')]",
234 |                         )
235 |                     )
236 |                 )
237 |                 next_button.location_once_scrolled_into_view
238 |                 time.sleep(2)
239 |                 next_button.click()
240 |                 time.sleep(10)
241 |             except TimeoutException:
242 |                 break
243 | 
244 |     driver.quit()
245 | 
246 |     return records
247 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/scraper/glide.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import re
  3 | import json
  4 | import logging
  5 | 
  6 | from selenium import webdriver
  7 | from selenium.common.exceptions import NoSuchElementException, TimeoutException
  8 | from selenium.webdriver.common.by import By
  9 | from selenium.webdriver.support.ui import WebDriverWait
 10 | from selenium.webdriver.support import expected_conditions as EC
 11 | 
 12 | from trouver_une_fresque_scraper.db.records import get_record_dict
 13 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates
 14 | from trouver_une_fresque_scraper.utils.errors import FreskError
 15 | from trouver_une_fresque_scraper.utils.keywords import *
 16 | from trouver_une_fresque_scraper.utils.language import detect_language_code
 17 | from trouver_une_fresque_scraper.utils.location import get_address
 18 | 
 19 | 
 20 | def get_glide_data(sources, service, options):
 21 |     logging.info("Scraping data from glide.page")
 22 | 
 23 |     driver = webdriver.Firefox(service=service, options=options)
 24 | 
 25 |     records = []
 26 | 
 27 |     for page in sources:
 28 |         logging.info(f"==================\nProcessing page {page}")
 29 |         driver.get(page["url"])
 30 |         driver.implicitly_wait(10)
 31 |         time.sleep(20)
 32 | 
 33 |         tab_button_element = driver.find_element(
 34 |             By.XPATH,
 35 |             f"//div[contains(@class, 'button-text') and text()='{page['filter']}']",
 36 |         )
 37 |         tab_button_element.click()
 38 | 
 39 |         # Maybe there are multiple pages, so we loop.
 40 |         while True:
 41 |             time.sleep(5)
 42 |             ele = driver.find_elements(
 43 |                 By.XPATH,
 44 |                 "//div[contains(@class, 'collection-item') and @role='button']",
 45 |             )
 46 |             num_el = len(ele)
 47 |             logging.info(f"Found {num_el} elements")
 48 | 
 49 |             for i in range(num_el):
 50 |                 time.sleep(5)
 51 |                 ele = driver.find_elements(
 52 |                     By.XPATH,
 53 |                     "//div[contains(@class, 'collection-item') and @role='button']",
 54 |                 )
 55 | 
 56 |                 # The following is ugly, but necessary as elements are loaded dynamically in JS.
 57 |                 # We have to make sure that all elements are loaded before proceeding.
 58 |                 max_tries = 10
 59 |                 count = 0
 60 |                 while len(ele) != num_el:
 61 |                     driver.refresh()
 62 |                     time.sleep(5)
 63 |                     ele = driver.find_elements(
 64 |                         By.XPATH,
 65 |                         "//div[contains(@class, 'collection-item') and @role='button']",
 66 |                     )
 67 | 
 68 |                     count += 1
 69 |                     if count == max_tries:
 70 |                         raise RuntimeError(
 71 |                             f"Cannot load the {num_el} JS elements after {count} tries."
 72 |                         )
 73 | 
 74 |                 el = ele[i]
 75 |                 el.click()
 76 | 
 77 |                 time.sleep(5)
 78 |                 link = driver.current_url
 79 |                 logging.info(f"\n-> Processing {link} ...")
 80 |                 driver.implicitly_wait(3)
 81 | 
 82 |                 ################################################################
 83 |                 # Is it canceled?
 84 |                 ################################################################
 85 |                 try:
 86 |                     # Attempt to find the div element by its id
 87 |                     large_title_el = driver.find_element(By.CSS_SELECTOR, "h2.headlineMedium")
 88 |                     large_title = large_title_el.text
 89 |                     if is_canceled(large_title):
 90 |                         logging.info("Rejecting record: canceled")
 91 |                         driver.back()
 92 |                         continue
 93 |                 except NoSuchElementException:
 94 |                     pass
 95 | 
 96 |                 ################################################################
 97 |                 # Parse event id
 98 |                 ################################################################
 99 |                 uuid = link.split("/")[-1]
100 |                 if not uuid:
101 |                     logging.info("Rejecting record: UUID not found")
102 |                     driver.back()
103 |                     continue
104 | 
105 |                 ################################################################
106 |                 # Parse event title
107 |                 ################################################################
108 |                 title_el = driver.find_element(by=By.CSS_SELECTOR, value="h2.headlineSmall")
109 |                 title = title_el.text
110 | 
111 |                 ################################################################
112 |                 # Parse start and end dates
113 |                 ################################################################
114 |                 time_el = driver.find_element(
115 |                     by=By.XPATH,
116 |                     value="//li/div[contains(text(), 'Date')]",
117 |                 )
118 |                 parent_el = time_el.find_element(by=By.XPATH, value="..")
119 |                 event_time_el = parent_el.find_element(by=By.XPATH, value="./*[2]")
120 |                 event_time = event_time_el.text.lower()
121 | 
122 |                 try:
123 |                     event_start_datetime, event_end_datetime = get_dates(event_time)
124 |                 except Exception as e:
125 |                     logging.info(f"Rejecting record: {e}")
126 |                     driver.back()
127 |                     continue
128 | 
129 |                 ################################################################
130 |                 # Is it an online event?
131 |                 ################################################################
132 |                 time_label_el = driver.find_element(
133 |                     by=By.XPATH,
134 |                     value="//li/div[contains(text(), 'Format')]",
135 |                 )
136 |                 parent_el = time_label_el.find_element(by=By.XPATH, value="..")
137 |                 online_el = parent_el.find_element(by=By.XPATH, value="./*[2]")
138 |                 online = is_online(online_el.text)
139 | 
140 |                 ################################################################
141 |                 # Location data
142 |                 ################################################################
143 |                 full_location = ""
144 |                 location_name = ""
145 |                 address = ""
146 |                 city = ""
147 |                 department = ""
148 |                 longitude = ""
149 |                 latitude = ""
150 |                 zip_code = ""
151 |                 country_code = ""
152 | 
153 |                 if not online:
154 |                     try:
155 |                         address_label_el = driver.find_element(
156 |                             by=By.XPATH,
157 |                             value="//li/div[contains(text(), 'Adresse')]",
158 |                         )
159 |                         parent_el = address_label_el.find_element(by=By.XPATH, value="..")
160 |                         address_el = parent_el.find_element(by=By.XPATH, value="./*[2]")
161 |                     except Exception:
162 |                         logging.info("Rejecting record: empty address")
163 |                         driver.back()
164 |                         continue
165 | 
166 |                     full_location = address_el.text
167 | 
168 |                     try:
169 |                         address_dict = get_address(full_location)
170 |                         (
171 |                             location_name,
172 |                             address,
173 |                             city,
174 |                             department,
175 |                             zip_code,
176 |                             country_code,
177 |                             latitude,
178 |                             longitude,
179 |                         ) = address_dict.values()
180 |                     except FreskError as error:
181 |                         logging.info(f"Rejecting record: {error}.")
182 |                         driver.back()
183 |                         continue
184 | 
185 |                 ################################################################
186 |                 # Description
187 |                 ################################################################
188 |                 description_label_el = driver.find_element(
189 |                     by=By.XPATH,
190 |                     value="//li/div[contains(text(), 'Description')]",
191 |                 )
192 |                 parent_el = description_label_el.find_element(by=By.XPATH, value="..")
193 |                 description_el = parent_el.find_element(by=By.XPATH, value="./*[2]")
194 |                 description = description_el.text
195 | 
196 |                 ################################################################
197 |                 # Training?
198 |                 ################################################################
199 |                 training = is_training(title)
200 | 
201 |                 ################################################################
202 |                 # Is it full?
203 |                 ################################################################
204 |                 attendees_label_el = driver.find_element(
205 |                     by=By.XPATH,
206 |                     value="//li/div[contains(text(), 'participant')]",
207 |                 )
208 |                 parent_el = attendees_label_el.find_element(by=By.XPATH, value="..")
209 |                 attendees_el = parent_el.find_element(by=By.XPATH, value="./*[2]")
210 |                 attendees = attendees_el.text
211 | 
212 |                 sold_out = attendees.split("/")[0] == attendees.split("/")[1]
213 | 
214 |                 ################################################################
215 |                 # Is it suited for kids?
216 |                 ################################################################
217 |                 kids = False
218 | 
219 |                 ################################################################
220 |                 # Building final object
221 |                 ################################################################
222 |                 record = get_record_dict(
223 |                     f"{page['id']}-{uuid}",
224 |                     page["id"],
225 |                     title,
226 |                     event_start_datetime,
227 |                     event_end_datetime,
228 |                     full_location,
229 |                     location_name,
230 |                     address,
231 |                     city,
232 |                     department,
233 |                     zip_code,
234 |                     country_code,
235 |                     latitude,
236 |                     longitude,
237 |                     page.get(
238 |                         "language_code",
239 |                         detect_language_code(title, description),
240 |                     ),
241 |                     online,
242 |                     training,
243 |                     sold_out,
244 |                     kids,
245 |                     link,
246 |                     link,
247 |                     description,
248 |                 )
249 | 
250 |                 records.append(record)
251 |                 logging.info(f"Successfully scraped {link}\n{json.dumps(record, indent=4)}")
252 | 
253 |                 driver.back()
254 | 
255 |             try:
256 |                 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
257 |                 driver.implicitly_wait(2)
258 |                 time.sleep(2)
259 |                 next_button = WebDriverWait(driver, 10).until(
260 |                     EC.element_to_be_clickable(
261 |                         (
262 |                             By.XPATH,
263 |                             "//button[@aria-label='Next']",
264 |                         )
265 |                     )
266 |                 )
267 |                 next_button.location_once_scrolled_into_view
268 |                 time.sleep(2)
269 |                 next_button.click()
270 |                 time.sleep(2)
271 |             except TimeoutException:
272 |                 break
273 | 
274 |     driver.quit()
275 | 
276 |     return records
277 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/scraper/billetweb.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import logging
  4 | from datetime import timedelta
  5 | 
  6 | from selenium import webdriver
  7 | from selenium.common.exceptions import NoSuchElementException, TimeoutException
  8 | from selenium.webdriver.common.by import By
  9 | from selenium.webdriver.support.ui import WebDriverWait
 10 | from selenium.webdriver.support import expected_conditions as EC
 11 | 
 12 | from trouver_une_fresque_scraper.db.records import get_record_dict
 13 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates
 14 | from trouver_une_fresque_scraper.utils.errors import FreskError
 15 | from trouver_une_fresque_scraper.utils.keywords import *
 16 | from trouver_une_fresque_scraper.utils.language import detect_language_code
 17 | from trouver_une_fresque_scraper.utils.location import get_address
 18 | 
 19 | 
 20 | def get_billetweb_data(sources, service, options):
 21 |     logging.info("Scraping data from www.billetweb.fr")
 22 | 
 23 |     driver = webdriver.Firefox(service=service, options=options)
 24 |     wait = WebDriverWait(driver, 10)
 25 | 
 26 |     records = []
 27 | 
 28 |     for page in sources:
 29 |         logging.info(f"==================\nProcessing page {page}")
 30 |         driver.get(page["url"])
 31 | 
 32 |         try:
 33 |             wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, page["iframe"])))
 34 |         except TimeoutException:
 35 |             logging.info("Rejecting record: iframe not found")
 36 |             continue
 37 | 
 38 |         wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete")
 39 |         ele = driver.find_elements(By.CSS_SELECTOR, "a.naviguate")
 40 |         links = [e.get_attribute("href") for e in ele]
 41 | 
 42 |         for link in links:
 43 |             logging.info(f"------------------\nProcessing event {link}")
 44 |             driver.get(link)
 45 |             wait.until(
 46 |                 lambda driver: driver.execute_script("return document.readyState") == "complete"
 47 |             )
 48 | 
 49 |             # Useful for different workshops sharing same event link
 50 |             if "filter" in page:
 51 |                 if page["filter"] not in link:
 52 |                     logging.info(
 53 |                         "Rejecting filter: expected filter keyword not present in current link"
 54 |                     )
 55 |                     continue
 56 | 
 57 |             # Description
 58 |             try:
 59 |                 driver.find_element(By.ID, "more_info").click()
 60 |             except Exception:
 61 |                 pass  # normal case if description is without more info
 62 | 
 63 |             try:
 64 |                 description = driver.find_element(by=By.CSS_SELECTOR, value="#description").text
 65 |             except Exception:
 66 |                 logging.info("Rejecting record: no description")
 67 |                 continue
 68 | 
 69 |             # Parse event id
 70 |             event_id = re.search(r"/([^/]+?)&", link).group(1)
 71 |             if not event_id:
 72 |                 logging.info("Rejecting record: event_id not found")
 73 |                 continue
 74 | 
 75 |             # Parse main title
 76 |             try:
 77 |                 main_title = driver.find_element(
 78 |                     by=By.CSS_SELECTOR, value="#event_title > div.event_name"
 79 |                 ).text
 80 |             except NoSuchElementException:
 81 |                 main_title = driver.find_element(
 82 |                     by=By.CSS_SELECTOR,
 83 |                     value="#description_block > div.event_title > div.event_name",
 84 |                 ).text
 85 | 
 86 |             # Location data
 87 |             try:
 88 |                 try:
 89 |                     main_full_location = driver.find_element(
 90 |                         by=By.CSS_SELECTOR, value="div.location_summary"
 91 |                     ).text
 92 |                 except NoSuchElementException:
 93 |                     main_full_location = driver.find_element(
 94 |                         by=By.CSS_SELECTOR,
 95 |                         value="#page_block_location > div.location > div.location_info > div.address > a",
 96 |                     ).text
 97 |             except Exception:
 98 |                 main_full_location = ""
 99 | 
100 |             event_info = []
101 | 
102 |             # Retrieve sessions if exist
103 |             wait.until(
104 |                 EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#shop_block iframe"))
105 |             )
106 |             wait.until(
107 |                 lambda driver: driver.execute_script("return document.readyState") == "complete"
108 |             )
109 |             back_links = driver.find_elements(By.CSS_SELECTOR, ".back_header_link.summarizable")
110 |             if back_links:
111 |                 # Case of Multi-time with only one date, we arrive directly to Basket, so get back to sessions
112 |                 driver.get(back_links[0].get_attribute("href"))
113 |                 wait.until(
114 |                     lambda driver: driver.execute_script("return document.readyState") == "complete"
115 |                 )
116 |             sessions = driver.find_elements(By.CSS_SELECTOR, "a.sesssion_href")
117 |             sessions_links = [
118 |                 s.get_attribute("href") for s in sessions
119 |             ]  # No sessions for Mono-time
120 |             driver.switch_to.parent_frame()
121 | 
122 |             ################################################################
123 |             # Multi-time management
124 |             ################################################################
125 |             for sessions_link in sessions_links:
126 |                 driver.get(sessions_link)
127 |                 wait.until(
128 |                     lambda driver: driver.execute_script("return document.readyState") == "complete"
129 |                 )
130 |                 context = driver.find_element(By.CSS_SELECTOR, "#context_title").text
131 | 
132 |                 # Parse title, dates, location
133 |                 if match := re.match(
134 |                     r"\s*((?P<title>.*) : )?(?P<event_time>.*)(\n\s*(?P<full_location>.*))?",
135 |                     context,
136 |                 ):
137 |                     if not match.group("title"):
138 |                         sub_title = main_title
139 |                     elif "atelier" in match.group("title").lower():
140 |                         sub_title = match.group("title")
141 |                     else:
142 |                         sub_title = main_title + " - " + match.group("title")
143 | 
144 |                     event_time = match.group("event_time")
145 |                     sub_full_location = (
146 |                         match.group("full_location")
147 |                         if match.group("full_location")
148 |                         else main_full_location
149 |                     )
150 |                 else:
151 |                     raise
152 | 
153 |                 # Is it full?
154 |                 try:
155 |                     # The presence of div.block indicates that the event is sold out,
156 |                     # except if the text below is displayed.
157 |                     empty = driver.find_element(By.CSS_SELECTOR, "div.block")
158 |                     sold_out = not has_external_tickets(empty.text)
159 |                 except NoSuchElementException:
160 |                     sold_out = False
161 | 
162 |                 # Parse session id
163 |                 session_id = re.search(r"&session=(\d+)", sessions_link).group(1)
164 |                 uuid = f"{event_id}-{session_id}"
165 | 
166 |                 event_info.append(
167 |                     [sub_title, event_time, sub_full_location, sold_out, sessions_link, uuid]
168 |                 )
169 | 
170 |             ################################################################
171 |             # Mono-time management
172 |             ################################################################
173 |             if not sessions_links:
174 |                 # Parse start and end dates
175 |                 try:
176 |                     event_time = driver.find_element(
177 |                         by=By.CSS_SELECTOR,
178 |                         value="#event_title > div.event_start_time > span.text",
179 |                     ).text
180 |                 except NoSuchElementException:
181 |                     event_time = driver.find_element(
182 |                         by=By.CSS_SELECTOR,
183 |                         value="#description_block > div.event_title > span > a > div.event_start_time",
184 |                     ).text
185 | 
186 |                 # Is it full?
187 |                 try:
188 |                     wait.until(
189 |                         EC.frame_to_be_available_and_switch_to_it(
190 |                             (By.CSS_SELECTOR, "#shop_block iframe")
191 |                         )
192 |                     )
193 |                     wait.until(
194 |                         lambda driver: driver.execute_script("return document.readyState")
195 |                         == "complete"
196 |                     )
197 | 
198 |                     # The presence of div.block indicates that the event is sold out,
199 |                     # except if the text below is displayed.
200 |                     empty = driver.find_element(By.CSS_SELECTOR, "div.block")
201 |                     sold_out = not has_external_tickets(empty.text)
202 |                 except NoSuchElementException:
203 |                     sold_out = False
204 |                 finally:
205 |                     driver.switch_to.parent_frame()
206 | 
207 |                 event_info.append(
208 |                     [main_title, event_time, main_full_location, sold_out, link, event_id]
209 |                 )
210 | 
211 |             ################################################################
212 |             # Session loop
213 |             ################################################################
214 |             for index, (title, event_time, full_location, sold_out, ticket_link, uuid) in enumerate(
215 |                 event_info
216 |             ):
217 |                 logging.info(
218 |                     f"\n-> Processing session {index+1}/{len(event_info)} {ticket_link} ..."
219 |                 )
220 |                 if is_gift_card(title):
221 |                     logging.info("Rejecting record: gift card")
222 |                     continue
223 | 
224 |                 ################################################################
225 |                 # Date and time
226 |                 ################################################################
227 |                 try:
228 |                     event_start_datetime, event_end_datetime = get_dates(event_time)
229 |                 except Exception as e:
230 |                     logging.info(f"Rejecting record: {e}")
231 |                     continue
232 | 
233 |                 if event_end_datetime - event_start_datetime > timedelta(days=1):
234 |                     logging.info(f"Rejecting record: event is too long: {event_time}")
235 |                     continue
236 | 
237 |                 # Is it an online event?
238 |                 online = is_online(title) or is_online(full_location)
239 |                 title = title.replace(" Online event", "")  # Button added by billetweb
240 | 
241 |                 ################################################################
242 |                 # Location data
243 |                 ################################################################
244 |                 location_name = (
245 |                     address
246 |                 ) = city = department = longitude = latitude = zip_code = country_code = ""
247 |                 if not online:
248 |                     try:
249 |                         address_dict = get_address(full_location)
250 |                         (
251 |                             location_name,
252 |                             address,
253 |                             city,
254 |                             department,
255 |                             zip_code,
256 |                             country_code,
257 |                             latitude,
258 |                             longitude,
259 |                         ) = address_dict.values()
260 |                     except FreskError as error:
261 |                         logging.info(f"Rejecting record: {error}.")
262 |                         continue
263 | 
264 |                 # Training?
265 |                 training = is_training(title)
266 | 
267 |                 # Is it suited for kids?
268 |                 kids = is_for_kids(title) and not training  # no trainings for kids
269 | 
270 |                 # Building final object
271 |                 record = get_record_dict(
272 |                     f"{page['id']}-{uuid}",
273 |                     page["id"],
274 |                     title,
275 |                     event_start_datetime,
276 |                     event_end_datetime,
277 |                     full_location,
278 |                     location_name,
279 |                     address,
280 |                     city,
281 |                     department,
282 |                     zip_code,
283 |                     country_code,
284 |                     latitude,
285 |                     longitude,
286 |                     page.get(
287 |                         "language_code",
288 |                         detect_language_code(title, description),
289 |                     ),
290 |                     online,
291 |                     training,
292 |                     sold_out,
293 |                     kids,
294 |                     link,
295 |                     ticket_link,
296 |                     description,
297 |                 )
298 |                 records.append(record)
299 |                 logging.info(f"Successfully scraped:\n{json.dumps(record, indent=4)}")
300 | 
301 |     driver.quit()
302 | 
303 |     return records
304 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/utils/date_and_time.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import traceback
  3 | import logging
  4 | 
  5 | from datetime import datetime, timedelta
  6 | from dateutil.parser import parse
  7 | 
  8 | from trouver_une_fresque_scraper.utils.errors import (
  9 |     FreskError,
 10 |     FreskDateBadFormat,
 11 |     FreskDateDifferentTimezone,
 12 | )
 13 | 
 14 | DEFAULT_DURATION = 3
 15 | CURRENT_YEAR = 2025
 16 | 
 17 | FRENCH_SHORT_DAYS = {
 18 |     "lun": 1,
 19 |     "mar": 2,
 20 |     "mer": 3,
 21 |     "jeu": 4,
 22 |     "ven": 5,
 23 |     "sam": 6,
 24 |     "dim": 7,
 25 | }
 26 | 
 27 | FRENCH_DAYS = {
 28 |     "lundi": 1,
 29 |     "mardi": 2,
 30 |     "mercredi": 3,
 31 |     "jeudi": 4,
 32 |     "vendredi": 5,
 33 |     "samedi": 6,
 34 |     "dimanche": 7,
 35 | }
 36 | 
 37 | FRENCH_SHORT_MONTHS = {
 38 |     "janv": 1,
 39 |     "févr": 2,
 40 |     "mars": 3,
 41 |     "avr": 4,
 42 |     "mai": 5,
 43 |     "juin": 6,
 44 |     "juil": 7,
 45 |     "août": 8,
 46 |     "sept": 9,
 47 |     "oct": 10,
 48 |     "nov": 11,
 49 |     "déc": 12,
 50 | }
 51 | 
 52 | FRENCH_MONTHS = {
 53 |     "janvier": 1,
 54 |     "février": 2,
 55 |     "mars": 3,
 56 |     "avril": 4,
 57 |     "mai": 5,
 58 |     "juin": 6,
 59 |     "juillet": 7,
 60 |     "août": 8,
 61 |     "septembre": 9,
 62 |     "octobre": 10,
 63 |     "novembre": 11,
 64 |     "décembre": 12,
 65 | }
 66 | 
 67 | 
 68 | def get_dates(event_time):
 69 |     try:
 70 |         # ===================
 71 |         # FdC English
 72 | 
 73 |         # June 03, 2025, from 05:30pm to 09:30pm (Paris time)
 74 |         if match := re.match(
 75 |             r"(?P<date>\w+\s\d{2},\s\d{4})"
 76 |             r",\sfrom\s"
 77 |             r"(?P<start_time>\d{2}:\d{2}[ap]m)"
 78 |             r"\sto\s"
 79 |             r"(?P<end_time>\d{2}:\d{2}[ap]m)"
 80 |             r"\s\(.*\stime\)",
 81 |             event_time,
 82 |         ):
 83 |             event_start_datetime = parse(f"{match.group('date')} {match.group('start_time')}")
 84 |             event_end_datetime = parse(f"{match.group('date')} {match.group('end_time')}")
 85 |             return event_start_datetime, event_end_datetime
 86 | 
 87 |         # ===================
 88 |         # Billetweb
 89 | 
 90 |         # Thu Oct 19, 2023 from 01:00 PM to 02:00 PM
 91 |         if match := re.match(
 92 |             r"(?P<date>.*)\s" r"from\s" r"(?P<start_time>.*)\s" r"to\s" r"(?P<end_time>.*)",
 93 |             event_time,
 94 |         ):
 95 |             event_start_datetime = parse(f"{match.group('date')} {match.group('start_time')}")
 96 |             event_end_datetime = parse(f"{match.group('date')} {match.group('end_time')}")
 97 |             return event_start_datetime, event_end_datetime
 98 | 
 99 |         # ===================
100 |         # Billetweb
101 | 
102 |         # Thu Oct 19, 2023 at 01:00 PM to Sat Feb 24, 2024 at 02:00 PM
103 |         elif match := re.match(
104 |             r"(?P<start_date>.*)\s"
105 |             r"at\s"
106 |             r"(?P<start_time>.*)\s"
107 |             r"to\s"
108 |             r"(?P<end_date>.*)\s"
109 |             r"at\s"
110 |             r"(?P<end_time>.*)",
111 |             event_time,
112 |         ):
113 |             event_start_datetime = parse(f"{match.group('start_date')} {match.group('start_time')}")
114 |             event_end_datetime = parse(f"{match.group('end_date')} {match.group('end_time')}")
115 |             return event_start_datetime, event_end_datetime
116 | 
117 |         # ===================
118 |         # Billetweb
119 | 
120 |         # Thu Oct 19, 2023 at 01:00 PM
121 |         # March 7, 2025 at 10:00 AM
122 |         elif match := re.match(r"(?P<date>.*)\s" r"at\s" r"(?P<time>.*)", event_time):
123 |             event_start_datetime = parse(f"{match.group('date')} {match.group('time')}")
124 |             event_end_datetime = event_start_datetime + timedelta(hours=DEFAULT_DURATION)
125 |             return event_start_datetime, event_end_datetime
126 | 
127 |         # ===================
128 |         # Eventbrite
129 | 
130 |         # ven. 11 avr. 2025 14:00 - 17:30 CEST
131 |         elif match := re.match(
132 |             rf"(?P<day_of_week>{'|'.join(FRENCH_SHORT_DAYS.keys())})\.?\s"
133 |             r"(?P<day>\d{1,2})\s"
134 |             rf"(?P<month>{'|'.join(FRENCH_SHORT_MONTHS.keys())})\.?\s"
135 |             r"(?P<year>\d{4})\s"
136 |             r"(?P<start_time>\d{2}:\d{2})\s"
137 |             r"-\s"
138 |             r"(?P<end_time>\d{2}:\d{2})\s"
139 |             r"(?P<timezone>.*)",
140 |             event_time,
141 |         ):
142 |             event_start_datetime = datetime(
143 |                 int(match.group("year")),
144 |                 FRENCH_SHORT_MONTHS[match.group("month")],
145 |                 int(match.group("day")),
146 |                 int(match.group("start_time").split(":")[0]),
147 |                 int(match.group("start_time").split(":")[1]),
148 |             )
149 |             event_end_datetime = datetime(
150 |                 int(match.group("year")),
151 |                 FRENCH_SHORT_MONTHS[match.group("month")],
152 |                 int(match.group("day")),
153 |                 int(match.group("end_time").split(":")[0]),
154 |                 int(match.group("end_time").split(":")[1]),
155 |             )
156 |             return event_start_datetime, event_end_datetime
157 | 
158 |         # ===================
159 |         # FdC French
160 | 
161 |         # 16 mai 2025, de 18h30 à 21h30 (heure de Paris)
162 |         elif match := re.match(
163 |             r"(?P<day>\d{1,2})\s"
164 |             rf"(?P<month>{'|'.join(FRENCH_MONTHS.keys())})\s"
165 |             r"(?P<year>\d{4}),\s"
166 |             r"de\s"
167 |             r"(?P<start_time>\d{1,2}h\d{2})\s"
168 |             r"à\s"
169 |             r"(?P<end_time>\d{1,2}h\d{2})",
170 |             event_time,
171 |         ):
172 |             # Construct the datetime objects
173 |             event_start_datetime = datetime(
174 |                 int(match.group("year")),
175 |                 FRENCH_MONTHS[match.group("month")],
176 |                 int(match.group("day")),
177 |                 int(match.group("start_time").split("h")[0]),
178 |                 int(match.group("start_time").split("h")[1]),
179 |             )
180 |             event_end_datetime = datetime(
181 |                 int(match.group("year")),
182 |                 FRENCH_MONTHS[match.group("month")],
183 |                 int(match.group("day")),
184 |                 int(match.group("end_time").split("h")[0]),
185 |                 int(match.group("end_time").split("h")[1]),
186 |             )
187 |             return event_start_datetime, event_end_datetime
188 | 
189 |         # ===================
190 |         # FEC
191 | 
192 |         # 03 mars 2025, 14:00 – 17:00 UTC+1
193 |         elif match := re.match(
194 |             rf"((?P<day_of_week>{'|'.join(FRENCH_SHORT_DAYS.keys())})\.?\s)?"
195 |             r"(?P<day>\d{1,2})\s"
196 |             rf"(?P<month>{'|'.join(FRENCH_SHORT_MONTHS.keys())})\.?\s"
197 |             r"(?P<year>\d{4})?,\s"
198 |             r"(?P<start_time>\d{2}:\d{2})\s"
199 |             r"–\s"
200 |             r"(?P<end_time>\d{2}:\d{2})"
201 |             r"(\sUTC(?P<timezone>.*))?",
202 |             event_time,
203 |         ):
204 |             timezone = match.group("timezone")
205 |             if timezone and timezone not in ("+1", "+2"):
206 |                 raise FreskDateDifferentTimezone(event_time)
207 | 
208 |             event_start_datetime = datetime(
209 |                 int(match.group("year")),
210 |                 FRENCH_SHORT_MONTHS[match.group("month")],
211 |                 int(match.group("day")),
212 |                 int(match.group("start_time").split(":")[0]),
213 |                 int(match.group("start_time").split(":")[1]),
214 |             )
215 |             event_end_datetime = datetime(
216 |                 int(match.group("year")),
217 |                 FRENCH_SHORT_MONTHS[match.group("month")],
218 |                 int(match.group("day")),
219 |                 int(match.group("end_time").split(":")[0]),
220 |                 int(match.group("end_time").split(":")[1]),
221 |             )
222 |             return event_start_datetime, event_end_datetime
223 | 
224 |         # ===================
225 |         # Glide
226 | 
227 |         # mercredi 12 février 2025 de 19h00 à 22h00
228 |         elif match := re.match(
229 |             rf"((?P<day_of_week>{'|'.join(FRENCH_DAYS.keys())})\s)?"
230 |             r"(?P<day>\d{1,2})\s"
231 |             rf"(?P<month>{'|'.join(FRENCH_MONTHS)})\s"
232 |             r"(?P<year>\d{4})\s"
233 |             r"de\s"
234 |             r"(?P<start_time>\d{1,2}h\d{2})\s"
235 |             r"à\s"
236 |             r"(?P<end_time>\d{1,2}h\d{2})",
237 |             event_time,
238 |         ):
239 |             event_start_datetime = datetime(
240 |                 int(match.group("year")),
241 |                 FRENCH_MONTHS[match.group("month")],
242 |                 int(match.group("day")),
243 |                 int(match.group("start_time").split("h")[0]),
244 |                 int(match.group("start_time").split("h")[1]),
245 |             )
246 |             event_end_datetime = datetime(
247 |                 int(match.group("year")),
248 |                 FRENCH_MONTHS[match.group("month")],
249 |                 int(match.group("day")),
250 |                 int(match.group("end_time").split("h")[0]),
251 |                 int(match.group("end_time").split("h")[1]),
252 |             )
253 |             return event_start_datetime, event_end_datetime
254 | 
255 |         # ===================
256 |         # HelloAsso
257 | 
258 |         # Le 12 février 2025, de 18h à 20h
259 |         elif match := re.match(
260 |             r"Le\s"
261 |             r"(?P<day>\d{1,2})\s"
262 |             rf"(?P<month>{'|'.join(FRENCH_MONTHS)})\s"
263 |             r"(?P<year>\d{4}),\s"
264 |             r"de\s"
265 |             r"(?P<start_time>\d{1,2}h\d{0,2})\s"
266 |             r"à\s"
267 |             r"(?P<end_time>\d{1,2}h\d{0,2})",
268 |             event_time,
269 |         ):
270 |             start_parts = match.group("start_time").split("h")
271 |             event_start_datetime = datetime(
272 |                 int(match.group("year")),
273 |                 FRENCH_MONTHS[match.group("month")],
274 |                 int(match.group("day")),
275 |                 int(start_parts[0]),
276 |                 (int(start_parts[1]) if len(start_parts) > 1 and len(start_parts[1]) else 0),
277 |             )
278 |             end_parts = match.group("end_time").split("h")
279 |             event_end_datetime = datetime(
280 |                 int(match.group("year")),
281 |                 FRENCH_MONTHS[match.group("month")],
282 |                 int(match.group("day")),
283 |                 int(end_parts[0]),
284 |                 int(end_parts[1]) if len(end_parts) > 1 and len(end_parts[1]) else 0,
285 |             )
286 |             return event_start_datetime, event_end_datetime
287 | 
288 |         else:
289 |             raise FreskDateBadFormat(event_time)
290 | 
291 |     except Exception as e:
292 |         if not isinstance(e, FreskError):
293 |             traceback.print_exc()
294 |         logging.error(f"get_dates: {event_time}")
295 |         raise FreskDateBadFormat(event_time)
296 | 
297 | 
298 | def get_dates_from_element(el):
299 |     """Returns start and end datetime objects extracted from the element.
300 | 
301 |     The "datetime" attribute of the element is used if present to extract the date, otherwise falls back on get_dates to parse the day and hours from the element text. Returns None, None on failure.
302 | 
303 |     May throw FreskDateDifferentTimezone, FreskDateBadFormat and any exception thrown by get_dates.
304 |     """
305 |     event_day = el.get_attribute("datetime")
306 |     event_time = el.text
307 | 
308 |     try:
309 |         # Leverage the datetime attribute if present.
310 |         # datetime: 2025-12-05
311 |         # text: déc. 5 de 9am à 12pm UTC+1
312 |         if event_day:
313 |             day_match = re.match(r"(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})", event_day)
314 | 
315 |             def PATTERN_TIME(hour_name, minute_name, pm_name):
316 |                 return (
317 |                     r"(?P<"
318 |                     + hour_name
319 |                     + r">\d{1,2})(?P<"
320 |                     + minute_name
321 |                     + r">:\d{2})?(?P<"
322 |                     + pm_name
323 |                     + r">(am|pm|vorm.|nachm.))"
324 |                 )
325 | 
326 |             def ParseTime(match_object, hour_name, minute_name, pm_name):
327 |                 hour = int(match_object.group(hour_name))
328 |                 PATTERN_PM = ["pm", "nachm."]
329 |                 if match_object.group(pm_name) in PATTERN_PM and hour < 12:
330 |                     hour += 12
331 | 
332 |                 minute = 0
333 |                 match_minute = hour_match.group(minute_name)
334 |                 if match_minute:
335 |                     minute = int(match_minute[1:])
336 | 
337 |                 return hour, minute
338 | 
339 |             # TODO: add proper support for timezone.
340 |             # We use re.search to skip the text for the date at the beginning of the string.
341 |             hour_match = re.search(
342 |                 r"(de|von)\s"
343 |                 + PATTERN_TIME("start_hour", "start_minute", "start_am_or_pm")
344 |                 + r"\s"
345 |                 + r"(à|bis)\s"
346 |                 + PATTERN_TIME("end_hour", "end_minute", "end_am_or_pm")
347 |                 + r"\s"
348 |                 + r"((UTC|MEZ)(?P<timezone>.*))",
349 |                 event_time,
350 |             )
351 |             if day_match and hour_match:
352 |                 timezone = hour_match.group("timezone")
353 |                 if timezone and timezone not in ("+1", "+2"):
354 |                     raise FreskDateDifferentTimezone(event_time)
355 |                 dt = datetime(
356 |                     int(day_match.group("year")),
357 |                     int(day_match.group("month")),
358 |                     int(day_match.group("day")),
359 |                 )
360 |                 start_hour, start_minute = ParseTime(
361 |                     hour_match, "start_hour", "start_minute", "start_am_or_pm"
362 |                 )
363 |                 end_hour, end_minute = ParseTime(
364 |                     hour_match, "end_hour", "end_minute", "end_am_or_pm"
365 |                 )
366 |                 return datetime(dt.year, dt.month, dt.day, start_hour, start_minute), datetime(
367 |                     dt.year, dt.month, dt.day, end_hour, end_minute
368 |                 )
369 | 
370 |         return get_dates(event_time)
371 | 
372 |     except Exception as e:
373 |         if not isinstance(e, FreskError):
374 |             traceback.print_exc()
375 |         logging.error(f"get_dates_from_element: {event_time}")
376 |         raise FreskDateBadFormat(event_time)
377 | 


--------------------------------------------------------------------------------
/src/trouver_une_fresque_scraper/scraper/eventbrite.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import json
  4 | import logging
  5 | import re
  6 | 
  7 | from selenium import webdriver
  8 | from selenium.common.exceptions import (
  9 |     StaleElementReferenceException,
 10 |     NoSuchElementException,
 11 |     TimeoutException,
 12 | )
 13 | from selenium.webdriver.common.by import By
 14 | from selenium.webdriver.support.ui import WebDriverWait
 15 | from selenium.webdriver.support import expected_conditions as EC
 16 | 
 17 | from trouver_une_fresque_scraper.db.records import get_record_dict
 18 | from trouver_une_fresque_scraper.utils.date_and_time import get_dates_from_element
 19 | from trouver_une_fresque_scraper.utils.errors import (
 20 |     FreskError,
 21 |     FreskDateBadFormat,
 22 |     FreskDateNotFound,
 23 | )
 24 | from trouver_une_fresque_scraper.utils.keywords import *
 25 | from trouver_une_fresque_scraper.utils.language import detect_language_code
 26 | from trouver_une_fresque_scraper.utils.location import get_address
 27 | 
 28 | 
 29 | def delete_cookies_overlay(driver):
 30 |     try:
 31 |         transcend_element = WebDriverWait(driver, 10).until(
 32 |             EC.presence_of_element_located((By.CSS_SELECTOR, "#transcend-consent-manager"))
 33 |         )
 34 | 
 35 |         # Use JavaScript to remove the transcend-consent-manager element
 36 |         script = """
 37 |         var element = arguments[0];
 38 |         element.parentNode.removeChild(element);
 39 |         """
 40 |         driver.execute_script(script, transcend_element)
 41 |     except Exception as e:
 42 |         logging.info(f"Transcend consent manager element couldn't be removed: {e}")
 43 | 
 44 | 
 45 | def scroll_to_bottom(driver):
 46 |     more_content = True
 47 |     while more_content:
 48 |         logging.info("Scrolling to the bottom...")
 49 |         try:
 50 |             driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
 51 |             time.sleep(5)  # Give the page some time to load new content
 52 | 
 53 |             # Function to safely click the next button
 54 |             def click_next_button():
 55 |                 try:
 56 |                     next_button = WebDriverWait(driver, 10).until(
 57 |                         EC.element_to_be_clickable(
 58 |                             (
 59 |                                 By.CSS_SELECTOR,
 60 |                                 "div.organizer-profile__section--content div.organizer-profile__show-more > button",
 61 |                             )
 62 |                         )
 63 |                     )
 64 | 
 65 |                     desired_y = (next_button.size["height"] / 2) + next_button.location["y"]
 66 |                     window_h = driver.execute_script("return window.innerHeight")
 67 |                     window_y = driver.execute_script("return window.pageYOffset")
 68 |                     current_y = (window_h / 2) + window_y
 69 |                     scroll_y_by = desired_y - current_y
 70 | 
 71 |                     driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_y_by)
 72 |                     next_button.click()
 73 | 
 74 |                 except StaleElementReferenceException:
 75 |                     click_next_button()  # Retry if the element is stale
 76 | 
 77 |             click_next_button()
 78 | 
 79 |         except TimeoutException:
 80 |             more_content = False
 81 | 
 82 | 
 83 | def get_eventbrite_data(sources, service, options):
 84 |     logging.info("Scraping data from eventbrite.fr")
 85 | 
 86 |     driver = webdriver.Firefox(service=service, options=options)
 87 | 
 88 |     records = []
 89 | 
 90 |     for page in sources:
 91 |         logging.info(f"==================\nProcessing page {page}")
 92 |         driver.get(page["url"])
 93 |         driver.implicitly_wait(5)
 94 | 
 95 |         # Scroll to bottom to load all events
 96 |         scroll_to_bottom(driver)
 97 |         driver.execute_script("window.scrollTo(0, 0);")
 98 | 
 99 |         elements = []
100 |         future_events = driver.find_element(
101 |             By.CSS_SELECTOR, 'div[data-testid="organizer-profile__future-events"]'
102 |         )
103 |         event_card_divs = future_events.find_elements(By.CSS_SELECTOR, "div.event-card")
104 | 
105 |         logging.info(f"Found {len(event_card_divs)} events")
106 | 
107 |         for event_card_div in event_card_divs:
108 |             link_elements = event_card_div.find_elements(By.CSS_SELECTOR, "a.event-card-link")
109 |             elements.extend(link_elements)
110 | 
111 |         links = []
112 |         for link_element in elements:
113 |             href = link_element.get_attribute("href")
114 |             if href:
115 |                 links.append(href)
116 |         links = np.unique(links)
117 | 
118 |         for link in links:
119 |             logging.info(f"\n-> Processing {link} ...")
120 |             driver.get(link)
121 |             delete_cookies_overlay(driver)
122 |             driver.implicitly_wait(3)
123 |             time.sleep(3)  # Pages are quite long to load
124 | 
125 |             ################################################################
126 |             # Has it expired?
127 |             ################################################################
128 |             try:
129 |                 badge = driver.find_element(
130 |                     By.XPATH, '//div[@data-testid="enhancedExpiredEventsBadge"]'
131 |                 )
132 |                 # If the element has children elements, it is enabled
133 |                 try:
134 |                     if badge.find_elements(By.XPATH, "./*"):
135 |                         logging.info("Rejecting record: event expired")
136 |                         continue
137 |                 except StaleElementReferenceException:
138 |                     if driver.find_element(
139 |                         By.XPATH, '//div[@data-testid="enhancedExpiredEventsBadge"]'
140 |                     ).find_elements(By.XPATH, "./*"):
141 |                         logging.info("Rejecting record: event expired")
142 |                         continue
143 | 
144 |             except NoSuchElementException:
145 |                 pass
146 | 
147 |             try:
148 |                 badge = driver.find_element(By.CSS_SELECTOR, "div.enhanced-expired-badge")
149 |                 logging.info("Rejecting record: event expired")
150 |                 continue
151 |             except NoSuchElementException:
152 |                 pass
153 | 
154 |             ################################################################
155 |             # Is it full?
156 |             ################################################################
157 |             sold_out = False
158 |             try:
159 |                 badge = driver.find_element(By.XPATH, '//div[@data-testid="salesEndedMessage"]')
160 |                 # If the element has children elements, it is enabled
161 |                 sold_out = bool(badge.find_elements(By.XPATH, "./*"))
162 |             except NoSuchElementException:
163 |                 pass
164 | 
165 |             if sold_out:
166 |                 # We reject sold out events as the Eventbrite UX hides
167 |                 # relevant info in this case (which looks like an awful practice)
168 |                 logging.info("Rejecting record: sold out")
169 |                 continue
170 | 
171 |             ################################################################
172 |             # Parse event title
173 |             ################################################################
174 |             title_el = driver.find_element(
175 |                 by=By.TAG_NAME,
176 |                 value="h1",
177 |             )
178 |             title = title_el.text
179 | 
180 |             if is_plenary(title):
181 |                 logging.info("Rejecting record: plénière")
182 |                 continue
183 | 
184 |             ###########################################################
185 |             # Is it an online event?
186 |             ################################################################
187 |             online = is_online(title)
188 |             if not online:
189 |                 try:
190 |                     short_location_el = driver.find_element(
191 |                         By.CSS_SELECTOR, "span.start-date-and-location__location"
192 |                     )
193 |                     online = is_online(short_location_el.text)
194 |                 except NoSuchElementException:
195 |                     pass
196 | 
197 |             ################################################################
198 |             # Location data
199 |             ################################################################
200 |             full_location = ""
201 |             location_name = ""
202 |             address = ""
203 |             city = ""
204 |             department = ""
205 |             longitude = ""
206 |             latitude = ""
207 |             zip_code = ""
208 |             country_code = ""
209 | 
210 |             if not online:
211 |                 try:
212 |                     full_location_el = driver.find_element(
213 |                         By.CSS_SELECTOR, 'div[class^="Location-module__addressWrapper___"'
214 |                     )
215 |                 except NoSuchElementException:
216 |                     logging.error(
217 |                         f"Location element not found for offline event {link}.",
218 |                     )
219 |                     continue
220 |                 full_location = full_location_el.text.replace("\n", ", ")
221 | 
222 |                 try:
223 |                     address_dict = get_address(full_location)
224 |                     (
225 |                         location_name,
226 |                         address,
227 |                         city,
228 |                         department,
229 |                         zip_code,
230 |                         country_code,
231 |                         latitude,
232 |                         longitude,
233 |                     ) = address_dict.values()
234 |                 except FreskError as error:
235 |                     logging.info(f"Rejecting record: {error}.")
236 |                     continue
237 | 
238 |             ################################################################
239 |             # Description
240 |             ################################################################
241 |             try:
242 |                 description_title_el = driver.find_element(By.CSS_SELECTOR, "div.event-description")
243 |                 description = description_title_el.text
244 |             except NoSuchElementException:
245 |                 logging.info("Rejecting record: Description not found.")
246 |                 continue
247 | 
248 |             ################################################################
249 |             # Training?
250 |             ################################################################
251 |             training = is_training(title)
252 | 
253 |             ################################################################
254 |             # Is it suited for kids?
255 |             ################################################################
256 |             kids = False
257 | 
258 |             ################################################################
259 |             # Multiple events
260 |             ################################################################
261 |             event_info = []
262 | 
263 |             try:
264 |                 date_time_div = WebDriverWait(driver, 10).until(
265 |                     EC.presence_of_element_located((By.CSS_SELECTOR, "div.select-date-and-time"))
266 |                 )
267 |                 if date_time_div:
268 |                     driver.execute_script("window.scrollBy(0, arguments[0]);", 800)
269 | 
270 |                     li_elements = date_time_div.find_elements(
271 |                         By.CSS_SELECTOR, "li:not([data-heap-id])"
272 |                     )
273 |                     for li in li_elements:
274 |                         clickable_li = WebDriverWait(driver, 10).until(
275 |                             EC.element_to_be_clickable(li)
276 |                         )
277 |                         clickable_li.click()
278 | 
279 |                         ################################################################
280 |                         # Dates
281 |                         ################################################################
282 |                         try:
283 |                             date_info_el = driver.find_element(
284 |                                 by=By.CSS_SELECTOR,
285 |                                 value="time.start-date-and-location__date",
286 |                             )
287 |                         except NoSuchElementException:
288 |                             raise FreskDateNotFound
289 | 
290 |                         try:
291 |                             event_start_datetime, event_end_datetime = get_dates_from_element(
292 |                                 date_info_el
293 |                             )
294 |                         except FreskDateBadFormat as error:
295 |                             logging.info(f"Reject record: {error}")
296 |                             continue
297 | 
298 |                         ################################################################
299 |                         # Parse tickets link
300 |                         ################################################################
301 |                         tickets_link = driver.current_url
302 | 
303 |                         ################################################################
304 |                         # Parse event id
305 |                         ################################################################
306 |                         uuid = re.search(r"/e/([^/?]+)", tickets_link).group(1)
307 | 
308 |                         # Selenium clicks on "sold out" cards (li elements), but this
309 |                         # has no effect. Worse, this adds the previous non-sold out
310 |                         # event another time. One can detect such cases by scanning
311 |                         # through previous event ids.
312 |                         already_scanned = False
313 |                         for event in event_info:
314 |                             if uuid in event[0]:
315 |                                 already_scanned = True
316 | 
317 |                         if not already_scanned:
318 |                             event_info.append(
319 |                                 [
320 |                                     uuid,
321 |                                     event_start_datetime,
322 |                                     event_end_datetime,
323 |                                     tickets_link,
324 |                                 ]
325 |                             )
326 | 
327 |             # There is only one event on this page.
328 |             except TimeoutException:
329 |                 ################################################################
330 |                 # Single event with multiple dates (a "collection").
331 |                 ################################################################
332 |                 try:
333 |                     check_availability_btn = driver.find_element(
334 |                         by=By.CSS_SELECTOR, value="button.check-availability-btn__button"
335 |                     )
336 |                     # TODO: add support for this.
337 |                     logging.error(f"EventBrite collection not supported in event {link}.")
338 |                     continue
339 |                 except NoSuchElementException:
340 |                     pass
341 | 
342 |                 ################################################################
343 |                 # Dates
344 |                 ################################################################
345 |                 try:
346 |                     date_info_el = driver.find_element(
347 |                         by=By.CSS_SELECTOR,
348 |                         value="time.start-date-and-location__date",
349 |                     )
350 |                 except NoSuchElementException:
351 |                     raise FreskDateNotFound
352 | 
353 |                 try:
354 |                     event_start_datetime, event_end_datetime = get_dates_from_element(date_info_el)
355 |                 except FreskDateBadFormat as error:
356 |                     logging.info(f"Reject record: {error}")
357 |                     continue
358 | 
359 |                 ################################################################
360 |                 # Parse tickets link
361 |                 ################################################################
362 |                 tickets_link = driver.current_url
363 | 
364 |                 ################################################################
365 |                 # Parse event id
366 |                 ################################################################
367 |                 uuid = re.search(r"/e/([^/?]+)", tickets_link).group(1)
368 | 
369 |                 event_info.append([uuid, event_start_datetime, event_end_datetime, tickets_link])
370 | 
371 |             ################################################################
372 |             # Session loop
373 |             ################################################################
374 |             for index, (
375 |                 uuid,
376 |                 event_start_datetime,
377 |                 event_end_datetime,
378 |                 link,
379 |             ) in enumerate(event_info):
380 |                 record = get_record_dict(
381 |                     f"{page['id']}-{uuid}",
382 |                     page["id"],
383 |                     title,
384 |                     event_start_datetime,
385 |                     event_end_datetime,
386 |                     full_location,
387 |                     location_name,
388 |                     address,
389 |                     city,
390 |                     department,
391 |                     zip_code,
392 |                     country_code,
393 |                     latitude,
394 |                     longitude,
395 |                     page.get(
396 |                         "language_code",
397 |                         detect_language_code(title, description),
398 |                     ),
399 |                     online,
400 |                     training,
401 |                     sold_out,
402 |                     kids,
403 |                     link,
404 |                     link,
405 |                     description,
406 |                 )
407 |                 records.append(record)
408 |                 logging.info(f"Successfully scraped {link}\n{json.dumps(record, indent=4)}")
409 | 
410 |     driver.quit()
411 | 
412 |     return records
413 | 


--------------------------------------------------------------------------------