├── s3
    └── .empty
├── secrets
    └── .empty
├── alembic
    ├── versions
    │   ├── .keep
    │   ├── 2450da0e6c60_number_of_keywords_20_30_40.py
    │   ├── 5cc9e1ec5362_add_info_public_to_program_metadata.py
    │   ├── c1d78b9968fe_add_info_public_to_program_metadata.py
    │   ├── a5c39db3c8e9_add_new_column_test_for_table_keywords.py
    │   ├── 43103d5b49c9_program_add_start_end_date_for_grid_.py
    │   ├── 5ccd746ee292_add_updated_at.py
    │   ├── 055173743036_keywords_add_channel_title.py
    │   ├── a0a707673259_add_radio_to_program_metadata.py
    │   ├── 30abfd828007_program_metadata.py
    │   ├── 5bff4dceda53_add_info_public_to_program_metadata.py
    │   ├── 827fb6dde3bb_time_monitored_new_table.py
    │   ├── c08231a9eb37_program_add_created_at_updated_at.py
    │   ├── 2c48f626a749_keywords_program_name.py
    │   ├── 4ccd746ee291_add_20_30.py
    │   ├── af956a85658f_add_new_column_number_of_keywords_.py
    │   ├── 356882459cec_remove_category_keywords_change_columns_.py
    │   ├── 4333bc46985d_keywords_program_id_foreign_key.py
    │   ├── 44f13b7eebd4_dictionary_category.py
    │   ├── ac96222af6fe_hrfp_counters.py
    │   └── a578d21d7aee_add_tables_labelstudio.py
    ├── script.py.mako
    └── env.py
├── my_dbt_project
    ├── analyses
    │   └── .gitkeep
    ├── macros
    │   └── .gitkeep
    ├── seeds
    │   ├── .gitkeep
    │   └── time_monitored.csv
    ├── snapshots
    │   └── .gitkeep
    ├── tests
    │   └── .gitkeep
    ├── pytest_tests
    │   ├── .gitkeep
    │   └── test_dbt_model_analytics.py
    ├── .gitignore
    ├── dbt
    │   ├── .user.yml
    │   └── profiles.yml
    ├── README.md
    └── models
    │   ├── analytics
    │       └── environmental_shares_with_desinfo_counts.sql
    │   └── dashboards
    │       ├── core_query_causal_links.sql
    │       ├── core_query_thematics_keywords_i8n.sql
    │       ├── thematic_query_ocean.sql
    │       └── core_query_thematics_keywords.sql
├── quotaclimat
    ├── utils
    │   ├── __init__.py
    │   ├── coverquotaclimat.png
    │   ├── logger.py
    │   ├── healthcheck_config.py
    │   └── sentry.py
    ├── data_ingestion
    │   ├── __init__.py
    │   ├── ingest_db
    │   │   ├── __init__.py
    │   │   └── ingest_sitemap_in_db.py
    │   ├── labelstudio
    │   │   └── configs.py
    │   └── scrap_html
    │   │   └── scrap_description_article.py
    ├── data_processing
    │   ├── __init__.py
    │   └── mediatree
    │   │   ├── i8n
    │   │       ├── dictionary.py
    │   │       ├── brazil
    │   │       │   ├── __init__.py
    │   │       │   └── channel_titles.py
    │   │       ├── france
    │   │       │   ├── __init__.py
    │   │       │   └── channel_titles.py
    │   │       ├── poland
    │   │       │   ├── __init__.py
    │   │       │   └── channel_titles.py
    │   │       ├── spain
    │   │       │   ├── __init__.py
    │   │       │   ├── channel_titles.py
    │   │       │   └── channel_program.py
    │   │       └── germany
    │   │       │   ├── __init__.py
    │   │       │   ├── channel_titles.py
    │   │       │   └── channel_program.py
    │   │   ├── config.py
    │   │   ├── api_import_utils
    │   │       └── db.py
    │   │   └── time_monitored
    │   │       └── models.py
    └── __init__.py
├── document-experts
    └── .download-from-gdrive.empty
├── .dockerignore
├── postgres
    ├── schemas
    │   ├── base.py
    │   └── sitemap.pgsql
    ├── insert_existing_data_example.py
    ├── database_connection.py
    └── insert_data.py
├── docs
    └── images
    │   └── data_tiers.png
├── mockwebsite
    ├── README.md
    ├── cnews_sitemap.xml
    ├── lefigaro_localhost_sitemap.xml
    ├── 20minutes_sitemap.xml
    ├── lefigaro_sitemap.xml
    ├── lacroix_sitemap.xml
    ├── midilibre_sitemap.xml
    ├── franceinter_sitemap.xml
    ├── republiquepyrenees_sitemap.xml
    ├── liberation_sitemap.xml
    ├── nicematin_sitemap.xml
    ├── letelegramme_sitemap.xml
    ├── leparisien_sitemap.xml
    ├── lexpress_sitemap.xml
    └── francebleu_sitemap.xml
├── test
    ├── s3
    │   ├── one-day-one-channel.parquet
    │   └── test_s3.py
    ├── sitemap
    │   ├── test_utils.py
    │   ├── test_scrap_html.py
    │   ├── test_mediatree_utils.py
    │   ├── test_keywords.py
    │   └── test_main_import_api.py
    ├── time_monitored
    │   └── test_time_monitored.py
    ├── i8n
    │   └── test_country.py
    └── mediatree
    │   └── test_mediatree_queries.py
├── .flake8
├── i8n
    └── mediatree_output
    │   └── year=2024
    │       └── month=10
    │           └── day=1
    │               └── channel=LAUNE
    │                   └── data.parquet
├── docker-entrypoint_stop_word.sh
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── docker-compose.yml
    │   ├── dependabot-auto-approve.yml
    │   ├── scaleway-down.yml
    │   ├── scaleway-up.yml
    │   ├── scaleway-start-import-job-update.yml
    │   └── deploy-main.yml
├── .vscode
    └── launch.json
├── Dockerfile_ingest
├── LICENSE
├── Dockerfile
├── Dockerfile_stop_word
├── Dockerfile_api_to_s3
├── Dockerfile_api_import
├── pyproject.toml
├── docker-entrypoint.sh
├── .gitignore
├── analyse
    └── mediatree
    │   └── test_program_durations.ipynb
└── alembic.ini


/s3/.empty:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/secrets/.empty:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/alembic/versions/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/my_dbt_project/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/my_dbt_project/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/my_dbt_project/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/my_dbt_project/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/my_dbt_project/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/quotaclimat/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/my_dbt_project/pytest_tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/quotaclimat/data_ingestion/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/quotaclimat/data_processing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/document-experts/.download-from-gdrive.empty:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/quotaclimat/data_ingestion/ingest_db/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/dictionary.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/my_dbt_project/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_packages/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/my_dbt_project/dbt/.user.yml:
--------------------------------------------------------------------------------
1 | id: e72efce9-d03e-4b9f-b04b-c919cc719b38
2 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | pgdata
2 | .git
3 | .venv
4 | venv
5 | .vscode
6 | notebooks
7 | LICENSE
8 | .idea
9 | 


--------------------------------------------------------------------------------
/postgres/schemas/base.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy.orm import declarative_base
2 | 
3 | Base = declarative_base()


--------------------------------------------------------------------------------
/docs/images/data_tiers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/docs/images/data_tiers.png


--------------------------------------------------------------------------------
/mockwebsite/README.md:
--------------------------------------------------------------------------------
1 | Everything in this folder will be served thanks to a docker image (nginx) to be tested locally.


--------------------------------------------------------------------------------
/test/s3/one-day-one-channel.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/test/s3/one-day-one-channel.parquet


--------------------------------------------------------------------------------
/quotaclimat/utils/coverquotaclimat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/quotaclimat/utils/coverquotaclimat.png


--------------------------------------------------------------------------------
/quotaclimat/__init__.py:
--------------------------------------------------------------------------------
1 | # Useless in the current structure
2 | # from quotaclimat.ui.streamlit_dashboard import main as build_dashboard
3 | 


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/brazil/__init__.py:
--------------------------------------------------------------------------------
1 | from .channel_program import channels_programs_brazil
2 | from .channel_titles import channel_titles_brazil


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/france/__init__.py:
--------------------------------------------------------------------------------
1 | from .channel_program import channels_programs_france
2 | from .channel_titles import channel_titles_france


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/poland/__init__.py:
--------------------------------------------------------------------------------
1 | from .channel_program import channels_programs_poland
2 | from .channel_titles import channel_titles_poland


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/spain/__init__.py:
--------------------------------------------------------------------------------
1 | from .channel_program import channels_programs_spain
2 | from .channel_titles import channel_titles_spain


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/germany/__init__.py:
--------------------------------------------------------------------------------
1 | from .channel_program import channels_programs_germany
2 | from .channel_titles import channel_titles_germany


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | extend-ignore = E203,E501,F401
4 | exclude =
5 |     .venv,
6 |     .git
7 | per-file-ignores =
8 |     */__init__.py:F403,F401
9 | 


--------------------------------------------------------------------------------
/i8n/mediatree_output/year=2024/month=10/day=1/channel=LAUNE/data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/i8n/mediatree_output/year=2024/month=10/day=1/channel=LAUNE/data.parquet


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/germany/channel_titles.py:
--------------------------------------------------------------------------------
1 | channel_titles_germany = {
2 |     "daserste":"Das Erste",
3 |     "zdf-neo":"ZDFneo",
4 |     "zdf":"ZDF",
5 |     "rtl-television":"RTL",
6 |     "sat1":"Sat.1",
7 |     "prosieben":"ProSieben",
8 |     "kabel-eins":"Kabel Eins",
9 | }


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/brazil/channel_titles.py:
--------------------------------------------------------------------------------
1 | channel_titles_brazil = {
2 |     "tvbrasil":"TV Brasil",
3 |     "tvglobo":"TV Globo",
4 |     "tvrecord":"TV Record",
5 |     "sbt":"SBT",
6 |     "redebandeirantes":"Band",
7 |     "jovempan":"Jovem Pan",
8 |     "cnnbrasil":"CNN Brasil",
9 | }


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/spain/channel_titles.py:
--------------------------------------------------------------------------------
1 | channel_titles_spain = {
2 |     "antenna-3": "Antenna 3",
3 |     "rtve-la-1": "RTVE La 1",
4 |     "rtve-24h": "RTVE 24h",
5 |     "lasexta-news": "LaSexta News",
6 |     "telecinco-news": "Telecinco News",
7 |     "cuatro-news": "Cuatro News",
8 | }


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/poland/channel_titles.py:
--------------------------------------------------------------------------------
 1 | channel_titles_poland = {
 2 |     "tvp": "TVP",
 3 |     "polsat": "Polsat",
 4 |     "tvn": "TVN",
 5 |     "polskie-radio": "Polskie Radio",
 6 |     "tofkm": "TOFKM",
 7 |     "radio-zet": "Radio Zet",
 8 |     "eska": "Eska",
 9 |     "tokfm": "TOKFM",
10 | }


--------------------------------------------------------------------------------
/quotaclimat/data_ingestion/labelstudio/configs.py:
--------------------------------------------------------------------------------
1 | db_config = [
2 |     {"database": "labelstudio", "countries": {6: "france", 9: "brazil", 20: "germany"}},
3 |     {"database": "labelstudio-climate-poland-prod-db", "countries": {1: "poland"}},
4 |     {"database": "labelstudio-climate-spain-prod-db", "countries": {1: "spain"}},
5 | ]


--------------------------------------------------------------------------------
/postgres/schemas/sitemap.pgsql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE sitemap_table(
 2 |         publication_name VARCHAR(255) NOT NULL, 
 3 |         news_title TEXT NOT NULL, 
 4 |         download_date DATE NOT NULL,
 5 |         news_publication_date DATE NOT NULL,
 6 |         news_keywords TEXT,
 7 |         section TEXT,
 8 |         image_caption TEXT,
 9 |         media_type VARCHAR(255)
10 |     )
11 | 
12 | 


--------------------------------------------------------------------------------
/docker-entrypoint_stop_word.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run migrations before starting the application
 4 | echo "Running migrations with alembic if exists"
 5 | poetry run alembic upgrade head
 6 | 
 7 | if [[ $? -eq 0 ]]; then
 8 |     echo "Command succeeded"
 9 | else
10 |     echo "Command failed"
11 | fi
12 | 
13 | echo "starting stop_word import app"
14 | python quotaclimat/data_processing/mediatree/stop_word/main.py


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Compose CI
 2 | 
 3 | on:
 4 |   workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch
 5 | 
 6 | jobs:
 7 |   build:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - uses: actions/checkout@v3
11 |     - name: init and load data
12 |       run: docker compose up -d
13 |     - name: sleep
14 |       run:  sleep 60
15 |     - name: log sitemap
16 |       run: docker logs sitemap
17 |     - name: log db ingestion
18 |       run: docker logs ingest_to_db
19 |     - name: log streamlit
20 |       run: docker logs streamlit


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/france/channel_titles.py:
--------------------------------------------------------------------------------
 1 | channel_titles_france = {
 2 |     "tf1": "TF1",
 3 |     "france2": "France 2",
 4 |     "fr3-idf": "France 3-idf",
 5 |     "m6": "M6",
 6 |     "arte": "Arte",
 7 |     "d8": "C8",
 8 |     "bfmtv": "BFM TV",
 9 |     "lci": "LCI",
10 |     "franceinfotv": "France Info TV",
11 |     "itele": "CNews",
12 |     "europe1": "Europe 1",
13 |     "france-culture": "France Culture",
14 |     "france-inter": "France Inter",
15 |     "sud-radio": "Sud Radio",
16 |     "rmc": "RMC",
17 |     "rtl": "RTL",
18 |     "france24": "France 24",
19 |     "france-info": "FranceinfoRadio",
20 |     "rfi": "RFI",
21 | }


--------------------------------------------------------------------------------
/my_dbt_project/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to your new dbt project!
 2 | 
 3 | ### Using the starter project
 4 | 
 5 | Try running the following commands:
 6 | cd my_dbt_project
 7 | - dbt debug
 8 | - dbt run
 9 | - dbt test
10 | 
11 | 
12 | ### Resources:
13 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
14 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
15 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
16 | - Find [dbt events](https://events.getdbt.com) near you
17 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
18 | 


--------------------------------------------------------------------------------
/.github/workflows/dependabot-auto-approve.yml:
--------------------------------------------------------------------------------
 1 | name: Dependabot auto-approve
 2 | on: pull_request
 3 | 
 4 | permissions:
 5 |   pull-requests: write
 6 | 
 7 | jobs:
 8 |   dependabot:
 9 |     runs-on: ubuntu-latest
10 |     if: github.event.pull_request.user.login == 'dependabot[bot]' && github.repository == 'dataforgoodfr/quotaclimat'
11 |     steps:
12 |       - name: Dependabot metadata
13 |         id: metadata
14 |         uses: dependabot/fetch-metadata@v2
15 |         with:
16 |           github-token: "${{ secrets.GITHUB_TOKEN }}"
17 |       - name: Approve a PR
18 |         run: gh pr review --approve "$PR_URL"
19 |         env:
20 |           PR_URL: ${{github.event.pull_request.html_url}}
21 |           GH_TOKEN: ${{secrets.GITHUB_TOKEN}}
22 | 


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | #read whole file to a string
 4 | def get_password():
 5 |     password = os.environ.get("MEDIATREE_PASSWORD")
 6 |     if(password == '/run/secrets/pwd_api'):
 7 |         password= open("/run/secrets/pwd_api", "r").read()
 8 |     return password
 9 | 
10 | def get_auth_url():
11 |     return os.environ.get("MEDIATREE_AUTH_URL") # 
12 | 
13 | def get_user():
14 |     USER = os.environ.get("MEDIATREE_USER")
15 |     if(USER == '/run/secrets/username_api'):
16 |         USER=open("/run/secrets/username_api", "r").read()
17 |     return USER
18 | 
19 | #https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList
20 | def get_keywords_url():
21 |     return os.environ.get("KEYWORDS_URL") 


--------------------------------------------------------------------------------
/alembic/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | ${imports if imports else ""}
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = ${repr(up_revision)}
16 | down_revision: Union[str, None] = ${repr(down_revision)}
17 | branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
18 | depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
19 | 
20 | 
21 | def upgrade() -> None:
22 |     ${upgrades if upgrades else "pass"}
23 | 
24 | 
25 | def downgrade() -> None:
26 |     ${downgrades if downgrades else "pass"}
27 | 


--------------------------------------------------------------------------------
/my_dbt_project/dbt/profiles.yml:
--------------------------------------------------------------------------------
 1 | my_dbt_project:
 2 |   outputs:
 3 |     docker:
 4 |       pass: "{{ env_var('POSTGRES_PASSWORD') }}"
 5 |       port: "{{ env_var('POSTGRES_PORT') | as_number }}"
 6 |       schema: public
 7 |       threads: 4
 8 |       type: postgres
 9 |       user: "{{ env_var('POSTGRES_USER') }}"
10 |       dbname: "{{ env_var('POSTGRES_DB') }}"
11 |       host: "{{ env_var('POSTGRES_HOST') }}"
12 |     analytics:
13 |       pass: "{{ env_var('POSTGRES_PASSWORD') }}"
14 |       port: "{{ env_var('POSTGRES_PORT') | as_number }}"
15 |       schema: analytics
16 |       threads: 4
17 |       type: postgres
18 |       user: "{{ env_var('POSTGRES_USER') }}"
19 |       dbname: "{{ env_var('POSTGRES_DB') }}"
20 |       host: "{{ env_var('POSTGRES_HOST') }}"
21 |   target: docker
22 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 | 
 5 |         {
 6 |             "name": "Python: Current File",
 7 |             "type": "python",
 8 |             "request": "launch",
 9 |             "program": "${file}",
10 |             "console": "integratedTerminal",
11 |             "justMyCode": true
12 |         },
13 |         {
14 |             "name": "Python: File",
15 |             "type": "python",
16 |             "request": "launch",
17 |             "program": "${file}",
18 |             "justMyCode": true
19 |         },
20 |         {"name": "Python data: Current File",
21 |         "type": "python",
22 |         "request": "launch",
23 |         "program": "${file}",
24 |         "console": "integratedTerminal",
25 |         }
26 |     },
27 |         
28 |     ]
29 | }


--------------------------------------------------------------------------------
/postgres/insert_existing_data_example.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from quotaclimat.data_ingestion.scrap_sitemap import get_sitemap_cols
 7 | 
 8 | 
 9 | def parse_section(section: str):
10 |     logging.debug(section)
11 |     if "," not in section:
12 |         return section
13 |     else:
14 |         return ",".join(map(str, section))
15 | 
16 | def transformation_from_dumps_to_table_entry(df: pd.DataFrame):
17 |     try:
18 |         cols = get_sitemap_cols()
19 |         df_template_db = pd.DataFrame(columns=cols)
20 |         df_consistent = pd.concat([df, df_template_db])
21 | 
22 |         df_consistent.section = df_consistent.section.apply(parse_section)
23 | 
24 |         return df_consistent[cols]
25 |     except Exception as err:
26 |         logging.error("Could not transform %s" % (err))
27 |         return None


--------------------------------------------------------------------------------
/alembic/versions/2450da0e6c60_number_of_keywords_20_30_40.py:
--------------------------------------------------------------------------------
 1 | """number of keywords 20,30,40
 2 | 
 3 | Revision ID: 2450da0e6c60
 4 | Revises: 055173743036
 5 | Create Date: 2024-06-19 10:21:34.624231
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '2450da0e6c60'
16 | down_revision: Union[str, None] = '055173743036'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     pass
24 |     # ### end Alembic commands ###
25 | 
26 | 
27 | def downgrade() -> None:
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     pass
30 |     # ### end Alembic commands ###
31 | 


--------------------------------------------------------------------------------
/alembic/versions/5cc9e1ec5362_add_info_public_to_program_metadata.py:
--------------------------------------------------------------------------------
 1 | """Add info/public to program metadata
 2 | 
 3 | Revision ID: 5cc9e1ec5362
 4 | Revises: 356882459cec
 5 | Create Date: 2024-05-03 08:54:16.764307
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '5cc9e1ec5362'
16 | down_revision: Union[str, None] = '356882459cec'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     pass
24 |     # ### end Alembic commands ###
25 | 
26 | 
27 | def downgrade() -> None:
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     pass
30 |     # ### end Alembic commands ###
31 | 


--------------------------------------------------------------------------------
/alembic/versions/c1d78b9968fe_add_info_public_to_program_metadata.py:
--------------------------------------------------------------------------------
 1 | """Add info/public to program metadata
 2 | 
 3 | Revision ID: c1d78b9968fe
 4 | Revises: 5cc9e1ec5362
 5 | Create Date: 2024-05-03 08:56:47.087189
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'c1d78b9968fe'
16 | down_revision: Union[str, None] = '5cc9e1ec5362'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     pass
24 |     # ### end Alembic commands ###
25 | 
26 | 
27 | def downgrade() -> None:
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     pass
30 |     # ### end Alembic commands ###
31 | 


--------------------------------------------------------------------------------
/alembic/versions/a5c39db3c8e9_add_new_column_test_for_table_keywords.py:
--------------------------------------------------------------------------------
 1 | """Add new column test for table keywords
 2 | 
 3 | Revision ID: a5c39db3c8e9
 4 | Revises: 5ccd746ee292
 5 | Create Date: 2024-09-12 14:10:26.305593
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'a5c39db3c8e9'
16 | down_revision: Union[str, None] = '5ccd746ee292'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     pass
24 |     # ### end Alembic commands ###
25 | 
26 | 
27 | def downgrade() -> None:
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     pass
30 |     # ### end Alembic commands ###
31 | 


--------------------------------------------------------------------------------
/alembic/versions/43103d5b49c9_program_add_start_end_date_for_grid_.py:
--------------------------------------------------------------------------------
 1 | """program: add start/end date for grid evolution
 2 | 
 3 | Revision ID: 43103d5b49c9
 4 | Revises: af956a85658f
 5 | Create Date: 2024-10-02 13:18:56.251135
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '43103d5b49c9'
16 | down_revision: Union[str, None] = 'af956a85658f'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     pass
24 |     # ### end Alembic commands ###
25 | 
26 | 
27 | def downgrade() -> None:
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     pass
30 |     # ### end Alembic commands ###
31 | 


--------------------------------------------------------------------------------
/alembic/versions/5ccd746ee292_add_updated_at.py:
--------------------------------------------------------------------------------
 1 | """add 20/30
 2 | 
 3 | Revision ID: 5ccd746ee292
 4 | Revises: 4ccd746ee291
 5 | Create Date: 2024-07-03 06:35:00.316441
 6 | """
 7 | from typing import Sequence, Union
 8 | 
 9 | from alembic import op
10 | import sqlalchemy as sa
11 | from sqlalchemy.dialects import postgresql
12 | 
13 | # revision identifiers, used by Alembic.
14 | revision: str = '5ccd746ee292'
15 | down_revision: Union[str, None] = '4ccd746ee291'
16 | branch_labels: Union[str, Sequence[str], None] = None
17 | depends_on: Union[str, Sequence[str], None] = None
18 | 
19 | def upgrade() -> None:
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('keywords', sa.Column('updated_at',sa.DateTime(), nullable=True))
22 |     # ### end Alembic commands ###
23 | 
24 | 
25 | def downgrade() -> None:
26 |     # ### commands auto generated by Alembic - please adjust! ###
27 |     op.drop_column('keywords', 'updated_at')
28 |     # ### end Alembic commands ###
29 | 


--------------------------------------------------------------------------------
/alembic/versions/055173743036_keywords_add_channel_title.py:
--------------------------------------------------------------------------------
 1 | """keywords: add channel_title
 2 | 
 3 | 
 4 | Revision ID: 055173743036
 5 | Revises: a0a707673259
 6 | Create Date: 2024-06-05 11:43:22.071610
 7 | 
 8 | """
 9 | from typing import Sequence, Union
10 | 
11 | from alembic import op
12 | import sqlalchemy as sa
13 | 
14 | 
15 | # revision identifiers, used by Alembic.
16 | revision: str = '055173743036'
17 | down_revision: Union[str, None] = 'a0a707673259'
18 | branch_labels: Union[str, Sequence[str], None] = None
19 | depends_on: Union[str, Sequence[str], None] = None
20 | 
21 | 
22 | def upgrade() -> None:
23 |     # ### commands auto generated by Alembic - please adjust! ###
24 |     op.add_column('keywords', sa.Column('channel_title', sa.String(), nullable=True))
25 |     # ### end Alembic commands ###
26 | 
27 | 
28 | def downgrade() -> None:
29 |     # ### commands auto generated by Alembic - please adjust! ###
30 |     op.drop_column('keywords','channel_title')
31 |     # ### end Alembic commands ###
32 | 


--------------------------------------------------------------------------------
/alembic/versions/a0a707673259_add_radio_to_program_metadata.py:
--------------------------------------------------------------------------------
 1 | """Add radio to program metadata
 2 | 
 3 | Revision ID: a0a707673259
 4 | Revises: 5bff4dceda53
 5 | Create Date: 2024-05-03 09:36:04.954535
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'a0a707673259'
16 | down_revision: Union[str, None] = '5bff4dceda53'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.add_column('program_metadata', sa.Column('radio', sa.Boolean(), nullable=True))
24 |     # ### end Alembic commands ###
25 | 
26 | 
27 | def downgrade() -> None:
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     op.drop_column('program_metadata', 'radio')
30 |     # ### end Alembic commands ###
31 | 


--------------------------------------------------------------------------------
/test/sitemap/test_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import pandas as pd
 4 | def get_localhost():
 5 |     localhost = ""
 6 |     if(os.environ.get("ENV") == "docker"):
 7 |         localhost ="http://nginxtest:80"
 8 |     else:
 9 |         localhost = "http://localhost:8000"
10 |     return localhost
11 | 
12 | def debug_df(df: pd.DataFrame):
13 |     pd.set_option('display.max_columns', None) 
14 |     logging.warning("--------------------DEBUG DF-------------------")
15 |     logging.info(df.dtypes)
16 |     logging.info(df.head(3))
17 |     logging.warning("--------------------DEBUG DF-------------------")
18 | 
19 | 
20 | def list_of_dicts_to_set_of_frozensets(list_of_dicts):
21 |     # Convert each dictionary to a frozenset to make it hashable
22 |     return {frozenset(d.items()) for d in list_of_dicts}
23 | 
24 | def compare_unordered_lists_of_dicts(list1, list2):
25 |     # Convert each list of dictionaries to a set of frozensets
26 |     set1 = list_of_dicts_to_set_of_frozensets(list1)
27 |     set2 = list_of_dicts_to_set_of_frozensets(list2)
28 | 
29 |     # Check if the sets are equal
30 |     return set1 == set2


--------------------------------------------------------------------------------
/Dockerfile_ingest:
--------------------------------------------------------------------------------
 1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
 2 | FROM python:3.11 as builder
 3 | 
 4 | ENV VIRTUAL_ENV=/app/.venv
 5 | 
 6 | ENV POETRY_NO_INTERACTION=1 \
 7 |     POETRY_VIRTUALENVS_IN_PROJECT=1 \
 8 |     POETRY_VIRTUALENVS_CREATE=1 \
 9 |     POETRY_CACHE_DIR=/tmp/poetry_cache
10 | 
11 | WORKDIR /app
12 | 
13 | COPY pyproject.toml poetry.lock ./
14 | 
15 | RUN pip install poetry==2.1.3
16 | 
17 |  RUN poetry install --no-root
18 | 
19 | # The runtime image, used to just run the code provided its virtual environment
20 | FROM python:3.11-slim as runtime
21 | 
22 | WORKDIR /app
23 | 
24 | ENV VIRTUAL_ENV=/app/.venv 
25 | ENV PATH="/app/.venv/bin:$PATH"
26 | ENV PATH="$PYENV_ROOT/bin:$PATH"
27 | ENV PYTHONPATH=/app
28 | 
29 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
30 | 
31 | # App code is include with docker-compose as well
32 |  
33 | COPY quotaclimat ./quotaclimat
34 | COPY postgres ./postgres
35 | COPY pyproject.toml pyproject.toml
36 | 
37 | # healthcheck
38 | EXPOSE 5000
39 | 
40 | 
41 | ENTRYPOINT ["python", "quotaclimat/data_ingestion/ingest_db/ingest_sitemap_in_db.py"]
42 | 


--------------------------------------------------------------------------------
/test/time_monitored/test_time_monitored.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pytest
 3 | import pandas as pd
 4 | 
 5 | from postgres.schemas.models import get_db_session, connect_to_db, create_tables
 6 | from quotaclimat.data_processing.mediatree.time_monitored.models import *
 7 | import zoneinfo
 8 | 
 9 | @pytest.fixture(scope="module", autouse=True)
10 | def init_tables(): 
11 |     create_tables()
12 | 
13 | def test_save_time_monitored():
14 |     start = datetime(2025, 1, 14, 15, 18, 43, 807525, tzinfo=zoneinfo.ZoneInfo(key='Europe/Paris'))
15 |     channel_name = "test_channel"
16 |     country = "france"
17 |     id = get_consistent_hash(f"{channel_name}_{start}_{country}")
18 |     duration_minutes = 30
19 | 
20 |     time_monitored = Time_Monitored(
21 |         id=id,
22 |         channel_name=channel_name,
23 |         start=start,
24 |         duration_minutes=duration_minutes,
25 |         country=country
26 |     )
27 |     save_time_monitored(number_of_rows=int(duration_minutes/2), day=start, channel=channel_name, country=country)
28 |     
29 |     output = get_time_monitored(id)
30 |     assert output.duration_minutes == time_monitored.duration_minutes


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Data For Good France
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/alembic/versions/30abfd828007_program_metadata.py:
--------------------------------------------------------------------------------
 1 | """program metadata
 2 | 
 3 | Revision ID: 30abfd828007
 4 | Revises: 43103d5b49c9
 5 | Create Date: 2024-10-03 14:18:09.874225
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '30abfd828007'
16 | down_revision: Union[str, None] = '43103d5b49c9'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.add_column('program_metadata', sa.Column('program_grid_start', sa.DateTime(), nullable=True))
24 |     op.add_column('program_metadata', sa.Column('program_grid_end', sa.DateTime(), nullable=True))
25 |     # ### end Alembic commands ###
26 | 
27 | 
28 | def downgrade() -> None:
29 |     # ### commands auto generated by Alembic - please adjust! ###
30 |     op.drop_column('program_metadata', 'program_grid_end')
31 |     op.drop_column('program_metadata', 'program_grid_start')
32 |     # ### end Alembic commands ###
33 | 


--------------------------------------------------------------------------------
/alembic/versions/5bff4dceda53_add_info_public_to_program_metadata.py:
--------------------------------------------------------------------------------
 1 | """Add info/public to program metadata
 2 | 
 3 | Revision ID: 5bff4dceda53
 4 | Revises: c1d78b9968fe
 5 | Create Date: 2024-05-03 09:09:44.751432
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '5bff4dceda53'
16 | down_revision: Union[str, None] = 'c1d78b9968fe'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.add_column('program_metadata', sa.Column('public', sa.Boolean(), nullable=True))
24 |     op.add_column('program_metadata', sa.Column('infocontinue', sa.Boolean(), nullable=True))
25 |     # ### end Alembic commands ###
26 | 
27 | 
28 | def downgrade() -> None:
29 |     # ### commands auto generated by Alembic - please adjust! ###
30 |     op.drop_column('program_metadata', 'infocontinue')
31 |     op.drop_column('program_metadata', 'public')
32 |     # ### end Alembic commands ###
33 | 


--------------------------------------------------------------------------------
/alembic/versions/827fb6dde3bb_time_monitored_new_table.py:
--------------------------------------------------------------------------------
 1 | """time monitored new table
 2 | 
 3 | Revision ID: 827fb6dde3bb
 4 | Revises: c08231a9eb37
 5 | Create Date: 2025-04-29 13:29:54.299095
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '827fb6dde3bb'
16 | down_revision: Union[str, None] = 'c08231a9eb37'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | def upgrade() -> None:
21 |     # Create the time_monitored table
22 |     op.create_table(
23 |         'time_monitored',
24 |         sa.Column('id', sa.String(), nullable=False),
25 |         sa.Column('channel_name', sa.String(), nullable=False),
26 |         sa.Column('start', sa.DateTime(), nullable=False),
27 |         sa.Column('duration_minutes', sa.Integer(), nullable=True),
28 |         sa.Column('country', sa.String(), nullable=False),
29 |         sa.PrimaryKeyConstraint('id')
30 |     )
31 | 
32 | 
33 | def downgrade() -> None:
34 |     # Drop the time_monitored table
35 |     op.drop_table('time_monitored')


--------------------------------------------------------------------------------
/alembic/versions/c08231a9eb37_program_add_created_at_updated_at.py:
--------------------------------------------------------------------------------
 1 | """program: add created_at updated_at
 2 | 
 3 | Revision ID: c08231a9eb37
 4 | Revises: 4333bc46985d
 5 | Create Date: 2025-03-29 08:17:51.997077
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'c08231a9eb37'
16 | down_revision: Union[str, None] = '4333bc46985d'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.add_column('program_metadata', sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text("(now() at time zone 'utc')"), nullable=True))
24 |     op.add_column('program_metadata', sa.Column('updated_at', sa.DateTime(), nullable=True))
25 |     # ### end Alembic commands ###
26 | 
27 | 
28 | def downgrade() -> None:
29 |     # ### commands auto generated by Alembic - please adjust! ###
30 |     op.drop_column('program_metadata', 'updated_at')
31 |     op.drop_column('program_metadata', 'created_at')
32 |     # ### end Alembic commands ###
33 | 


--------------------------------------------------------------------------------
/alembic/versions/2c48f626a749_keywords_program_name.py:
--------------------------------------------------------------------------------
 1 | """keywords: program name
 2 | 
 3 | Revision ID: 2c48f626a749
 4 | Revises: 
 5 | Create Date: 2024-04-12 12:44:23.512407
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '2c48f626a749'
16 | down_revision: Union[str, None] = None
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.add_column('keywords', sa.Column('channel_program', sa.String(), nullable=True))
24 |     op.add_column('keywords', sa.Column('channel_program_type', sa.String(), nullable=True))
25 |     op.add_column('keywords', sa.Column('category', sa.JSON(), nullable=True))
26 |     # ### end Alembic commands ###
27 | 
28 | 
29 | def downgrade() -> None:
30 |     # ### commands auto generated by Alembic - please adjust! ###
31 |     op.drop_column('keywords', 'category')
32 |     op.drop_column('keywords', 'channel_program_type')
33 |     op.drop_column('keywords', 'channel_program')
34 |     # ### end Alembic commands ###
35 | 


--------------------------------------------------------------------------------
/quotaclimat/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | class CustomFormatter(logging.Formatter):
 4 | 
 5 |     grey = "\x1b[38;20m"
 6 |     yellow = "\x1b[33;20m"
 7 |     red = "\x1b[31;20m"
 8 |     bold_red = "\x1b[31;1m"
 9 |     reset = "\x1b[0m"
10 |     light_blue = "\x1b[36m"
11 |     format = "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d | %(message)s"
12 | 
13 |     FORMATS = {
14 |         logging.DEBUG: grey + format + reset,
15 |         logging.INFO: light_blue + format + reset,
16 |         logging.WARNING: yellow + format + reset,
17 |         logging.ERROR: red + format + reset,
18 |         logging.CRITICAL: bold_red + format + reset
19 |     }
20 | 
21 |     def format(self, record):
22 |         log_fmt = self.FORMATS.get(record.levelno)
23 |         formatter = logging.Formatter(log_fmt)
24 |         return formatter.format(record)
25 |     
26 | def getLogger():
27 |     # create logger with 'spam_application'
28 |     logger = logging.getLogger()
29 |     logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper())
30 |     # create console handler with a higher log level
31 |     if (logger.hasHandlers()):
32 |         logger.handlers.clear()
33 |     ch = logging.StreamHandler()
34 |     ch.setFormatter(CustomFormatter())
35 |     logger.addHandler(ch)
36 | 
37 |     return logger


--------------------------------------------------------------------------------
/alembic/versions/4ccd746ee291_add_20_30.py:
--------------------------------------------------------------------------------
 1 | """add 20/30
 2 | 
 3 | Revision ID: 4ccd746ee291
 4 | Revises: 2450da0e6c60
 5 | Create Date: 2024-06-20 06:35:00.316441
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | from sqlalchemy.dialects import postgresql
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '4ccd746ee291'
16 | down_revision: Union[str, None] = '2450da0e6c60'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.alter_column('keywords', sa.Column('number_of_keywords_20', sa.Integer(), nullable=True))
24 |     op.alter_column('keywords', sa.Column('number_of_keywords_30', sa.Integer(), nullable=True))
25 |     op.alter_column('keywords', sa.Column('number_of_keywords_40', sa.Integer(), nullable=True))
26 |     # ### end Alembic commands ###
27 | 
28 | 
29 | def downgrade() -> None:
30 |     # ### commands auto generated by Alembic - please adjust! ###
31 |     op.drop_column('keywords', 'number_of_keywords_20')
32 |     op.drop_column('keywords', 'number_of_keywords_30')
33 |     op.drop_column('keywords', 'number_of_keywords_40')
34 |     # ### end Alembic commands ###
35 | 


--------------------------------------------------------------------------------
/.github/workflows/scaleway-down.yml:
--------------------------------------------------------------------------------
 1 | name: Stop Scaleway
 2 | 
 3 | on:
 4 |   workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch
 5 |   
 6 |   schedule: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule
 7 |   - cron:  '49 21 * * *'
 8 | 
 9 | jobs:
10 |   down:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - name: Use CLI
14 |       uses: jawher/action-scw@v2.34.0
15 |       env:
16 |         SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
17 |         SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
18 |         SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
19 |         SCW_ZONE: ${{ secrets.SCW_ZONE }}
20 |       with:
21 |         args: container container list name=${{ secrets.CONTAINER_NAME }} --output json
22 | 
23 |     - name: Get CONTAINER_ID env var
24 |       run: echo "CONTAINER_ID=$(cat "${GITHUB_WORKSPACE}/scw.output" | jq -r '.[0].id')"  >> $GITHUB_ENV
25 | 
26 | 
27 |     - name: 0 instances
28 |       uses: jawher/action-scw@v2.34.0
29 |       env:
30 |         SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
31 |         SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
32 |         SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
33 |         SCW_ZONE: ${{ secrets.SCW_ZONE }}
34 |       with:
35 |         args: container container update min-scale=0 ${{ env.CONTAINER_ID }}
36 | 


--------------------------------------------------------------------------------
/.github/workflows/scaleway-up.yml:
--------------------------------------------------------------------------------
 1 | name: Start Scaleway
 2 | 
 3 | on:
 4 |   workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch
 5 |   
 6 |   schedule: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule
 7 |   - cron:  '52 05 * * *'
 8 | 
 9 | jobs:
10 |   up:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - name: Use CLI
14 |       uses: jawher/action-scw@v2.34.0
15 |       env:
16 |         SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
17 |         SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
18 |         SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
19 |         SCW_ZONE: ${{ secrets.SCW_ZONE }}
20 |       with:
21 |         args: container container list name=${{ secrets.CONTAINER_NAME }} --output json
22 | 
23 |     - name: Get CONTAINER_ID env var
24 |       run: echo "CONTAINER_ID=$(cat "${GITHUB_WORKSPACE}/scw.output" | jq -r '.[0].id')"  >> $GITHUB_ENV
25 | 
26 |     - name: start 1 instances
27 |       uses: jawher/action-scw@v2.34.0
28 |       env:
29 |         SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
30 |         SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
31 |         SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
32 |         SCW_ZONE: ${{ secrets.SCW_ZONE }}
33 |       with:
34 |         args: container container update min-scale=1 ${{ env.CONTAINER_ID }}
35 | 


--------------------------------------------------------------------------------
/alembic/versions/af956a85658f_add_new_column_number_of_keywords_.py:
--------------------------------------------------------------------------------
 1 | """Add new column number_of_keywords climat/biod/r
 2 | 
 3 | Revision ID: af956a85658f
 4 | Revises: a5c39db3c8e9
 5 | Create Date: 2024-09-12 14:15:12.049367
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'af956a85658f'
16 | down_revision: Union[str, None] = 'a5c39db3c8e9'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.add_column('keywords', sa.Column('number_of_keywords_climat', sa.Integer(), nullable=True))
24 |     op.add_column('keywords', sa.Column('number_of_keywords_biodiversite', sa.Integer(), nullable=True))
25 |     op.add_column('keywords', sa.Column('number_of_keywords_ressources', sa.Integer(), nullable=True))
26 |     # ### end Alembic commands ###
27 | 
28 | 
29 | def downgrade() -> None:
30 |     # ### commands auto generated by Alembic - please adjust! ###
31 |     op.drop_column('keywords', 'number_of_keywords_ressources')
32 |     op.drop_column('keywords', 'number_of_keywords_biodiversite')
33 |     op.drop_column('keywords', 'number_of_keywords_climat')
34 |     # ### end Alembic commands ###
35 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
 2 | FROM python:3.12.10 as builder
 3 | 
 4 | ENV VIRTUAL_ENV=/app/.venv
 5 | 
 6 | ENV POETRY_NO_INTERACTION=1 \
 7 |     POETRY_VIRTUALENVS_IN_PROJECT=1 \
 8 |     POETRY_VIRTUALENVS_CREATE=1 \
 9 |     POETRY_CACHE_DIR=/tmp/poetry_cache
10 | 
11 | WORKDIR /app
12 | 
13 | COPY pyproject.toml poetry.lock ./
14 | 
15 | RUN pip install poetry==2.1.3
16 | 
17 |  RUN poetry install --no-root
18 | 
19 | # The runtime image, used to just run the code provided its virtual environment
20 | FROM python:3.12.10-slim as runtime
21 | 
22 | RUN apt update && apt-get install -y git
23 | 
24 | WORKDIR /app
25 | 
26 | ENV VIRTUAL_ENV=/app/.venv 
27 | ENV PATH="/app/.venv/bin:$PATH"
28 | ENV PATH="$PYENV_ROOT/bin:$PATH"
29 | ENV PYTHONPATH=/app
30 | ENV DBT_PROFILES_DIR=/app/my_dbt_project/dbt
31 | ENV DBT_PROJECT_DIR=/app/my_dbt_project
32 | 
33 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
34 | 
35 | # For streamlit only
36 | COPY pyproject.toml poetry.lock ./
37 | RUN pip install poetry 
38 | 
39 | # App code is include with docker-compose as well
40 |  
41 | COPY quotaclimat ./quotaclimat
42 | COPY postgres ./postgres
43 | COPY alembic/ ./alembic
44 | COPY transform_program.py ./transform_program.py
45 | COPY my_dbt_project/ ./my_dbt_project
46 | 
47 | # Docker compose overwrite this config to have only one Dockerfile
48 | CMD ["ls"]
49 | 


--------------------------------------------------------------------------------
/Dockerfile_stop_word:
--------------------------------------------------------------------------------
 1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
 2 | FROM python:3.12.10 as builder
 3 | 
 4 | ENV VIRTUAL_ENV=/app/.venv
 5 | 
 6 | ENV POETRY_NO_INTERACTION=1 \
 7 |     POETRY_VIRTUALENVS_IN_PROJECT=1 \
 8 |     POETRY_VIRTUALENVS_CREATE=1 \
 9 |     POETRY_CACHE_DIR=/tmp/poetry_cache
10 | 
11 | WORKDIR /app
12 | 
13 | COPY pyproject.toml poetry.lock ./
14 | 
15 | RUN pip install poetry==2.1.3
16 | 
17 |  RUN poetry install --no-root
18 | 
19 | # The runtime image, used to just run the code provided its virtual environment
20 | FROM python:3.12.10-slim as runtime
21 | 
22 | WORKDIR /app
23 | 
24 | ENV VIRTUAL_ENV=/app/.venv 
25 | ENV PATH="/app/.venv/bin:$PATH"
26 | ENV PATH="$PYENV_ROOT/bin:$PATH"
27 | ENV PYTHONPATH=/app
28 | 
29 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
30 | 
31 | # App code is include with docker-compose as well
32 |  
33 | COPY quotaclimat ./quotaclimat
34 | COPY postgres ./postgres
35 | COPY pyproject.toml pyproject.toml
36 | COPY alembic/ ./alembic
37 | COPY alembic.ini ./alembic.ini
38 | COPY transform_program.py ./transform_program.py
39 | 
40 | # healthcheck
41 | EXPOSE 5050
42 | 
43 | # Use a separate script to handle migrations and start the application
44 | COPY docker-entrypoint_stop_word.sh ./docker-entrypoint_stop_word.sh
45 | RUN chmod +x ./docker-entrypoint_stop_word.sh
46 | 
47 | ENTRYPOINT ["./docker-entrypoint_stop_word.sh"]


--------------------------------------------------------------------------------
/quotaclimat/utils/healthcheck_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import http.server
 3 | import socketserver
 4 | import os
 5 | import logging
 6 | import asyncio
 7 | import tomli
 8 | 
 9 | def get_app_version():
10 |      # Open and read the pyproject.toml file
11 |     with open('pyproject.toml', 'rb') as toml_file:
12 |         pyproject_data = tomli.load(toml_file)
13 | 
14 |     # Access the version from the pyproject.toml file
15 |     version = pyproject_data['project']['version']
16 |     return version
17 | 
18 | version = get_app_version()
19 | 
20 | class HealthCheckHandler(http.server.SimpleHTTPRequestHandler):
21 |     def do_GET(self):
22 |         self.send_response(200)
23 |         self.end_headers()
24 |         self.wfile.write((f"Healthy.\n\nApp version {version}").encode())
25 | 
26 | async def run_health_check_server():
27 |     PORT = int(os.environ.get("PORT_HS", 5050))
28 |     SERVER_ADDRESS = os.environ.get("HEALTHCHECK_SERVER", "")
29 | 
30 |     logging.info(f"App version {version}")
31 |     logging.info(f"Healthcheck at '{SERVER_ADDRESS}' : port {PORT}")
32 |     with socketserver.TCPServer((SERVER_ADDRESS, PORT), HealthCheckHandler) as httpd:
33 |         try:
34 |             await asyncio.to_thread(httpd.serve_forever)
35 |         except asyncio.CancelledError:
36 |             logging.info("health check cancel")
37 |             httpd.shutdown() # to terminal infinite loop "serve_forever"
38 |             return
39 |             


--------------------------------------------------------------------------------
/Dockerfile_api_to_s3:
--------------------------------------------------------------------------------
 1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
 2 | FROM python:3.12.10 as builder
 3 | 
 4 | ENV VIRTUAL_ENV=/app/.venv
 5 | 
 6 | ENV POETRY_NO_INTERACTION=1 \
 7 |     POETRY_VIRTUALENVS_IN_PROJECT=1 \
 8 |     POETRY_VIRTUALENVS_CREATE=1 \
 9 |     POETRY_CACHE_DIR=/tmp/poetry_cache
10 | 
11 | WORKDIR /app
12 | 
13 | COPY pyproject.toml poetry.lock ./
14 | 
15 | RUN pip install poetry==2.1.3
16 | 
17 | RUN poetry install --no-root
18 | 
19 | # The runtime image, used to just run the code provided its virtual environment
20 | FROM python:3.12.10-slim as runtime
21 | 
22 | WORKDIR /app
23 | 
24 | ENV VIRTUAL_ENV=/app/.venv 
25 | ENV PATH="/app/.venv/bin:$PATH"
26 | ENV PATH="$PYENV_ROOT/bin:$PATH"
27 | ENV PYTHONPATH=/app
28 | 
29 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
30 | 
31 | # App code is include with docker-compose as well
32 |  
33 | COPY quotaclimat ./quotaclimat
34 | COPY postgres ./postgres
35 | COPY pyproject.toml pyproject.toml
36 | COPY alembic/ ./alembic
37 | COPY alembic.ini ./alembic.ini
38 | COPY transform_program.py ./transform_program.py
39 | 
40 | # healthcheck
41 | EXPOSE 5050
42 | 
43 | # Use a separate script to handle migrations and start the application
44 | COPY docker-entrypoint.sh ./docker-entrypoint.sh
45 | RUN chmod +x ./docker-entrypoint.sh
46 | 
47 | 
48 | ENTRYPOINT ["python", "quotaclimat/data_processing/mediatree/s3/api_to_s3.py"]
49 | 


--------------------------------------------------------------------------------
/Dockerfile_api_import:
--------------------------------------------------------------------------------
 1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
 2 | FROM python:3.12.10 as builder
 3 | 
 4 | ENV VIRTUAL_ENV=/app/.venv
 5 | 
 6 | ENV POETRY_NO_INTERACTION=1 \
 7 |     POETRY_VIRTUALENVS_IN_PROJECT=1 \
 8 |     POETRY_VIRTUALENVS_CREATE=1 \
 9 |     POETRY_CACHE_DIR=/tmp/poetry_cache
10 | 
11 | WORKDIR /app
12 | 
13 | COPY pyproject.toml poetry.lock ./
14 | 
15 | RUN pip install poetry==2.1.3
16 | 
17 |  RUN poetry install --no-root
18 | 
19 | # The runtime image, used to just run the code provided its virtual environment
20 | FROM python:3.12.10-slim as runtime
21 | 
22 | RUN apt update && apt-get install -y git
23 | 
24 | WORKDIR /app
25 | 
26 | ENV VIRTUAL_ENV=/app/.venv 
27 | ENV PATH="/app/.venv/bin:$PATH"
28 | ENV PATH="$PYENV_ROOT/bin:$PATH"
29 | ENV PYTHONPATH=/app
30 | 
31 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
32 | 
33 | # App code is include with docker-compose as well
34 |  
35 | COPY quotaclimat ./quotaclimat
36 | COPY postgres ./postgres
37 | COPY pyproject.toml pyproject.toml
38 | COPY alembic/ ./alembic
39 | COPY alembic.ini ./alembic.ini
40 | COPY transform_program.py ./transform_program.py
41 | COPY my_dbt_project/ ./my_dbt_project
42 | COPY i8n/ ./i8n
43 | ENV DBT_PROFILES_DIR=/app/my_dbt_project/dbt
44 | ENV DBT_PROJECT_DIR=/app/my_dbt_project
45 | 
46 | # healthcheck
47 | EXPOSE 5050
48 | 
49 | # Use a separate script to handle migrations and start the application
50 | COPY docker-entrypoint.sh ./docker-entrypoint.sh
51 | RUN chmod +x ./docker-entrypoint.sh
52 | 
53 | ENTRYPOINT ["./docker-entrypoint.sh"]
54 | 


--------------------------------------------------------------------------------
/my_dbt_project/seeds/time_monitored.csv:
--------------------------------------------------------------------------------
 1 | id,channel_name,start,duration_minutes,country
 2 | f48e555ced0b59dc6016b9ed62e4ca0b630ff98d48ac459c8f3ae0945d81a534,daserste,"February 01, 2025, 12:00 AM",258,germany
 3 | 3a6fd867f15cafbddc489509576a495b1794633e895ff0f18a48250bb6f1cf25,zdf-neo,"February 01, 2025, 12:00 AM",352,germany
 4 | 31a2db38f49bd7b3d1689369a409bca7f031f2cab2c2d2c8715d367560651277,rtl-television,"February 01, 2025, 12:00 AM",294,germany
 5 | 37d6723cd58f3b137045298c8b3dded8563da30df84e979cf27441808c7381ec,sat1,"February 01, 2025, 12:00 AM",222,germany
 6 | f015abc528de99458ea833d94cdea466ab0e9c4445727a2d005bca9b2ea4adff,prosieben,"February 01, 2025, 12:00 AM",156,germany
 7 | 143cfbae72cbf7c634645fe8f0b3dce52c3e95c0d27d01af10210252ec3e67e8,kabel-eins,"February 01, 2025, 12:00 AM",36,germany
 8 | cf6d8f980175b1335583bce4a40595eca5886fcaa9ebeaf7611557fc41b6cf21,tf1,"February 01, 2025, 12:00 AM",258,france
 9 | 6b7e0d69c3111ceb6b9f176f5c3748b5c9d44a898f5c2d9ecc7e3f0a37cb5adf,france2,"February 01, 2025, 12:00 AM",334,france
10 | 3b046c77314301e63bef3a4142eb9ac62b48fe52b72602de1ab3d93eb1c5d24b,fr3-idf,"February 01, 2025, 12:00 AM",240,france
11 | b51fe8a6a65b06ead17099a2eac4312b526f76f9b1f256d8d3779c76533a3b6a,m6,"February 01, 2025, 12:00 AM",316,france
12 | 9b1ebe8bc77b319560f91fc1c768079ff16e9f01f544b5aad25065d335c5f3f7,arte,"February 01, 2025, 12:00 AM",88,france
13 | 6aba0a0299934ed1a3411289a51ccbd11b6d9236ffef2adc8df0d76b003357f0,bfmtv,"February 01, 2025, 12:00 AM","1,030",france
14 | 0bb8064e6500c8bc63e9e30f42d21d9ad5322d508f04dd024c1b76956f0d40c4,franceinfotv,"February 01, 2025, 12:00 AM","1,030",france


--------------------------------------------------------------------------------
/test/sitemap/test_scrap_html.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pandas as pd
 3 | from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news, get_hat_20minutes, get_url_content
 4 | from quotaclimat.data_ingestion.scrap_sitemap import get_description_article
 5 | from bs4 import BeautifulSoup
 6 | from test_utils import get_localhost, debug_df
 7 | 
 8 | localhost = get_localhost()
 9 | 
10 | @pytest.mark.asyncio
11 | async def test_get_description_article():
12 |     url_to_parse = f"{localhost}/mediapart_website.html"
13 |     media = "Le Figaro"
14 |     df_articles = pd.DataFrame([{
15 |         "url" : url_to_parse,
16 |         "news_title" :media,
17 |     }])
18 | 
19 |     expected_result = pd.DataFrame([{
20 |         "url" : url_to_parse,
21 |         "news_title" :media,
22 |         "news_description" : "description could be parsed with success"
23 |     }])
24 | 
25 |     df_articles["news_description"] = await get_description_article(media, df_articles)
26 |     debug_df(df_articles)
27 |     pd.testing.assert_frame_equal(df_articles.reset_index(drop=True), expected_result.reset_index(drop=True))
28 | 
29 | @pytest.mark.asyncio
30 | async def test_get_meta_news():
31 |     url_to_parse = f"{localhost}/mediapart_website.html"
32 | 
33 |     ouput = await get_meta_news(url_to_parse, "media")
34 |     assert ouput["description"] == "description could be parsed with success"
35 | 
36 | @pytest.mark.asyncio
37 | async def test_get_hat_20minutes():
38 |     url_to_parse = f"{localhost}/20minutes_website.html"
39 | 
40 |     response = await get_url_content(url_to_parse)
41 |     hat = get_hat_20minutes(BeautifulSoup(response, "html.parser"))
42 |     assert hat == "howdy there"


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "quotaclimat"
 3 | version = "2.0.74"
 4 | description = ""
 5 | authors = [
 6 |     {name = "Rambier Estelle", "email"="estelle.rambier@hotmail.fr"},
 7 |     {name = "Paul Leclercq", "email"="paul@epauler.fr"}
 8 | ]
 9 | readme = "README.md"
10 | 
11 | [tool.pytest.ini_options]
12 | log_cli = 1
13 | log_cli_level = "INFO"
14 | testpaths = [
15 |     "test"
16 | ]
17 | 
18 | [tool.poetry.dependencies]
19 | s3fs = {extras = ["boto3"], version = ">=2023.12.0"}
20 | boto3 = "*"
21 | botocore = "*"
22 | python = ">=3.11,<=3.13"
23 | s3transfer = "0.10.4"
24 | pandas = "^2.2.3"
25 | advertools = "^0.14.1"
26 | xmltodict = "^0.13.0"
27 | sqlalchemy = "^2.0.35"
28 | psycopg2-binary = "^2.9.5"
29 | alembic = "^1.13.1"
30 | beautifulsoup4 = "^4.11.1"
31 | asyncio = "^3.4.3"
32 | tomli = "^2.0.1"
33 | aiohttp = "^3.10.8"
34 | pytest-asyncio = "^0.23.5"
35 | swifter = "^1.4.0"
36 | tenacity = "^8.2.3"
37 | sentry-sdk = ">=2.53.0"
38 | modin = {extras = ["ray"], version = "^0.32.0"}
39 | openpyxl = "^3.1.5"
40 | requests = "^2.32.3"
41 | thefuzz = "^0.22.1"
42 | dbt-core = "^1.9.2"
43 | dbt-postgres = "^1.9.0"
44 | ruff = "^0.13.3"
45 | graphviz = "^0.21"
46 | matplotlib = "^3.10.7"
47 | plotly = "^6.5.0"
48 | nbformat = "^5.10.4"
49 | kaleido = "^1.2.0"
50 | [build-system]
51 | requires = ["poetry-core>=1.1"]
52 | build-backend = "poetry.core.masonry.api"
53 | 
54 | 
55 | 
56 | [tool.poetry.group.dev.dependencies]
57 | coverage = "^7.5.4"
58 | pytest = "^8.1.1"
59 | pytest-cov = "^5.0.0"
60 | poetry-bumpversion = "^0.3.1"
61 | pre-commit = "^2.18.1"
62 | black = "^22.3.0"
63 | isort = "^5.10.1"
64 | flake8 = "^4.0.1"
65 | invoke = "^1.7.3"
66 | deptry = "^0.20.0"
67 | graphviz = "^0.21"
68 | ipykernel = "^7.0.1"
69 | 


--------------------------------------------------------------------------------
/my_dbt_project/models/analytics/environmental_shares_with_desinfo_counts.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |     materialized='incremental',
 3 |     unique_key=['start','channel_name','country']
 4 |   )
 5 | }}
 6 | 
 7 | with env_shares as (
 8 | 	with name_map as (
 9 | 		select 
10 | 			channel_title,
11 | 			max(channel_name) channel_name
12 | 		from 
13 | 			program_metadata pm
14 | 		where pm.country='france'
15 | 		group by
16 | 			channel_title
17 | 	)
18 | 	 select
19 | 	 	start,
20 | 		cqes."Program Metadata - Channel Name__channel_title" as "channel_title",
21 | 		name_map.channel_name,
22 | 		cqes.sum_duration_minutes,
23 | 		cqes."% climat" as weekly_perc_climat,
24 | 		'france' as country
25 | 	from 
26 | 		public.core_query_environmental_shares cqes
27 | 	left join
28 | 		name_map 
29 | 	on
30 | 		name_map.channel_title=cqes."Program Metadata - Channel Name__channel_title"
31 | 	union all
32 | 	select 
33 | 		cqesin."start",
34 | 		cqesin.channel_title,
35 | 		cqesin.channel_name,
36 | 		cqesin.sum_duration_minutes,
37 | 		cqesin."% climat" as weekly_perc_climat,
38 | 		country
39 | 	from 
40 | 		public.core_query_environmental_shares_i8n cqesin
41 | 	where country!='france'
42 | ),
43 | weekly_desinfo as (
44 | 	select 
45 | 	 	date_trunc('week', tgc.data_item_start) week_start,
46 | 	 	tgc.data_item_channel_name,
47 | 	 	tgc.country,
48 | 	 	sum(case when tgc.mesinfo_correct is null then 0 else tgc.mesinfo_correct end) total_mesinfo
49 | 	from
50 | 		{{ ref("task_global_completion") }} tgc
51 |     where tgc."Annotation Version"=1
52 | 	group by
53 | 		week_start,
54 | 	 	tgc.data_item_channel_name,
55 | 	 	tgc.country
56 | )
57 | select 
58 | 	env_shares.*,
59 | 	case when weekly_desinfo.total_mesinfo is null then 0 else weekly_desinfo.total_mesinfo end total_mesinfo
60 | from 
61 | 	env_shares
62 | left join 
63 | 	weekly_desinfo
64 | on
65 | 	env_shares.start=weekly_desinfo.week_start
66 | 	and env_shares.channel_name=weekly_desinfo.data_item_channel_name
67 | 	and env_shares.country=weekly_desinfo.country
68 | 


--------------------------------------------------------------------------------
/my_dbt_project/models/dashboards/core_query_causal_links.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |     materialized='incremental',
 3 |     incremental_strategy='append',
 4 |     on_schema_change='append_new_columns'
 5 |   )
 6 | }}
 7 | 
 8 | {% set process_month = var("process_month", date_trunc('month', current_date)) %}
 9 | 
10 | SELECT
11 |   public.keywords.id,
12 |   public.keywords.channel_title,
13 |   public.keywords.country,
14 |   public.keywords.start,
15 |   kw_consequence ->> 'keyword' AS keyword,
16 |   CASE
17 |     WHEN LOWER(kw_consequence ->> 'theme') LIKE '%climat%' THEN 'Crise climatique'
18 |     WHEN LOWER(kw_consequence ->> 'theme') LIKE '%biodiversite%' THEN 'Crise de la biodiversité'
19 |     WHEN LOWER(kw_consequence ->> 'theme') LIKE '%ressource%' THEN 'Crise des ressources'
20 |     ELSE 'Autre'
21 |   END AS crise,
22 |   (
23 |     SELECT COUNT(*)
24 |     FROM public.keywords k2
25 |     WHERE k2.channel_title = public.keywords.channel_title
26 |       AND k2.number_of_changement_climatique_constat_no_hrfp > 0
27 |       AND k2.start BETWEEN public.keywords.start - interval '4 minutes' AND public.keywords.start + interval '4 minutes'
28 |        and date_trunc('month', public.keywords.start) = cast('{{ var("process_month") }}' as date)
29 |   ) AS nb_constats_climat_neighbor,
30 |   (
31 |     SELECT COUNT(*)
32 |     FROM public.keywords k3
33 |     WHERE k3.channel_title = public.keywords.channel_title
34 |       AND k3.number_of_biodiversite_concepts_generaux_no_hrfp > 0
35 |       AND k3.start BETWEEN public.keywords.start - interval '4 minutes' AND public.keywords.start + interval '4 minutes'
36 |        and date_trunc('month', public.keywords.start) = cast('{{ var("process_month") }}' as date)
37 |   ) AS nb_constats_biodiversite_neighbor
38 | FROM public.keywords
39 | CROSS JOIN LATERAL json_array_elements(public.keywords.keywords_with_timestamp::json) kw_consequence
40 | WHERE LOWER(kw_consequence ->> 'theme') LIKE '%consequence%'
41 |   and date_trunc('month', public.keywords.start) = cast('{{ var("process_month") }}' as date)


--------------------------------------------------------------------------------
/test/i8n/test_country.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from quotaclimat.data_processing.mediatree.i8n.country import *
 4 | 
 5 | def test_validate_country_code_fra():
 6 |         france_code = validate_country_code("fra")
 7 |         assert france_code == FRANCE.code
 8 | 
 9 | def test_validate_country_code_invalid():
10 |     with pytest.raises(ValueError, match="Invalid country code: nz"):
11 |         validate_country_code("nz")
12 | 
13 | def test_get_country_from_code_fra():
14 |         france = get_country_from_code("fra")
15 |         assert france == FRANCE
16 | 
17 | def test_get_channels_brazil():
18 |       os.environ['ENV'] = 'prod'
19 |       channels = get_channels(country_code=BRAZIL.code)
20 |       assert channels == BRAZIL.channels
21 |       os.environ['ENV'] = 'docker'
22 | 
23 | def test_get_channels_default():
24 |       os.environ['ENV'] = 'docker'
25 |       channels = get_channels()
26 |       assert channels ==  ["france2"]
27 | 
28 | 
29 | def test_get_channels_default():
30 |       os.environ['ENV'] = 'prod'
31 |       channels = get_channels()
32 |       assert channels ==  FRANCE.channels
33 |       os.environ['ENV'] = 'docker'
34 | 
35 | def test_get_channel_title_for_name():
36 |       assert get_channel_title_for_name("tf1") == "TF1"
37 | 
38 | def test_get_channel_title_for_name_germany():
39 |       assert get_channel_title_for_name("rtl-television", GERMANY) == "RTL"
40 | 
41 | def test_get_channels_poland():
42 |       os.environ['ENV'] = 'prod'
43 |       channels = get_channels(country_code=POLAND.code)
44 |       assert channels == POLAND.channels
45 |       os.environ['ENV'] = 'docker'
46 | 
47 | def test_get_channel_title_for_name_poland():
48 |       assert get_channel_title_for_name("tvp", POLAND) == "TVP"
49 | 
50 | def test_get_channels_spain():
51 |       os.environ['ENV'] = 'prod'
52 |       channels = get_channels(country_code=SPAIN.code)
53 |       assert channels == SPAIN.channels
54 |       os.environ['ENV'] = 'docker'
55 | 
56 | def test_get_channel_title_for_name_spain():
57 |       assert get_channel_title_for_name("antenna-3", SPAIN) == "Antenna 3"


--------------------------------------------------------------------------------
/alembic/versions/356882459cec_remove_category_keywords_change_columns_.py:
--------------------------------------------------------------------------------
 1 | """Remove: category keywords / change columns names
 2 | 
 3 | Revision ID: 356882459cec
 4 | Revises: 2c48f626a749
 5 | Create Date: 2024-04-29 10:14:27.240887
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | from sqlalchemy.dialects import postgresql
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '356882459cec'
16 | down_revision: Union[str, None] = '2c48f626a749'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.add_column('keywords', sa.Column('number_of_ressources', sa.Integer(), nullable=True))
24 |     op.add_column('keywords', sa.Column('number_of_ressources_solutions', sa.Integer(), nullable=True))
25 |     op.drop_column('keywords', 'number_of_ressources_naturelles_causes')
26 |     op.drop_column('keywords', 'number_of_ressources_naturelles_concepts_generaux')
27 |     op.drop_column('keywords', 'category')
28 |     op.drop_column('keywords', 'number_of_ressources_naturelles_solutions')
29 |     # ### end Alembic commands ###
30 | 
31 | 
32 | def downgrade() -> None:
33 |     # ### commands auto generated by Alembic - please adjust! ###
34 |     op.add_column('keywords', sa.Column('number_of_ressources_naturelles_solutions', sa.INTEGER(), autoincrement=False, nullable=True))
35 |     op.add_column('keywords', sa.Column('category', postgresql.JSON(astext_type=sa.Text()), autoincrement=False, nullable=True))
36 |     op.add_column('keywords', sa.Column('number_of_ressources_naturelles_concepts_generaux', sa.INTEGER(), autoincrement=False, nullable=True))
37 |     op.add_column('keywords', sa.Column('number_of_ressources_naturelles_causes', sa.INTEGER(), autoincrement=False, nullable=True))
38 |     op.drop_column('keywords', 'number_of_ressources_solutions')
39 |     op.drop_column('keywords', 'number_of_ressources')
40 |     # ### end Alembic commands ###
41 | 


--------------------------------------------------------------------------------
/docker-entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run migrations before starting the application
 4 | echo "Running migrations with alembic if exists"
 5 | poetry run alembic upgrade head
 6 | 
 7 | 
 8 | echo "update program metadata file"
 9 | poetry run python3 transform_program.py
10 | if [[ $? -eq 0 ]]; then
11 |     echo "Command succeeded"
12 | else
13 |     echo "Command failed"
14 | fi
15 | if [[ "${REPARSE_CAUSAL_LINKS:-0}" -eq 1 ]]; then
16 |   echo "Reparsing core_query_causal_links"
17 |   year_end=$(date +%d)
18 | 
19 |   for m in $(seq 2022 2025); do
20 |     start_reparse=0
21 |     for mm in $(seq -w 1 12); do
22 |       date="$m-$mm-01"
23 |       echo "Processing month: $date"
24 |       poetry run dbt run --select core_query_causal_links --vars "{\"process_month\": \"$date\"}"
25 |     done
26 |   done
27 | else
28 |   echo "starting mediatree import app"
29 |   python quotaclimat/data_processing/mediatree/api_import.py
30 | 
31 |   echo "ingest labelstudio data into barometre database"
32 |   poetry run python -m quotaclimat.data_ingestion.labelstudio.ingest_labelstudio
33 | 
34 |   echo "apply dbt models - except causal links and analytics tables"
35 |   poetry run dbt run --full-refresh \
36 |     --exclude core_query_causal_links \
37 |     --exclude task_global_completion \
38 |     --exclude environmental_shares_with_desinfo_counts
39 | 
40 |   echo "apply dbt models to build analytics tables in 'analytics' schema."
41 |   poetry run dbt run --full-refresh --target analytics \
42 |     --select task_global_completion \
43 |     --select environmental_shares_with_desinfo_counts
44 | 
45 |   echo "Causal query case: Checking if today is the first of the month..."
46 |   day=$(date +%d)
47 | 
48 |   if [ "$day" -eq 01 ]; then
49 |     echo "✅ It's the 1st — running DBT for the previous month"
50 | 
51 |     # previous month (first day)
52 |     prev_month=$(date -d "$(date +%Y-%m-01) -1 month" +%Y-%m-01)
53 | 
54 |     echo "Processing month: $prev_month"
55 |     poetry run dbt run --select core_query_causal_links --vars "{\"process_month\": \"$prev_month\"}"
56 |   else
57 |     echo "⏭️ Not the 1st — skipping DBT run"
58 |   fi
59 | 
60 | fi
61 | 


--------------------------------------------------------------------------------
/quotaclimat/data_ingestion/scrap_html/scrap_description_article.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import aiohttp
 4 | from bs4 import BeautifulSoup
 5 | import asyncio
 6 | import re
 7 | 
 8 | agent = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"}
 9 | async def get_url_content(url_article: str):
10 |     async with aiohttp.ClientSession() as session:
11 |         async with session.get(url_article, headers=agent) as response:
12 |             return await response.text()
13 | 
14 | def get_hat_20minutes(soup, url_article = ""):
15 |     hat = soup.select_one(".hat-summary")
16 |     if hat is not None:
17 |         return (hat.text).strip()
18 |     else:
19 |         logging.warning(f"could not get hat : {url_article}")
20 |         return ""
21 | 
22 | # get https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta
23 | async def get_meta_news(url_article, media):
24 |     result = {
25 |         "title": "",
26 |         "description": "",
27 |     }
28 | 
29 |     if(media != "ouest-france"): # anti robot
30 |         response = await get_url_content(str(url_article))
31 |     else:
32 |         return result
33 | 
34 |     soup = BeautifulSoup(response, "html.parser")
35 |     soup_description = soup.find(name="meta", attrs={'name': 'description'})
36 |     if soup_description is not None:
37 |         description = soup_description.get("content").strip()
38 |         logging.debug(f"description for {url_article} is \n {description}")
39 |         result["description"] = description
40 |     elif media == "20_minutes": # does not have meta description
41 |         hat = get_hat_20minutes(soup, url_article)
42 |         logging.info(f"reading hat for {media} - {hat}")
43 |         result["description"] = hat
44 |     else:
45 |         logging.warning(f"could not find description for {url_article} - response \n {response}")
46 | 
47 |     # TODO : use it someday to parse missing data
48 |     soup_title = soup.find(name="title")
49 |     if soup_title is not None:
50 |         result["title"] = (soup_title.string).strip()
51 |         
52 |     return result
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/postgres/database_connection.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from sqlalchemy import create_engine, URL, Engine
 3 | from sqlalchemy.orm import sessionmaker, Session
 4 | import logging
 5 | 
 6 | logging.basicConfig(level=logging.INFO)
 7 | 
 8 | 
 9 | def connect_to_db(
10 |     database: str = os.environ.get("POSTGRES_DB", "barometre"),
11 |     user: str = os.environ.get("POSTGRES_USER", "user"),
12 |     host: str = os.environ.get("POSTGRES_HOST", "localhost"),
13 |     port: int = os.environ.get("POSTGRES_PORT", 5432),
14 |     password: str = os.environ.get("POSTGRES_PASSWORD", "password"),
15 | ):
16 |     """
17 |     Connect to the PostgreSQL database using environment variables or provided parameters.
18 | 
19 |     Parameters:
20 |     - database (str, optional): The name of the database. Defaults to 'barometre'.
21 |     - user (str, optional): The username for accessing the database. Defaults to 'user'.
22 |     - localhost (str, optional): The hostname of the database server. Defaults to 'localhost'.
23 |     - port (int, optional): The port number on which the database server is listening. Defaults to 5432.
24 |     - password (str, optional): The password for accessing the database. Defaults to 'password'.
25 | 
26 |     Returns:
27 |     - Engine: The SQLAlchemy engine object representing the connection to the database.
28 |     """
29 | 
30 |     logging.info("Connect to the host %s for DB %s" % (host, database))
31 | 
32 |     url = URL.create(
33 |         drivername="postgresql",
34 |         username=user,
35 |         host=host,
36 |         database=database,
37 |         port=port,
38 |         password=password,
39 |     )
40 | 
41 |     engine = create_engine(url)
42 | 
43 |     return engine
44 | 
45 | 
46 | def get_db_session(engine: Engine = None) -> Session:
47 |     """
48 |     Create a session for interacting with the database using the provided engine.
49 | 
50 |     Parameters:
51 |     - engine (Engine, optional): The SQLAlchemy engine object. If not provided, it calls `connect_to_db()` to obtain one.
52 | 
53 |     Returns:
54 |     - Session: A SQLAlchemy session bound to the provided engine or created by calling `connect_to_db()`.
55 |     """
56 |     if engine is None:
57 |         engine = connect_to_db()
58 | 
59 |     Session = sessionmaker(bind=engine)
60 |     return Session()
61 | 


--------------------------------------------------------------------------------
/quotaclimat/utils/sentry.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import ray
 3 | import os
 4 | import logging
 5 | from quotaclimat.utils.healthcheck_config import get_app_version
 6 | import sentry_sdk
 7 | from sentry_sdk.integrations.logging import LoggingIntegration
 8 | 
 9 | # read SENTRY_DSN from env
10 | functions_to_trace = [
11 |     {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.get_cts_in_ms_for_keywords"},
12 |     {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.filter_keyword_with_same_timestamp"},
13 |     {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.get_themes_keywords_duration"},
14 |     {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.count_keywords_duration_overlap"},
15 |     {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.filter_and_tag_by_theme"},
16 |     {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.add_primary_key"},
17 |     {"qualified_name": "quotaclimat.data_processing.mediatree.api_import.extract_api_sub"},
18 |     {"qualified_name": "quotaclimat.data_processing.mediatree.api_import.parse_reponse_subtitle"},
19 | ]
20 | 
21 | def sentry_init():
22 |     if(os.environ.get("SENTRY_DSN", None) != None):
23 |         logging.info("Sentry init")
24 |         logging_kwargs = {}
25 |         if os.getenv("SENTRY_LOGGING") == "true":
26 |             logging_kwargs = dict(
27 |                 enable_logs=True,
28 |                 integrations=[
29 |                     # Only send WARNING (and higher) logs to Sentry logs,
30 |                     # even if the logger is set to a lower level.
31 |                     LoggingIntegration(sentry_logs_level=logging.INFO),
32 |                 ]
33 |             )
34 |         sentry_sdk.init(
35 |             traces_sample_rate=0.3,
36 |             # To set a uniform sample rate
37 |             # Set profiles_sample_rate to 1.0 to profile 100%
38 |             # of sampled transactions.
39 |             # We recommend adjusting this value in production,
40 |             profiles_sample_rate=0.3,
41 |             release=get_app_version(),
42 |             # functions_to_trace=functions_to_trace,
43 |             # integrations=[ # TODO : https://docs.sentry.io/platforms/python/integrations/ray/
44 |             #     RayIntegration(),
45 |             # ],
46 |             **logging_kwargs
47 |         )
48 |     else:
49 |         logging.info("Sentry not init - SENTRY_DSN not found")


--------------------------------------------------------------------------------
/.github/workflows/scaleway-start-import-job-update.yml:
--------------------------------------------------------------------------------
 1 | name: Import job Scaleway
 2 | 
 3 | on:
 4 |   workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch
 5 | 
 6 | 
 7 | jobs:
 8 |   start-job-image:
 9 |     strategy:
10 |       matrix:
11 |         dates: [ 
12 |           {start_date: "2023-04-01", end_date: "2023-05-01"}
13 |           ,{start_date: "2023-05-01", end_date: "2023-06-01"}
14 |           ,{start_date: "2023-06-01", end_date: "2023-07-01"}
15 |           ,{start_date: "2023-07-01", end_date: "2023-08-01"}
16 |           ,{start_date: "2023-08-01", end_date: "2023-09-01"}
17 |           ,{start_date: "2023-09-01", end_date: "2023-10-01"}
18 |           ,{start_date: "2023-10-01", end_date: "2023-11-01"}
19 |           ,{start_date: "2023-11-01", end_date: "2023-12-01"}
20 |           ,{start_date: "2023-12-01", end_date: "2024-01-01"}
21 |           ,{start_date: "2024-01-01", end_date: "2024-02-01"}
22 |           ,{start_date: "2024-02-01", end_date: "2024-03-01"}
23 |           ,{start_date: "2024-03-01", end_date: "2024-04-01"}
24 |           ,{start_date: "2024-04-01", end_date: "2024-05-01"}
25 |           ,{start_date: "2024-05-01", end_date: "2024-06-01"}
26 |           ,{start_date: "2024-06-01", end_date: "2024-07-01"}
27 |           ,{start_date: "2024-07-01", end_date: "2024-08-01"}
28 |           ,{start_date: "2024-08-01", end_date: "2024-09-01"}
29 |           ,{start_date: "2024-09-01", end_date: "2024-10-01"}
30 |           ,{start_date: "2024-10-01", end_date: "2024-11-01"}
31 |           ,{start_date: "2024-11-01", end_date: "2024-12-01"}
32 |           ,{start_date: "2024-12-01", end_date: "2025-01-01"}
33 |           ,{start_date: "2025-01-01", end_date: "2025-02-01"}
34 |         ]
35 |     runs-on: ubuntu-latest
36 |     steps:
37 |     - name: start import job to reapply logic to all elements start_date matrix
38 |       uses: jawher/action-scw@v2.34.0
39 |       env:
40 |         SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
41 |         SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
42 |         SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
43 |         SCW_ZONE: ${{ secrets.SCW_ZONE }}
44 |       with:
45 |         args: jobs definition start ${{ secrets.SCALEWAY_JOB_IMPORT_ID }} environment-variables.UPDATE=true environment-variables.BIODIVERSITY_ONLY=true environment-variables.START_DATE_UPDATE=${{ matrix.dates.start_date }} environment-variables.END_DATE=${{ matrix.dates.end_date }}
46 | 


--------------------------------------------------------------------------------
/alembic/versions/4333bc46985d_keywords_program_id_foreign_key.py:
--------------------------------------------------------------------------------
 1 | """keywords: program_id foreign key
 2 | 
 3 | Revision ID: 4333bc46985d
 4 | Revises: ac96222af6fe
 5 | Create Date: 2025-03-21 14:25:06.180296
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | from sqlalchemy.dialects import postgresql
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '4333bc46985d'
16 | down_revision: Union[str, None] = 'ac96222af6fe'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.add_column('keywords', sa.Column('program_metadata_id', sa.Text(), nullable=True))
24 |     op.create_foreign_key(None, 'keywords', 'program_metadata', ['program_metadata_id'], ['id'])
25 |     op.alter_column('sitemap_table', 'download_date',
26 |                existing_type=postgresql.TIMESTAMP(timezone=True),
27 |                type_=sa.DateTime(),
28 |                existing_nullable=True)
29 |     op.alter_column('sitemap_table', 'news_publication_date',
30 |                existing_type=postgresql.TIMESTAMP(timezone=True),
31 |                type_=sa.DateTime(),
32 |                existing_nullable=True)
33 |     op.alter_column('sitemap_table', 'updated_on',
34 |                existing_type=postgresql.TIMESTAMP(timezone=True),
35 |                type_=sa.DateTime(),
36 |                existing_nullable=True)
37 |     # ### end Alembic commands ###
38 | 
39 | 
40 | def downgrade() -> None:
41 |     # ### commands auto generated by Alembic - please adjust! ###
42 |     op.alter_column('sitemap_table', 'updated_on',
43 |                existing_type=sa.DateTime(),
44 |                type_=postgresql.TIMESTAMP(timezone=True),
45 |                existing_nullable=True)
46 |     op.alter_column('sitemap_table', 'news_publication_date',
47 |                existing_type=sa.DateTime(),
48 |                type_=postgresql.TIMESTAMP(timezone=True),
49 |                existing_nullable=True)
50 |     op.alter_column('sitemap_table', 'download_date',
51 |                existing_type=sa.DateTime(),
52 |                type_=postgresql.TIMESTAMP(timezone=True),
53 |                existing_nullable=True)
54 |     op.drop_constraint(None, 'keywords', type_='foreignkey')
55 |     op.drop_column('keywords', 'program_metadata_id')
56 |     # ### end Alembic commands ###
57 | 


--------------------------------------------------------------------------------
/alembic/versions/44f13b7eebd4_dictionary_category.py:
--------------------------------------------------------------------------------
 1 | """dictionary category
 2 | 
 3 | Revision ID: 44f13b7eebd4
 4 | Revises: 827fb6dde3bb
 5 | Create Date: 2025-05-23 12:54:53.323525
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | from sqlalchemy.dialects import postgresql
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = '44f13b7eebd4'
16 | down_revision: Union[str, None] = '827fb6dde3bb'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.add_column('dictionary', sa.Column('category', sa.String(), nullable=True))
24 |     op.add_column('dictionary', sa.Column('theme', sa.String(), nullable=True))
25 | 
26 |     op.drop_column('dictionary', 'categories')
27 |     op.drop_column('dictionary', 'themes')
28 | 
29 |     op.drop_column('dictionary', 'solution')
30 |     op.drop_column('dictionary', 'consequence')
31 |     op.drop_column('dictionary', 'cause')
32 |     op.drop_column('dictionary', 'general_concepts')
33 |     op.drop_column('dictionary', 'statement')
34 | 
35 |     op.drop_column('dictionary', 'crisis_climate')
36 |     op.drop_column('dictionary', 'crisis_biodiversity')
37 |     op.drop_column('dictionary', 'crisis_resource')
38 |     pass
39 |     # ### end Alembic commands ###
40 | 
41 | 
42 | def downgrade() -> None:
43 |     op.add_column('dictionary', sa.Column('categories', postgresql.ARRAY(sa.String()), nullable=True))
44 |     op.add_column('dictionary', sa.Column('themes', postgresql.ARRAY(sa.String()), nullable=True))
45 |     op.add_column('dictionary', sa.Column('solution', sa.Boolean(), nullable=True, server_default=sa.text('false')))
46 |     op.add_column('dictionary', sa.Column('consequence', sa.Boolean(), nullable=True, server_default=sa.text('false')))
47 |     op.add_column('dictionary', sa.Column('cause', sa.Boolean(), nullable=True, server_default=sa.text('false')))
48 |     op.add_column('dictionary', sa.Column('general_concepts', sa.Boolean(), nullable=True, server_default=sa.text('false')))
49 |     op.add_column('dictionary', sa.Column('statement', sa.Boolean(), nullable=True, server_default=sa.text('false')))
50 | 
51 |     op.add_column('dictionary', sa.Column('crisis_climate', sa.Boolean(), nullable=True, server_default=sa.text('true')))
52 |     op.add_column('dictionary', sa.Column('crisis_biodiversity', sa.Boolean(), nullable=True, server_default=sa.text('true')))
53 |     op.add_column('dictionary', sa.Column('crisis_resource', sa.Boolean(), nullable=True, server_default=sa.text('true')))
54 | 
55 |     op.drop_column('dictionary', 'category')
56 |     op.drop_column('dictionary', 'theme')


--------------------------------------------------------------------------------
/mockwebsite/cnews_sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
 3 |   <url>
 4 |     <loc>https://www.cnews.fr/culture/2023-10-25/mort-de-richard-roundtree-samuel-l-jackson-gabrielle-union-carl-weathers-les</loc>
 5 |     <news:news>
 6 |       <news:publication>
 7 |         <news:name>CNEWS</news:name>
 8 |         <news:language>fr</news:language>
 9 |       </news:publication>
10 |       <news:title>Mort de Richard Roundtree : Samuel L. Jackson, Gabrielle Union, Carl Weathers… Les stars rendent hommage à l’acteur de «Shaft»</news:title>
11 |       <news:publication_date>2023-10-25T08:51:25+00:00</news:publication_date>
12 |       <news:keywords>Cinéma, culture, Carnet noir, hommages, People</news:keywords>
13 |     </news:news>
14 |     <image:image>
15 |       <image:loc>https://static.cnews.fr/sites/default/files/richard_roundtree_hommages_6538c96cd0e46_0.jpg</image:loc>
16 |       <image:caption/>
17 |     </image:image>
18 |   </url>
19 |   <url>
20 |     <loc>https://www.cnews.fr/france/2023-10-25/squat-de-saint-martin-du-touch-toulouse-pres-de-200-personnes-evacuees-1410951</loc>
21 |     <news:news>
22 |       <news:publication>
23 |         <news:name>CNEWS</news:name>
24 |         <news:language>fr</news:language>
25 |       </news:publication>
26 |       <news:title>Squat de Saint-Martin-du-Touch à Toulouse : près de 200 personnes évacuées</news:title>
27 |       <news:publication_date>2023-10-25T08:47:27+00:00</news:publication_date>
28 |       <news:keywords>Squat, Toulouse, Squatteurs</news:keywords>
29 |     </news:news>
30 |     <image:image>
31 |       <image:loc>https://static.cnews.fr/sites/default/files/capture_decran_2023-10-25_a_10.10.05_6538ce23a0be6_0.png</image:loc>
32 |       <image:caption/>
33 |     </image:image>
34 |   </url>
35 |   <url>
36 |     <loc>https://www.cnews.fr/videos/monde/2023-10-25/israel-hamas-des-que-jai-vu-lhorreur-je-suis-monte-dans-le-premier-avion</loc>
37 |     <news:news>
38 |       <news:publication>
39 |         <news:name>CNEWS</news:name>
40 |         <news:language>fr</news:language>
41 |       </news:publication>
42 |       <news:title>Israël-Hamas : «Dès que j'ai vu l'horreur, je suis monté dans le premier avion», explique un soldat de la réserve de Tsahal</news:title>
43 |       <news:publication_date>2023-10-25T08:29:51+00:00</news:publication_date>
44 |       <news:keywords>Israël, Tsahal, Armée, Hamas</news:keywords>
45 |     </news:news>
46 |     <image:image>
47 |       <image:loc>https://static.cnews.fr/sites/default/files/Video/x8p2xa3_6538a94a625ad_0.jpg</image:loc>
48 |       <image:caption>Témoignage d'un réserviste mobilisé en Israël</image:caption>
49 |     </image:image>
50 |   </url>
51 | </urlset>


--------------------------------------------------------------------------------
/quotaclimat/data_ingestion/ingest_db/ingest_sitemap_in_db.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from argparse import ArgumentParser
 3 | import sys,time
 4 | import os
 5 | from postgres.insert_data import insert_data_in_sitemap_table
 6 | from postgres.insert_existing_data_example import \
 7 |     transformation_from_dumps_to_table_entry
 8 | from postgres.schemas.models import create_tables, connect_to_db, get_last_month_sitemap_id
 9 | from quotaclimat.utils.healthcheck_config import run_health_check_server
10 | from quotaclimat.utils.logger import CustomFormatter
11 | import sentry_sdk
12 | from sentry_sdk.crons import monitor
13 | from quotaclimat.utils.sentry import sentry_init
14 | import asyncio
15 | from quotaclimat.data_ingestion.scrap_sitemap import \
16 |     query_one_sitemap_and_transform, get_sitemap_list
17 | 
18 | 
19 | 
20 | async def batch_sitemap(exit_event):
21 |     create_tables()
22 |     
23 |     conn = connect_to_db()
24 |     sitemap_list = get_sitemap_list().items() 
25 |     logging.info("Going to parse %s" % (sitemap_list))
26 |     df_from_pg = get_last_month_sitemap_id(conn)
27 |     for media, sitemap_conf in sitemap_list:
28 |         try:
29 |             df = await query_one_sitemap_and_transform(media, sitemap_conf, df_from_pg)
30 |             df_to_insert = transformation_from_dumps_to_table_entry(df)
31 |             await asyncio.to_thread(insert_data_in_sitemap_table(df_to_insert, conn))
32 |         except TypeError as err:
33 |             logging.debug("Asyncio error %s" % (err))
34 |             continue
35 |         except Exception as err:
36 |             logging.error("Could not ingest data in db for media %s:(%s) %s" % (media,type(err).__name__, err))
37 |             continue
38 | 
39 |     logging.info("finished")
40 |     conn.dispose()
41 |     exit_event.set()
42 |     return
43 | 
44 | async def main():
45 |     with monitor(monitor_slug='sitemap'): #https://docs.sentry.io/platforms/python/crons/
46 |         event_finish = asyncio.Event()
47 |         # Start the health check server in the background
48 |         health_check_task = asyncio.create_task(run_health_check_server())
49 | 
50 |         # Start batch job
51 |         asyncio.create_task(batch_sitemap(event_finish))
52 | 
53 |         # Wait for both tasks to complete
54 |         await event_finish.wait()
55 | 
56 |         res=health_check_task.cancel()
57 |     logging.info("Exiting with success")
58 |     sys.exit(0)
59 | 
60 | if __name__ == "__main__":
61 |     # create logger with 'spam_application'
62 |     logger = logging.getLogger()
63 |     logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper())
64 |     sentry_init()
65 |     # create console handler with a higher log level
66 |     if (logger.hasHandlers()):
67 |         logger.handlers.clear()
68 |     ch = logging.StreamHandler()
69 |     ch.setFormatter(CustomFormatter())
70 |     logger.addHandler(ch)
71 | 
72 |     asyncio.run(main())
73 |     sys.exit(0)
74 | 


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/api_import_utils/db.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | import logging
 3 | from typing import Tuple
 4 | from quotaclimat.data_processing.mediatree.utils import *
 5 | from quotaclimat.data_processing.mediatree.config import *
 6 | from postgres.schemas.models import Keywords
 7 | from sqlalchemy.orm import Session
 8 | from sqlalchemy import Select, select, func, cast, Date, Integer, text, and_
 9 | from quotaclimat.data_processing.mediatree.i8n.country import *
10 | from typing import NamedTuple
11 | 
12 | class KeywordLastStats(NamedTuple):
13 |     last_day_saved: date
14 |     number_of_previous_days_from_yesterday: int
15 | 
16 | # Security nets to catch up delays from production servers errors
17 | 
18 | def get_last_date_and_number_of_delay_saved_in_keywords(session: Session, days_filter: int = 30, country = FRANCE) -> KeywordLastStats:
19 |     logging.debug(f"get_last_date_and_number_of_delay_saved_in_keywords")
20 |     try:
21 |         source_subquery = (
22 |             select(
23 |                 Keywords.start.label("start"),
24 |                 cast(
25 |                     func.extract(
26 |                         "day",
27 |                         func.date_trunc("day", (func.now() - text("INTERVAL '1 day'"))) - func.date_trunc("day", Keywords.start),
28 |                     ),
29 |                     Integer,
30 |                 ).label("previous_days"),
31 |             )
32 |             .select_from(Keywords)
33 |             .where(
34 |                 and_(
35 |                     Keywords.start >= func.now() - text(f"INTERVAL '{days_filter} days'"),
36 |                     Keywords.country == country.name
37 |                 )
38 |             )
39 |             .subquery("source")
40 |         )
41 | 
42 |         statement: Select[Tuple[date, int]] = (
43 |             select(
44 |                 func.max(cast(source_subquery.c.start, Date)).label("last_day_saved"),
45 |                 func.min(source_subquery.c.previous_days).label("number_of_previous_days_from_yesterday"),
46 |             )
47 |         )
48 | 
49 |         result = session.execute(statement).fetchone()
50 |         return KeywordLastStats(result[0], result[1])
51 |     except Exception as err:
52 |             logging.error("get_top_keywords_by_channel crash (%s) %s" % (type(err).__name__, err))
53 |             raise err
54 |     
55 | def get_delay_date(lastSavedKeywordsDate: KeywordLastStats, normal_delay_in_days: int = 1):
56 |     logging.warning(f"Delay detected : {lastSavedKeywordsDate.number_of_previous_days_from_yesterday } days, it should be {normal_delay_in_days} day")
57 |     default_start_date = get_epoch_from_datetime(datetime(lastSavedKeywordsDate.last_day_saved.year,lastSavedKeywordsDate.last_day_saved.month,lastSavedKeywordsDate.last_day_saved.day))
58 |     default_number_of_previous_days = lastSavedKeywordsDate.number_of_previous_days_from_yesterday
59 |     return default_start_date, default_number_of_previous_days


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | secrets/pwd_api.txt
  2 | secrets/username_api.txt
  3 | secrets/*
  4 | s3/*
  5 | i8n/mediatree_output/
  6 | i8n/csa-belge/
  7 | documents-experts/*
  8 | i8n/mediatree_output
  9 | i8n/csa-belge
 10 | 
 11 | i8n/germany_big.parquet
 12 | test/i8n
 13 | llm/
 14 | cc-bio.json
 15 | *.xlsx
 16 | coverage_re
 17 | # Byte-compiled / optimized / DLL files
 18 | __pycache__/
 19 | *.py[cod]
 20 | *$py.class
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | build/
 28 | develop-eggs/
 29 | dist/
 30 | downloads/
 31 | eggs/
 32 | .eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | wheels/
 39 | pip-wheel-metadata/
 40 | share/python-wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | 
 70 | # Translations
 71 | *.mo
 72 | *.pot
 73 | 
 74 | # Django stuff:
 75 | *.log
 76 | local_settings.py
 77 | db.sqlite3
 78 | db.sqlite3-journal
 79 | 
 80 | # Flask stuff:
 81 | instance/
 82 | .webassets-cache
 83 | 
 84 | # Scrapy stuff:
 85 | .scrapy
 86 | 
 87 | # Sphinx documentation
 88 | docs/_build/
 89 | 
 90 | # PyBuilder
 91 | target/
 92 | 
 93 | # Jupyter Notebook
 94 | .ipynb_checkpoints
 95 | 
 96 | # IPython
 97 | profile_default/
 98 | ipython_config.py
 99 | 
100 | # pyenv
101 | .python-version
102 | 
103 | # pipenv
104 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
106 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
107 | #   install all needed dependencies.
108 | #Pipfile.lock
109 | 
110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
111 | __pypackages__/
112 | 
113 | # Celery stuff
114 | celerybeat-schedule
115 | celerybeat.pid
116 | 
117 | # SageMath parsed files
118 | *.sage.py
119 | 
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 | 
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 | 
133 | # Rope project settings
134 | .ropeproject
135 | 
136 | # mkdocs documentation
137 | /site
138 | 
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 | 
144 | # Pyre type checker
145 | .pyre/
146 | data/*
147 | .vscode/settings.json
148 | notebooks/nlp/df_all.csv
149 | notebooks/nlp/df_X_tfidf.pkl
150 | .vscode/settings.json
151 | 
152 | .DS_Store
153 | pgdata
154 | mb-data
155 | .idea
156 | pgdump/


--------------------------------------------------------------------------------
/alembic/versions/ac96222af6fe_hrfp_counters.py:
--------------------------------------------------------------------------------
 1 | """hrfp counters
 2 | 
 3 | Revision ID: ac96222af6fe
 4 | Revises: 30abfd828007
 5 | Create Date: 2024-12-02 14:36:21.970968
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'ac96222af6fe'
16 | down_revision: Union[str, None] = '30abfd828007'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.add_column('keywords', sa.Column('number_of_changement_climatique_constat_no_hrfp', sa.Integer(), nullable=True))
24 |     op.add_column('keywords', sa.Column('number_of_changement_climatique_causes_no_hrfp', sa.Integer(), nullable=True))
25 |     op.add_column('keywords', sa.Column('number_of_changement_climatique_consequences_no_hrfp', sa.Integer(), nullable=True))
26 |     op.add_column('keywords', sa.Column('number_of_attenuation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
27 |     op.add_column('keywords', sa.Column('number_of_adaptation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
28 |     op.add_column('keywords', sa.Column('number_of_ressources_no_hrfp', sa.Integer(), nullable=True))
29 |     op.add_column('keywords', sa.Column('number_of_ressources_solutions_no_hrfp', sa.Integer(), nullable=True))
30 |     op.add_column('keywords', sa.Column('number_of_biodiversite_concepts_generaux_no_hrfp', sa.Integer(), nullable=True))
31 |     op.add_column('keywords', sa.Column('number_of_biodiversite_causes_no_hrfp', sa.Integer(), nullable=True))
32 |     op.add_column('keywords', sa.Column('number_of_biodiversite_consequences_no_hrfp', sa.Integer(), nullable=True))
33 |     op.add_column('keywords', sa.Column('number_of_biodiversite_solutions_no_hrfp', sa.Integer(), nullable=True))
34 |     # ### end Alembic commands ###
35 | 
36 | 
37 | def downgrade() -> None:
38 |     # ### commands auto generated by Alembic - please adjust! ###
39 |     op.drop_column('keywords', 'number_of_biodiversite_solutions_no_hrfp')
40 |     op.drop_column('keywords', 'number_of_biodiversite_consequences_no_hrfp')
41 |     op.drop_column('keywords', 'number_of_biodiversite_causes_no_hrfp')
42 |     op.drop_column('keywords', 'number_of_biodiversite_concepts_generaux_no_hrfp')
43 |     op.drop_column('keywords', 'number_of_ressources_solutions_no_hrfp')
44 |     op.drop_column('keywords', 'number_of_ressources_no_hrfp')
45 |     op.drop_column('keywords', 'number_of_adaptation_climatique_solutions_no_hrfp')
46 |     op.drop_column('keywords', 'number_of_attenuation_climatique_solutions_no_hrfp')
47 |     op.drop_column('keywords', 'number_of_changement_climatique_consequences_no_hrfp')
48 |     op.drop_column('keywords', 'number_of_changement_climatique_causes_no_hrfp')
49 |     op.drop_column('keywords', 'number_of_changement_climatique_constat_no_hrfp')
50 |     # ### end Alembic commands ###
51 | 


--------------------------------------------------------------------------------
/mockwebsite/lefigaro_localhost_sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
 3 |             <url>
 4 |             <loc>http://localhost:8000/mediapart_website.html</loc>
 5 |             <lastmod>2023-10-12T17:34:28+02:00</lastmod>
 6 |             <news:news>
 7 |                 <news:publication>
 8 |                     <news:name>Le Figaro</news:name>
 9 |                     <news:language>fr</news:language>
10 |                 </news:publication>
11 |                 <news:publication_date>2023-10-12T06:13:00+02:00</news:publication_date>
12 |                 <news:title>EN DIRECT - Conflit Hamas-Israël : l’armée israélienne dit avoir frappé Gaza avec 4000 tonnes d’explosifs depuis samedi</news:title>
13 |                                     <news:keywords>Israël, Hamas, conflit israélo-palestinien, International, actualité internationale, affaires étrangères, ministère des affaires étrangères, politique étrangère</news:keywords>
14 |                                                     <news:genres>Blog</news:genres>
15 |                             </news:news>
16 |                             <image:image>
17 |                     <image:loc>https://i.f1g.fr/media/cms/orig/2023/10/12/eccf7495cede8869a8a35d6fd70a1635759a12dbef68dd16e82e34162f69ec4f.jpg</image:loc>
18 |                                             <image:caption>Explosion dans le centre de la ville de Gaza ce jeudi 12 octobre.</image:caption>
19 |                                     </image:image>
20 |             </url>
21 |             <url>
22 |             <loc>http://localhost:8000/20minutes_website.html</loc>
23 |             <lastmod>2023-10-12T17:34:21+02:00</lastmod>
24 |             <news:news>
25 |                 <news:publication>
26 |                     <news:name>Le Figaro</news:name>
27 |                     <news:language>fr</news:language>
28 |                 </news:publication>
29 |                 <news:publication_date>2023-10-11T16:16:00+02:00</news:publication_date>
30 |                 <news:title>Grève du 13 octobre : SNCF, RATP, aérien, médecins… Retrouvez le détail des perturbations à prévoir</news:title>
31 |                                     <news:keywords>grève, salaires, social, RH, ressources humaines, primes, conjoncture, entreprise, œuvres sociales, trséorerie, finance, comoité d'entreprise, elections syndicales, gestion entreprise, TPE, PME, PMI, CAC 40, fiscalité des entreprises, actualités sociales</news:keywords>
32 |                                             </news:news>
33 |                             <image:image>
34 |                     <image:loc>https://i.f1g.fr/media/cms/orig/2023/10/09/8f1062e1948f5c0abb930b0665ec4958613a74853c8fba9dfb7f374b3ec82065.jpg</image:loc>
35 |                                             <image:caption>Grève: à quoi faut-il s’attendre ce 13 octobre ?</image:caption>
36 |                                     </image:image>
37 |             </url>
38 |     </urlset>
39 | 


--------------------------------------------------------------------------------
/mockwebsite/20minutes_sitemap.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd http://www.google.com/schemas/sitemap-news/0.9 http://www.google.com/schemas/sitemap-news/0.9/sitemap-news.xsd"><url><loc>https://www.20minutes.fr/justice/4059662-20231027-prisons-proces-rugby-re-passe-heure-voiture-eric-dupond-moretti</loc><news:news><news:publication><news:name>20minutes.fr</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-27T10:07:37+02:00</news:publication_date><news:title>Prisons, procès, rugby… On a (re) passé une heure en voiture avec Éric Dupond-Moretti</news:title></news:news><image:image><image:loc>https://img.20mn.fr/DWn2CVxERkK9ZEKE_2ASMyk/1200x768_eric-dupond-moretti-au-centre-a-inaugure-mercredi</image:loc><image:caption>Eric Dupond-Moretti (au centre) a inauguré mercredi le centre pénitentiaire de Troyes-Lavau, dans l'Aube, aux côtés du maire de Troyes, François Baroin (à droiteà, et celui de Lavau, Jacques Gachowski (à gauche)</image:caption></image:image></url><url><loc>https://www.20minutes.fr/guide-achat/guide-achat-bon-plan-cdiscount/4059580-20231026-top-5-meilleures-trottinettes-electriques-petit-prix-chez-cdiscount</loc><news:news><news:publication><news:name>20minutes.fr</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-27T10:05:36+02:00</news:publication_date><news:title>Top 5 des meilleures trottinettes électriques à petit prix chez Cdiscount</news:title></news:news><image:image><image:loc>https://img.20mn.fr/ilZnoCiMQsyvdlq67n7upyk/1200x768_top-5-des-meilleures-trottinettes-electriques-a-petit-prix-chez-cdiscount</image:loc><image:caption>Top 5 des meilleures trottinettes électriques à petit prix chez Cdiscount</image:caption></image:image></url><url><loc>https://www.20minutes.fr/monde/etats-unis/4059735-20231027-fusillades-etats-unis-direct-police-americaine-toujours-recherche-robert-card</loc><news:news><news:publication><news:name>20minutes.fr</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-27T10:04:16+02:00</news:publication_date><news:title>Fusillades aux Etats-Unis EN DIRECT : La police américaine toujours à la recherche de Robert Card…</news:title></news:news><image:image><image:loc>https://img.20mn.fr/OB_g4z-PQ6yJwXKhJBgf5yk/1200x768_oct-26-2023-bowdoin-maine-usa-law-enforcement-officers-search-the-area-of-bowdoin-maine-the-day-after-a-suspect-killed-at-least-18-people-during-multiple-shootings-in-the-lewiston-area-mandatory-credit-camille-fine-usa-today-sipa-usa-49221769-zd5-2310270429</image:loc><image:caption>Des agents des forces de l'ordre fouillent la zone de Bowdoin, dans le Maine, au lendemain du jour où un suspect a tué au moins 18 personnes lors de multiples fusillades dans la région de Lewiston.</image:caption></image:image></url></urlset>


--------------------------------------------------------------------------------
/mockwebsite/lefigaro_sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
 3 |             <url>
 4 |             <loc>http://nginxtest:80/mediapart_website.html</loc>
 5 |             <lastmod>2023-10-12T17:34:28+02:00</lastmod>
 6 |             <news:news>
 7 |                 <news:publication>
 8 |                     <news:name>Le Figaro</news:name>
 9 |                     <news:language>fr</news:language>
10 |                 </news:publication>
11 |                 <news:publication_date>2023-10-12T06:13:00+02:00</news:publication_date>
12 |                 <news:title>EN DIRECT - Conflit Hamas-Israël : l’armée israélienne dit avoir frappé Gaza avec 4000 tonnes d’explosifs depuis samedi</news:title>
13 |                                     <news:keywords>Israël, Hamas, conflit israélo-palestinien, International, actualité internationale, affaires étrangères, ministère des affaires étrangères, politique étrangère</news:keywords>
14 |                                                     <news:genres>Blog</news:genres>
15 |                             </news:news>
16 |                             <image:image>
17 |                     <image:loc>https://i.f1g.fr/media/cms/orig/2023/10/12/eccf7495cede8869a8a35d6fd70a1635759a12dbef68dd16e82e34162f69ec4f.jpg</image:loc>
18 |                                             <image:caption>Explosion dans le centre de la ville de Gaza ce jeudi 12 octobre.</image:caption>
19 |                                     </image:image>
20 |             </url>
21 |             <url>
22 |             <loc>http://nginxtest:80/20minutes_website.html</loc>
23 |             <lastmod>2023-10-12T17:34:21+02:00</lastmod>
24 |             <news:news>
25 |                 <news:publication>
26 |                     <news:name>Le Figaro</news:name>
27 |                     <news:language>fr</news:language>
28 |                 </news:publication>
29 |                 <news:publication_date>2023-10-11T16:16:00+02:00</news:publication_date>
30 |                 <!-- missing title on purpose to test -->
31 |                 <news:title>Grève du 13 octobre : SNCF, RATP, aérien, médecins… Retrouvez le détail des perturbations à prévoir</news:title>
32 |                                     <news:keywords>grève, salaires, social, RH, ressources humaines, primes, conjoncture, entreprise, œuvres sociales, trséorerie, finance, comoité d'entreprise, elections syndicales, gestion entreprise, TPE, PME, PMI, CAC 40, fiscalité des entreprises, actualités sociales</news:keywords>
33 |                                             </news:news>
34 |                             <image:image>
35 |                     <image:loc>https://i.f1g.fr/media/cms/orig/2023/10/09/8f1062e1948f5c0abb930b0665ec4958613a74853c8fba9dfb7f374b3ec82065.jpg</image:loc>
36 |                                             <image:caption>Grève: à quoi faut-il s’attendre ce 13 octobre ?</image:caption>
37 |                                     </image:image>
38 |             </url>
39 | 
40 |     </urlset>
41 | 


--------------------------------------------------------------------------------
/my_dbt_project/models/dashboards/core_query_thematics_keywords_i8n.sql:
--------------------------------------------------------------------------------
 1 | {{ config(
 2 |     materialized='incremental'
 3 |     ,unique_key=['week','channel_title']
 4 |   )
 5 | }}
 6 | {{ config(
 7 |     materialized='incremental'
 8 |     ,unique_key=['week','channel_title']
 9 |   )
10 | }}
11 | 
12 | WITH keyword_occurrences AS (
13 |   SELECT DISTINCT
14 |     COALESCE(pm.channel_title, k.channel_title) AS channel_title,
15 |     DATE_TRUNC('week', k.start)::date AS week,
16 |     k.start AS occurrence_time,
17 |     k.country AS country,
18 |     -- Semantic tags
19 |     CASE WHEN LOWER(kw ->> 'theme') LIKE '%solution%' THEN TRUE ELSE FALSE END AS is_solution,
20 |     CASE WHEN LOWER(kw ->> 'theme') LIKE '%consequence%' THEN TRUE ELSE FALSE END AS is_consequence,
21 |     CASE WHEN LOWER(kw ->> 'theme') LIKE '%cause%' THEN TRUE ELSE FALSE END AS is_cause,
22 |     CASE WHEN LOWER(kw ->> 'theme') LIKE '%concepts_generaux%' THEN TRUE ELSE FALSE END AS is_general_concepts,
23 |     CASE WHEN LOWER(kw ->> 'theme') LIKE '%constat%' THEN TRUE ELSE FALSE END AS is_statement,
24 |     -- Crisis type
25 |     CASE
26 |       WHEN LOWER(kw ->> 'theme') LIKE '%climat%' THEN 'Crise climatique'
27 |       WHEN LOWER(kw ->> 'theme') LIKE '%biodiversite%' THEN 'Crise de la biodiversité'
28 |       WHEN LOWER(kw ->> 'theme') LIKE '%ressource%' THEN 'Crise des ressources'
29 |       ELSE 'Autre'
30 |     END AS crise_type,
31 |     kw ->> 'theme' AS theme,
32 |     kw ->> 'keyword' AS keyword
33 |   FROM public.keywords k
34 |   LEFT JOIN public.program_metadata pm
35 |     ON k.channel_program = pm.channel_program
36 |    AND k.channel_name = pm.channel_name
37 |    AND (
38 |       (
39 |         CASE
40 |           WHEN ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7) = 0 THEN 7
41 |           ELSE ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7)
42 |         END = pm.weekday
43 |       )
44 |     )
45 |    -- AND k.country = pm.country
46 |    AND CAST(k.start AS date) BETWEEN CAST(pm.program_grid_start AS date)
47 |    AND CAST(pm.program_grid_end AS date)
48 |   , json_array_elements(k.keywords_with_timestamp::json) AS kw
49 |   WHERE
50 |     LOWER(kw ->> 'theme') NOT LIKE '%indirect%'
51 | )
52 | 
53 | SELECT
54 |   ko.channel_title,
55 |   ko.country,
56 |   ko.week,
57 |   COALESCE(NULLIF(d.category, ''), 'Transversal') AS category,
58 |   d.high_risk_of_false_positive,
59 |   ko.is_solution,
60 |   ko.is_consequence,
61 |   ko.is_cause,
62 |   ko.is_general_concepts,
63 |   ko.is_statement,
64 |   ko.crise_type,
65 |   ko.theme,
66 |   ko.keyword,
67 |   COUNT(*) AS count
68 | FROM keyword_occurrences ko
69 | LEFT JOIN public.dictionary d
70 |   ON d.keyword = ko.keyword AND d.theme LIKE ko.theme || '%' -- ensure matc with indirect theme inside the dictionary table
71 | GROUP BY
72 |   ko.country,
73 |   ko.channel_title,
74 |   ko.week,
75 |   d.high_risk_of_false_positive,
76 |   COALESCE(NULLIF(d.category, ''), 'Transversal'),
77 |   ko.is_solution,
78 |   ko.is_consequence,
79 |   ko.is_cause,
80 |   ko.is_general_concepts,
81 |   ko.is_statement,
82 |   ko.crise_type,
83 |   ko.theme,
84 |   ko.keyword
85 | ORDER BY
86 |   ko.channel_title, ko.week, ko.crise_type


--------------------------------------------------------------------------------
/test/sitemap/test_mediatree_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pandas as pd
 3 | 
 4 | from test_utils import get_localhost
 5 | from quotaclimat.data_processing.mediatree.utils import *
 6 | 
 7 | import logging
 8 | from time import strftime,localtime
 9 | 
10 | localhost = get_localhost()
11 | 
12 | def test_get_date_sql_query():
13 |     date = datetime(2024, 12, 12, 0, 0, 0)
14 |     expected =  "'2024-12-12 00:00:00.000 +00:00'"
15 | 
16 |     assert get_date_sql_query(date) == expected
17 | 
18 | def test_get_yesterday():
19 |     yesterday = get_yesterday()
20 |     yesterday_string = strftime('%Y-%m-%d %H:%M:%S', localtime(yesterday))
21 |     logging.info(f"yesterday_string {yesterday_string}")
22 |     assert '00:00:00' in yesterday_string
23 | 
24 | def test_is_it_tuesday():
25 |     date = pd.Timestamp("2024-02-13 15:34:28")
26 |     assert is_it_tuesday(date) == True
27 | 
28 |     date = pd.Timestamp("2024-01-01 15:34:28")
29 |     assert is_it_tuesday(date) == False
30 | 
31 | def test_get_end_of_month():
32 |     assert get_end_of_month("2024-04-01") == "2024-04-30"
33 |     assert get_end_of_month("2024-02-01") == "2024-02-29"
34 |     assert get_end_of_month("2024-02-15") == "2024-02-29"
35 | 
36 | def test_get_first_of_month():
37 |     date = datetime(2024, 12, 12, 0, 0, 0)
38 |     assert get_first_of_month(date) == "2024-12-01"
39 | 
40 | def test_get_date_now_minus_days():
41 |     date = datetime(2024, 12, 12, 0, 0, 0)
42 |     assert get_date_now_minus_days(start=date, minus_days=6) == "2024-12-06"
43 |     assert get_date_now_minus_days(start=date, minus_days=13) == "2024-11-29"
44 | 
45 | 
46 | def test_get_start_end_date_env_variable_with_default():
47 |     start_date = 0
48 |     
49 |     assert get_start_end_date_env_variable_with_default(start_date, minus_days=1) == (get_yesterday(), None)
50 | 
51 | def test_get_start_end_date_env_variable_with_start_date_value():
52 |     start_date = 1734508085
53 |     number_of_previous_days = 7
54 |     start_date_minus_days = start_date - (number_of_previous_days * 24 * 60 * 60)
55 | 
56 |     assert get_start_end_date_env_variable_with_default(start_date, minus_days=number_of_previous_days) == (int(start_date), start_date_minus_days)
57 | 
58 | def test_get_start_end_date_with_get_date_range():
59 |     start_date = 1734508085
60 |     number_of_previous_days = 7
61 |     (start,end) = get_start_end_date_env_variable_with_default(start_date, minus_days=number_of_previous_days)
62 | 
63 |     expected = pd.DatetimeIndex(['2024-12-11', '2024-12-12', '2024-12-13', '2024-12-14', '2024-12-15', '2024-12-16', '2024-12-17', '2024-12-18'],
64 |                         dtype='datetime64[ns]', freq='D')
65 |     
66 |     output = get_date_range(start,end)
67 |     assert len(output) == number_of_previous_days + 1
68 |     pd.testing.assert_index_equal(output, expected)
69 | 
70 | def test_get_start_end_date_with_get_date_range_default():
71 |     start_date = 0
72 |     number_of_previous_days = 7
73 |     (start,end) = get_start_end_date_env_variable_with_default(start_date, minus_days=number_of_previous_days)
74 | 
75 |     
76 |     output = get_date_range(start,end, minus_days=number_of_previous_days)
77 |     assert len(output) == number_of_previous_days


--------------------------------------------------------------------------------
/test/s3/test_s3.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pandas as pd
 3 | from quotaclimat.data_processing.mediatree.s3.api_to_s3 import get_bucket_key, get_bucket_key_folder, get_partition_s3
 4 | from quotaclimat.data_processing.mediatree.s3.s3_utils import read_folder_from_s3, transform_raw_keywords
 5 | from quotaclimat.data_processing.mediatree.channel_program import *
 6 | from quotaclimat.data_processing.mediatree.i8n.country import *
 7 | 
 8 | def test_get_bucket_key_default():
 9 |     friday_6h26 = 1726719981
10 |     date = pd.to_datetime(friday_6h26, unit='s', utc=True)
11 |     channel = "tf1"
12 |     assert get_bucket_key(date, channel) == "year=2024/month=9/day=19/channel=tf1/*.parquet"
13 | 
14 | def test_get_bucket_key_france():
15 |     friday_6h26 = 1726719981
16 |     date = pd.to_datetime(friday_6h26, unit='s', utc=True)
17 |     channel = "tf1"
18 |     assert get_bucket_key(date, channel, country=FRANCE) == "year=2024/month=9/day=19/channel=tf1/*.parquet"
19 | 
20 | def test_get_bucket_key_country():
21 |     friday_6h26 = 1726719981
22 |     date = pd.to_datetime(friday_6h26, unit='s', utc=True)
23 |     channel = "tf1"
24 |     assert get_bucket_key(date, channel, country=GERMANY) == f"country={GERMANY.name}/year=2024/month=9/day=19/channel=tf1/*.parquet"
25 | 
26 | def test_get_bucket_key_first_of_the_month():
27 |     first_december = 1733040125
28 |     date = pd.to_datetime(first_december, unit='s', utc=True)
29 |     channel = "tf1"
30 |     assert get_bucket_key(date, channel) == "year=2024/month=12/day=1/channel=tf1/*.parquet"
31 | 
32 | def test_get_bucket_key_first_of_the_month_default():
33 |     first_december = 1733040125
34 |     date = pd.to_datetime(first_december, unit='s', utc=True)
35 |     channel = "tf1"
36 |     assert get_bucket_key_folder(date, channel) == "year=2024/month=12/day=1/channel=tf1/"
37 | 
38 | def test_get_bucket_key_first_of_the_month_france():
39 |     first_december = 1733040125
40 |     date = pd.to_datetime(first_december, unit='s', utc=True)
41 |     channel = "tf1"
42 |     key_folder = f"year=2024/month=12/day=1/channel=tf1/"
43 |     assert get_bucket_key_folder(date, channel, country=FRANCE) == key_folder
44 | 
45 | def test_get_bucket_key_first_of_the_month_brazil():
46 |     first_december = 1733040125
47 |     date = pd.to_datetime(first_december, unit='s', utc=True)
48 |     channel = "tf1"
49 |     key_folder = f"country={BRAZIL.name}/year=2024/month=12/day=1/channel=tf1/"
50 |     assert get_bucket_key_folder(date, channel, country=BRAZIL) == key_folder
51 | 
52 | def test_get_partition_s3_france_legacy():
53 |     assert get_partition_s3(FRANCE) == ['year', 'month', 'day', 'channel']
54 | 
55 | def test_get_partition_s3_other_country_than_france():
56 |     assert get_partition_s3(GERMANY) == ['country','year', 'month', 'day', 'channel']
57 |     assert get_partition_s3(BRAZIL) == ['country','year', 'month', 'day', 'channel']
58 | 
59 | # TODO need to mock s3 reads
60 | # def test_read_folder_from_s3():
61 | #     first_december = 1733040125
62 | #     date = pd.to_datetime(first_december, unit='s', utc=True)
63 | #     read_folder_from_s3(date=date, channel="tf1", storage_options=None)
64 | 
65 | #     assert False == True
66 | 
67 | def test_transform_raw_keywords():
68 |     df= pd.read_parquet(path="test/s3/one-day-one-channel.parquet")
69 |     df_programs = get_programs()
70 |     output = transform_raw_keywords(df, df_programs=df_programs)
71 | 
72 |     assert len(output) == 31


--------------------------------------------------------------------------------
/mockwebsite/lacroix_sitemap.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"><url><loc>https://www.la-croix.com/Evasion-Reau-helicoptere-heure-verdict-Redoine-Faid-2023-10-24-1301288158</loc><lastmod>2023-10-25T09:49:48+01:00</lastmod><news:news><news:publication><news:name>La Croix</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-24T23:56:04+01:00</news:publication_date><news:title>Evasion par hélicoptère de Rédoine Faïd: le verdict attendu en fin d'après-midi</news:title><news:keywords>procès, prison, prisonniers, évasion, assises, 75</news:keywords></news:news><image:image><image:loc>http://i.la-croix.com/x/2023/10/24/1301288158/Croquis-audience-Redoine-Faid-ouverture-proces-devant-assises-Paris-5-septembre-2023_0.jpg</image:loc><image:caption>Croquis d'audience de Rédoine Faïd à l'ouverture de son procès devant la cour d'assises de Paris, le 5 septembre 2023 </image:caption></image:image></url><url><loc>https://www.la-croix.com/international/guerre-israel-hamas-jour-19-attaque-bande-gaza-otages-liban-resume-2023-10-25-1201288167</loc><lastmod>2023-10-25T09:36:14+01:00</lastmod><news:news><news:publication><news:name>La Croix</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T05:16:56+01:00</news:publication_date><news:title>Guerre Israël-Hamas : Macron à Amman puis au Caire, 80 morts à Gaza selon le Hamas</news:title><news:keywords>conflit israélo-palestinien, Israël, Hamas, Moyen-Orient</news:keywords></news:news><image:image><image:loc>http://i.la-croix.com/x/2023/10/25/1201288167/camions-daide-humanitaire-attendent-pouvoir-franchir-passage-Rafah-permettant-dacceder-bande-Gaza-Egypte-24-octobre-2023_0.jpg</image:loc><image:caption>Des camions d’aide humanitaire attendent de pouvoir franchir le passage de Rafah permettant d’accéder à la bande de Gaza, en Égypte, le 24 octobre 2023.</image:caption></image:image></url><url><loc>https://www.la-croix.com/debat/Vie-destin-saint-Crepin-2023-10-25-1201288185</loc><lastmod>2023-10-25T09:29:28+01:00</lastmod><news:news><news:publication><news:name>La Croix</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T09:29:28+01:00</news:publication_date><news:title>Vie et destin de saint Crépin</news:title><news:keywords>Alain Rémond, Chroniques</news:keywords></news:news><image:image><image:loc>http://i.la-croix.com/x/2023/10/25/1201288185/Alain-Remond_0.jpg</image:loc><image:caption>Alain Rémond.</image:caption></image:image></url><url><loc>https://www.la-croix.com/Boxe-Naoya-Inoue-defier-Marlon-Tapales-devenir-roi-inconteste-super-coqs-2023-10-25-1301288184</loc><lastmod>2023-10-25T09:26:10+01:00</lastmod><news:news><news:publication><news:name>La Croix</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T09:26:10+01:00</news:publication_date><news:title>Boxe: Naoya Inoue va défier Marlon Tapales pour devenir le roi incontesté des super-coqs</news:title><news:keywords>Box, JPN, Inoue, PHI, Tapales</news:keywords></news:news><image:image><image:loc>http://i.la-croix.com/x/2023/10/25/1301288184/boxeur-japonais-Naoya-Inoue-25-octobre-2023-Yokohama_0.jpg</image:loc><image:caption>Le boxeur japonais Naoya Inoue, le 25 octobre 2023 à Yokohama</image:caption></image:image></url></urlset>


--------------------------------------------------------------------------------
/mockwebsite/midilibre_sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
 3 |           <url>
 4 |         <loc>https://www.midilibre.fr/2023/10/24/emmanuel-macron-en-israel-le-president-annonce-que-les-sept-kidnappes-francais-sont-bien-vivantss-11539314.php</loc>
 5 |         <news:news>
 6 |           <news:publication>
 7 |             <news:name>Midi Libre</news:name>
 8 |             <news:language>fr</news:language>
 9 |           </news:publication>
10 |           <news:publication_date>2023-10-24T10:01:57+02:00</news:publication_date>
11 |           <news:title>Les neuf "kidnappés Français" par le Hamas sont "bien vivants", annonce Emmanuel Macron en visite en Israël</news:title>
12 |           <news:keywords>Attaque du Hamas contre Israël, Emmanuel Macron</news:keywords>
13 |         </news:news>
14 |                   <image:image>
15 |             <image:loc>https://images.midilibre.fr/api/v1/images/view/653760e38756005f7e7a81d9/hd/image.jpg?v=1</image:loc>
16 |             <image:title>Les neuf "kidnappés Français" par le Hamas sont "bien vivants", annonce Emmanuel Macron en visite en Israël</image:title>
17 |           </image:image>
18 |               </url>
19 |           <url>
20 |         <loc>https://www.midilibre.fr/2023/10/24/controle-technique-des-deux-roues-motos-scooters-comment-la-mesure-va-t-elle-etre-mise-en-place-a-partir-de-2024-11539363.php</loc>
21 |         <news:news>
22 |           <news:publication>
23 |             <news:name>Midi Libre</news:name>
24 |             <news:language>fr</news:language>
25 |           </news:publication>
26 |           <news:publication_date>2023-10-24T10:01:03+02:00</news:publication_date>
27 |           <news:title>Contrôle technique des deux roues : motos, scooters... comment la mesure va-t-elle être mise en place à partir de 2024</news:title>
28 |           <news:keywords>Auto-moto</news:keywords>
29 |         </news:news>
30 |                   <image:image>
31 |             <image:loc>https://images.midilibre.fr/api/v1/images/view/6537772054da116cc865b469/hd/image.jpg?v=1</image:loc>
32 |             <image:title>Contrôle technique des deux roues : motos, scooters... comment la mesure va-t-elle être mise en place à partir de 2024</image:title>
33 |           </image:image>
34 |               </url>
35 |           <url>
36 |         <loc>https://www.midilibre.fr/2023/10/24/podcast-comment-les-caves-cooperatives-viticoles-sont-nees-et-quel-avenir-pour-ces-structures-aujourdhui-11532063.php</loc>
37 |         <news:news>
38 |           <news:publication>
39 |             <news:name>Midi Libre</news:name>
40 |             <news:language>fr</news:language>
41 |           </news:publication>
42 |           <news:publication_date>2023-10-24T10:06:02+02:00</news:publication_date>
43 |           <news:title>PODCAST. Comment les caves coopératives viticoles sont nées et quel avenir pour ces structures aujourd'hui</news:title>
44 |           <news:keywords>Podcasts, Viticulture, Aude</news:keywords>
45 |         </news:news>
46 |                   <image:image>
47 |             <image:loc>https://images.midilibre.fr/api/v1/images/view/6530efb5eea84505924071ba/hd/image.jpg?v=1</image:loc>
48 |             <image:title>PODCAST. Comment les caves coopératives viticoles sont nées et quel avenir pour ces structures aujourd'hui</image:title>
49 |           </image:image>
50 |         </url>
51 | </urlset>


--------------------------------------------------------------------------------
/postgres/insert_data.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | 
 4 | import pandas as pd
 5 | from sqlalchemy import DateTime
 6 | from sqlalchemy.dialects.postgresql import insert
 7 | from sqlalchemy import JSON
 8 | from postgres.schemas.models import sitemap_table, Keywords, Stop_Word, keywords_table
 9 | from datetime import datetime
10 | 
11 | def clean_data(df: pd.DataFrame):
12 |     df = df.drop_duplicates(subset="id")
13 |     return df.query("id != 'empty'")  #  TODO improve - should be a None ?
14 | 
15 | ## UPSERT
16 | def insert_or_update_on_conflict(table, conn, keys, data_iter):
17 |     data = [dict(zip(keys, row)) for row in data_iter]
18 |     insert_stmt = insert(table.table).values(data)
19 |     # pk for tables
20 |     if table.table.name == keywords_table:
21 |         pk = ("id", "start") # pk of keywords
22 |     else:
23 |         pk = ("id",)
24 | 
25 |     upsert_stmt = insert_stmt.on_conflict_do_update(
26 |         index_elements=list(pk),
27 |         set_={k: insert_stmt.excluded[k] for k in keys if k not in pk}
28 |     )
29 | 
30 |     return conn.execute(upsert_stmt)
31 | 
32 | # do not save when primary key already exist - ignore duplicate key
33 | # from https://stackoverflow.com/a/69421596/3535853
34 | def insert_or_do_nothing_on_conflict(table, conn, keys, data_iter):
35 |     data = [dict(zip(keys, row)) for row in data_iter]
36 | 
37 |     insert_statement = insert(table.table).values(data)
38 | 
39 |     on_duplicate_key_stmt = insert_statement.on_conflict_do_update(
40 |         constraint=f"{table.table.name}_pkey",
41 |         set_={c.key: c for c in insert_statement.excluded},
42 |     )
43 | 
44 |     return conn.execute(on_duplicate_key_stmt)
45 | 
46 | def show_sitemaps_dataframe(df: pd.DataFrame):
47 |     try:
48 |         df_tmp = df.groupby(by="id").size().reset_index(name="count").nlargest(5, "count")
49 |         df_final = df_tmp[df_tmp['count'] > 1]
50 |         if df_final.empty:
51 |             logging.debug("No duplicates detected")
52 |         else:
53 |             logging.warning("Duplicates to remove : %s out of %s" % (len(df_final), len(df)))
54 |     except Exception as err:
55 |             logging.warning("Could show sitemap before saving : \n %s \n %s" % (err, df.head(1).to_string()))
56 | 
57 | 
58 | def save_to_pg(df, table, conn):
59 |     number_of_elements = len(df)
60 |     logging.info(f"Saving {number_of_elements} elements to PG table '{table}'")
61 | 
62 |     try:
63 |         logging.debug("Schema before saving\n%s", df.dtypes)
64 |         if table == keywords_table:
65 |             df['updated_at'] = datetime.now()
66 | 
67 |         df.to_sql(
68 |             table,
69 |             index=False,
70 |             con=conn,
71 |             if_exists="append",
72 |             chunksize=1000,
73 |             method=insert_or_update_on_conflict,  # TODO upsert
74 |             dtype={"keywords_with_timestamp": JSON, "theme": JSON, "srt": JSON}, # only for keywords
75 |         )
76 |         logging.info("Saved dataframe to PG")
77 |         return len(df)
78 |     except Exception as err:
79 |         logging.error("Could not save : \n %s" % (err))
80 |         raise err
81 | 
82 | def insert_data_in_sitemap_table(df: pd.DataFrame, conn):
83 |     number_of_rows = len(df)
84 |     if(number_of_rows == 0):
85 |         logging.warning("0 elements to parse")
86 |     else:
87 |         logging.info("Received %s elements", number_of_rows)
88 | 
89 |     show_sitemaps_dataframe(df)
90 | 
91 |     df = clean_data(df)
92 |     save_to_pg(df, sitemap_table, conn)
93 |     
94 | 


--------------------------------------------------------------------------------
/test/mediatree/test_mediatree_queries.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from sqlalchemy import Engine
 4 | 
 5 | from quotaclimat.data_processing.mediatree.stop_word.main import *
 6 | from postgres.schemas.models import get_db_session, connect_to_db, drop_tables
 7 | from quotaclimat.data_processing.mediatree.api_import_utils.db import *
 8 | from postgres.insert_data import save_to_pg
 9 | from postgres.schemas.models import create_tables, get_db_session, get_keyword, connect_to_db, drop_tables, empty_tables,keywords_table
10 | from datetime import date
11 | from quotaclimat.data_processing.mediatree.update_pg_keywords import *
12 | 
13 | conn = connect_to_db()
14 | session = get_db_session(conn)
15 | 
16 | 
17 | 
18 | def test_mediatree_get_last_date_and_number_of_delay_saved_in_keywords():
19 |         conn: Engine = connect_to_db()
20 |         create_tables(conn)
21 |         session = get_db_session(conn)
22 |         start = pd.to_datetime("2025-01-26 12:18:54", utc=True).tz_convert('Europe/Paris')
23 |         wrong_value = 1
24 |         pk = "delete_me"
25 |         df = pd.DataFrame([{
26 |         "id" : pk,
27 |         "start": start,
28 |         "plaintext": "test",
29 |         "channel_name": "test",
30 |         "channel_radio": False,
31 |         "theme":[],
32 |         "keywords_with_timestamp": [],
33 |         "srt": [],
34 |         "number_of_keywords": wrong_value, # wrong data to reapply our custom logic for "new_value"
35 |         "number_of_changement_climatique_constat":  wrong_value,
36 |         "number_of_changement_climatique_causes_directes":  wrong_value,
37 |         "number_of_changement_climatique_consequences":  wrong_value,
38 |         "number_of_attenuation_climatique_solutions_directes":  wrong_value,
39 |         "number_of_adaptation_climatique_solutions_directes":  wrong_value,
40 |         "number_of_ressources":  wrong_value,
41 |         "number_of_ressources_solutions":  wrong_value,
42 |         "number_of_biodiversite_concepts_generaux":  wrong_value,
43 |         "number_of_biodiversite_causes_directes":  wrong_value,
44 |         "number_of_biodiversite_consequences":  wrong_value,
45 |         "number_of_biodiversite_solutions_directes" : wrong_value,
46 |         "channel_program_type": "to change",
47 |         "channel_program":"to change"
48 |         ,"program_metadata_id":"336643dc7fa09ac7335a4ceba43270ed3f553be3383a9b3b6e3cced101f2a87a"
49 |         ,"channel_title":"channel_title"
50 |         ,"number_of_keywords_climat": wrong_value
51 |         ,"number_of_keywords_biodiversite": wrong_value
52 |         ,"number_of_keywords_ressources": wrong_value
53 |         ,"country" :"france"
54 |         }])
55 | 
56 |         save_to_pg(df, keywords_table, conn)
57 | 
58 |         keywordStats = get_last_date_and_number_of_delay_saved_in_keywords(session, days_filter=3000)
59 |         expected_max_date = KeywordLastStats(date(2025, 1, 26), 2)
60 |        
61 |         assert expected_max_date.last_day_saved == keywordStats.last_day_saved
62 |         assert keywordStats.number_of_previous_days_from_yesterday > 1
63 |         delete_keywords_id(session, pk)
64 |         session.commit()
65 |         session.close()
66 | 
67 | 
68 | def test_get_delay_date():
69 |         unixtimestamp_2025_01_26 = 1737849600
70 |         expected_max_date = KeywordLastStats(date(2025, 1, 26), 2)
71 |         default_start_date, default_number_of_previous_days = get_delay_date(expected_max_date, normal_delay_in_days=1)
72 | 
73 |         assert default_start_date == unixtimestamp_2025_01_26
74 |         assert default_number_of_previous_days == 2


--------------------------------------------------------------------------------
/alembic/env.py:
--------------------------------------------------------------------------------
  1 | from logging.config import fileConfig
  2 | 
  3 | from sqlalchemy import create_engine
  4 | from postgres.schemas.base import Base
  5 | from quotaclimat.data_ingestion.labelstudio.models import TargetBase
  6 | from alembic import context
  7 | 
  8 | import re
  9 | import os
 10 | 
 11 | # this is the Alembic Config object, which provides
 12 | # access to the values within the .ini file in use.
 13 | config = context.config
 14 | 
 15 | # Interpret the config file for Python logging.
 16 | # This line sets up loggers basically.
 17 | if config.config_file_name is not None:
 18 |     fileConfig(config.config_file_name)
 19 | 
 20 | # add your model's MetaData object here
 21 | # for 'autogenerate' support
 22 | # from myapp import mymodel
 23 | # target_metadata = mymodel.Base.metadata
 24 | target_metadata = [Base.metadata, TargetBase.metadata]
 25 | 
 26 | # from https://stackoverflow.com/a/63672522/3535853
 27 | # https://alembic.sqlalchemy.org/en/latest/cookbook.html#don-t-generate-any-drop-table-directives-with-autogenerate
 28 | def include_object(object, name, type_, reflected, compare_to):
 29 |     if type_ == "table" and reflected and compare_to is None:
 30 |         return False
 31 |     else:
 32 |         return True
 33 |     
 34 | # other values from the config, defined by the needs of env.py,
 35 | # can be acquired:
 36 | # my_important_option = config.get_main_option("my_important_option")
 37 | # ... etc.
 38 | 
 39 | 
 40 | def run_migrations_offline() -> None:
 41 |     """Run migrations in 'offline' mode.
 42 | 
 43 |     This configures the context with just a URL
 44 |     and not an Engine, though an Engine is acceptable
 45 |     here as well.  By skipping the Engine creation
 46 |     we don't even need a DBAPI to be available.
 47 | 
 48 |     Calls to context.execute() here emit the given string to the
 49 |     script output.
 50 | 
 51 |     """
 52 |     url = config.get_main_option("sqlalchemy.url")
 53 |     context.configure(
 54 |         url=url,
 55 |         target_metadata=target_metadata,
 56 |         literal_binds=True,
 57 |         dialect_opts={"paramstyle": "named"},
 58 |         include_object=include_object
 59 |     )
 60 | 
 61 |     with context.begin_transaction():
 62 |         context.run_migrations()
 63 | 
 64 | 
 65 | def run_migrations_online() -> None:
 66 |     """Run migrations in 'online' mode.
 67 | 
 68 |     In this scenario we need to create an Engine
 69 |     and associate a connection with the context.
 70 | 
 71 |     """
 72 |     url_tokens = {
 73 |       "POSTGRES_USER": os.getenv("POSTGRES_USER",""),
 74 |       "POSTGRES_DB": os.getenv("POSTGRES_DB",""),
 75 |       "POSTGRES_PASSWORD": os.getenv("POSTGRES_PASSWORD",""),
 76 |       "POSTGRES_HOST": os.getenv("POSTGRES_HOST",""),
 77 |       "POSTGRES_PORT": os.getenv("POSTGRES_PORT","")
 78 |     }
 79 | 
 80 |     url = config.get_main_option("sqlalchemy.url")
 81 | 
 82 |     url = re.sub(r"\${(.+?)}", lambda m: url_tokens[m.group(1)], url)
 83 | 
 84 |     connectable = create_engine(url)
 85 | 
 86 |     with connectable.connect() as connection:
 87 |         context.configure(
 88 |             connection=connection,
 89 |             target_metadata=target_metadata,
 90 |             compare_type=True,
 91 |             compare_server_default=True,
 92 |             include_object=include_object
 93 |         )
 94 | 
 95 |         with context.begin_transaction():
 96 |             context.run_migrations()
 97 | 
 98 | if context.is_offline_mode():
 99 |     run_migrations_offline()
100 | else:
101 |     run_migrations_online()
102 | 


--------------------------------------------------------------------------------
/analyse/mediatree/test_program_durations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "fa23a75a",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib.pyplot as plt"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "id": "ce7a2095",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "df = pd.read_csv(\"data/mediatree_channel_coverages_2025-12-15\")\n",
 22 |     "df.date = pd.to_datetime(df.date)"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "44b06fa4",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "df.date.max().strftime(\"%d %b\")"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "a36a6874",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": []
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "e638c622",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import plotly.express as px\n",
 51 |     "import os\n",
 52 |     "\n",
 53 |     "for country, group in df.groupby(\"country\"):\n",
 54 |     "    start_date = group.date.min().strftime(\"%d %B\")\n",
 55 |     "    end_date = group.date.max().strftime(\"%d %B\")\n",
 56 |     "    fig = px.line(group, x=\"date\", y=\"coverage\", color='channel_name', title=f\"{country.title()}: {start_date} - {end_date}\")\n",
 57 |     "    os.makedirs(f\"images/{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}\", exist_ok=True)\n",
 58 |     "    fig.write_image(f\"images/{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}/coverage_{country}_chains_{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}.png\")\n",
 59 |     "    fig.show()\n"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "id": "45d55028",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "for country, group in df.groupby(\"country\"):\n",
 70 |     "    start_date = group.date.min().strftime(\"%d %B\")\n",
 71 |     "    end_date = group.date.max().strftime(\"%d %B\")\n",
 72 |     "    df_mean = group.groupby(\"date\").agg({\"coverage\": \"mean\"})\n",
 73 |     "    fig = px.line(df_mean, y=\"coverage\", title=f\"Mean Coverage {country.title()}: {start_date} - {end_date}\")\n",
 74 |     "    fig.write_image(f\"images/{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}/coverage_{country}_mean_{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}.png\")\n",
 75 |     "    fig.show()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "id": "727893ea",
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": []
 85 |   }
 86 |  ],
 87 |  "metadata": {
 88 |   "kernelspec": {
 89 |    "display_name": ".venv",
 90 |    "language": "python",
 91 |    "name": "python3"
 92 |   },
 93 |   "language_info": {
 94 |    "codemirror_mode": {
 95 |     "name": "ipython",
 96 |     "version": 3
 97 |    },
 98 |    "file_extension": ".py",
 99 |    "mimetype": "text/x-python",
100 |    "name": "python",
101 |    "nbconvert_exporter": "python",
102 |    "pygments_lexer": "ipython3",
103 |    "version": "3.11.6"
104 |   }
105 |  },
106 |  "nbformat": 4,
107 |  "nbformat_minor": 5
108 | }
109 | 


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/time_monitored/models.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from datetime import datetime
 3 | 
 4 | from sqlalchemy import Column, DateTime, String, Text, Boolean, ARRAY, JSON, Integer, Table, MetaData, ForeignKey, PrimaryKeyConstraint
 5 | from sqlalchemy.orm import declarative_base, sessionmaker, relationship
 6 | from sqlalchemy.exc import SQLAlchemyError
 7 | from sqlalchemy.dialects.postgresql import insert
 8 | import pandas as pd
 9 | from sqlalchemy import text
10 | from postgres.database_connection import connect_to_db, get_db_session
11 | from postgres.schemas.base import Base
12 | from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS
13 | from quotaclimat.data_processing.mediatree.i8n.country import FRANCE
14 | from quotaclimat.data_ingestion.scrap_sitemap import get_consistent_hash
15 | import os
16 | import json
17 | from json import JSONDecodeError
18 | 
19 | 
20 | import traceback
21 | 
22 | # The duration in minutes of media monitoring based on number of chunks of 2 minutes saved in S3
23 | class Time_Monitored(Base):
24 |     __tablename__ = "time_monitored"
25 |     id = Column(Text, primary_key=True)
26 |     channel_name = Column(String, nullable=False)
27 |     start = Column(DateTime(), nullable=False)
28 |     duration_minutes= Column(Integer)
29 |     country = Column(String, nullable=False)
30 | 
31 | def get_time_monitored(id: str):
32 |     session = get_db_session()
33 |     return session.get(Time_Monitored, id)
34 | 
35 | # count how many rows are in the dataframe and save it to postgresql inside a new table called time_monitor
36 | def save_time_monitored(number_of_rows : int, day: datetime, channel :str, country : str,session=None):
37 |     """
38 |     Save the number of rows (chunk) to the time_monitor table in PostgreSQL.
39 |     
40 |     Args:
41 |         number_of_rows (int): The number of rows (2 minute chunk) to save.
42 |         day (datetime): The date of the monitoring.
43 |         channel (str): The name of the channel.
44 |         country (str): The country name.
45 |     """
46 |     try:
47 |         duration_minutes = number_of_rows * 2 # 2 minutes per chunk
48 |         logging.info(f"Saving time monitored of {duration_minutes} minutes ({number_of_rows} chunks of 2 minutes) for {day} - {channel} - {country}")
49 |         max_hours = 23
50 |         if duration_minutes / 60 > max_hours:
51 |             logging.error(f"Duration of {duration_minutes / 60} hours is above {max_hours} hours. Please check the data.")
52 |         
53 |         if session is None:
54 |             session = get_db_session()
55 |         
56 |         stmt = insert(Time_Monitored).values(
57 |             id=get_consistent_hash(f"{channel}_{day}_{country}"),
58 |             channel_name=channel,
59 |             start=day,
60 |             duration_minutes=duration_minutes,
61 |             country=country
62 |         )
63 |         # upsert
64 |         stmt = stmt.on_conflict_do_update(
65 |             index_elements=['id'],  # Use the 'id' column as the conflict target
66 |             set_={
67 |                 'channel_name': stmt.excluded.channel_name,
68 |                 'start': stmt.excluded.start,
69 |                 'duration_minutes': stmt.excluded.duration_minutes,
70 |                 'country': stmt.excluded.country
71 |             }
72 |         )
73 | 
74 |         # Execute the statement
75 |         session.execute(stmt)
76 |         
77 |         session.commit()
78 |         logging.info("Saved time monitored")
79 |     except SQLAlchemyError as e:
80 |         logging.error(f"Error saving time monitored data: {e}")
81 |         logging.error(traceback.format_exc())
82 |     finally:
83 |         session.close()


--------------------------------------------------------------------------------
/test/sitemap/test_keywords.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import pandas as pd
 4 | from quotaclimat.data_processing.mediatree.utils import *
 5 | from quotaclimat.data_processing.mediatree.detect_keywords import *
 6 | from quotaclimat.data_processing.mediatree.keyword.stop_words import STOP_WORDS
 7 | 
 8 | def test_get_remove_stopwords_recycler():
 9 |     stop_words_list = [
10 |         "recycler"
11 |     ]
12 |     ad = "nous les recycler pour en faire de nouvelles en fabriquant nous-mêmes du plastique recyclé pour cela nous avons créé trois usines exclusivement dédié au recyclage dès cette année cristallines est capable de recycler autant de bouteilles"
13 | 
14 |     assert remove_stopwords(ad, stop_words_list) == "nous les  pour en faire de nouvelles en fabriquant nous-mêmes du plastique recyclé pour cela nous avons créé trois usines exclusivement dédié au recyclage dès cette année cristallines est capable de  autant de bouteilles"
15 | 
16 | def test_get_remove_stopwords_no_modification():
17 |     stop_words_list = [
18 |         "recycler"
19 |     ]
20 |     ad = "no keywords"
21 | 
22 |     assert remove_stopwords(ad, stop_words_list) == ad
23 | 
24 | def test_remove_stopwords_huile():
25 |     stop_words_list = [
26 |         "recycler",
27 |         "huile de coude était aussi une énergie renouvelable",
28 |         "est à fond sur le tri sélectif"
29 |     ]
30 |     assert remove_stopwords("l' huile de coude était aussi une énergie renouvelable stéphane est à fond sur le tri sélectif",stop_words_list) \
31 |           == "l'  stéphane "
32 | 
33 | 
34 | def test_remove_stopwords_energie():
35 |     plaintext = "quand le prix de l' énergie augmente il y a ceux qui se couvre plus ceux qui sortent moins et il y a ceux qui choisissent d' optimiser leurs énergies panneaux solaires isolations thermique pompes à chaleur chaque jour fleuron industrie parcourt la france pour vous aider à optimiser votre énergie florent industries point com en ce moment la centrale photovoltaïque de trois kilowatts et deux mille cinq cents euros et oui deux deux mille cinq cents euros cents dépêchez euros vous dépêchez vous de réserver votre kit sur fleuron industries point com <unk> <unk> la rénovation énergétique avec ici pour changer de maison sans changer de maison isolation chauffage solaire plus de confort et d' économie avec ici pas à mal casser pas mal vous avez fait une toute la pâte à modeler la je fais comment une tartine de pâte à modeler sans pâte à modeler c' est pas interdit ça s' appelle dupin juste merci pour le partage le jour où vous aimerez la pâte"
36 |     output = remove_stopwords(plaintext,STOP_WORDS)
37 |     # plantext does not contain photovoltaïque
38 |     assert "photovoltaïque" not in output
39 |     assert "rénovation énergetique" not in output
40 |     assert "chauffage" not in output
41 | 
42 | def test_remove_stopwords_fleuron():
43 |     plaintext = "chaque jour fleuron industrie parcourt"
44 |     output = remove_stopwords(plaintext,STOP_WORDS)
45 |     # plantext does not contain photovoltaïque
46 |     assert output == ""
47 | 
48 | def test_remove_stopwords_photovoltaique():
49 |     plaintext = "point com en ce moment la centrale photovoltaïque de trois kilowatt et à deux m"
50 |     output = remove_stopwords(plaintext,STOP_WORDS)
51 |     # plantext does not contain photovoltaïque
52 |     assert "photovoltaïque" not in output
53 |     assert len(output) == 0
54 | 
55 | 
56 | def test_replace_word_with_context_unk():
57 |     plaintext="<unk> <unk> quand le prix de l' énergie augmente il y a ceux qui se couvren"
58 |     output = replace_word_with_context(text=plaintext, word="<unk> ", length_to_remove=0)
59 |     assert output == "quand le prix de l' énergie augmente il y a ceux qui se couvren"
60 | 


--------------------------------------------------------------------------------
/mockwebsite/franceinter_sitemap.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"><url><loc>https://www.radiofrance.fr/franceinter/attentat-de-l-opera-en-2018-le-meilleur-ami-du-terroriste-dans-le-box-des-accuses-7790800</loc><news:news><news:publication><news:name>France Inter</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T04:22:39+00:00</news:publication_date><news:title>Attentat de l'Opéra en 2018 : le meilleur ami du terroriste dans le box des accusés</news:title><news:keywords>Justice, Attentats en France, Djihadisme, Terrorisme, Société, </news:keywords></news:news><image:image><image:loc>https://www.radiofrance.fr/s3/cruiser-production/2023/10/609eccc2-ca90-4694-a42e-9175f318a68a/1200x680_sc_maxnewsworldfour522282.jpg</image:loc><image:caption>Une personne est décédée, quatre autres ont été blessées, lors de l'attaque dans le quartier Opéra de Paris en mai 2018. - Nicolas Joubert</image:caption></image:image></url><url><loc>https://www.radiofrance.fr/franceinter/bronchiolite-par-manque-de-traitements-des-maternites-obligees-de-trier-les-bebes-eligibles-au-beyfortus-6279386</loc><news:news><news:publication><news:name>France Inter</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T04:16:27+00:00</news:publication_date><news:title>Bronchiolite : par manque de traitements, des maternités obligées de trier les bébés éligibles au Beyfortus</news:title><news:keywords>Santé, Maternité, Enfance, Société</news:keywords></news:news><image:image><image:loc>https://www.radiofrance.fr/s3/cruiser-production/2023/10/06db6c5f-d163-4f1d-af2a-f44283e87ff4/1200x680_sc_080-hl-amorcillo-2084694.jpg</image:loc><image:caption>Les bébés peuvent bénéficier d'un traitement, le Beyfortus, permettant d'éviter les formes graves de la bronchiolite. - Aline Morcillo</image:caption></image:image></url><url><loc>https://www.radiofrance.fr/franceinter/sur-tik-tok-des-influenceurs-soutirent-des-milliers-d-euros-a-leurs-abonnes-pour-des-cadeaux-virtuels-9624456</loc><news:news><news:publication><news:name>France Inter</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T04:12:31+00:00</news:publication_date><news:title>Sur TikTok, des influenceurs soutirent des milliers d'euros à leurs abonnés pour des cadeaux virtuels</news:title><news:keywords>Tech – Web, Applications mobiles, Société, </news:keywords></news:news><image:image><image:loc>https://www.radiofrance.fr/s3/cruiser-production/2023/10/3f085f10-3a39-43cd-82ad-617fc92b5e3c/1200x680_sc_illustration-tiktok.jpg</image:loc><image:caption>Capture d"écran d'un "live" TikTok, au cours duquel sont proposés des cadeaux virtuels - Xavier Demagny</image:caption></image:image></url><url><loc>https://www.radiofrance.fr/franceinter/feminisation-attractivite-et-creativite-six-choses-a-savoir-sur-l-industrie-francaise-du-jeu-video-7091011</loc><news:news><news:publication><news:name>France Inter</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-24T15:50:56+00:00</news:publication_date><news:title>Féminisation, attractivité et créativité : six choses à savoir sur l'industrie française du jeu vidéo en 2023</news:title><news:keywords>Entreprises – Marchés, Jeux vidéo, Économie, Arts et Divertissement</news:keywords></news:news><image:image><image:loc>https://www.radiofrance.fr/s3/cruiser-production/2023/10/9c284c6e-8797-47c7-b69a-2cab32e9917a/1200x680_sc_maxnewsfrfive059827.jpg</image:loc><image:caption>Stand d'Ubisoft, un des poids lourds du jeu vidéo français, lors de la Paris Games Week 2022 - Bruno Levesque / IP3</image:caption></image:image></url></urlset>


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/spain/channel_program.py:
--------------------------------------------------------------------------------
 1 | channels_programs_spain = [
 2 |     {"channel_name": "antenna-3", "start": "06:15", "end": "08:50", "weekday": "weekday", "program_name": "Noticia de la manana", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
 3 |     {"channel_name": "antenna-3", "start": "15:00", "end": "15:30", "weekday": "weekday", "program_name": "Noticias 15:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
 4 |     {"channel_name": "antenna-3", "start": "21:00", "end": "21:30", "weekday": "weekday", "program_name": "Noticias", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
 5 |     
 6 |     {"channel_name": "rtve-la-1", "start": "06:00", "end": "06:30", "weekday": "weekday", "program_name": "Telediaro 06:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
 7 |     {"channel_name": "rtve-la-1", "start": "15:00", "end": "15:40", "weekday": "weekday", "program_name": "Telediaro 15:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
 8 |     {"channel_name": "rtve-la-1", "start": "21:00", "end": "21:30", "weekday": "weekday", "program_name": "Telediaro", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
 9 |     {"channel_name": "rtve-la-1", "start": "15:00", "end": "15:40", "weekday": "weekend", "program_name": "Telediaro fin de semana", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
10 | 
11 |     {"channel_name": "rtve-24h", "start": "14:00", "end": "14:45", "weekday": "*", "program_name": "Information 24 horas 14:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
12 |     {"channel_name": "rtve-24h", "start": "20:00", "end": "20:45", "weekday": "*", "program_name": "Information 24 horas", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
13 | 
14 |     {"channel_name": "lasexta-news", "start": "11:00", "end": "15:00", "weekday": "weekday", "program_name": "Al Rojo Vivo", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
15 |     {"channel_name": "lasexta-news", "start": "14:00", "end": "14:45", "weekday": "*", "program_name": "La Sexta Noticias 14:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
16 |     {"channel_name": "lasexta-news", "start": "20:00", "end": "20:45", "weekday": "*", "program_name": "La Sexta Noticias", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
17 | 
18 |     {"channel_name": "telecinco-news", "start": "07:00", "end": "09:00", "weekday": "weekday", "program_name": "El Matinal 07:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
19 |     {"channel_name": "telecinco-news", "start": "15:00", "end": "15:30", "weekday": "weekday", "program_name": "El Matinal 15:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
20 |     {"channel_name": "telecinco-news", "start": "21:00", "end": "21:40", "weekday": "weekday", "program_name": "El Matinal", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
21 |     
22 |     {"channel_name": "cuatro-news", "start": "14:00", "end": "14:55", "weekday": "weekday", "program_name": "Noticias Cuatro 14:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
23 |     {"channel_name": "cuatro-news", "start": "20:00", "end": "20:40", "weekday": "weekday", "program_name": "Noticias Cuatro", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
24 |     {"channel_name": "cuatro-news", "start": "10:30", "end": "14:00", "weekday": "weekday", "program_name": "En Boca de Todos", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
25 | 
26 | ]
27 | 


--------------------------------------------------------------------------------
/alembic/versions/a578d21d7aee_add_tables_labelstudio.py:
--------------------------------------------------------------------------------
 1 | """Add tables labelstudio
 2 | 
 3 | Revision ID: a578d21d7aee
 4 | Revises: 44f13b7eebd4
 5 | Create Date: 2025-10-09 14:18:14.410103
 6 | 
 7 | """
 8 | from typing import Sequence, Union
 9 | 
10 | from alembic import op
11 | import sqlalchemy as sa
12 | 
13 | 
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'a578d21d7aee'
16 | down_revision: Union[str, None] = '44f13b7eebd4'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 | 
20 | 
21 | def upgrade() -> None:
22 |     # ### commands auto generated by Alembic - please adjust! ###
23 |     op.create_table('labelstudio_task_aggregate',
24 |     sa.Column('task_aggregate_id', sa.String(), nullable=False),
25 |     sa.Column('id', sa.Integer(), nullable=False),
26 |     sa.Column('data', sa.JSON(), nullable=False),
27 |     sa.Column('created_at', sa.DateTime(), nullable=False),
28 |     sa.Column('updated_at', sa.DateTime(), nullable=False),
29 |     sa.Column('is_labeled', sa.Boolean(), nullable=False),
30 |     sa.Column('project_id', sa.Integer(), nullable=True),
31 |     sa.Column('meta', sa.JSON(), nullable=True),
32 |     sa.Column('overlap', sa.Integer(), nullable=False),
33 |     sa.Column('file_upload_id', sa.Integer(), nullable=True),
34 |     sa.Column('updated_by_id', sa.Integer(), nullable=True),
35 |     sa.Column('inner_id', sa.BigInteger(), nullable=True),
36 |     sa.Column('total_annotations', sa.Integer(), nullable=False),
37 |     sa.Column('cancelled_annotations', sa.Integer(), nullable=False),
38 |     sa.Column('total_predictions', sa.Integer(), nullable=False),
39 |     sa.Column('comment_count', sa.Integer(), nullable=False),
40 |     sa.Column('last_comment_updated_at', sa.DateTime(), nullable=True),
41 |     sa.Column('unresolved_comment_count', sa.Integer(), nullable=False),
42 |     sa.Column('country', sa.String(), nullable=False),
43 |     sa.PrimaryKeyConstraint('task_aggregate_id')
44 |     )
45 |     op.create_table('labelstudio_task_completion_aggregate',
46 |     sa.Column('task_completion_aggregate_id', sa.String(), nullable=False),
47 |     sa.Column('task_aggregate_id', sa.String(), nullable=False),
48 |     sa.Column('id', sa.Integer(), nullable=False),
49 |     sa.Column('result', sa.JSON(), nullable=True),
50 |     sa.Column('was_cancelled', sa.Boolean(), nullable=False),
51 |     sa.Column('ground_truth', sa.Boolean(), nullable=False),
52 |     sa.Column('created_at', sa.DateTime(), nullable=False),
53 |     sa.Column('updated_at', sa.DateTime(), nullable=False),
54 |     sa.Column('task_id', sa.Integer(), nullable=True),
55 |     sa.Column('prediction', sa.JSON(), nullable=True),
56 |     sa.Column('lead_time', sa.Double(), nullable=True),
57 |     sa.Column('result_count', sa.Integer(), nullable=False),
58 |     sa.Column('completed_by_id', sa.Integer(), nullable=True),
59 |     sa.Column('parent_prediction_id', sa.Integer(), nullable=True),
60 |     sa.Column('parent_annotation_id', sa.Integer(), nullable=True),
61 |     sa.Column('last_action', sa.Text(), nullable=True),
62 |     sa.Column('last_created_by_id', sa.Integer(), nullable=True),
63 |     sa.Column('project_id', sa.Integer(), nullable=True),
64 |     sa.Column('updated_by_id', sa.Integer(), nullable=True),
65 |     sa.Column('unique_id', sa.Uuid(), nullable=True),
66 |     sa.Column('draft_created_at', sa.DateTime(), nullable=True),
67 |     sa.Column('import_id', sa.BigInteger(), nullable=True),
68 |     sa.Column('bulk_created', sa.Boolean(), nullable=True),
69 |     sa.Column('country', sa.String(), nullable=False),
70 |     sa.ForeignKeyConstraint(['task_aggregate_id'], ['labelstudio_task_aggregate.task_aggregate_id'], ),
71 |     sa.PrimaryKeyConstraint('task_completion_aggregate_id')
72 |     )
73 |     # ### end Alembic commands ###
74 | 
75 | 
76 | def downgrade() -> None:
77 |     # ### commands auto generated by Alembic - please adjust! ###
78 |     op.drop_table('labelstudio_task_completion_aggregate')
79 |     op.drop_table('labelstudio_task_aggregate')
80 |     # ### end Alembic commands ###
81 | 


--------------------------------------------------------------------------------
/my_dbt_project/models/dashboards/thematic_query_ocean.sql:
--------------------------------------------------------------------------------
  1 | {{ config(
  2 |     materialized='incremental'
  3 |     ,unique_key=['id']
  4 |   )
  5 | }}
  6 | 
  7 | with clean_keywords AS (
  8 |       SELECT
  9 |         "public"."keywords"."id" AS "id",
 10 |         json_array_elements(
 11 |           "public"."keywords"."keywords_with_timestamp" :: json
 12 |         ) AS kw
 13 |       FROM
 14 |         "public"."keywords"
 15 | 	 WHERE
 16 |         "public"."keywords"."start" >= '2025-01-01'
 17 | 		AND "public"."keywords"."number_of_keywords" > 0
 18 | 		AND "public"."keywords"."country" = 'france'
 19 | 		AND "public"."keywords"."channel_title" <> 'C8'
 20 |     ),
 21 | 	
 22 | filtered_keywords AS (
 23 | SELECT
 24 | 	*
 25 | FROM clean_keywords  
 26 | INNER JOIN "public"."dictionary" 
 27 |     ON "public"."dictionary"."keyword" = clean_keywords.kw ->> 'keyword' 
 28 |     AND "public"."dictionary"."theme" LIKE clean_keywords.kw ->> 'theme' || '%' -- ensure matc with indirect theme inside the dictionary table
 29 | WHERE
 30 |     "public"."dictionary"."keyword" IN (
 31 |         'acidification des océans',
 32 |         'acidification des oceans',
 33 |         'algues vertes',
 34 |         'aménagement résilient',
 35 |         'chalut',
 36 |         'chalutage',
 37 |         'chalutier',
 38 |         'conservation marine',
 39 |         'deep sea mining',
 40 |         'dessalement de l’eau de mer',
 41 |         'élévation du niveau de la mer',
 42 |         'élévation du niveau des océans',
 43 |         'érosion des côtes',
 44 |         'érosion du littoral',
 45 |         'exploitation fonds marins',
 46 |         'exploitation gazière',
 47 |         'exploitation pétrolière',
 48 |         'filets de pêche',
 49 |         'filets maillants',
 50 |         'gestion du littoral',
 51 |         'halieutique',
 52 |         'hausse du niveau de la mer',
 53 |         'hausse du niveau des océans',
 54 |         'industrie de la pêche',
 55 |         'journée mondiale des océans',
 56 |         'limiter l’érosion des côtes',
 57 |         'littoral',
 58 |         'macro déchet plastique',
 59 |         'mer',
 60 |         'micro déchet plastique',
 61 |         'montée du niveau de la mer',
 62 |         'montée du niveau des océans',
 63 |         'nano plastique',
 64 |         'océan',
 65 |         'océanographe',
 66 |         'palangre',
 67 |         'parc naturel marin',
 68 |         'pêche artisanale',
 69 |         'pêche au large',
 70 |         'pêche côtière',
 71 |         'pêche durable',
 72 |         'pêche industrielle',
 73 |         'pêche professionnelle',
 74 |         'pêche responsable',
 75 |         'pêcheur',
 76 |         'petite pêche',
 77 |         'plan de prévention des risques littoraux',
 78 |         'pollution de la mer',
 79 |         'protection des côtes',
 80 |         'protection des océans',
 81 |         'quota de pêche',
 82 |         'réchauffement des océans',
 83 |         'recul du trait de côte',
 84 |         'septième continent',
 85 |         'stress thermique',
 86 |         'système de drainage',
 87 |         'surpêche',
 88 |         'the metals company',
 89 |         'zone marine protégée',
 90 |         'zone maritime'
 91 |     )
 92 | ),
 93 | 
 94 | distinct_kw AS (
 95 |   SELECT
 96 | 	DISTINCT(id) AS "distinct_id"
 97 |   FROM
 98 | 	filtered_keywords
 99 | )
100 | 
101 | SELECT
102 |   "public"."keywords"."id",
103 |   "public"."keywords"."start",
104 |   "public"."keywords"."channel_title",
105 |   "public"."keywords"."plaintext",
106 |   "public"."keywords"."number_of_keywords",
107 |   "public"."keywords"."keywords_with_timestamp",
108 |   "public"."keywords"."country",
109 |   "public"."keywords"."channel_name"
110 | FROM
111 |   "public"."keywords"
112 | INNER JOIN distinct_kw ON distinct_kw.distinct_id = "public"."keywords".id
113 | WHERE
114 |   "public"."keywords"."start" >= '2025-01-01'
115 |   AND "public"."keywords"."number_of_keywords" > 0
116 |   AND "public"."keywords"."country" = 'france'
117 |   AND "public"."keywords"."channel_title" <> 'C8'
118 |   AND "public"."keywords"."channel_title" IS NOT NULL
119 |   AND "public"."keywords"."channel_title" <> ''


--------------------------------------------------------------------------------
/mockwebsite/republiquepyrenees_sitemap.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1"><url><loc>https://www.larepubliquedespyrenees.fr/pyrenees-atlantiques/pontiacq-viellepinte/pontiacq-lamayou-c-est-parti-pour-le-36e-tournoi-de-pala-17196604.php</loc><lastmod>2023-10-25T11:00:49+02:00</lastmod><image:image><image:loc>https://images.larepubliquedespyrenees.fr/17196604/1200x-1/morlaasvic-bilh-0e06cc7df3274c2190c3d751fbe2f787-151648-ph0.jpg</image:loc><image:title>Le premier match du tournoi a vu la victoire de Sébastien Pina et Fabrice Lajus contre Romain Tillet et Maxime Delas.</image:title></image:image><news:news><news:publication><news:name>La République des Pyrénées</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T11:00:49+02:00</news:publication_date><news:title>Pontiacq-Lamayou : c’est parti pour le 36e tournoi de pala !</news:title></news:news></url><url><loc>https://www.larepubliquedespyrenees.fr/pyrenees-atlantiques/vallee-d-aspe/vallee-d-aspe-des-changements-au-1er-novembre-pour-le-transport-a-la-demande-17196907.php</loc><lastmod>2023-10-25T10:55:33+02:00</lastmod><image:image><image:loc>https://images.larepubliquedespyrenees.fr/17196907/1200x-1/oloronvalleesbearnaises-6b1cd659e6db43dda3cc25d5e4b7efaa-154147-ph0.jpg</image:loc><image:title>Les panneaux signalétiques jaune et blanc ont fleuri dans chaque commune.</image:title></image:image><news:news><news:publication><news:name>La République des Pyrénées</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T10:55:33+02:00</news:publication_date><news:title>Vallée d’Aspe : des changements au 1er novembre pour le transport à la demande</news:title></news:news></url><url><loc>https://www.larepubliquedespyrenees.fr/sport/equitation/le-concours-5-etoiles-de-pau-devient-un-evenement-familial-17147228.php</loc><lastmod>2023-10-25T10:55:01+02:00</lastmod><image:image><image:loc>https://images.larepubliquedespyrenees.fr/17147228/1200x-1/rep-10211-hd141476.jpg</image:loc><image:title>L’an dernier, 40 000 personnes ont assisté au concours.</image:title></image:image><news:news><news:publication><news:name>La République des Pyrénées</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T10:55:01+02:00</news:publication_date><news:title>Le concours 5 étoiles de Pau devient un événement familial</news:title></news:news></url><url><loc>https://www.larepubliquedespyrenees.fr/societe/afp/evasion-par-helicoptere-de-redoine-faid-le-verdict-attendu-en-fin-d-apres-midi-17205823.php</loc><lastmod>2023-10-25T10:49:48+02:00</lastmod><image:image><image:loc>https://images.larepubliquedespyrenees.fr/17205823/1200x-1/pp-6538d860a43f5e284d9c2bef-ph0.jpg</image:loc><image:title>Croquis d'audience de Rédoine Faïd à l'ouverture de son procès devant la cour d'assises de Paris, le 5 septembre 2023</image:title></image:image><news:news><news:publication><news:name>La République des Pyrénées</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T10:49:48+02:00</news:publication_date><news:title>Evasion par hélicoptère de Rédoine Faïd: le verdict attendu en fin d'après-midi</news:title></news:news></url><url><loc>https://www.larepubliquedespyrenees.fr/culture-et-loisirs/pyrenees-gaming-notre-jeu-du-mois-assassin-s-creed-mirage-un-retour-aux-sources-de-la-saga-17170841.php</loc><lastmod>2023-10-25T10:49:27+02:00</lastmod><image:image><image:loc>https://images.larepubliquedespyrenees.fr/17170841/1200x-1/lcl3ybzh.jpg</image:loc><image:title>« Assassin’s Creed Mirage » a été développé par Ubisoft Bordeaux.</image:title></image:image><news:news><news:publication><news:name>La République des Pyrénées</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T10:49:27+02:00</news:publication_date><news:title>▶️ Pyrénées Gaming. Notre jeu du mois : « Assassin’s Creed Mirage », « un retour aux sources, de la saga »</news:title></news:news></url></urlset>


--------------------------------------------------------------------------------
/mockwebsite/liberation_sitemap.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"><url><loc>https://www.liberation.fr/international/moyen-orient/en-direct-guerre-hamas-israel-otages-liberees-macron-a-tel-aviv-bombardements-sur-gaza-crainte-dun-embrasement-regional-aide-humanitaire-retrouvez-toutes-les-informations-de-ce-mardi-24-octobre-20231024_6DU6EBVRLZAELLAYU47IAHF6Z4/</loc><lastmod>2023-10-24T08:27:52.306Z</lastmod><changefreq>always</changefreq><priority>0.5</priority><news:news><news:publication><news:name>Libération</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-24T08:27:52.306Z</news:publication_date><news:title><![CDATA[EN DIRECT - Guerre Hamas-Israël : depuis Jérusalem, Emmanuel Macron place comme «premier objectif» la libération de «tous les otages»]]></news:title></news:news><image:image><image:loc>https://liberation-liberation-prod.cdn.arcpublishing.com/resizer/SKQApBHpBaSJVcpqIDj1h4O-sfU=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/5RDM4TAUGFEZPIHYWG3CVDUR7Y.jpg</image:loc><image:caption><![CDATA[Le président français Emmanuel Macron rencontre le président israélien Isaac Herzog à Jérusalem, le 24 octobre 2023.]]></image:caption></image:image></url><url><loc>https://www.liberation.fr/politique/elections/le-gros-bobard-de-jean-philippe-tanguy-sur-le-gud-ennemi-historique-du-rassemblement-national-20231023_EWA5NEN4QFEUXHLIN74PSPEDJ4/</loc><lastmod>2023-10-23T15:15:23.928Z</lastmod><changefreq>always</changefreq><priority>0.5</priority><news:news><news:publication><news:name>Libération</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-23T15:15:23.928Z</news:publication_date><news:title><![CDATA[Le gros bobard de Jean-Philippe Tanguy sur le GUD, «ennemi historique du Rassemblement national»]]></news:title></news:news><image:image><image:loc>https://liberation-liberation-prod.cdn.arcpublishing.com/resizer/3hZXTi8Ccr2O3s6zyYqk8-Us3Qw=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/AREAUQIPLZCWFKB6HUTFE7VQ24.jpg</image:loc><image:caption><![CDATA[Le président du groupe RN à l'Assemblée, Jean-Philippe Tanguy, le 3 novembre 2022.]]></image:caption></image:image></url><url><loc>https://www.liberation.fr/societe/police-justice/chateau-de-versailles-un-lanceur-de-fausse-alerte-condamne-a-huit-mois-de-prison-avec-sursis-20231023_F2KK3TWLVVGSDAJOPW4KM6OCZQ/</loc><lastmod>2023-10-23T17:16:09.315Z</lastmod><changefreq>always</changefreq><priority>0.5</priority><news:news><news:publication><news:name>Libération</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-23T17:16:09.315Z</news:publication_date><news:title><![CDATA[Château de Versailles : un auteur de fausse alerte condamné à huit mois de prison avec sursis]]></news:title></news:news><image:image><image:loc>https://liberation-liberation-prod.cdn.arcpublishing.com/resizer/cJtbpHCkwdNZbFOVGCSkmRz9FUs=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/DNYUPQGQ2JE2NEWLG4UQLCIYAY.jpg</image:loc><image:caption><![CDATA[Le château de Versailles a été évacué sept fois en neuf jours pour des alertes à la bombe.]]></image:caption></image:image></url><url><loc>https://www.liberation.fr/international/europe/plusieurs-disparus-apres-une-collision-entre-deux-cargos-en-mer-du-nord-20231024_325S36NYBRGRLJ7LUAYQ2K5TKQ/</loc><lastmod>2023-10-24T07:57:03.897Z</lastmod><changefreq>always</changefreq><priority>0.5</priority><news:news><news:publication><news:name>Libération</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-24T07:57:03.897Z</news:publication_date><news:title><![CDATA[Plusieurs disparus après une collision entre deux cargos en mer du Nord]]></news:title></news:news><image:image><image:loc>https://liberation-liberation-prod.cdn.arcpublishing.com/resizer/nIoB0Sv-h1lexX5KgABQaf4px5Y=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/KIBODBZQNREILHF6YWE7KFF4Z4.jpg</image:loc><image:caption><![CDATA[Deux cargos à l'embouchure de l'Elbe, près de Cuxhaven, en mars.]]></image:caption></image:image></url></urlset>


--------------------------------------------------------------------------------
/alembic.ini:
--------------------------------------------------------------------------------
  1 | # A generic, single database configuration.
  2 | 
  3 | [alembic]
  4 | # path to migration scripts
  5 | script_location = alembic
  6 | 
  7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
  8 | # Uncomment the line below if you want the files to be prepended with date and time
  9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
 10 | # for all available tokens
 11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
 12 | 
 13 | # sys.path path, will be prepended to sys.path if present.
 14 | # defaults to the current working directory.
 15 | prepend_sys_path = .
 16 | 
 17 | # timezone to use when rendering the date within the migration file
 18 | # as well as the filename.
 19 | # If specified, requires the python>=3.9 or backports.zoneinfo library.
 20 | # Any required deps can installed by adding `alembic[tz]` to the pip requirements
 21 | # string value is passed to ZoneInfo()
 22 | # leave blank for localtime
 23 | # timezone =
 24 | 
 25 | # max length of characters to apply to the
 26 | # "slug" field
 27 | # truncate_slug_length = 40
 28 | 
 29 | # set to 'true' to run the environment during
 30 | # the 'revision' command, regardless of autogenerate
 31 | # revision_environment = false
 32 | 
 33 | # set to 'true' to allow .pyc and .pyo files without
 34 | # a source .py file to be detected as revisions in the
 35 | # versions/ directory
 36 | # sourceless = false
 37 | 
 38 | # version location specification; This defaults
 39 | # to alembic/versions.  When using multiple version
 40 | # directories, initial revisions must be specified with --version-path.
 41 | # The path separator used here should be the separator specified by "version_path_separator" below.
 42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
 43 | 
 44 | # version path separator; As mentioned above, this is the character used to split
 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
 47 | # Valid values for version_path_separator are:
 48 | #
 49 | # version_path_separator = :
 50 | # version_path_separator = ;
 51 | # version_path_separator = space
 52 | version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
 53 | 
 54 | # set to 'true' to search source files recursively
 55 | # in each "version_locations" directory
 56 | # new in Alembic version 1.10
 57 | # recursive_version_locations = false
 58 | 
 59 | # the output encoding used when revision files
 60 | # are written from script.py.mako
 61 | # output_encoding = utf-8
 62 | sqlalchemy.url = postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}
 63 | 
 64 | [post_write_hooks]
 65 | # post_write_hooks defines scripts or Python functions that are run
 66 | # on newly generated revision scripts.  See the documentation for further
 67 | # detail and examples
 68 | 
 69 | # format using "black" - use the console_scripts runner, against the "black" entrypoint
 70 | # hooks = black
 71 | # black.type = console_scripts
 72 | # black.entrypoint = black
 73 | # black.options = -l 79 REVISION_SCRIPT_FILENAME
 74 | 
 75 | # lint with attempts to fix using "ruff" - use the exec runner, execute a binary
 76 | # hooks = ruff
 77 | # ruff.type = exec
 78 | # ruff.executable = %(here)s/.venv/bin/ruff
 79 | # ruff.options = --fix REVISION_SCRIPT_FILENAME
 80 | 
 81 | # Logging configuration
 82 | [loggers]
 83 | keys = root,sqlalchemy,alembic
 84 | 
 85 | [handlers]
 86 | keys = console
 87 | 
 88 | [formatters]
 89 | keys = generic
 90 | 
 91 | [logger_root]
 92 | level = WARN
 93 | handlers = console
 94 | qualname =
 95 | 
 96 | [logger_sqlalchemy]
 97 | level = WARN
 98 | handlers =
 99 | qualname = sqlalchemy.engine
100 | 
101 | [logger_alembic]
102 | level = INFO
103 | handlers =
104 | qualname = alembic
105 | 
106 | [handler_console]
107 | class = StreamHandler
108 | args = (sys.stderr,)
109 | level = NOTSET
110 | formatter = generic
111 | 
112 | [formatter_generic]
113 | format = %(levelname)-5.5s [%(name)s] %(message)s
114 | datefmt = %H:%M:%S
115 | 


--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/germany/channel_program.py:
--------------------------------------------------------------------------------
 1 | channels_programs_germany = [
 2 |     {"channel_name": "daserste", "start": "05:30", "end": "09:30", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "ZDF-Morgenmagazin", "program_type": "Information - Magazine"},
 3 |     {"channel_name": "daserste", "start": "12:00", "end": "14:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Mittagsmagazin", "program_type": "Information - Magazine"},
 4 |     {"channel_name": "daserste", "start": "17:00", "end": "18:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Tagesschau", "program_type": "Information - Journal"},
 5 |     {"channel_name": "daserste", "start": "19:30", "end": "00:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Prime Time", "program_type": "Entertainment - Various"},
 6 |     {"channel_name": "daserste", "start": "21:45", "end": "00:00", "weekday": "6", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Sunday Special", "program_type": "Information - Magazine"},
 7 |     
 8 |     {"channel_name": "zdf-neo", "start": "00:00", "end": "01:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Nighttime Programming", "program_type": "Entertainment - Talk Show"},
 9 |     {"channel_name": "zdf-neo", "start": "05:30", "end": "11:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "ZDF-Morgenmagazin", "program_type": "Information - Journal"},
10 |     {"channel_name": "zdf-neo", "start": "12:00", "end": "14:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Mittagsmagazin", "program_type": "Information - Magazine"},
11 |     {"channel_name": "zdf-neo", "start": "21:30", "end": "00:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Late Evening Show", "program_type": "Entertainment - Various"},
12 |     
13 |     {"channel_name": "rtl-television", "start": "00:00", "end": "01:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "RTL Late Night", "program_type": "Entertainment - Talk Show"},
14 |     {"channel_name": "rtl-television", "start": "06:00", "end": "09:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Guten Morgen Deutschland ", "program_type": "Information - Magazine"},
15 |     {"channel_name": "rtl-television", "start": "12:00", "end": "15:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Midday Show", "program_type": "Entertainment - Various"},
16 |     {"channel_name": "rtl-television", "start": "18:30", "end": "20:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "RTL Aktuell", "program_type": "Information - Journal"},
17 |     {"channel_name": "rtl-television", "start": "22:00", "end": "00:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Primetime Shows", "program_type": "Entertainment - Various"},
18 |     
19 |     {"channel_name": "sat1", "start": "05:30", "end": "10:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Sat.1 Frühstücksfernsehen", "program_type": "Information - Magazine"},
20 |     {"channel_name": "sat1", "start": "19:30", "end": "20:30", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Prime Time Show", "program_type": "Entertainment - Various"},
21 |     
22 |     {"channel_name": "prosieben", "start": "17:00", "end": "20:30", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "taff & Galileo", "program_type": "Information - Magazine"},
23 |     
24 |     {"channel_name": "kabel-eins", "start": "16:30", "end": "18:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Abenteuer Leben täglich ", "program_type": "Information - Magazine"},
25 | ]
26 | 


--------------------------------------------------------------------------------
/mockwebsite/nicematin_sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="https://www.google.com/schemas/sitemap-news/0.9" xmlns:image="https://www.google.com/schemas/sitemap-image/1.1">
 3 |         <url>
 4 |                         <loc>https://www.nicematin.com/education/cest-un-scandale-une-mere-de-famille-en-colere-apres-avoir-mis-de-longs-mois-a-trouver-un-mode-de-garde-pour-sa-fille-sur-la-cote-dazur-881497</loc><news:news>
 5 |                             <news:publication>
 6 |                                 <news:name>Nice-Matin</news:name>
 7 |                                 <news:language>fr</news:language>
 8 |                             </news:publication><news:publication_date>2023-10-25T10:55:00+02:00</news:publication_date><news:title>"C'est un scandale": une mère de famille en colère après avoir mis de longs mois à trouver un mode de garde pour sa fille sur la Côte-d'Azur</news:title></news:news><image:image><image:loc>https://fyooyzbm.filerobot.com/v7/nounou2-C8iVj9UI.jpg?vh=bb8c9a&amp;ci_seal=1795970eb9&amp;w=750&amp;h=375&amp;gravity=auto&amp;func=crop</image:loc><image:caption>Après de long mois, Emilie a fini par trouver une solution de garde pour sa fille Mélina. Non sans répercussions sur sa vie professionnelle.</image:caption></image:image></url><url>
 9 |                         <loc>https://www.nicematin.com/faits-divers/a-nice-la-replique-dun-gilet-tactique-abandonne-avec-une-grenade-provoque-lintervention-de-la-police-881520</loc><news:news>
10 |                             <news:publication>
11 |                                 <news:name>Nice-Matin</news:name>
12 |                                 <news:language>fr</news:language>
13 |                             </news:publication><news:publication_date>2023-10-25T10:49:00+02:00</news:publication_date><news:title>À Nice, la réplique d'un gilet tactique abandonné avec une grenade provoque l'intervention de la police</news:title></news:news><image:image><image:loc>https://fyooyzbm.filerobot.com/v7/maxmatinarch530448-Zr26gJZK.jpg?vh=9bf068&amp;ci_seal=812f0dc672&amp;w=750&amp;h=375&amp;gravity=auto&amp;func=crop</image:loc><image:caption>L'intervention a eu lieu rue Delille à Nice.</image:caption></image:image></url><url>
14 |                         <loc>https://www.nicematin.com/temoignage/rien-que-par-le-bouche-a-oreille-dans-ma-residence-jai-deja-des-appels-a-51-ans-elle-plaque-tout-pour-devenir-assistante-maternelle-a-nice-881495</loc><news:news>
15 |                             <news:publication>
16 |                                 <news:name>Nice-Matin</news:name>
17 |                                 <news:language>fr</news:language>
18 |                             </news:publication><news:publication_date>2023-10-25T10:35:00+02:00</news:publication_date><news:title>"Rien que par le bouche-à-oreille dans ma résidence, j’ai déjà des appels": à 51 ans, elle plaque tout pour devenir assistante maternelle à Nice</news:title></news:news><image:image><image:loc>https://fyooyzbm.filerobot.com/v7/assistantenounou+%281%29-cHTI0xtv.webp?ci_seal=30e64b9995&amp;tl_px=6,9&amp;br_px=1270,735&amp;w=750&amp;h=375&amp;gravity=auto&amp;func=crop</image:loc><image:caption>Dans les Alpes-Maritimes, près de 500 assistants maternels ont quitté leur fonction depuis quatre ans.</image:caption></image:image></url><url>
19 |                         <loc>https://www.nicematin.com/environnement/totalenergies-accuse-par-greenpeace-detre-implique-dans-33-projets-fossiles-catastrophiques-pour-le-climat-881516</loc><news:news>
20 |                             <news:publication>
21 |                                 <news:name>Nice-Matin</news:name>
22 |                                 <news:language>fr</news:language>
23 |                             </news:publication><news:publication_date>2023-10-25T10:35:00+02:00</news:publication_date><news:title>TotalEnergies accusé par Greenpeace d'être impliqué dans 33 projets fossiles "catastrophiques pour le climat"</news:title></news:news><image:image><image:loc>https://fyooyzbm.filerobot.com/v7/000_33A94W3-g3Zrfh2z.jpg?vh=7d1a0a&amp;ci_seal=df517fd0af&amp;w=750&amp;h=375&amp;gravity=auto&amp;func=crop</image:loc><image:caption>TotalEnergies participe à 33 projets de gaz et de pétrole "super-émetteurs" en gaz à effet de serre, accuse mercredi l'ONG Greenpeace dans une étude visant à démontrer une "logique d'expansion fossile" en contradiction avec les objectifs climatiques.</image:caption></image:image></url>
24 |                         </urlset>


--------------------------------------------------------------------------------
/mockwebsite/letelegramme_sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 3 | <url>
 4 |   <loc>https://www.letelegramme.fr/monde/coups-de-feu-a-bruxelles-deux-morts-le-suspect-en-fuite-6450252.php</loc>
 5 |   <image:image xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
 6 |     <image:loc>https://media.letelegramme.fr/api/v1/images/view/652d8f34710625629665f40a/web_golden_xxl/652d8f34710625629665f40a.1</image:loc>
 7 |     <image:caption>Un périmètre de sécurité a été installé autour de la place Sainctelette. (Hatim Kaghat/AFP)</image:caption>
 8 |     <image:title>Un périmètre de sécurité a été installé autour de la place Sainctelette.</image:title>
 9 |   </image:image>
10 |   <news:news xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
11 |     <news:publication>
12 |       <news:name>Le Télégramme</news:name>
13 |       <news:language>fr</news:language>
14 |     </news:publication>
15 |     <news:publication_date>2023-10-16T19:29:56+00:00</news:publication_date>
16 |     <news:title>Coups de feu à Bruxelles : deux morts, le suspect en fuite, la piste terroriste évoquée</news:title>
17 |   </news:news>
18 | </url>
19 | <url>
20 |   <loc>https://www.letelegramme.fr/monde/mali-la-mission-de-lonu-engage-sous-tension-une-nouvelle-phase-de-son-retrait-6450249.php</loc>
21 |   <news:news xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
22 |     <news:publication>
23 |       <news:name>Le Télégramme</news:name>
24 |       <news:language>fr</news:language>
25 |     </news:publication>
26 |     <news:publication_date>2023-10-16T19:18:00+00:00</news:publication_date>
27 |     <news:title>Mali : la mission de l’Onu engage sous tension une nouvelle phase de son retrait</news:title>
28 |   </news:news>
29 | </url>
30 | <url>
31 |   <loc>https://www.letelegramme.fr/morbihan/vannes-56000/circulation-et-stationnement-a-la-gare-de-vannes-ca-va-etre-tres-complique-pendant-deux-ans-6450250.php</loc>
32 |   <image:image xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
33 |     <image:loc>https://media.letelegramme.fr/api/v1/images/view/652d8d905a16a826de416f33/web_golden_xxl/652d8d905a16a826de416f33.1</image:loc>
34 |     <image:caption>Le côté sud de la gare vu d’en haut. L’avenue Favrel et Lincy deviendra une voie de bus dans le sens descendant, une voie pour les voitures et bus dans le sens montant. Le parvis de la gare sera élargi et végétalisé. Les vélos y trouveront leur place. (Image : Villes et paysages)</image:caption>
35 |     <image:title>Le côté sud de la gare vu d’en haut. L’avenue Favrel et Lincy deviendra une voie de bus dans le sens descendant, une voie pour les voitures et bus dans le sens montant. Le parvis de la gare sera élargi et végétalisé. Les vélos y trouveront leur place.</image:title>
36 |   </image:image>
37 |   <news:news xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
38 |     <news:publication>
39 |       <news:name>Le Télégramme</news:name>
40 |       <news:language>fr</news:language>
41 |     </news:publication>
42 |     <news:publication_date>2023-10-16T19:13:00+00:00</news:publication_date>
43 |     <news:title>Circulation et stationnement à la gare de Vannes : « Ça va être très compliqué pendant deux ans »</news:title>
44 |     <news:keywords>Futur quartier de la gare de Vannes,Gare</news:keywords>
45 |   </news:news>
46 | </url>
47 | <url>
48 |   <loc>https://www.letelegramme.fr/finistere/ergue-gaberic-29500/cinq-blesses-dans-un-accident-de-circulation-a-ergue-gaberic-6450248.php</loc>
49 |   <image:image xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
50 |     <image:loc>https://media.letelegramme.fr/api/v1/images/view/652d8cd651450e731a713e6a/web_golden_xxl/652d8cd651450e731a713e6a.1</image:loc>
51 |     <image:caption>Un homme a été transporté dans un état critique à l’hôpital de Quimper. (Photo d’illustration Lionel Le Saux/Le Télégramme)</image:caption>
52 |     <image:title>Un homme a été transporté dans un état critique à l’hôpital de Quimper.</image:title>
53 |   </image:image>
54 |   <news:news xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
55 |     <news:publication>
56 |       <news:name>Le Télégramme</news:name>
57 |       <news:language>fr</news:language>
58 |     </news:publication>
59 |     <news:publication_date>2023-10-16T19:08:00+00:00</news:publication_date>
60 |     <news:title>Cinq blessés dans un accident de la circulation à Ergué-Gabéric</news:title>
61 |     <news:keywords>Faits divers,Accident</news:keywords>
62 |   </news:news>
63 | </url>
64 | 
65 | </urlset>


--------------------------------------------------------------------------------
/my_dbt_project/pytest_tests/test_dbt_model_analytics.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import os
  4 | import subprocess
  5 | from decimal import *
  6 | 
  7 | import psycopg2
  8 | import pytest
  9 | 
 10 | from my_dbt_project.pytest_tests.test_dbt_model_homepage import run_dbt_command
 11 | 
 12 | 
 13 | @pytest.fixture(scope="module")
 14 | def db_connection():
 15 |     conn = psycopg2.connect(
 16 |         dbname=os.getenv("POSTGRES_DB", ""),
 17 |         user=os.getenv("POSTGRES_USER", ""),
 18 |         password=os.getenv("POSTGRES_PASSWORD", ""),
 19 |         host=os.getenv("POSTGRES_HOST", ""),
 20 |         port=os.getenv("POSTGRES_PORT", ""),
 21 |     )
 22 |     yield conn
 23 |     conn.close()
 24 | 
 25 | 
 26 | def seed_dbt_labelstudio():
 27 |     """Run dbt seed once before any test."""
 28 |     commands = [
 29 |         "seed",
 30 |         "--select",
 31 |         "labelstudio_task_aggregate",
 32 |         "--select",
 33 |         "labelstudio_task_completion_aggregate",
 34 |         "--full-refresh",
 35 |     ]
 36 |     logging.info(f"pytest running dbt seed : {commands}")
 37 |     run_dbt_command(commands)
 38 |     # seed and dbt run upstream tables
 39 |     commands = [
 40 |         "seed",
 41 |         "--select",
 42 |         "program_metadata",
 43 |         "--select",
 44 |         "time_monitored",
 45 |         "--select",
 46 |         "keywords",
 47 |         "--select",
 48 |         "dictionary",
 49 |         "--select",
 50 |         "keyword_macro_category",
 51 |         "--full-refresh",
 52 |     ]
 53 |     run_dbt_command(commands)
 54 | 
 55 | seed_dbt_labelstudio()
 56 | 
 57 | @pytest.fixture(scope="module", autouse=True)
 58 | def run_analytics():
 59 |     logging.info("Run dbt for the thematics model once before related tests.")
 60 |     run_dbt_command(
 61 |         [
 62 |             "run",
 63 |             "--exclude",
 64 |             "core_query_causal_links",
 65 |             "--exclude",
 66 |             "task_global_completion",
 67 |             "--exclude",
 68 |             "environmental_shares_with_desinfo_counts",
 69 |             "--full-refresh",
 70 |         ]
 71 |     )
 72 |     logging.info("pytest running dbt task_global_completion")
 73 |     run_dbt_command(
 74 |         [
 75 |             "run",
 76 |             "--select",
 77 |             "task_global_completion",
 78 |             "--select",
 79 |             "environmental_shares_with_desinfo_counts",
 80 |             "--target",
 81 |             "analytics",
 82 |             "--full-refresh",
 83 |         ]
 84 |     )
 85 | 
 86 | 
 87 | def test_task_global_completion(db_connection):
 88 |     with db_connection.cursor() as cur:
 89 |         cur.execute("""
 90 |             SELECT
 91 |                 "analytics"."task_global_completion"."task_completion_aggregate_id",
 92 |                 "analytics"."task_global_completion"."country",
 93 |                 "analytics"."task_global_completion"."data_item_channel_name",
 94 |                 "analytics"."task_global_completion"."mesinfo_choice",
 95 |                 "analytics"."task_global_completion"."sum_duration_minutes"
 96 |             FROM analytics.task_global_completion
 97 |             ORDER BY analytics.task_global_completion.task_completion_aggregate_id
 98 |             LIMIT 1
 99 |         """)
100 |         row = cur.fetchone()
101 | 
102 |     expected = (
103 |         "0e7ee7f70a223e21b10c0dad27464bebb8cc6a7f4bd5f5b7746c661a44ec7b45",
104 |         "france",
105 |         "europe1",
106 |         "Correct",
107 |         None,
108 |     )
109 | 
110 |     assert row == expected, f"Unexpected values: {row}"
111 | 
112 | def test_environmental_shares_desinfo(db_connection):
113 |     with db_connection.cursor() as cur:
114 |         cur.execute("""
115 |             SELECT
116 |                 "analytics"."environmental_shares_with_desinfo_counts"."start",
117 |                 "analytics"."environmental_shares_with_desinfo_counts"."channel_name",
118 |                 "analytics"."environmental_shares_with_desinfo_counts"."sum_duration_minutes",
119 |                 "analytics"."environmental_shares_with_desinfo_counts"."weekly_perc_climat",
120 |                 "analytics"."environmental_shares_with_desinfo_counts"."total_mesinfo"
121 |             FROM analytics.environmental_shares_with_desinfo_counts
122 |             ORDER BY analytics.environmental_shares_with_desinfo_counts.start
123 |             LIMIT 1
124 |         """)
125 |         row = cur.fetchone()
126 |     expected = (
127 |         datetime.datetime(2025, 1, 27, 0, 0),
128 |         "arte",
129 |         65,
130 |         0.13846153846153847,
131 |         0,
132 |     )
133 |     assert row == expected


--------------------------------------------------------------------------------
/my_dbt_project/models/dashboards/core_query_thematics_keywords.sql:
--------------------------------------------------------------------------------
  1 | {{ config(
  2 |     materialized='incremental',
  3 |     unique_key=['week','channel_title'],
  4 |     on_schema_change='append_new_columns'
  5 |   )
  6 | }}
  7 | 
  8 | -- Core Query Thematics Keywords makes only sense when looking for keywords,theme, and category together (otherwise duplicates
  9 | -- as a keyword inside keyword_with_timestamp is present 4 times, if the keyword has 4 themes)
 10 | 
 11 | WITH program_durations AS (
 12 |   SELECT
 13 |     pm.channel_title,
 14 |     pm.channel_program,
 15 |     pm.weekday,
 16 |     CAST(pm.program_grid_start AS date) AS program_start,
 17 |     CAST(pm.program_grid_end AS date) AS program_end,
 18 |     pm.duration_minutes
 19 |   FROM public.program_metadata pm
 20 |   WHERE pm.country = 'france'
 21 | ),
 22 | program_weeks AS (
 23 |   SELECT
 24 |     pd.channel_title,
 25 |     pd.channel_program,
 26 |     pd.duration_minutes,
 27 |     pd.weekday,
 28 |     generate_series(
 29 |       date_trunc('week', pd.program_start),
 30 |       date_trunc('week', pd.program_end),
 31 |       interval '1 week'
 32 |     )::date AS week_start
 33 |   FROM program_durations pd
 34 | ),
 35 | program_airings AS (
 36 |   SELECT
 37 |     channel_title,
 38 |     channel_program,
 39 |     duration_minutes,
 40 |     -- calculate actual airing date per week + weekday offset
 41 |     (week_start + (weekday - 1) * INTERVAL '1 day')::date AS airing_date,
 42 |     week_start
 43 |   FROM program_weeks
 44 | ),
 45 | weekly_program_durations AS (
 46 |   SELECT
 47 |     channel_title,
 48 |     week_start AS week,
 49 |     SUM(duration_minutes) AS weekly_duration_minutes
 50 |   FROM program_airings
 51 |   GROUP BY channel_title, week_start
 52 | ),
 53 | keyword_occurrences AS (
 54 |   SELECT DISTINCT
 55 |     COALESCE(pm.channel_title, k.channel_title) AS channel_title,
 56 |     DATE_TRUNC('week', k.start)::date AS week,
 57 |     k.start AS occurrence_time,
 58 |     -- Semantic tags
 59 |     CASE WHEN LOWER(kw ->> 'theme') LIKE '%solution%' THEN TRUE ELSE FALSE END AS is_solution,
 60 |     CASE WHEN LOWER(kw ->> 'theme') LIKE '%consequence%' THEN TRUE ELSE FALSE END AS is_consequence,
 61 |     CASE WHEN LOWER(kw ->> 'theme') LIKE '%cause%' THEN TRUE ELSE FALSE END AS is_cause,
 62 |     CASE WHEN LOWER(kw ->> 'theme') LIKE '%concepts_generaux%' THEN TRUE ELSE FALSE END AS is_general_concepts,
 63 |     CASE WHEN LOWER(kw ->> 'theme') LIKE '%constat%' THEN TRUE ELSE FALSE END AS is_statement,
 64 |     -- Crisis type
 65 |     CASE
 66 |       WHEN LOWER(kw ->> 'theme') LIKE '%climat%' THEN 'Crise climatique'
 67 |       WHEN LOWER(kw ->> 'theme') LIKE '%biodiversite%' THEN 'Crise de la biodiversité'
 68 |       WHEN LOWER(kw ->> 'theme') LIKE '%ressource%' THEN 'Crise des ressources'
 69 |       ELSE 'Autre'
 70 |     END AS crise_type,
 71 |     kw ->> 'theme' AS theme,
 72 |     kw ->> 'keyword' AS keyword
 73 |   FROM public.keywords k
 74 |   LEFT JOIN public.program_metadata pm
 75 |     ON k.channel_program = pm.channel_program
 76 |    AND k.channel_name = pm.channel_name
 77 |    AND (
 78 |       (
 79 |         CASE
 80 |           WHEN ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7) = 0 THEN 7
 81 |           ELSE ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7)
 82 |         END = pm.weekday
 83 |       )
 84 |     )
 85 |    AND CAST(k.start AS date) BETWEEN CAST(pm.program_grid_start AS date)
 86 |    AND CAST(pm.program_grid_end AS date)
 87 |   , json_array_elements(k.keywords_with_timestamp::json) AS kw
 88 |   WHERE
 89 |     LOWER(kw ->> 'theme') NOT LIKE '%indirect%'
 90 |     AND k.country = 'france'
 91 | )
 92 | SELECT
 93 |   ko.channel_title,
 94 |   ko.week,
 95 |   COALESCE(NULLIF(d.category, ''), 'Transversal') AS category,
 96 |   d.high_risk_of_false_positive,
 97 |   ko.is_solution,
 98 |   ko.is_consequence,
 99 |   ko.is_cause,
100 |   ko.is_general_concepts,
101 |   ko.is_statement,
102 |   ko.crise_type,
103 |   ko.theme,
104 |   ko.keyword,
105 |   kmc.general,
106 |   kmc.agriculture,
107 |   kmc.transport,
108 |   kmc.batiments,
109 |   kmc.energie,
110 |   kmc.industrie,
111 |   kmc.eau,
112 |   kmc.ecosysteme,
113 |   kmc.economie_ressources,
114 |   COUNT(*) AS count,
115 |   COALESCE(wpd.weekly_duration_minutes, 0) AS sum_duration_minutes
116 | FROM keyword_occurrences ko
117 | LEFT JOIN public.dictionary d
118 |   ON d.keyword = ko.keyword AND d.theme LIKE ko.theme || '%' -- ensure matc with indirect theme inside the dictionary table
119 | LEFT JOIN weekly_program_durations wpd
120 |   ON wpd.channel_title = ko.channel_title AND wpd.week = ko.week
121 | LEFT JOIN public.keyword_macro_category kmc
122 |   ON kmc.keyword = ko.keyword
123 | GROUP BY
124 |   ko.channel_title,
125 |   ko.week,
126 |   d.high_risk_of_false_positive,
127 |   COALESCE(NULLIF(d.category, ''), 'Transversal'),
128 |   ko.is_solution,
129 |   ko.is_consequence,
130 |   ko.is_cause,
131 |   ko.is_general_concepts,
132 |   ko.is_statement,
133 |   ko.crise_type,
134 |   ko.theme,
135 |   ko.keyword,
136 |   kmc.general,
137 |   kmc.agriculture,
138 |   kmc.transport,
139 |   kmc.batiments,
140 |   kmc.energie,
141 |   kmc.industrie,
142 |   kmc.eau,
143 |   kmc.ecosysteme,
144 |   kmc.economie_ressources,
145 |   wpd.weekly_duration_minutes
146 | ORDER BY
147 |   ko.channel_title, ko.week, ko.crise_type


--------------------------------------------------------------------------------
/test/sitemap/test_main_import_api.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from modin.pandas.dataframe import DataFrame
  4 | 
  5 | from quotaclimat.data_processing.mediatree.update_pg_keywords import *
  6 | 
  7 | from postgres.insert_data import (clean_data,
  8 |                                   insert_data_in_sitemap_table)
  9 | 
 10 | from postgres.schemas.models import create_tables, get_db_session, get_keyword, connect_to_db, drop_tables, empty_tables
 11 | from postgres.insert_data import save_to_pg
 12 | from quotaclimat.data_processing.mediatree.detect_keywords import *
 13 | from quotaclimat.data_processing.mediatree.api_import import *
 14 | from quotaclimat.data_processing.mediatree.keyword.stop_words import STOP_WORDS
 15 | from quotaclimat.data_processing.mediatree.stop_word.main import save_append_stop_word
 16 | from quotaclimat.data_processing.mediatree.s3.api_to_s3 import parse_reponse_subtitle
 17 | from test_utils import get_localhost, debug_df, compare_unordered_lists_of_dicts
 18 | 
 19 | import time as t
 20 | 
 21 | 
 22 | def insert_mediatree_json(conn, json_file_path='test/sitemap/mediatree.json'):
 23 |     create_tables(conn)  
 24 |     empty_tables(get_db_session(conn), stop_word=False)
 25 |     logging.info(f"reading {json_file_path}")
 26 |     with open(json_file_path, 'r') as file:
 27 |         json_response = json.load(file)
 28 |         start_time = t.time()
 29 |         df = parse_reponse_subtitle(json_response)
 30 |         df = filter_and_tag_by_theme(df)
 31 |         df["id"] = df.apply(lambda x: add_primary_key(x), axis=1)
 32 |         end_time = t.time()
 33 |         logging.info(f"Elapsed time for api import {end_time - start_time}")
 34 |         
 35 |         # must df._to_pandas() because to_sql does not handle modin dataframe
 36 |         save_to_pg(df._to_pandas(), keywords_table, conn)
 37 |         
 38 |         return len(df)
 39 | 
 40 | def insert_stop_word(conn):
 41 |        logging.info("test saving stop words")
 42 |        to_save = []
 43 |        for stop in STOP_WORDS:
 44 |                 stop_word  = dict()
 45 |                 stop_word['id'] = stop
 46 |                 stop_word['context'] = stop
 47 |                 to_save.append(stop_word)
 48 | 
 49 |        save_append_stop_word(conn, to_save)
 50 | 
 51 | def test_main_api_import():
 52 |         conn = connect_to_db()
 53 |         drop_tables(conn)
 54 |         create_tables(conn)
 55 |         insert_stop_word(conn)
 56 |         len_df = insert_mediatree_json(conn, json_file_path="test/sitemap/light.json")
 57 | 
 58 |         session = get_db_session(conn)
 59 |         saved_keywords = get_keywords_columns(session, start_date="2024-02-01", end_date="2024-02-29")
 60 |         assert len(saved_keywords) != 0
 61 |         assert len(saved_keywords) == len_df
 62 | 
 63 | def test_first_row_api_import():
 64 |         primary_key = "29d2b1f8267b206cb62e475b960de3247e835273f396af012f5ce21bf3056472"
 65 |         
 66 |         specific_keyword =  get_keyword(primary_key)
 67 |         logging.info(f"Getting {primary_key} :\n {specific_keyword}")
 68 |         assert set(specific_keyword.theme) == set([
 69 |               'biodiversite_concepts_generaux_indirectes',
 70 |               'changement_climatique_consequences_indirectes',
 71 |               'changement_climatique_constat_indirectes'
 72 |             ])
 73 |        
 74 |         assert specific_keyword.number_of_keywords == 0
 75 | 
 76 | def test_second_row_api_import():
 77 |         
 78 |         primary_key = "9f0fb1987371c1dc0b4a165a11feb7ca7ed9b6f9f40d3d6b4fc0748e2ca59c3f"
 79 |         specific_keyword = get_keyword(primary_key)
 80 |         assert len(set(specific_keyword.theme)) > 0
 81 |         assert specific_keyword.number_of_keywords > 0
 82 | 
 83 | 
 84 | def test_third_row_api_import():
 85 |         primary_key = "32cb864fe56a4436151bcf78c385a7cc4226316e0563a298ac6988d1b8ee955b"
 86 | 
 87 |         specific_keyword = get_keyword(primary_key)
 88 |         assert len(set(specific_keyword.theme)) > 0
 89 |         
 90 |         assert specific_keyword.number_of_keywords == 1
 91 | 
 92 | def test_get_api_stop():
 93 |         conn = connect_to_db()
 94 |         session = get_db_session(conn)
 95 |         stopwords = get_stop_words(session, country=None)    
 96 |         assert type(stopwords[0]) == str
 97 | 
 98 | def test_transform_raw_keywords_srt_to_mediatree():
 99 |     conn = connect_to_db()
100 | 
101 |     channel = "LAUNE"
102 |     primary_key = "df0d86983f0c4ed074800f5cdabbd577671b90845fb6208a5de1ae3802fb10e0"
103 |     df: DataFrame= pd.read_parquet(path=f"i8n/mediatree_output/year=2024/month=10/day=1/channel={channel}")
104 |     df_programs = get_programs()
105 |     output = transform_raw_keywords(df, df_programs=df_programs,country=BELGIUM)
106 | 
107 |     output_dict = output.to_dict(orient='records')
108 |     filtered = output[output["id"] == primary_key]
109 |     row_dict = filtered.iloc[0].to_dict()
110 |     assert row_dict["country"] == "belgium"
111 |     assert row_dict["channel_name"] == channel
112 | 
113 |     assert len(output) == 29
114 |     save_to_pg(df=output,conn=conn, table=keywords_table)
115 |     specific_keyword = get_keyword(primary_key)
116 |     assert set(specific_keyword.theme) == set([
117 |         'changement_climatique_causes_indirectes',
118 |     ])
119 | 
120 |     assert specific_keyword.number_of_keywords == 0


--------------------------------------------------------------------------------
/mockwebsite/leparisien_sitemap.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1"><url><loc>https://www.leparisien.fr/sports/cyclisme/tour-de-france/direct-tour-de-france-lannonce-des-parcours-2024-a-suivre-en-live-25-10-2023-SGPV57QEYVAOJKR2VTRVETMVSY.php</loc><lastmod>2023-10-25T08:53:25.512Z</lastmod><news:news><news:publication><news:name>Le Parisien</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T08:53:26.556Z</news:publication_date><news:title><![CDATA[DIRECT. Tour de France : l’annonce des parcours hommes et femmes 2024 à suivre en live]]></news:title><news:keywords><![CDATA[direct, Alpes, Massif Central, Tour de France femmes]]></news:keywords></news:news><image:image><image:loc>https://www.leparisien.fr/resizer/pZgWLK34dnSm3PnePH4YT7PDeLI=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/RBJCS5CIAVHK3IG3DKVEMSO56Y.jpg</image:loc><image:caption><![CDATA[Jonas Vingegaard et Tadej Pogacar avaient dominé l'édition 2023 du Tour de France. Reuters/Benoit Tessier]]></image:caption></image:image></url><url><loc>https://www.leparisien.fr/sports/football/sadio-mane-arrive-aux-commandes-de-bourges-foot-18-club-de-national-2-25-10-2023-Z7GNAIUG65ECXC33R7XZPG6V2E.php</loc><lastmod>2023-10-25T08:52:34.982Z</lastmod><news:news><news:publication><news:name>Le Parisien</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T08:52:35.420Z</news:publication_date><news:title><![CDATA[Sadio Mané arrive aux commandes de Bourges Foot 18, club de National 2]]></news:title><news:keywords><![CDATA[Bourges, National 2, Sadio Mané]]></news:keywords></news:news><image:image><image:loc>https://www.leparisien.fr/resizer/oYbqphCAWq15Lf1aZAo4uO651ZI=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/XC6BTYGH3VENLEONZWY7LRTYOY.jpg</image:loc><image:caption><![CDATA[L'attaquant-vedette du Sénégal et d'Al-Nasr Sadio Mané, 31 ans, a décidé d'investir à Bourges Foot 18, club de National 2. Icon sport]]></image:caption></image:image></url><url><loc>https://www.leparisien.fr/faits-divers/le-pilote-americain-qui-a-tente-de-couper-les-moteurs-dun-avion-avait-consomme-des-hallucinogenes-25-10-2023-OBK4GDNF4NFN7MXLD4NY4FVUEU.php</loc><lastmod>2023-10-25T08:50:28.302Z</lastmod><news:news><news:publication><news:name>Le Parisien</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T08:50:28.762Z</news:publication_date><news:title><![CDATA[Le pilote américain qui a tenté de couper les moteurs d’un avion avait consommé des hallucinogènes]]></news:title><news:keywords><![CDATA[Etats-Unis, avion, vol, moteur, drogue, hallucinogènes, pilote]]></news:keywords></news:news><image:image><image:loc>https://www.leparisien.fr/resizer/dpRGItWIAA5vHv2D6cmBGmOff7U=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/TAPHNZFSIRBAVHYEOIIN3U7AP4.jpg</image:loc><image:caption><![CDATA[L’appareil, qui transportait 80 passagers, a été dérouté vers Portland dans l’Oregon, où il a atterri en toute sécurité. (Illustration) LP / Arnaud Journois]]></image:caption></image:image></url><url><loc>https://www.leparisien.fr/futurs/punaises-de-lit-comment-sen-debarrasser-les-reconnaitre-dou-viennent-elles-posez-nous-vos-questions-25-10-2023-A5ZSPB6LSBBVLHCFIX4OIAZFWA.php</loc><lastmod>2023-10-25T08:49:39.415Z</lastmod><news:news><news:publication><news:name>Le Parisien</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T08:49:40.613Z</news:publication_date><news:title><![CDATA[Punaises de lit : comment s’en débarrasser, les reconnaitre, d'où viennent-elles... Posez-nous vos questions !]]></news:title><news:keywords><![CDATA[Questions / réponses]]></news:keywords></news:news><image:image><image:loc>https://www.leparisien.fr/resizer/02sdSrjueqCNoNETKkV7cTpDO_0=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/4QGY2FZT2JF4XFUFQGSNWNZ5WQ.jpg</image:loc><image:caption><![CDATA[Aymeric Renou est journaliste au service Futurs du Parisien.]]></image:caption></image:image></url><url><loc>https://www.leparisien.fr/culture-loisirs/cinema/le-syndrome-des-amours-passees-mais-pourquoi-couchent-ils-avec-leurs-ex-25-10-2023-SOCRYLNKVBHH5N7RZONQFRQLII.php</loc><lastmod>2023-10-25T08:49:30.367Z</lastmod><news:news><news:publication><news:name>Le Parisien</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T08:49:30.814Z</news:publication_date><news:title><![CDATA[« Le Syndrome des amours passées » : mais pourquoi couchent-ils avec leurs ex ?]]></news:title><news:keywords><![CDATA[Lucie Debay, Nora Hamzawi, Floence Loiret-Caille, Festival de Cannes, Semaine de la Critique]]></news:keywords></news:news><image:image><image:loc>https://www.leparisien.fr/resizer/yk9qwslNqiBUhMh5EhTUDc5JoRc=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/7W2I45MNRNHLRHKSYZK2BZ6IMI.jpg</image:loc><image:caption><![CDATA[Dans «Le Syndrome des amours passées», Sandra et Rémy désirent avoir un enfant, mais ils ont beau tout essayer, rien ne fonctionne, jusqu'au jour où un médecin leur préconise d'aller voir leurs ex... Hélicotronc/Tripode Productions]]></image:caption></image:image></url></urlset>


--------------------------------------------------------------------------------
/.github/workflows/deploy-main.yml:
--------------------------------------------------------------------------------
  1 | name: Build & Deploy to Scaleway
  2 | 
  3 | on:
  4 |   push:
  5 |       # Sequence of patterns matched against refs/heads
  6 |     branches:    
  7 |       - main
  8 | 
  9 |   # to be able to force deploy    
 10 |   workflow_dispatch:
 11 | 
 12 | 
 13 | env:
 14 |   PYTHON_VERSION: '3.12'
 15 |   POETRY_VERSION: '2.1.3'
 16 | 
 17 | jobs:
 18 |   build:
 19 |     runs-on: ubuntu-latest
 20 |     steps:
 21 |     - uses: actions/checkout@v4
 22 |     - uses: actions/setup-python@v5
 23 |       with:
 24 |         python-version: ${{ env.PYTHON_VERSION }}
 25 |     - uses: actions/checkout@v4
 26 |     - name: Install Poetry
 27 |       uses: snok/install-poetry@v1
 28 |       with:
 29 |         version: ${{ env.POETRY_VERSION }}
 30 |         virtualenvs-create: true
 31 |         virtualenvs-in-project: true
 32 |         installer-parallel: true
 33 |     - name: Poetry install & bump version
 34 |       run: |
 35 |         poetry install --only dev
 36 |         poetry version patch
 37 |         PROJECT_VERSION=$(poetry version --short)
 38 |         echo "PROJECT_VERSION=$PROJECT_VERSION" >> $GITHUB_ENV
 39 |         git config user.name barometre-github-actions
 40 |         git config user.email barometre-github-actions@github.com
 41 |         git add pyproject.toml
 42 |         git commit -m "[no ci]: $PROJECT_VERSION bumping version"
 43 |         git push origin main
 44 |     - name: Login to Scaleway Container Registry
 45 |       uses: docker/login-action@v3
 46 |       with:
 47 |         username: nologin
 48 |         password: ${{ secrets.SCALEWAY_API_KEY }}
 49 |         registry: ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}
 50 | 
 51 |     - name: Build mediatree_import image
 52 |       run: docker build -f Dockerfile_api_import . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }}
 53 |     - name: Tag mediatree_import latest image
 54 |       run: docker tag ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }} ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:latest
 55 |     - name: Push mediatree_import Image
 56 |       run: docker push --all-tags ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import
 57 | 
 58 |     - name: update scaleway job definition with version mediatree_import
 59 |       uses: jawher/action-scw@v2.34.0
 60 |       env:
 61 |         SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
 62 |         SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
 63 |         SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
 64 |         SCW_ZONE: ${{ secrets.SCW_ZONE }}
 65 |       with:
 66 |         args: jobs definition update ${{ secrets.SCALEWAY_JOB_IMPORT_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }}
 67 |     
 68 |     - name: Build s3 image
 69 |       run: docker build -f Dockerfile_api_to_s3 . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }}
 70 |     - name: Tag s3 latest image
 71 |       run: docker tag ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }} ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:latest
 72 |     - name: Push s3 Image
 73 |       run: docker push ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }}
 74 | 
 75 |     - name: update scaleway job definition with version s3
 76 |       uses: jawher/action-scw@v2.34.0
 77 |       env:
 78 |         SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
 79 |         SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
 80 |         SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
 81 |         SCW_ZONE: ${{ secrets.SCW_ZONE }}
 82 |       with:
 83 |         args: jobs definition update ${{ secrets.SCALEWAY_JOB_S3_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }}
 84 |     
 85 |     - name: Build stop_word image
 86 |       run: docker build -f Dockerfile_stop_word . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }}
 87 |     - name: Tag stop_word latest image
 88 |       run: docker tag ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }} ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:latest
 89 |     - name: Push stop_word Image
 90 |       run: docker push ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }}
 91 | 
 92 |     - name: update scaleway job definition with version stopwords
 93 |       uses: jawher/action-scw@v2.34.0
 94 |       env:
 95 |         SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
 96 |         SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
 97 |         SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
 98 |         SCW_ZONE: ${{ secrets.SCW_ZONE }}
 99 |       with:
100 |         args: jobs definition update ${{ secrets.SCALEWAY_STOP_WORDS_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }}
101 | 
102 |     - name: update scaleway job update job
103 |       uses: jawher/action-scw@v2.34.0
104 |       env:
105 |         SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
106 |         SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
107 |         SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
108 |         SCW_ZONE: ${{ secrets.SCW_ZONE }}
109 |       with:
110 |         args: jobs definition update ${{ secrets.SCALEWAY_UPDATE_JOB_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }}
111 | 


--------------------------------------------------------------------------------
/mockwebsite/lexpress_sitemap.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"><url><loc>https://www.lexpress.fr/societe/evasion-de-reau-par-helicoptere-lheure-du-verdict-pour-redoine-faid-SYPRU6BXSRB27DFSOLH23QSCRY/</loc><lastmod>2023-10-25T10:49:48.000+02:00</lastmod><changefreq>always</changefreq><priority>0.5</priority><news:news><news:publication><news:name>L'Express</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T08:49:48Z</news:publication_date><news:title><![CDATA[Evasion par hélicoptère de Rédoine Faïd: le verdict attendu en fin d'après-midi]]></news:title></news:news><image:image><image:loc>https://www.lexpress.fr/resizer/rBes-Zxn7XqcPvpVdnoTR_0vEIM=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/4WE5RWALHVECPOS3OAQ4U6MGUI.jpg</image:loc><image:caption><![CDATA[Croquis d'audience de Rédoine Faïd (c), lors de son procès au palais de justice de Paris, le 5 septembre 2023]]></image:caption></image:image></url><url><loc>https://www.lexpress.fr/monde/europe/le-ministre-russe-de-la-defense-sur-la-zone-de-loperation-militaire-en-ukraine-JBK5YZUYZZFNLIDRKF54LHHEAE/</loc><lastmod>2023-10-25T10:31:43.539+02:00</lastmod><changefreq>always</changefreq><priority>0.5</priority><news:news><news:publication><news:name>L'Express</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T08:31:43.539Z</news:publication_date><news:title><![CDATA[Le ministre russe de la Défense sur la zone de l’opération militaire en Ukraine]]></news:title></news:news><image:image><image:loc>https://www.lexpress.fr/resizer/5OIiTmRnwqg0l6dHTKEoovcmCCM=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/UAC35INRQZFKFF6GIWQRWM6RJU.jpg</image:loc><image:caption><![CDATA[Capture d'image d'une vidéo non datée, diffusée le 26 juin 2023 par le ministère russe de la Défense, montrant Sergueï Choïgou (c), ministre russe de la Défense, avec deux officiers dans un lieu non précisé ]]></image:caption></image:image></url><url><loc>https://www.lexpress.fr/monde/proche-moyen-orient/guerre-hamas-israel-macron-va-rencontrer-le-roi-de-jordanie-a-amman-ZR4BAAKC45FRRE7O4JV454AUKY/</loc><lastmod>2023-10-25T10:17:31.494+02:00</lastmod><changefreq>always</changefreq><priority>0.5</priority><news:news><news:publication><news:name>L'Express</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T08:17:31.494Z</news:publication_date><news:title><![CDATA[Guerre Hamas - Israël : Macron va rencontrer le président égyptien au Caire ]]></news:title></news:news><image:image><image:loc>https://www.lexpress.fr/resizer/cqcHx_xhgHOc6D2tPodgor6yp5M=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/CHZX6RK67VB5TGGYQJDI4FD5K4.jpg</image:loc><image:caption><![CDATA[Le président français Emmanuel Macron arrive à Amman le 24 octobre 2023]]></image:caption></image:image></url><url><loc>https://www.lexpress.fr/monde/japon-decision-de-justice-tres-attendue-sur-le-changement-detat-civil-des-personnes-transgenres-TDU6FGHBANHVPNU5ZM5FI75KIU/</loc><lastmod>2023-10-25T10:02:23.000+02:00</lastmod><changefreq>always</changefreq><priority>0.5</priority><news:news><news:publication><news:name>L'Express</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T08:02:23Z</news:publication_date><news:title><![CDATA[Japon: la Cour suprême juge "inconstitutionnelle" l'obligation de stérilisation pour officialiser un changement de sexe]]></news:title></news:news><image:image><image:loc>https://www.lexpress.fr/resizer/tDdXsSjhswEleE1mcoIdChtNwTw=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/VCTHD6G5MJAA3NYI3BDYZ6X2VI.jpg</image:loc><image:caption><![CDATA[La Cour suprême du Japon, à Tokyo, le 11 juillet 2023]]></image:caption></image:image></url><url><loc>https://www.lexpress.fr/societe/deserts-medicaux-le-senat-retoque-la-repartition-des-medecins-VEF4G4QZDZFFRLBAAWP4UP57JU/</loc><lastmod>2023-10-25T09:59:37.725+02:00</lastmod><changefreq>always</changefreq><priority>0.5</priority><news:news><news:publication><news:name>L'Express</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T07:59:37.725Z</news:publication_date><news:title><![CDATA[Déserts médicaux : le Sénat retoque la répartition des médecins]]></news:title></news:news><image:image><image:loc>https://www.lexpress.fr/resizer/NSdrjfJ2Na62498cuhWsqyqjRuk=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/XSQJAM2USRBMZFKDB2AFJ3MMAM.jpg</image:loc><image:caption><![CDATA[Le ministre de la Santé Aurélien Rousseau à l'Elysée, le 30 août 2023]]></image:caption></image:image></url><url><loc>https://www.lexpress.fr/monde/laide-de-lonu-a-gaza-menacee-de-paralysie-discussions-autour-dune-pause-humanitaire-XV7BLYTMLVGJRI2XPLSREFVC2Q/</loc><lastmod>2023-10-25T09:20:53.000+02:00</lastmod><changefreq>always</changefreq><priority>0.5</priority><news:news><news:publication><news:name>L'Express</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-25T07:20:53Z</news:publication_date><news:title><![CDATA[L'aide de l'ONU à Gaza menacée de paralysie, nouvelles frappes israéliennes]]></news:title></news:news><image:image><image:loc>https://www.lexpress.fr/resizer/l-bVyVd-EWXxoZLn5QoX6xp5cAI=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/SG5MCUKVT5EDHN4HJYAONCMGDM.jpg</image:loc><image:caption><![CDATA[Fumée au-dessus de la bande de Gaza après un bombardement israélien, vue depuis la ville israélienne de Sdérot le 24 octobre 2023]]></image:caption></image:image></url></urlset>


--------------------------------------------------------------------------------
/mockwebsite/francebleu_sitemap.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"><url><loc>https://www.francebleu.fr/infos/faits-divers-justice/accident-villognon-le-maitre-d-hotel-blesse-au-pied-droit-sera-indemnise-1322246</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:57:37+00:00</news:publication_date><news:title>Accident Villognon : le maître d'hôtel blessé au pied droit sera indemnisé</news:title><news:keywords>Faits divers - Justice, François Hollande, justice, Infos</news:keywords></news:news></url><url><loc>https://www.francebleu.fr/infos/culture-loisirs/en-route-pour-une-nouvelle-semaine-de-cadeaux-avec-france-bleu-vaucluse-9545500</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:55:40+00:00</news:publication_date><news:title>En route pour une nouvelle semaine de cadeaux avec France Bleu Vaucluse ! </news:title><news:keywords>Culture - Loisirs, Infos</news:keywords></news:news></url><url><loc>https://www.francebleu.fr/infos/economie-social/dans-les-deux-charentes-payez-vos-factures-d-electricite-moins-cheres-grace-a-l-achat-groupe-d-energie-9597185</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:51:10+00:00</news:publication_date><news:title>Dans les deux Charentes : payez vos factures d'électricité moins chères grâce à l'achat groupé d'énergie</news:title><news:keywords>Économie - Social, Énergie, UFC Que Choisir, Électricité, Inflation, Économies d'énergie – Éco-gestes, Infos</news:keywords></news:news></url><url><loc>https://www.francebleu.fr/infos/faits-divers-justice/cette-histoire-m-a-traumatise-francis-nachbar-ancien-magistrat-publie-un-livre-sur-les-affaires-fourniret-6174798</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:47:09+00:00</news:publication_date><news:title> "Cette histoire m'a traumatisé", Francis Nachbar, ancien magistrat publie un livre sur les affaires Fourniret</news:title><news:keywords>Faits divers - Justice, Les affaires Fourniret, justice, Monique Olivier, Enquêtes – Investigation, Infos</news:keywords></news:news></url><url><loc>https://www.francebleu.fr/sports/football/liverpool-tfc-gagner-a-anfield-c-est-si-rare-pour-un-club-francais-5163393</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:46:03+00:00</news:publication_date><news:title>Liverpool-TFC : gagner à Anfield, c'est si rare pour un club français</news:title><news:keywords>Football, TFC - Toulouse Football Club, Europa League, Toulouse, Sports</news:keywords></news:news></url><url><loc>https://www.francebleu.fr/infos/faits-divers-justice/caen-coups-de-marteau-menaces-de-mort-et-videos-humiliantes-un-jeune-homme-condamne-a-2-ans-de-prison-ferme-4264720</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:43:03+00:00</news:publication_date><news:title>Caen: coups de marteau, menaces de mort et vidéos humiliantes: un jeune homme condamné à 2 ans de prison ferme</news:title><news:keywords>Faits divers - Justice, Infos</news:keywords></news:news></url><url><loc>https://www.francebleu.fr/infos/faits-divers-justice/seine-maritime-un-jeune-homme-de-21-ans-tue-par-balles-a-maromme-l-auteur-en-fuite-5680400</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:40:36+00:00</news:publication_date><news:title>Seine-Maritime : un jeune homme de 21 ans tué par balles à Maromme, l'auteur en fuite</news:title><news:keywords>Faits divers - Justice, Armes à feu, Agression, Police nationale, Enquêtes – Investigation, Infos</news:keywords></news:news></url><url><loc>https://www.francebleu.fr/infos/societe/ehpad-une-enquete-de-60-millions-de-consommateurs-pointe-du-doigt-la-qualite-des-repas-servis-6540286</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:39:07+00:00</news:publication_date><news:title>Ehpad : une enquête de 60 millions de consommateurs pointe du doigt la qualité des repas servis</news:title><news:keywords>Société, Maisons de retraite – Ehpad, Alimentation, Infos</news:keywords></news:news></url><url><loc>https://www.francebleu.fr/infos/societe/poule-cherche-nouvelle-maison-a-pia-un-eleveur-met-a-la-vente-ses-pondeuses-avec-poule-pour-tous-4774679</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:31:46+00:00</news:publication_date><news:title>Poule cherche nouvelle maison, à Pia un éleveur met à la vente ses pondeuses avec Poule pour tous </news:title><news:keywords>Société, poulet, animaux, Élevage, Infos</news:keywords></news:news></url><url><loc>https://www.francebleu.fr/infos/faits-divers-justice/cannabis-cocaine-ecstasy-1-homme-et-1-femme-arretes-a-bagnols-sur-ceze-5907058</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:27:41+00:00</news:publication_date><news:title>Cannabis, cocaïne, ecstasy : un homme et une femme arrêtés à Bagnols-sur-Cèze</news:title><news:keywords>Faits divers - Justice, Gard, Drogues, Police nationale, Infos</news:keywords></news:news></url><url><loc>https://www.francebleu.fr/infos/societe/une-cinquantaine-d-habitants-de-mourenx-denoncent-les-odeurs-intenables-du-methaniseur-biobearn-2192014</loc><news:news><news:publication><news:name>France Bleu</news:name><news:language>fr</news:language></news:publication><news:publication_date>2023-10-26T09:12:24+00:00</news:publication_date><news:title>Une cinquantaine d'habitants de Mourenx dénoncent les odeurs "intenables" du méthaniseur Biobéarn</news:title><news:keywords>Société, Entreprises, Infos</news:keywords></news:news></url></urlset>


--------------------------------------------------------------------------------