├── s3
└── .empty
├── secrets
└── .empty
├── alembic
├── versions
│ ├── .keep
│ ├── 2450da0e6c60_number_of_keywords_20_30_40.py
│ ├── 5cc9e1ec5362_add_info_public_to_program_metadata.py
│ ├── c1d78b9968fe_add_info_public_to_program_metadata.py
│ ├── a5c39db3c8e9_add_new_column_test_for_table_keywords.py
│ ├── 43103d5b49c9_program_add_start_end_date_for_grid_.py
│ ├── 5ccd746ee292_add_updated_at.py
│ ├── 055173743036_keywords_add_channel_title.py
│ ├── a0a707673259_add_radio_to_program_metadata.py
│ ├── 30abfd828007_program_metadata.py
│ ├── 5bff4dceda53_add_info_public_to_program_metadata.py
│ ├── 827fb6dde3bb_time_monitored_new_table.py
│ ├── c08231a9eb37_program_add_created_at_updated_at.py
│ ├── 2c48f626a749_keywords_program_name.py
│ ├── 4ccd746ee291_add_20_30.py
│ ├── af956a85658f_add_new_column_number_of_keywords_.py
│ ├── 356882459cec_remove_category_keywords_change_columns_.py
│ ├── 4333bc46985d_keywords_program_id_foreign_key.py
│ ├── 44f13b7eebd4_dictionary_category.py
│ ├── ac96222af6fe_hrfp_counters.py
│ └── a578d21d7aee_add_tables_labelstudio.py
├── script.py.mako
└── env.py
├── my_dbt_project
├── analyses
│ └── .gitkeep
├── macros
│ └── .gitkeep
├── seeds
│ ├── .gitkeep
│ └── time_monitored.csv
├── snapshots
│ └── .gitkeep
├── tests
│ └── .gitkeep
├── pytest_tests
│ ├── .gitkeep
│ └── test_dbt_model_analytics.py
├── .gitignore
├── dbt
│ ├── .user.yml
│ └── profiles.yml
├── README.md
└── models
│ ├── analytics
│ └── environmental_shares_with_desinfo_counts.sql
│ └── dashboards
│ ├── core_query_causal_links.sql
│ ├── core_query_thematics_keywords_i8n.sql
│ ├── thematic_query_ocean.sql
│ └── core_query_thematics_keywords.sql
├── quotaclimat
├── utils
│ ├── __init__.py
│ ├── coverquotaclimat.png
│ ├── logger.py
│ ├── healthcheck_config.py
│ └── sentry.py
├── data_ingestion
│ ├── __init__.py
│ ├── ingest_db
│ │ ├── __init__.py
│ │ └── ingest_sitemap_in_db.py
│ ├── labelstudio
│ │ └── configs.py
│ └── scrap_html
│ │ └── scrap_description_article.py
├── data_processing
│ ├── __init__.py
│ └── mediatree
│ │ ├── i8n
│ │ ├── dictionary.py
│ │ ├── brazil
│ │ │ ├── __init__.py
│ │ │ └── channel_titles.py
│ │ ├── france
│ │ │ ├── __init__.py
│ │ │ └── channel_titles.py
│ │ ├── poland
│ │ │ ├── __init__.py
│ │ │ └── channel_titles.py
│ │ ├── spain
│ │ │ ├── __init__.py
│ │ │ ├── channel_titles.py
│ │ │ └── channel_program.py
│ │ └── germany
│ │ │ ├── __init__.py
│ │ │ ├── channel_titles.py
│ │ │ └── channel_program.py
│ │ ├── config.py
│ │ ├── api_import_utils
│ │ └── db.py
│ │ └── time_monitored
│ │ └── models.py
└── __init__.py
├── document-experts
└── .download-from-gdrive.empty
├── .dockerignore
├── postgres
├── schemas
│ ├── base.py
│ └── sitemap.pgsql
├── insert_existing_data_example.py
├── database_connection.py
└── insert_data.py
├── docs
└── images
│ └── data_tiers.png
├── mockwebsite
├── README.md
├── cnews_sitemap.xml
├── lefigaro_localhost_sitemap.xml
├── 20minutes_sitemap.xml
├── lefigaro_sitemap.xml
├── lacroix_sitemap.xml
├── midilibre_sitemap.xml
├── franceinter_sitemap.xml
├── republiquepyrenees_sitemap.xml
├── liberation_sitemap.xml
├── nicematin_sitemap.xml
├── letelegramme_sitemap.xml
├── leparisien_sitemap.xml
├── lexpress_sitemap.xml
└── francebleu_sitemap.xml
├── test
├── s3
│ ├── one-day-one-channel.parquet
│ └── test_s3.py
├── sitemap
│ ├── test_utils.py
│ ├── test_scrap_html.py
│ ├── test_mediatree_utils.py
│ ├── test_keywords.py
│ └── test_main_import_api.py
├── time_monitored
│ └── test_time_monitored.py
├── i8n
│ └── test_country.py
└── mediatree
│ └── test_mediatree_queries.py
├── .flake8
├── i8n
└── mediatree_output
│ └── year=2024
│ └── month=10
│ └── day=1
│ └── channel=LAUNE
│ └── data.parquet
├── docker-entrypoint_stop_word.sh
├── .github
├── dependabot.yml
└── workflows
│ ├── docker-compose.yml
│ ├── dependabot-auto-approve.yml
│ ├── scaleway-down.yml
│ ├── scaleway-up.yml
│ ├── scaleway-start-import-job-update.yml
│ └── deploy-main.yml
├── .vscode
└── launch.json
├── Dockerfile_ingest
├── LICENSE
├── Dockerfile
├── Dockerfile_stop_word
├── Dockerfile_api_to_s3
├── Dockerfile_api_import
├── pyproject.toml
├── docker-entrypoint.sh
├── .gitignore
├── analyse
└── mediatree
│ └── test_program_durations.ipynb
└── alembic.ini
/s3/.empty:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/secrets/.empty:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/alembic/versions/.keep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/my_dbt_project/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/my_dbt_project/macros/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/my_dbt_project/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/my_dbt_project/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/my_dbt_project/tests/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/quotaclimat/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/my_dbt_project/pytest_tests/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/quotaclimat/data_ingestion/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/quotaclimat/data_processing/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/document-experts/.download-from-gdrive.empty:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/quotaclimat/data_ingestion/ingest_db/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/dictionary.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/my_dbt_project/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | target/
3 | dbt_packages/
4 | logs/
5 |
--------------------------------------------------------------------------------
/my_dbt_project/dbt/.user.yml:
--------------------------------------------------------------------------------
1 | id: e72efce9-d03e-4b9f-b04b-c919cc719b38
2 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | pgdata
2 | .git
3 | .venv
4 | venv
5 | .vscode
6 | notebooks
7 | LICENSE
8 | .idea
9 |
--------------------------------------------------------------------------------
/postgres/schemas/base.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy.orm import declarative_base
2 |
3 | Base = declarative_base()
--------------------------------------------------------------------------------
/docs/images/data_tiers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/docs/images/data_tiers.png
--------------------------------------------------------------------------------
/mockwebsite/README.md:
--------------------------------------------------------------------------------
1 | Everything in this folder will be served thanks to a docker image (nginx) to be tested locally.
--------------------------------------------------------------------------------
/test/s3/one-day-one-channel.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/test/s3/one-day-one-channel.parquet
--------------------------------------------------------------------------------
/quotaclimat/utils/coverquotaclimat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/quotaclimat/utils/coverquotaclimat.png
--------------------------------------------------------------------------------
/quotaclimat/__init__.py:
--------------------------------------------------------------------------------
1 | # Useless in the current structure
2 | # from quotaclimat.ui.streamlit_dashboard import main as build_dashboard
3 |
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/brazil/__init__.py:
--------------------------------------------------------------------------------
1 | from .channel_program import channels_programs_brazil
2 | from .channel_titles import channel_titles_brazil
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/france/__init__.py:
--------------------------------------------------------------------------------
1 | from .channel_program import channels_programs_france
2 | from .channel_titles import channel_titles_france
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/poland/__init__.py:
--------------------------------------------------------------------------------
1 | from .channel_program import channels_programs_poland
2 | from .channel_titles import channel_titles_poland
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/spain/__init__.py:
--------------------------------------------------------------------------------
1 | from .channel_program import channels_programs_spain
2 | from .channel_titles import channel_titles_spain
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/germany/__init__.py:
--------------------------------------------------------------------------------
1 | from .channel_program import channels_programs_germany
2 | from .channel_titles import channel_titles_germany
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | extend-ignore = E203,E501,F401
4 | exclude =
5 | .venv,
6 | .git
7 | per-file-ignores =
8 | */__init__.py:F403,F401
9 |
--------------------------------------------------------------------------------
/i8n/mediatree_output/year=2024/month=10/day=1/channel=LAUNE/data.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/i8n/mediatree_output/year=2024/month=10/day=1/channel=LAUNE/data.parquet
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/germany/channel_titles.py:
--------------------------------------------------------------------------------
1 | channel_titles_germany = {
2 | "daserste":"Das Erste",
3 | "zdf-neo":"ZDFneo",
4 | "zdf":"ZDF",
5 | "rtl-television":"RTL",
6 | "sat1":"Sat.1",
7 | "prosieben":"ProSieben",
8 | "kabel-eins":"Kabel Eins",
9 | }
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/brazil/channel_titles.py:
--------------------------------------------------------------------------------
1 | channel_titles_brazil = {
2 | "tvbrasil":"TV Brasil",
3 | "tvglobo":"TV Globo",
4 | "tvrecord":"TV Record",
5 | "sbt":"SBT",
6 | "redebandeirantes":"Band",
7 | "jovempan":"Jovem Pan",
8 | "cnnbrasil":"CNN Brasil",
9 | }
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/spain/channel_titles.py:
--------------------------------------------------------------------------------
1 | channel_titles_spain = {
2 | "antenna-3": "Antenna 3",
3 | "rtve-la-1": "RTVE La 1",
4 | "rtve-24h": "RTVE 24h",
5 | "lasexta-news": "LaSexta News",
6 | "telecinco-news": "Telecinco News",
7 | "cuatro-news": "Cuatro News",
8 | }
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/poland/channel_titles.py:
--------------------------------------------------------------------------------
1 | channel_titles_poland = {
2 | "tvp": "TVP",
3 | "polsat": "Polsat",
4 | "tvn": "TVN",
5 | "polskie-radio": "Polskie Radio",
6 | "tofkm": "TOFKM",
7 | "radio-zet": "Radio Zet",
8 | "eska": "Eska",
9 | "tokfm": "TOKFM",
10 | }
--------------------------------------------------------------------------------
/quotaclimat/data_ingestion/labelstudio/configs.py:
--------------------------------------------------------------------------------
1 | db_config = [
2 | {"database": "labelstudio", "countries": {6: "france", 9: "brazil", 20: "germany"}},
3 | {"database": "labelstudio-climate-poland-prod-db", "countries": {1: "poland"}},
4 | {"database": "labelstudio-climate-spain-prod-db", "countries": {1: "spain"}},
5 | ]
--------------------------------------------------------------------------------
/postgres/schemas/sitemap.pgsql:
--------------------------------------------------------------------------------
1 | CREATE TABLE sitemap_table(
2 | publication_name VARCHAR(255) NOT NULL,
3 | news_title TEXT NOT NULL,
4 | download_date DATE NOT NULL,
5 | news_publication_date DATE NOT NULL,
6 | news_keywords TEXT,
7 | section TEXT,
8 | image_caption TEXT,
9 | media_type VARCHAR(255)
10 | )
11 |
12 |
--------------------------------------------------------------------------------
/docker-entrypoint_stop_word.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Run migrations before starting the application
4 | echo "Running migrations with alembic if exists"
5 | poetry run alembic upgrade head
6 |
7 | if [[ $? -eq 0 ]]; then
8 | echo "Command succeeded"
9 | else
10 | echo "Command failed"
11 | fi
12 |
13 | echo "starting stop_word import app"
14 | python quotaclimat/data_processing/mediatree/stop_word/main.py
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "pip" # See documentation for possible values
9 | directory: "/" # Location of package manifests
10 | schedule:
11 | interval: "weekly"
12 |
--------------------------------------------------------------------------------
/.github/workflows/docker-compose.yml:
--------------------------------------------------------------------------------
1 | name: Docker Compose CI
2 |
3 | on:
4 | workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch
5 |
6 | jobs:
7 | build:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - uses: actions/checkout@v3
11 | - name: init and load data
12 | run: docker compose up -d
13 | - name: sleep
14 | run: sleep 60
15 | - name: log sitemap
16 | run: docker logs sitemap
17 | - name: log db ingestion
18 | run: docker logs ingest_to_db
19 | - name: log streamlit
20 | run: docker logs streamlit
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/france/channel_titles.py:
--------------------------------------------------------------------------------
1 | channel_titles_france = {
2 | "tf1": "TF1",
3 | "france2": "France 2",
4 | "fr3-idf": "France 3-idf",
5 | "m6": "M6",
6 | "arte": "Arte",
7 | "d8": "C8",
8 | "bfmtv": "BFM TV",
9 | "lci": "LCI",
10 | "franceinfotv": "France Info TV",
11 | "itele": "CNews",
12 | "europe1": "Europe 1",
13 | "france-culture": "France Culture",
14 | "france-inter": "France Inter",
15 | "sud-radio": "Sud Radio",
16 | "rmc": "RMC",
17 | "rtl": "RTL",
18 | "france24": "France 24",
19 | "france-info": "FranceinfoRadio",
20 | "rfi": "RFI",
21 | }
--------------------------------------------------------------------------------
/my_dbt_project/README.md:
--------------------------------------------------------------------------------
1 | Welcome to your new dbt project!
2 |
3 | ### Using the starter project
4 |
5 | Try running the following commands:
6 | cd my_dbt_project
7 | - dbt debug
8 | - dbt run
9 | - dbt test
10 |
11 |
12 | ### Resources:
13 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
14 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
15 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
16 | - Find [dbt events](https://events.getdbt.com) near you
17 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
18 |
--------------------------------------------------------------------------------
/.github/workflows/dependabot-auto-approve.yml:
--------------------------------------------------------------------------------
1 | name: Dependabot auto-approve
2 | on: pull_request
3 |
4 | permissions:
5 | pull-requests: write
6 |
7 | jobs:
8 | dependabot:
9 | runs-on: ubuntu-latest
10 | if: github.event.pull_request.user.login == 'dependabot[bot]' && github.repository == 'dataforgoodfr/quotaclimat'
11 | steps:
12 | - name: Dependabot metadata
13 | id: metadata
14 | uses: dependabot/fetch-metadata@v2
15 | with:
16 | github-token: "${{ secrets.GITHUB_TOKEN }}"
17 | - name: Approve a PR
18 | run: gh pr review --approve "$PR_URL"
19 | env:
20 | PR_URL: ${{github.event.pull_request.html_url}}
21 | GH_TOKEN: ${{secrets.GITHUB_TOKEN}}
22 |
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | #read whole file to a string
4 | def get_password():
5 | password = os.environ.get("MEDIATREE_PASSWORD")
6 | if(password == '/run/secrets/pwd_api'):
7 | password= open("/run/secrets/pwd_api", "r").read()
8 | return password
9 |
10 | def get_auth_url():
11 | return os.environ.get("MEDIATREE_AUTH_URL") #
12 |
13 | def get_user():
14 | USER = os.environ.get("MEDIATREE_USER")
15 | if(USER == '/run/secrets/username_api'):
16 | USER=open("/run/secrets/username_api", "r").read()
17 | return USER
18 |
19 | #https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList
20 | def get_keywords_url():
21 | return os.environ.get("KEYWORDS_URL")
--------------------------------------------------------------------------------
/alembic/script.py.mako:
--------------------------------------------------------------------------------
1 | """${message}
2 |
3 | Revision ID: ${up_revision}
4 | Revises: ${down_revision | comma,n}
5 | Create Date: ${create_date}
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 | ${imports if imports else ""}
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = ${repr(up_revision)}
16 | down_revision: Union[str, None] = ${repr(down_revision)}
17 | branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
18 | depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
19 |
20 |
21 | def upgrade() -> None:
22 | ${upgrades if upgrades else "pass"}
23 |
24 |
25 | def downgrade() -> None:
26 | ${downgrades if downgrades else "pass"}
27 |
--------------------------------------------------------------------------------
/my_dbt_project/dbt/profiles.yml:
--------------------------------------------------------------------------------
1 | my_dbt_project:
2 | outputs:
3 | docker:
4 | pass: "{{ env_var('POSTGRES_PASSWORD') }}"
5 | port: "{{ env_var('POSTGRES_PORT') | as_number }}"
6 | schema: public
7 | threads: 4
8 | type: postgres
9 | user: "{{ env_var('POSTGRES_USER') }}"
10 | dbname: "{{ env_var('POSTGRES_DB') }}"
11 | host: "{{ env_var('POSTGRES_HOST') }}"
12 | analytics:
13 | pass: "{{ env_var('POSTGRES_PASSWORD') }}"
14 | port: "{{ env_var('POSTGRES_PORT') | as_number }}"
15 | schema: analytics
16 | threads: 4
17 | type: postgres
18 | user: "{{ env_var('POSTGRES_USER') }}"
19 | dbname: "{{ env_var('POSTGRES_DB') }}"
20 | host: "{{ env_var('POSTGRES_HOST') }}"
21 | target: docker
22 |
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "0.2.0",
3 | "configurations": [
4 |
5 | {
6 | "name": "Python: Current File",
7 | "type": "python",
8 | "request": "launch",
9 | "program": "${file}",
10 | "console": "integratedTerminal",
11 | "justMyCode": true
12 | },
13 | {
14 | "name": "Python: File",
15 | "type": "python",
16 | "request": "launch",
17 | "program": "${file}",
18 | "justMyCode": true
19 | },
20 | {"name": "Python data: Current File",
21 | "type": "python",
22 | "request": "launch",
23 | "program": "${file}",
24 | "console": "integratedTerminal",
25 | }
26 | },
27 |
28 | ]
29 | }
--------------------------------------------------------------------------------
/postgres/insert_existing_data_example.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | import pandas as pd
5 |
6 | from quotaclimat.data_ingestion.scrap_sitemap import get_sitemap_cols
7 |
8 |
9 | def parse_section(section: str):
10 | logging.debug(section)
11 | if "," not in section:
12 | return section
13 | else:
14 | return ",".join(map(str, section))
15 |
16 | def transformation_from_dumps_to_table_entry(df: pd.DataFrame):
17 | try:
18 | cols = get_sitemap_cols()
19 | df_template_db = pd.DataFrame(columns=cols)
20 | df_consistent = pd.concat([df, df_template_db])
21 |
22 | df_consistent.section = df_consistent.section.apply(parse_section)
23 |
24 | return df_consistent[cols]
25 | except Exception as err:
26 | logging.error("Could not transform %s" % (err))
27 | return None
--------------------------------------------------------------------------------
/alembic/versions/2450da0e6c60_number_of_keywords_20_30_40.py:
--------------------------------------------------------------------------------
1 | """number of keywords 20,30,40
2 |
3 | Revision ID: 2450da0e6c60
4 | Revises: 055173743036
5 | Create Date: 2024-06-19 10:21:34.624231
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '2450da0e6c60'
16 | down_revision: Union[str, None] = '055173743036'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | pass
24 | # ### end Alembic commands ###
25 |
26 |
27 | def downgrade() -> None:
28 | # ### commands auto generated by Alembic - please adjust! ###
29 | pass
30 | # ### end Alembic commands ###
31 |
--------------------------------------------------------------------------------
/alembic/versions/5cc9e1ec5362_add_info_public_to_program_metadata.py:
--------------------------------------------------------------------------------
1 | """Add info/public to program metadata
2 |
3 | Revision ID: 5cc9e1ec5362
4 | Revises: 356882459cec
5 | Create Date: 2024-05-03 08:54:16.764307
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '5cc9e1ec5362'
16 | down_revision: Union[str, None] = '356882459cec'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | pass
24 | # ### end Alembic commands ###
25 |
26 |
27 | def downgrade() -> None:
28 | # ### commands auto generated by Alembic - please adjust! ###
29 | pass
30 | # ### end Alembic commands ###
31 |
--------------------------------------------------------------------------------
/alembic/versions/c1d78b9968fe_add_info_public_to_program_metadata.py:
--------------------------------------------------------------------------------
1 | """Add info/public to program metadata
2 |
3 | Revision ID: c1d78b9968fe
4 | Revises: 5cc9e1ec5362
5 | Create Date: 2024-05-03 08:56:47.087189
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'c1d78b9968fe'
16 | down_revision: Union[str, None] = '5cc9e1ec5362'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | pass
24 | # ### end Alembic commands ###
25 |
26 |
27 | def downgrade() -> None:
28 | # ### commands auto generated by Alembic - please adjust! ###
29 | pass
30 | # ### end Alembic commands ###
31 |
--------------------------------------------------------------------------------
/alembic/versions/a5c39db3c8e9_add_new_column_test_for_table_keywords.py:
--------------------------------------------------------------------------------
1 | """Add new column test for table keywords
2 |
3 | Revision ID: a5c39db3c8e9
4 | Revises: 5ccd746ee292
5 | Create Date: 2024-09-12 14:10:26.305593
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'a5c39db3c8e9'
16 | down_revision: Union[str, None] = '5ccd746ee292'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | pass
24 | # ### end Alembic commands ###
25 |
26 |
27 | def downgrade() -> None:
28 | # ### commands auto generated by Alembic - please adjust! ###
29 | pass
30 | # ### end Alembic commands ###
31 |
--------------------------------------------------------------------------------
/alembic/versions/43103d5b49c9_program_add_start_end_date_for_grid_.py:
--------------------------------------------------------------------------------
1 | """program: add start/end date for grid evolution
2 |
3 | Revision ID: 43103d5b49c9
4 | Revises: af956a85658f
5 | Create Date: 2024-10-02 13:18:56.251135
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '43103d5b49c9'
16 | down_revision: Union[str, None] = 'af956a85658f'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | pass
24 | # ### end Alembic commands ###
25 |
26 |
27 | def downgrade() -> None:
28 | # ### commands auto generated by Alembic - please adjust! ###
29 | pass
30 | # ### end Alembic commands ###
31 |
--------------------------------------------------------------------------------
/alembic/versions/5ccd746ee292_add_updated_at.py:
--------------------------------------------------------------------------------
1 | """add 20/30
2 |
3 | Revision ID: 5ccd746ee292
4 | Revises: 4ccd746ee291
5 | Create Date: 2024-07-03 06:35:00.316441
6 | """
7 | from typing import Sequence, Union
8 |
9 | from alembic import op
10 | import sqlalchemy as sa
11 | from sqlalchemy.dialects import postgresql
12 |
13 | # revision identifiers, used by Alembic.
14 | revision: str = '5ccd746ee292'
15 | down_revision: Union[str, None] = '4ccd746ee291'
16 | branch_labels: Union[str, Sequence[str], None] = None
17 | depends_on: Union[str, Sequence[str], None] = None
18 |
19 | def upgrade() -> None:
20 | # ### commands auto generated by Alembic - please adjust! ###
21 | op.add_column('keywords', sa.Column('updated_at',sa.DateTime(), nullable=True))
22 | # ### end Alembic commands ###
23 |
24 |
25 | def downgrade() -> None:
26 | # ### commands auto generated by Alembic - please adjust! ###
27 | op.drop_column('keywords', 'updated_at')
28 | # ### end Alembic commands ###
29 |
--------------------------------------------------------------------------------
/alembic/versions/055173743036_keywords_add_channel_title.py:
--------------------------------------------------------------------------------
1 | """keywords: add channel_title
2 |
3 |
4 | Revision ID: 055173743036
5 | Revises: a0a707673259
6 | Create Date: 2024-06-05 11:43:22.071610
7 |
8 | """
9 | from typing import Sequence, Union
10 |
11 | from alembic import op
12 | import sqlalchemy as sa
13 |
14 |
15 | # revision identifiers, used by Alembic.
16 | revision: str = '055173743036'
17 | down_revision: Union[str, None] = 'a0a707673259'
18 | branch_labels: Union[str, Sequence[str], None] = None
19 | depends_on: Union[str, Sequence[str], None] = None
20 |
21 |
22 | def upgrade() -> None:
23 | # ### commands auto generated by Alembic - please adjust! ###
24 | op.add_column('keywords', sa.Column('channel_title', sa.String(), nullable=True))
25 | # ### end Alembic commands ###
26 |
27 |
28 | def downgrade() -> None:
29 | # ### commands auto generated by Alembic - please adjust! ###
30 | op.drop_column('keywords','channel_title')
31 | # ### end Alembic commands ###
32 |
--------------------------------------------------------------------------------
/alembic/versions/a0a707673259_add_radio_to_program_metadata.py:
--------------------------------------------------------------------------------
1 | """Add radio to program metadata
2 |
3 | Revision ID: a0a707673259
4 | Revises: 5bff4dceda53
5 | Create Date: 2024-05-03 09:36:04.954535
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'a0a707673259'
16 | down_revision: Union[str, None] = '5bff4dceda53'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.add_column('program_metadata', sa.Column('radio', sa.Boolean(), nullable=True))
24 | # ### end Alembic commands ###
25 |
26 |
27 | def downgrade() -> None:
28 | # ### commands auto generated by Alembic - please adjust! ###
29 | op.drop_column('program_metadata', 'radio')
30 | # ### end Alembic commands ###
31 |
--------------------------------------------------------------------------------
/test/sitemap/test_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import pandas as pd
4 | def get_localhost():
5 | localhost = ""
6 | if(os.environ.get("ENV") == "docker"):
7 | localhost ="http://nginxtest:80"
8 | else:
9 | localhost = "http://localhost:8000"
10 | return localhost
11 |
12 | def debug_df(df: pd.DataFrame):
13 | pd.set_option('display.max_columns', None)
14 | logging.warning("--------------------DEBUG DF-------------------")
15 | logging.info(df.dtypes)
16 | logging.info(df.head(3))
17 | logging.warning("--------------------DEBUG DF-------------------")
18 |
19 |
20 | def list_of_dicts_to_set_of_frozensets(list_of_dicts):
21 | # Convert each dictionary to a frozenset to make it hashable
22 | return {frozenset(d.items()) for d in list_of_dicts}
23 |
24 | def compare_unordered_lists_of_dicts(list1, list2):
25 | # Convert each list of dictionaries to a set of frozensets
26 | set1 = list_of_dicts_to_set_of_frozensets(list1)
27 | set2 = list_of_dicts_to_set_of_frozensets(list2)
28 |
29 | # Check if the sets are equal
30 | return set1 == set2
--------------------------------------------------------------------------------
/Dockerfile_ingest:
--------------------------------------------------------------------------------
1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
2 | FROM python:3.11 as builder
3 |
4 | ENV VIRTUAL_ENV=/app/.venv
5 |
6 | ENV POETRY_NO_INTERACTION=1 \
7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \
8 | POETRY_VIRTUALENVS_CREATE=1 \
9 | POETRY_CACHE_DIR=/tmp/poetry_cache
10 |
11 | WORKDIR /app
12 |
13 | COPY pyproject.toml poetry.lock ./
14 |
15 | RUN pip install poetry==2.1.3
16 |
17 | RUN poetry install --no-root
18 |
19 | # The runtime image, used to just run the code provided its virtual environment
20 | FROM python:3.11-slim as runtime
21 |
22 | WORKDIR /app
23 |
24 | ENV VIRTUAL_ENV=/app/.venv
25 | ENV PATH="/app/.venv/bin:$PATH"
26 | ENV PATH="$PYENV_ROOT/bin:$PATH"
27 | ENV PYTHONPATH=/app
28 |
29 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
30 |
31 | # App code is include with docker-compose as well
32 |
33 | COPY quotaclimat ./quotaclimat
34 | COPY postgres ./postgres
35 | COPY pyproject.toml pyproject.toml
36 |
37 | # healthcheck
38 | EXPOSE 5000
39 |
40 |
41 | ENTRYPOINT ["python", "quotaclimat/data_ingestion/ingest_db/ingest_sitemap_in_db.py"]
42 |
--------------------------------------------------------------------------------
/test/time_monitored/test_time_monitored.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pytest
3 | import pandas as pd
4 |
5 | from postgres.schemas.models import get_db_session, connect_to_db, create_tables
6 | from quotaclimat.data_processing.mediatree.time_monitored.models import *
7 | import zoneinfo
8 |
9 | @pytest.fixture(scope="module", autouse=True)
10 | def init_tables():
11 | create_tables()
12 |
13 | def test_save_time_monitored():
14 | start = datetime(2025, 1, 14, 15, 18, 43, 807525, tzinfo=zoneinfo.ZoneInfo(key='Europe/Paris'))
15 | channel_name = "test_channel"
16 | country = "france"
17 | id = get_consistent_hash(f"{channel_name}_{start}_{country}")
18 | duration_minutes = 30
19 |
20 | time_monitored = Time_Monitored(
21 | id=id,
22 | channel_name=channel_name,
23 | start=start,
24 | duration_minutes=duration_minutes,
25 | country=country
26 | )
27 | save_time_monitored(number_of_rows=int(duration_minutes/2), day=start, channel=channel_name, country=country)
28 |
29 | output = get_time_monitored(id)
30 | assert output.duration_minutes == time_monitored.duration_minutes
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Data For Good France
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/alembic/versions/30abfd828007_program_metadata.py:
--------------------------------------------------------------------------------
1 | """program metadata
2 |
3 | Revision ID: 30abfd828007
4 | Revises: 43103d5b49c9
5 | Create Date: 2024-10-03 14:18:09.874225
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '30abfd828007'
16 | down_revision: Union[str, None] = '43103d5b49c9'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.add_column('program_metadata', sa.Column('program_grid_start', sa.DateTime(), nullable=True))
24 | op.add_column('program_metadata', sa.Column('program_grid_end', sa.DateTime(), nullable=True))
25 | # ### end Alembic commands ###
26 |
27 |
28 | def downgrade() -> None:
29 | # ### commands auto generated by Alembic - please adjust! ###
30 | op.drop_column('program_metadata', 'program_grid_end')
31 | op.drop_column('program_metadata', 'program_grid_start')
32 | # ### end Alembic commands ###
33 |
--------------------------------------------------------------------------------
/alembic/versions/5bff4dceda53_add_info_public_to_program_metadata.py:
--------------------------------------------------------------------------------
1 | """Add info/public to program metadata
2 |
3 | Revision ID: 5bff4dceda53
4 | Revises: c1d78b9968fe
5 | Create Date: 2024-05-03 09:09:44.751432
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '5bff4dceda53'
16 | down_revision: Union[str, None] = 'c1d78b9968fe'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.add_column('program_metadata', sa.Column('public', sa.Boolean(), nullable=True))
24 | op.add_column('program_metadata', sa.Column('infocontinue', sa.Boolean(), nullable=True))
25 | # ### end Alembic commands ###
26 |
27 |
28 | def downgrade() -> None:
29 | # ### commands auto generated by Alembic - please adjust! ###
30 | op.drop_column('program_metadata', 'infocontinue')
31 | op.drop_column('program_metadata', 'public')
32 | # ### end Alembic commands ###
33 |
--------------------------------------------------------------------------------
/alembic/versions/827fb6dde3bb_time_monitored_new_table.py:
--------------------------------------------------------------------------------
1 | """time monitored new table
2 |
3 | Revision ID: 827fb6dde3bb
4 | Revises: c08231a9eb37
5 | Create Date: 2025-04-29 13:29:54.299095
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '827fb6dde3bb'
16 | down_revision: Union[str, None] = 'c08231a9eb37'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 | def upgrade() -> None:
21 | # Create the time_monitored table
22 | op.create_table(
23 | 'time_monitored',
24 | sa.Column('id', sa.String(), nullable=False),
25 | sa.Column('channel_name', sa.String(), nullable=False),
26 | sa.Column('start', sa.DateTime(), nullable=False),
27 | sa.Column('duration_minutes', sa.Integer(), nullable=True),
28 | sa.Column('country', sa.String(), nullable=False),
29 | sa.PrimaryKeyConstraint('id')
30 | )
31 |
32 |
33 | def downgrade() -> None:
34 | # Drop the time_monitored table
35 | op.drop_table('time_monitored')
--------------------------------------------------------------------------------
/alembic/versions/c08231a9eb37_program_add_created_at_updated_at.py:
--------------------------------------------------------------------------------
1 | """program: add created_at updated_at
2 |
3 | Revision ID: c08231a9eb37
4 | Revises: 4333bc46985d
5 | Create Date: 2025-03-29 08:17:51.997077
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'c08231a9eb37'
16 | down_revision: Union[str, None] = '4333bc46985d'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.add_column('program_metadata', sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text("(now() at time zone 'utc')"), nullable=True))
24 | op.add_column('program_metadata', sa.Column('updated_at', sa.DateTime(), nullable=True))
25 | # ### end Alembic commands ###
26 |
27 |
28 | def downgrade() -> None:
29 | # ### commands auto generated by Alembic - please adjust! ###
30 | op.drop_column('program_metadata', 'updated_at')
31 | op.drop_column('program_metadata', 'created_at')
32 | # ### end Alembic commands ###
33 |
--------------------------------------------------------------------------------
/alembic/versions/2c48f626a749_keywords_program_name.py:
--------------------------------------------------------------------------------
1 | """keywords: program name
2 |
3 | Revision ID: 2c48f626a749
4 | Revises:
5 | Create Date: 2024-04-12 12:44:23.512407
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '2c48f626a749'
16 | down_revision: Union[str, None] = None
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.add_column('keywords', sa.Column('channel_program', sa.String(), nullable=True))
24 | op.add_column('keywords', sa.Column('channel_program_type', sa.String(), nullable=True))
25 | op.add_column('keywords', sa.Column('category', sa.JSON(), nullable=True))
26 | # ### end Alembic commands ###
27 |
28 |
29 | def downgrade() -> None:
30 | # ### commands auto generated by Alembic - please adjust! ###
31 | op.drop_column('keywords', 'category')
32 | op.drop_column('keywords', 'channel_program_type')
33 | op.drop_column('keywords', 'channel_program')
34 | # ### end Alembic commands ###
35 |
--------------------------------------------------------------------------------
/quotaclimat/utils/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | class CustomFormatter(logging.Formatter):
4 |
5 | grey = "\x1b[38;20m"
6 | yellow = "\x1b[33;20m"
7 | red = "\x1b[31;20m"
8 | bold_red = "\x1b[31;1m"
9 | reset = "\x1b[0m"
10 | light_blue = "\x1b[36m"
11 | format = "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d | %(message)s"
12 |
13 | FORMATS = {
14 | logging.DEBUG: grey + format + reset,
15 | logging.INFO: light_blue + format + reset,
16 | logging.WARNING: yellow + format + reset,
17 | logging.ERROR: red + format + reset,
18 | logging.CRITICAL: bold_red + format + reset
19 | }
20 |
21 | def format(self, record):
22 | log_fmt = self.FORMATS.get(record.levelno)
23 | formatter = logging.Formatter(log_fmt)
24 | return formatter.format(record)
25 |
26 | def getLogger():
27 | # create logger with 'spam_application'
28 | logger = logging.getLogger()
29 | logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper())
30 | # create console handler with a higher log level
31 | if (logger.hasHandlers()):
32 | logger.handlers.clear()
33 | ch = logging.StreamHandler()
34 | ch.setFormatter(CustomFormatter())
35 | logger.addHandler(ch)
36 |
37 | return logger
--------------------------------------------------------------------------------
/alembic/versions/4ccd746ee291_add_20_30.py:
--------------------------------------------------------------------------------
1 | """add 20/30
2 |
3 | Revision ID: 4ccd746ee291
4 | Revises: 2450da0e6c60
5 | Create Date: 2024-06-20 06:35:00.316441
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 | from sqlalchemy.dialects import postgresql
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '4ccd746ee291'
16 | down_revision: Union[str, None] = '2450da0e6c60'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.alter_column('keywords', sa.Column('number_of_keywords_20', sa.Integer(), nullable=True))
24 | op.alter_column('keywords', sa.Column('number_of_keywords_30', sa.Integer(), nullable=True))
25 | op.alter_column('keywords', sa.Column('number_of_keywords_40', sa.Integer(), nullable=True))
26 | # ### end Alembic commands ###
27 |
28 |
29 | def downgrade() -> None:
30 | # ### commands auto generated by Alembic - please adjust! ###
31 | op.drop_column('keywords', 'number_of_keywords_20')
32 | op.drop_column('keywords', 'number_of_keywords_30')
33 | op.drop_column('keywords', 'number_of_keywords_40')
34 | # ### end Alembic commands ###
35 |
--------------------------------------------------------------------------------
/.github/workflows/scaleway-down.yml:
--------------------------------------------------------------------------------
1 | name: Stop Scaleway
2 |
3 | on:
4 | workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch
5 |
6 | schedule: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule
7 | - cron: '49 21 * * *'
8 |
9 | jobs:
10 | down:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - name: Use CLI
14 | uses: jawher/action-scw@v2.34.0
15 | env:
16 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
17 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
18 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
19 | SCW_ZONE: ${{ secrets.SCW_ZONE }}
20 | with:
21 | args: container container list name=${{ secrets.CONTAINER_NAME }} --output json
22 |
23 | - name: Get CONTAINER_ID env var
24 | run: echo "CONTAINER_ID=$(cat "${GITHUB_WORKSPACE}/scw.output" | jq -r '.[0].id')" >> $GITHUB_ENV
25 |
26 |
27 | - name: 0 instances
28 | uses: jawher/action-scw@v2.34.0
29 | env:
30 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
31 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
32 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
33 | SCW_ZONE: ${{ secrets.SCW_ZONE }}
34 | with:
35 | args: container container update min-scale=0 ${{ env.CONTAINER_ID }}
36 |
--------------------------------------------------------------------------------
/.github/workflows/scaleway-up.yml:
--------------------------------------------------------------------------------
1 | name: Start Scaleway
2 |
3 | on:
4 | workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch
5 |
6 | schedule: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule
7 | - cron: '52 05 * * *'
8 |
9 | jobs:
10 | up:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - name: Use CLI
14 | uses: jawher/action-scw@v2.34.0
15 | env:
16 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
17 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
18 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
19 | SCW_ZONE: ${{ secrets.SCW_ZONE }}
20 | with:
21 | args: container container list name=${{ secrets.CONTAINER_NAME }} --output json
22 |
23 | - name: Get CONTAINER_ID env var
24 | run: echo "CONTAINER_ID=$(cat "${GITHUB_WORKSPACE}/scw.output" | jq -r '.[0].id')" >> $GITHUB_ENV
25 |
26 | - name: start 1 instances
27 | uses: jawher/action-scw@v2.34.0
28 | env:
29 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
30 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
31 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
32 | SCW_ZONE: ${{ secrets.SCW_ZONE }}
33 | with:
34 | args: container container update min-scale=1 ${{ env.CONTAINER_ID }}
35 |
--------------------------------------------------------------------------------
/alembic/versions/af956a85658f_add_new_column_number_of_keywords_.py:
--------------------------------------------------------------------------------
1 | """Add new column number_of_keywords climat/biod/r
2 |
3 | Revision ID: af956a85658f
4 | Revises: a5c39db3c8e9
5 | Create Date: 2024-09-12 14:15:12.049367
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'af956a85658f'
16 | down_revision: Union[str, None] = 'a5c39db3c8e9'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.add_column('keywords', sa.Column('number_of_keywords_climat', sa.Integer(), nullable=True))
24 | op.add_column('keywords', sa.Column('number_of_keywords_biodiversite', sa.Integer(), nullable=True))
25 | op.add_column('keywords', sa.Column('number_of_keywords_ressources', sa.Integer(), nullable=True))
26 | # ### end Alembic commands ###
27 |
28 |
29 | def downgrade() -> None:
30 | # ### commands auto generated by Alembic - please adjust! ###
31 | op.drop_column('keywords', 'number_of_keywords_ressources')
32 | op.drop_column('keywords', 'number_of_keywords_biodiversite')
33 | op.drop_column('keywords', 'number_of_keywords_climat')
34 | # ### end Alembic commands ###
35 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
2 | FROM python:3.12.10 as builder
3 |
4 | ENV VIRTUAL_ENV=/app/.venv
5 |
6 | ENV POETRY_NO_INTERACTION=1 \
7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \
8 | POETRY_VIRTUALENVS_CREATE=1 \
9 | POETRY_CACHE_DIR=/tmp/poetry_cache
10 |
11 | WORKDIR /app
12 |
13 | COPY pyproject.toml poetry.lock ./
14 |
15 | RUN pip install poetry==2.1.3
16 |
17 | RUN poetry install --no-root
18 |
19 | # The runtime image, used to just run the code provided its virtual environment
20 | FROM python:3.12.10-slim as runtime
21 |
22 | RUN apt update && apt-get install -y git
23 |
24 | WORKDIR /app
25 |
26 | ENV VIRTUAL_ENV=/app/.venv
27 | ENV PATH="/app/.venv/bin:$PATH"
28 | ENV PATH="$PYENV_ROOT/bin:$PATH"
29 | ENV PYTHONPATH=/app
30 | ENV DBT_PROFILES_DIR=/app/my_dbt_project/dbt
31 | ENV DBT_PROJECT_DIR=/app/my_dbt_project
32 |
33 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
34 |
35 | # For streamlit only
36 | COPY pyproject.toml poetry.lock ./
37 | RUN pip install poetry
38 |
39 | # App code is include with docker-compose as well
40 |
41 | COPY quotaclimat ./quotaclimat
42 | COPY postgres ./postgres
43 | COPY alembic/ ./alembic
44 | COPY transform_program.py ./transform_program.py
45 | COPY my_dbt_project/ ./my_dbt_project
46 |
47 | # Docker compose overwrite this config to have only one Dockerfile
48 | CMD ["ls"]
49 |
--------------------------------------------------------------------------------
/Dockerfile_stop_word:
--------------------------------------------------------------------------------
1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
2 | FROM python:3.12.10 as builder
3 |
4 | ENV VIRTUAL_ENV=/app/.venv
5 |
6 | ENV POETRY_NO_INTERACTION=1 \
7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \
8 | POETRY_VIRTUALENVS_CREATE=1 \
9 | POETRY_CACHE_DIR=/tmp/poetry_cache
10 |
11 | WORKDIR /app
12 |
13 | COPY pyproject.toml poetry.lock ./
14 |
15 | RUN pip install poetry==2.1.3
16 |
17 | RUN poetry install --no-root
18 |
19 | # The runtime image, used to just run the code provided its virtual environment
20 | FROM python:3.12.10-slim as runtime
21 |
22 | WORKDIR /app
23 |
24 | ENV VIRTUAL_ENV=/app/.venv
25 | ENV PATH="/app/.venv/bin:$PATH"
26 | ENV PATH="$PYENV_ROOT/bin:$PATH"
27 | ENV PYTHONPATH=/app
28 |
29 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
30 |
31 | # App code is include with docker-compose as well
32 |
33 | COPY quotaclimat ./quotaclimat
34 | COPY postgres ./postgres
35 | COPY pyproject.toml pyproject.toml
36 | COPY alembic/ ./alembic
37 | COPY alembic.ini ./alembic.ini
38 | COPY transform_program.py ./transform_program.py
39 |
40 | # healthcheck
41 | EXPOSE 5050
42 |
43 | # Use a separate script to handle migrations and start the application
44 | COPY docker-entrypoint_stop_word.sh ./docker-entrypoint_stop_word.sh
45 | RUN chmod +x ./docker-entrypoint_stop_word.sh
46 |
47 | ENTRYPOINT ["./docker-entrypoint_stop_word.sh"]
--------------------------------------------------------------------------------
/quotaclimat/utils/healthcheck_config.py:
--------------------------------------------------------------------------------
1 |
2 | import http.server
3 | import socketserver
4 | import os
5 | import logging
6 | import asyncio
7 | import tomli
8 |
9 | def get_app_version():
10 | # Open and read the pyproject.toml file
11 | with open('pyproject.toml', 'rb') as toml_file:
12 | pyproject_data = tomli.load(toml_file)
13 |
14 | # Access the version from the pyproject.toml file
15 | version = pyproject_data['project']['version']
16 | return version
17 |
18 | version = get_app_version()
19 |
20 | class HealthCheckHandler(http.server.SimpleHTTPRequestHandler):
21 | def do_GET(self):
22 | self.send_response(200)
23 | self.end_headers()
24 | self.wfile.write((f"Healthy.\n\nApp version {version}").encode())
25 |
26 | async def run_health_check_server():
27 | PORT = int(os.environ.get("PORT_HS", 5050))
28 | SERVER_ADDRESS = os.environ.get("HEALTHCHECK_SERVER", "")
29 |
30 | logging.info(f"App version {version}")
31 | logging.info(f"Healthcheck at '{SERVER_ADDRESS}' : port {PORT}")
32 | with socketserver.TCPServer((SERVER_ADDRESS, PORT), HealthCheckHandler) as httpd:
33 | try:
34 | await asyncio.to_thread(httpd.serve_forever)
35 | except asyncio.CancelledError:
36 | logging.info("health check cancel")
37 | httpd.shutdown() # to terminal infinite loop "serve_forever"
38 | return
39 |
--------------------------------------------------------------------------------
/Dockerfile_api_to_s3:
--------------------------------------------------------------------------------
1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
2 | FROM python:3.12.10 as builder
3 |
4 | ENV VIRTUAL_ENV=/app/.venv
5 |
6 | ENV POETRY_NO_INTERACTION=1 \
7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \
8 | POETRY_VIRTUALENVS_CREATE=1 \
9 | POETRY_CACHE_DIR=/tmp/poetry_cache
10 |
11 | WORKDIR /app
12 |
13 | COPY pyproject.toml poetry.lock ./
14 |
15 | RUN pip install poetry==2.1.3
16 |
17 | RUN poetry install --no-root
18 |
19 | # The runtime image, used to just run the code provided its virtual environment
20 | FROM python:3.12.10-slim as runtime
21 |
22 | WORKDIR /app
23 |
24 | ENV VIRTUAL_ENV=/app/.venv
25 | ENV PATH="/app/.venv/bin:$PATH"
26 | ENV PATH="$PYENV_ROOT/bin:$PATH"
27 | ENV PYTHONPATH=/app
28 |
29 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
30 |
31 | # App code is include with docker-compose as well
32 |
33 | COPY quotaclimat ./quotaclimat
34 | COPY postgres ./postgres
35 | COPY pyproject.toml pyproject.toml
36 | COPY alembic/ ./alembic
37 | COPY alembic.ini ./alembic.ini
38 | COPY transform_program.py ./transform_program.py
39 |
40 | # healthcheck
41 | EXPOSE 5050
42 |
43 | # Use a separate script to handle migrations and start the application
44 | COPY docker-entrypoint.sh ./docker-entrypoint.sh
45 | RUN chmod +x ./docker-entrypoint.sh
46 |
47 |
48 | ENTRYPOINT ["python", "quotaclimat/data_processing/mediatree/s3/api_to_s3.py"]
49 |
--------------------------------------------------------------------------------
/Dockerfile_api_import:
--------------------------------------------------------------------------------
1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
2 | FROM python:3.12.10 as builder
3 |
4 | ENV VIRTUAL_ENV=/app/.venv
5 |
6 | ENV POETRY_NO_INTERACTION=1 \
7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \
8 | POETRY_VIRTUALENVS_CREATE=1 \
9 | POETRY_CACHE_DIR=/tmp/poetry_cache
10 |
11 | WORKDIR /app
12 |
13 | COPY pyproject.toml poetry.lock ./
14 |
15 | RUN pip install poetry==2.1.3
16 |
17 | RUN poetry install --no-root
18 |
19 | # The runtime image, used to just run the code provided its virtual environment
20 | FROM python:3.12.10-slim as runtime
21 |
22 | RUN apt update && apt-get install -y git
23 |
24 | WORKDIR /app
25 |
26 | ENV VIRTUAL_ENV=/app/.venv
27 | ENV PATH="/app/.venv/bin:$PATH"
28 | ENV PATH="$PYENV_ROOT/bin:$PATH"
29 | ENV PYTHONPATH=/app
30 |
31 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
32 |
33 | # App code is include with docker-compose as well
34 |
35 | COPY quotaclimat ./quotaclimat
36 | COPY postgres ./postgres
37 | COPY pyproject.toml pyproject.toml
38 | COPY alembic/ ./alembic
39 | COPY alembic.ini ./alembic.ini
40 | COPY transform_program.py ./transform_program.py
41 | COPY my_dbt_project/ ./my_dbt_project
42 | COPY i8n/ ./i8n
43 | ENV DBT_PROFILES_DIR=/app/my_dbt_project/dbt
44 | ENV DBT_PROJECT_DIR=/app/my_dbt_project
45 |
46 | # healthcheck
47 | EXPOSE 5050
48 |
49 | # Use a separate script to handle migrations and start the application
50 | COPY docker-entrypoint.sh ./docker-entrypoint.sh
51 | RUN chmod +x ./docker-entrypoint.sh
52 |
53 | ENTRYPOINT ["./docker-entrypoint.sh"]
54 |
--------------------------------------------------------------------------------
/my_dbt_project/seeds/time_monitored.csv:
--------------------------------------------------------------------------------
1 | id,channel_name,start,duration_minutes,country
2 | f48e555ced0b59dc6016b9ed62e4ca0b630ff98d48ac459c8f3ae0945d81a534,daserste,"February 01, 2025, 12:00 AM",258,germany
3 | 3a6fd867f15cafbddc489509576a495b1794633e895ff0f18a48250bb6f1cf25,zdf-neo,"February 01, 2025, 12:00 AM",352,germany
4 | 31a2db38f49bd7b3d1689369a409bca7f031f2cab2c2d2c8715d367560651277,rtl-television,"February 01, 2025, 12:00 AM",294,germany
5 | 37d6723cd58f3b137045298c8b3dded8563da30df84e979cf27441808c7381ec,sat1,"February 01, 2025, 12:00 AM",222,germany
6 | f015abc528de99458ea833d94cdea466ab0e9c4445727a2d005bca9b2ea4adff,prosieben,"February 01, 2025, 12:00 AM",156,germany
7 | 143cfbae72cbf7c634645fe8f0b3dce52c3e95c0d27d01af10210252ec3e67e8,kabel-eins,"February 01, 2025, 12:00 AM",36,germany
8 | cf6d8f980175b1335583bce4a40595eca5886fcaa9ebeaf7611557fc41b6cf21,tf1,"February 01, 2025, 12:00 AM",258,france
9 | 6b7e0d69c3111ceb6b9f176f5c3748b5c9d44a898f5c2d9ecc7e3f0a37cb5adf,france2,"February 01, 2025, 12:00 AM",334,france
10 | 3b046c77314301e63bef3a4142eb9ac62b48fe52b72602de1ab3d93eb1c5d24b,fr3-idf,"February 01, 2025, 12:00 AM",240,france
11 | b51fe8a6a65b06ead17099a2eac4312b526f76f9b1f256d8d3779c76533a3b6a,m6,"February 01, 2025, 12:00 AM",316,france
12 | 9b1ebe8bc77b319560f91fc1c768079ff16e9f01f544b5aad25065d335c5f3f7,arte,"February 01, 2025, 12:00 AM",88,france
13 | 6aba0a0299934ed1a3411289a51ccbd11b6d9236ffef2adc8df0d76b003357f0,bfmtv,"February 01, 2025, 12:00 AM","1,030",france
14 | 0bb8064e6500c8bc63e9e30f42d21d9ad5322d508f04dd024c1b76956f0d40c4,franceinfotv,"February 01, 2025, 12:00 AM","1,030",france
--------------------------------------------------------------------------------
/test/sitemap/test_scrap_html.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import pandas as pd
3 | from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news, get_hat_20minutes, get_url_content
4 | from quotaclimat.data_ingestion.scrap_sitemap import get_description_article
5 | from bs4 import BeautifulSoup
6 | from test_utils import get_localhost, debug_df
7 |
8 | localhost = get_localhost()
9 |
10 | @pytest.mark.asyncio
11 | async def test_get_description_article():
12 | url_to_parse = f"{localhost}/mediapart_website.html"
13 | media = "Le Figaro"
14 | df_articles = pd.DataFrame([{
15 | "url" : url_to_parse,
16 | "news_title" :media,
17 | }])
18 |
19 | expected_result = pd.DataFrame([{
20 | "url" : url_to_parse,
21 | "news_title" :media,
22 | "news_description" : "description could be parsed with success"
23 | }])
24 |
25 | df_articles["news_description"] = await get_description_article(media, df_articles)
26 | debug_df(df_articles)
27 | pd.testing.assert_frame_equal(df_articles.reset_index(drop=True), expected_result.reset_index(drop=True))
28 |
29 | @pytest.mark.asyncio
30 | async def test_get_meta_news():
31 | url_to_parse = f"{localhost}/mediapart_website.html"
32 |
33 | ouput = await get_meta_news(url_to_parse, "media")
34 | assert ouput["description"] == "description could be parsed with success"
35 |
36 | @pytest.mark.asyncio
37 | async def test_get_hat_20minutes():
38 | url_to_parse = f"{localhost}/20minutes_website.html"
39 |
40 | response = await get_url_content(url_to_parse)
41 | hat = get_hat_20minutes(BeautifulSoup(response, "html.parser"))
42 | assert hat == "howdy there"
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "quotaclimat"
3 | version = "2.0.74"
4 | description = ""
5 | authors = [
6 | {name = "Rambier Estelle", "email"="estelle.rambier@hotmail.fr"},
7 | {name = "Paul Leclercq", "email"="paul@epauler.fr"}
8 | ]
9 | readme = "README.md"
10 |
11 | [tool.pytest.ini_options]
12 | log_cli = 1
13 | log_cli_level = "INFO"
14 | testpaths = [
15 | "test"
16 | ]
17 |
18 | [tool.poetry.dependencies]
19 | s3fs = {extras = ["boto3"], version = ">=2023.12.0"}
20 | boto3 = "*"
21 | botocore = "*"
22 | python = ">=3.11,<=3.13"
23 | s3transfer = "0.10.4"
24 | pandas = "^2.2.3"
25 | advertools = "^0.14.1"
26 | xmltodict = "^0.13.0"
27 | sqlalchemy = "^2.0.35"
28 | psycopg2-binary = "^2.9.5"
29 | alembic = "^1.13.1"
30 | beautifulsoup4 = "^4.11.1"
31 | asyncio = "^3.4.3"
32 | tomli = "^2.0.1"
33 | aiohttp = "^3.10.8"
34 | pytest-asyncio = "^0.23.5"
35 | swifter = "^1.4.0"
36 | tenacity = "^8.2.3"
37 | sentry-sdk = ">=2.53.0"
38 | modin = {extras = ["ray"], version = "^0.32.0"}
39 | openpyxl = "^3.1.5"
40 | requests = "^2.32.3"
41 | thefuzz = "^0.22.1"
42 | dbt-core = "^1.9.2"
43 | dbt-postgres = "^1.9.0"
44 | ruff = "^0.13.3"
45 | graphviz = "^0.21"
46 | matplotlib = "^3.10.7"
47 | plotly = "^6.5.0"
48 | nbformat = "^5.10.4"
49 | kaleido = "^1.2.0"
50 | [build-system]
51 | requires = ["poetry-core>=1.1"]
52 | build-backend = "poetry.core.masonry.api"
53 |
54 |
55 |
56 | [tool.poetry.group.dev.dependencies]
57 | coverage = "^7.5.4"
58 | pytest = "^8.1.1"
59 | pytest-cov = "^5.0.0"
60 | poetry-bumpversion = "^0.3.1"
61 | pre-commit = "^2.18.1"
62 | black = "^22.3.0"
63 | isort = "^5.10.1"
64 | flake8 = "^4.0.1"
65 | invoke = "^1.7.3"
66 | deptry = "^0.20.0"
67 | graphviz = "^0.21"
68 | ipykernel = "^7.0.1"
69 |
--------------------------------------------------------------------------------
/my_dbt_project/models/analytics/environmental_shares_with_desinfo_counts.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='incremental',
3 | unique_key=['start','channel_name','country']
4 | )
5 | }}
6 |
7 | with env_shares as (
8 | with name_map as (
9 | select
10 | channel_title,
11 | max(channel_name) channel_name
12 | from
13 | program_metadata pm
14 | where pm.country='france'
15 | group by
16 | channel_title
17 | )
18 | select
19 | start,
20 | cqes."Program Metadata - Channel Name__channel_title" as "channel_title",
21 | name_map.channel_name,
22 | cqes.sum_duration_minutes,
23 | cqes."% climat" as weekly_perc_climat,
24 | 'france' as country
25 | from
26 | public.core_query_environmental_shares cqes
27 | left join
28 | name_map
29 | on
30 | name_map.channel_title=cqes."Program Metadata - Channel Name__channel_title"
31 | union all
32 | select
33 | cqesin."start",
34 | cqesin.channel_title,
35 | cqesin.channel_name,
36 | cqesin.sum_duration_minutes,
37 | cqesin."% climat" as weekly_perc_climat,
38 | country
39 | from
40 | public.core_query_environmental_shares_i8n cqesin
41 | where country!='france'
42 | ),
43 | weekly_desinfo as (
44 | select
45 | date_trunc('week', tgc.data_item_start) week_start,
46 | tgc.data_item_channel_name,
47 | tgc.country,
48 | sum(case when tgc.mesinfo_correct is null then 0 else tgc.mesinfo_correct end) total_mesinfo
49 | from
50 | {{ ref("task_global_completion") }} tgc
51 | where tgc."Annotation Version"=1
52 | group by
53 | week_start,
54 | tgc.data_item_channel_name,
55 | tgc.country
56 | )
57 | select
58 | env_shares.*,
59 | case when weekly_desinfo.total_mesinfo is null then 0 else weekly_desinfo.total_mesinfo end total_mesinfo
60 | from
61 | env_shares
62 | left join
63 | weekly_desinfo
64 | on
65 | env_shares.start=weekly_desinfo.week_start
66 | and env_shares.channel_name=weekly_desinfo.data_item_channel_name
67 | and env_shares.country=weekly_desinfo.country
68 |
--------------------------------------------------------------------------------
/my_dbt_project/models/dashboards/core_query_causal_links.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='incremental',
3 | incremental_strategy='append',
4 | on_schema_change='append_new_columns'
5 | )
6 | }}
7 |
8 | {% set process_month = var("process_month", date_trunc('month', current_date)) %}
9 |
10 | SELECT
11 | public.keywords.id,
12 | public.keywords.channel_title,
13 | public.keywords.country,
14 | public.keywords.start,
15 | kw_consequence ->> 'keyword' AS keyword,
16 | CASE
17 | WHEN LOWER(kw_consequence ->> 'theme') LIKE '%climat%' THEN 'Crise climatique'
18 | WHEN LOWER(kw_consequence ->> 'theme') LIKE '%biodiversite%' THEN 'Crise de la biodiversité'
19 | WHEN LOWER(kw_consequence ->> 'theme') LIKE '%ressource%' THEN 'Crise des ressources'
20 | ELSE 'Autre'
21 | END AS crise,
22 | (
23 | SELECT COUNT(*)
24 | FROM public.keywords k2
25 | WHERE k2.channel_title = public.keywords.channel_title
26 | AND k2.number_of_changement_climatique_constat_no_hrfp > 0
27 | AND k2.start BETWEEN public.keywords.start - interval '4 minutes' AND public.keywords.start + interval '4 minutes'
28 | and date_trunc('month', public.keywords.start) = cast('{{ var("process_month") }}' as date)
29 | ) AS nb_constats_climat_neighbor,
30 | (
31 | SELECT COUNT(*)
32 | FROM public.keywords k3
33 | WHERE k3.channel_title = public.keywords.channel_title
34 | AND k3.number_of_biodiversite_concepts_generaux_no_hrfp > 0
35 | AND k3.start BETWEEN public.keywords.start - interval '4 minutes' AND public.keywords.start + interval '4 minutes'
36 | and date_trunc('month', public.keywords.start) = cast('{{ var("process_month") }}' as date)
37 | ) AS nb_constats_biodiversite_neighbor
38 | FROM public.keywords
39 | CROSS JOIN LATERAL json_array_elements(public.keywords.keywords_with_timestamp::json) kw_consequence
40 | WHERE LOWER(kw_consequence ->> 'theme') LIKE '%consequence%'
41 | and date_trunc('month', public.keywords.start) = cast('{{ var("process_month") }}' as date)
--------------------------------------------------------------------------------
/test/i8n/test_country.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from quotaclimat.data_processing.mediatree.i8n.country import *
4 |
5 | def test_validate_country_code_fra():
6 | france_code = validate_country_code("fra")
7 | assert france_code == FRANCE.code
8 |
9 | def test_validate_country_code_invalid():
10 | with pytest.raises(ValueError, match="Invalid country code: nz"):
11 | validate_country_code("nz")
12 |
13 | def test_get_country_from_code_fra():
14 | france = get_country_from_code("fra")
15 | assert france == FRANCE
16 |
17 | def test_get_channels_brazil():
18 | os.environ['ENV'] = 'prod'
19 | channels = get_channels(country_code=BRAZIL.code)
20 | assert channels == BRAZIL.channels
21 | os.environ['ENV'] = 'docker'
22 |
23 | def test_get_channels_default():
24 | os.environ['ENV'] = 'docker'
25 | channels = get_channels()
26 | assert channels == ["france2"]
27 |
28 |
29 | def test_get_channels_default():
30 | os.environ['ENV'] = 'prod'
31 | channels = get_channels()
32 | assert channels == FRANCE.channels
33 | os.environ['ENV'] = 'docker'
34 |
35 | def test_get_channel_title_for_name():
36 | assert get_channel_title_for_name("tf1") == "TF1"
37 |
38 | def test_get_channel_title_for_name_germany():
39 | assert get_channel_title_for_name("rtl-television", GERMANY) == "RTL"
40 |
41 | def test_get_channels_poland():
42 | os.environ['ENV'] = 'prod'
43 | channels = get_channels(country_code=POLAND.code)
44 | assert channels == POLAND.channels
45 | os.environ['ENV'] = 'docker'
46 |
47 | def test_get_channel_title_for_name_poland():
48 | assert get_channel_title_for_name("tvp", POLAND) == "TVP"
49 |
50 | def test_get_channels_spain():
51 | os.environ['ENV'] = 'prod'
52 | channels = get_channels(country_code=SPAIN.code)
53 | assert channels == SPAIN.channels
54 | os.environ['ENV'] = 'docker'
55 |
56 | def test_get_channel_title_for_name_spain():
57 | assert get_channel_title_for_name("antenna-3", SPAIN) == "Antenna 3"
--------------------------------------------------------------------------------
/alembic/versions/356882459cec_remove_category_keywords_change_columns_.py:
--------------------------------------------------------------------------------
1 | """Remove: category keywords / change columns names
2 |
3 | Revision ID: 356882459cec
4 | Revises: 2c48f626a749
5 | Create Date: 2024-04-29 10:14:27.240887
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 | from sqlalchemy.dialects import postgresql
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '356882459cec'
16 | down_revision: Union[str, None] = '2c48f626a749'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.add_column('keywords', sa.Column('number_of_ressources', sa.Integer(), nullable=True))
24 | op.add_column('keywords', sa.Column('number_of_ressources_solutions', sa.Integer(), nullable=True))
25 | op.drop_column('keywords', 'number_of_ressources_naturelles_causes')
26 | op.drop_column('keywords', 'number_of_ressources_naturelles_concepts_generaux')
27 | op.drop_column('keywords', 'category')
28 | op.drop_column('keywords', 'number_of_ressources_naturelles_solutions')
29 | # ### end Alembic commands ###
30 |
31 |
32 | def downgrade() -> None:
33 | # ### commands auto generated by Alembic - please adjust! ###
34 | op.add_column('keywords', sa.Column('number_of_ressources_naturelles_solutions', sa.INTEGER(), autoincrement=False, nullable=True))
35 | op.add_column('keywords', sa.Column('category', postgresql.JSON(astext_type=sa.Text()), autoincrement=False, nullable=True))
36 | op.add_column('keywords', sa.Column('number_of_ressources_naturelles_concepts_generaux', sa.INTEGER(), autoincrement=False, nullable=True))
37 | op.add_column('keywords', sa.Column('number_of_ressources_naturelles_causes', sa.INTEGER(), autoincrement=False, nullable=True))
38 | op.drop_column('keywords', 'number_of_ressources_solutions')
39 | op.drop_column('keywords', 'number_of_ressources')
40 | # ### end Alembic commands ###
41 |
--------------------------------------------------------------------------------
/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Run migrations before starting the application
4 | echo "Running migrations with alembic if exists"
5 | poetry run alembic upgrade head
6 |
7 |
8 | echo "update program metadata file"
9 | poetry run python3 transform_program.py
10 | if [[ $? -eq 0 ]]; then
11 | echo "Command succeeded"
12 | else
13 | echo "Command failed"
14 | fi
15 | if [[ "${REPARSE_CAUSAL_LINKS:-0}" -eq 1 ]]; then
16 | echo "Reparsing core_query_causal_links"
17 | year_end=$(date +%d)
18 |
19 | for m in $(seq 2022 2025); do
20 | start_reparse=0
21 | for mm in $(seq -w 1 12); do
22 | date="$m-$mm-01"
23 | echo "Processing month: $date"
24 | poetry run dbt run --select core_query_causal_links --vars "{\"process_month\": \"$date\"}"
25 | done
26 | done
27 | else
28 | echo "starting mediatree import app"
29 | python quotaclimat/data_processing/mediatree/api_import.py
30 |
31 | echo "ingest labelstudio data into barometre database"
32 | poetry run python -m quotaclimat.data_ingestion.labelstudio.ingest_labelstudio
33 |
34 | echo "apply dbt models - except causal links and analytics tables"
35 | poetry run dbt run --full-refresh \
36 | --exclude core_query_causal_links \
37 | --exclude task_global_completion \
38 | --exclude environmental_shares_with_desinfo_counts
39 |
40 | echo "apply dbt models to build analytics tables in 'analytics' schema."
41 | poetry run dbt run --full-refresh --target analytics \
42 | --select task_global_completion \
43 | --select environmental_shares_with_desinfo_counts
44 |
45 | echo "Causal query case: Checking if today is the first of the month..."
46 | day=$(date +%d)
47 |
48 | if [ "$day" -eq 01 ]; then
49 | echo "✅ It's the 1st — running DBT for the previous month"
50 |
51 | # previous month (first day)
52 | prev_month=$(date -d "$(date +%Y-%m-01) -1 month" +%Y-%m-01)
53 |
54 | echo "Processing month: $prev_month"
55 | poetry run dbt run --select core_query_causal_links --vars "{\"process_month\": \"$prev_month\"}"
56 | else
57 | echo "⏭️ Not the 1st — skipping DBT run"
58 | fi
59 |
60 | fi
61 |
--------------------------------------------------------------------------------
/quotaclimat/data_ingestion/scrap_html/scrap_description_article.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import aiohttp
4 | from bs4 import BeautifulSoup
5 | import asyncio
6 | import re
7 |
8 | agent = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"}
9 | async def get_url_content(url_article: str):
10 | async with aiohttp.ClientSession() as session:
11 | async with session.get(url_article, headers=agent) as response:
12 | return await response.text()
13 |
14 | def get_hat_20minutes(soup, url_article = ""):
15 | hat = soup.select_one(".hat-summary")
16 | if hat is not None:
17 | return (hat.text).strip()
18 | else:
19 | logging.warning(f"could not get hat : {url_article}")
20 | return ""
21 |
22 | # get https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta
23 | async def get_meta_news(url_article, media):
24 | result = {
25 | "title": "",
26 | "description": "",
27 | }
28 |
29 | if(media != "ouest-france"): # anti robot
30 | response = await get_url_content(str(url_article))
31 | else:
32 | return result
33 |
34 | soup = BeautifulSoup(response, "html.parser")
35 | soup_description = soup.find(name="meta", attrs={'name': 'description'})
36 | if soup_description is not None:
37 | description = soup_description.get("content").strip()
38 | logging.debug(f"description for {url_article} is \n {description}")
39 | result["description"] = description
40 | elif media == "20_minutes": # does not have meta description
41 | hat = get_hat_20minutes(soup, url_article)
42 | logging.info(f"reading hat for {media} - {hat}")
43 | result["description"] = hat
44 | else:
45 | logging.warning(f"could not find description for {url_article} - response \n {response}")
46 |
47 | # TODO : use it someday to parse missing data
48 | soup_title = soup.find(name="title")
49 | if soup_title is not None:
50 | result["title"] = (soup_title.string).strip()
51 |
52 | return result
53 |
54 |
55 |
--------------------------------------------------------------------------------
/postgres/database_connection.py:
--------------------------------------------------------------------------------
1 | import os
2 | from sqlalchemy import create_engine, URL, Engine
3 | from sqlalchemy.orm import sessionmaker, Session
4 | import logging
5 |
6 | logging.basicConfig(level=logging.INFO)
7 |
8 |
9 | def connect_to_db(
10 | database: str = os.environ.get("POSTGRES_DB", "barometre"),
11 | user: str = os.environ.get("POSTGRES_USER", "user"),
12 | host: str = os.environ.get("POSTGRES_HOST", "localhost"),
13 | port: int = os.environ.get("POSTGRES_PORT", 5432),
14 | password: str = os.environ.get("POSTGRES_PASSWORD", "password"),
15 | ):
16 | """
17 | Connect to the PostgreSQL database using environment variables or provided parameters.
18 |
19 | Parameters:
20 | - database (str, optional): The name of the database. Defaults to 'barometre'.
21 | - user (str, optional): The username for accessing the database. Defaults to 'user'.
22 | - localhost (str, optional): The hostname of the database server. Defaults to 'localhost'.
23 | - port (int, optional): The port number on which the database server is listening. Defaults to 5432.
24 | - password (str, optional): The password for accessing the database. Defaults to 'password'.
25 |
26 | Returns:
27 | - Engine: The SQLAlchemy engine object representing the connection to the database.
28 | """
29 |
30 | logging.info("Connect to the host %s for DB %s" % (host, database))
31 |
32 | url = URL.create(
33 | drivername="postgresql",
34 | username=user,
35 | host=host,
36 | database=database,
37 | port=port,
38 | password=password,
39 | )
40 |
41 | engine = create_engine(url)
42 |
43 | return engine
44 |
45 |
46 | def get_db_session(engine: Engine = None) -> Session:
47 | """
48 | Create a session for interacting with the database using the provided engine.
49 |
50 | Parameters:
51 | - engine (Engine, optional): The SQLAlchemy engine object. If not provided, it calls `connect_to_db()` to obtain one.
52 |
53 | Returns:
54 | - Session: A SQLAlchemy session bound to the provided engine or created by calling `connect_to_db()`.
55 | """
56 | if engine is None:
57 | engine = connect_to_db()
58 |
59 | Session = sessionmaker(bind=engine)
60 | return Session()
61 |
--------------------------------------------------------------------------------
/quotaclimat/utils/sentry.py:
--------------------------------------------------------------------------------
1 |
2 | import ray
3 | import os
4 | import logging
5 | from quotaclimat.utils.healthcheck_config import get_app_version
6 | import sentry_sdk
7 | from sentry_sdk.integrations.logging import LoggingIntegration
8 |
9 | # read SENTRY_DSN from env
10 | functions_to_trace = [
11 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.get_cts_in_ms_for_keywords"},
12 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.filter_keyword_with_same_timestamp"},
13 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.get_themes_keywords_duration"},
14 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.count_keywords_duration_overlap"},
15 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.filter_and_tag_by_theme"},
16 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.add_primary_key"},
17 | {"qualified_name": "quotaclimat.data_processing.mediatree.api_import.extract_api_sub"},
18 | {"qualified_name": "quotaclimat.data_processing.mediatree.api_import.parse_reponse_subtitle"},
19 | ]
20 |
21 | def sentry_init():
22 | if(os.environ.get("SENTRY_DSN", None) != None):
23 | logging.info("Sentry init")
24 | logging_kwargs = {}
25 | if os.getenv("SENTRY_LOGGING") == "true":
26 | logging_kwargs = dict(
27 | enable_logs=True,
28 | integrations=[
29 | # Only send WARNING (and higher) logs to Sentry logs,
30 | # even if the logger is set to a lower level.
31 | LoggingIntegration(sentry_logs_level=logging.INFO),
32 | ]
33 | )
34 | sentry_sdk.init(
35 | traces_sample_rate=0.3,
36 | # To set a uniform sample rate
37 | # Set profiles_sample_rate to 1.0 to profile 100%
38 | # of sampled transactions.
39 | # We recommend adjusting this value in production,
40 | profiles_sample_rate=0.3,
41 | release=get_app_version(),
42 | # functions_to_trace=functions_to_trace,
43 | # integrations=[ # TODO : https://docs.sentry.io/platforms/python/integrations/ray/
44 | # RayIntegration(),
45 | # ],
46 | **logging_kwargs
47 | )
48 | else:
49 | logging.info("Sentry not init - SENTRY_DSN not found")
--------------------------------------------------------------------------------
/.github/workflows/scaleway-start-import-job-update.yml:
--------------------------------------------------------------------------------
1 | name: Import job Scaleway
2 |
3 | on:
4 | workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch
5 |
6 |
7 | jobs:
8 | start-job-image:
9 | strategy:
10 | matrix:
11 | dates: [
12 | {start_date: "2023-04-01", end_date: "2023-05-01"}
13 | ,{start_date: "2023-05-01", end_date: "2023-06-01"}
14 | ,{start_date: "2023-06-01", end_date: "2023-07-01"}
15 | ,{start_date: "2023-07-01", end_date: "2023-08-01"}
16 | ,{start_date: "2023-08-01", end_date: "2023-09-01"}
17 | ,{start_date: "2023-09-01", end_date: "2023-10-01"}
18 | ,{start_date: "2023-10-01", end_date: "2023-11-01"}
19 | ,{start_date: "2023-11-01", end_date: "2023-12-01"}
20 | ,{start_date: "2023-12-01", end_date: "2024-01-01"}
21 | ,{start_date: "2024-01-01", end_date: "2024-02-01"}
22 | ,{start_date: "2024-02-01", end_date: "2024-03-01"}
23 | ,{start_date: "2024-03-01", end_date: "2024-04-01"}
24 | ,{start_date: "2024-04-01", end_date: "2024-05-01"}
25 | ,{start_date: "2024-05-01", end_date: "2024-06-01"}
26 | ,{start_date: "2024-06-01", end_date: "2024-07-01"}
27 | ,{start_date: "2024-07-01", end_date: "2024-08-01"}
28 | ,{start_date: "2024-08-01", end_date: "2024-09-01"}
29 | ,{start_date: "2024-09-01", end_date: "2024-10-01"}
30 | ,{start_date: "2024-10-01", end_date: "2024-11-01"}
31 | ,{start_date: "2024-11-01", end_date: "2024-12-01"}
32 | ,{start_date: "2024-12-01", end_date: "2025-01-01"}
33 | ,{start_date: "2025-01-01", end_date: "2025-02-01"}
34 | ]
35 | runs-on: ubuntu-latest
36 | steps:
37 | - name: start import job to reapply logic to all elements start_date matrix
38 | uses: jawher/action-scw@v2.34.0
39 | env:
40 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
41 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
42 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
43 | SCW_ZONE: ${{ secrets.SCW_ZONE }}
44 | with:
45 | args: jobs definition start ${{ secrets.SCALEWAY_JOB_IMPORT_ID }} environment-variables.UPDATE=true environment-variables.BIODIVERSITY_ONLY=true environment-variables.START_DATE_UPDATE=${{ matrix.dates.start_date }} environment-variables.END_DATE=${{ matrix.dates.end_date }}
46 |
--------------------------------------------------------------------------------
/alembic/versions/4333bc46985d_keywords_program_id_foreign_key.py:
--------------------------------------------------------------------------------
1 | """keywords: program_id foreign key
2 |
3 | Revision ID: 4333bc46985d
4 | Revises: ac96222af6fe
5 | Create Date: 2025-03-21 14:25:06.180296
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 | from sqlalchemy.dialects import postgresql
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '4333bc46985d'
16 | down_revision: Union[str, None] = 'ac96222af6fe'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.add_column('keywords', sa.Column('program_metadata_id', sa.Text(), nullable=True))
24 | op.create_foreign_key(None, 'keywords', 'program_metadata', ['program_metadata_id'], ['id'])
25 | op.alter_column('sitemap_table', 'download_date',
26 | existing_type=postgresql.TIMESTAMP(timezone=True),
27 | type_=sa.DateTime(),
28 | existing_nullable=True)
29 | op.alter_column('sitemap_table', 'news_publication_date',
30 | existing_type=postgresql.TIMESTAMP(timezone=True),
31 | type_=sa.DateTime(),
32 | existing_nullable=True)
33 | op.alter_column('sitemap_table', 'updated_on',
34 | existing_type=postgresql.TIMESTAMP(timezone=True),
35 | type_=sa.DateTime(),
36 | existing_nullable=True)
37 | # ### end Alembic commands ###
38 |
39 |
40 | def downgrade() -> None:
41 | # ### commands auto generated by Alembic - please adjust! ###
42 | op.alter_column('sitemap_table', 'updated_on',
43 | existing_type=sa.DateTime(),
44 | type_=postgresql.TIMESTAMP(timezone=True),
45 | existing_nullable=True)
46 | op.alter_column('sitemap_table', 'news_publication_date',
47 | existing_type=sa.DateTime(),
48 | type_=postgresql.TIMESTAMP(timezone=True),
49 | existing_nullable=True)
50 | op.alter_column('sitemap_table', 'download_date',
51 | existing_type=sa.DateTime(),
52 | type_=postgresql.TIMESTAMP(timezone=True),
53 | existing_nullable=True)
54 | op.drop_constraint(None, 'keywords', type_='foreignkey')
55 | op.drop_column('keywords', 'program_metadata_id')
56 | # ### end Alembic commands ###
57 |
--------------------------------------------------------------------------------
/alembic/versions/44f13b7eebd4_dictionary_category.py:
--------------------------------------------------------------------------------
1 | """dictionary category
2 |
3 | Revision ID: 44f13b7eebd4
4 | Revises: 827fb6dde3bb
5 | Create Date: 2025-05-23 12:54:53.323525
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 | from sqlalchemy.dialects import postgresql
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = '44f13b7eebd4'
16 | down_revision: Union[str, None] = '827fb6dde3bb'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.add_column('dictionary', sa.Column('category', sa.String(), nullable=True))
24 | op.add_column('dictionary', sa.Column('theme', sa.String(), nullable=True))
25 |
26 | op.drop_column('dictionary', 'categories')
27 | op.drop_column('dictionary', 'themes')
28 |
29 | op.drop_column('dictionary', 'solution')
30 | op.drop_column('dictionary', 'consequence')
31 | op.drop_column('dictionary', 'cause')
32 | op.drop_column('dictionary', 'general_concepts')
33 | op.drop_column('dictionary', 'statement')
34 |
35 | op.drop_column('dictionary', 'crisis_climate')
36 | op.drop_column('dictionary', 'crisis_biodiversity')
37 | op.drop_column('dictionary', 'crisis_resource')
38 | pass
39 | # ### end Alembic commands ###
40 |
41 |
42 | def downgrade() -> None:
43 | op.add_column('dictionary', sa.Column('categories', postgresql.ARRAY(sa.String()), nullable=True))
44 | op.add_column('dictionary', sa.Column('themes', postgresql.ARRAY(sa.String()), nullable=True))
45 | op.add_column('dictionary', sa.Column('solution', sa.Boolean(), nullable=True, server_default=sa.text('false')))
46 | op.add_column('dictionary', sa.Column('consequence', sa.Boolean(), nullable=True, server_default=sa.text('false')))
47 | op.add_column('dictionary', sa.Column('cause', sa.Boolean(), nullable=True, server_default=sa.text('false')))
48 | op.add_column('dictionary', sa.Column('general_concepts', sa.Boolean(), nullable=True, server_default=sa.text('false')))
49 | op.add_column('dictionary', sa.Column('statement', sa.Boolean(), nullable=True, server_default=sa.text('false')))
50 |
51 | op.add_column('dictionary', sa.Column('crisis_climate', sa.Boolean(), nullable=True, server_default=sa.text('true')))
52 | op.add_column('dictionary', sa.Column('crisis_biodiversity', sa.Boolean(), nullable=True, server_default=sa.text('true')))
53 | op.add_column('dictionary', sa.Column('crisis_resource', sa.Boolean(), nullable=True, server_default=sa.text('true')))
54 |
55 | op.drop_column('dictionary', 'category')
56 | op.drop_column('dictionary', 'theme')
--------------------------------------------------------------------------------
/mockwebsite/cnews_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | https://www.cnews.fr/culture/2023-10-25/mort-de-richard-roundtree-samuel-l-jackson-gabrielle-union-carl-weathers-les
5 |
6 |
7 | CNEWS
8 | fr
9 |
10 | Mort de Richard Roundtree : Samuel L. Jackson, Gabrielle Union, Carl Weathers… Les stars rendent hommage à l’acteur de «Shaft»
11 | 2023-10-25T08:51:25+00:00
12 | Cinéma, culture, Carnet noir, hommages, People
13 |
14 |
15 | https://static.cnews.fr/sites/default/files/richard_roundtree_hommages_6538c96cd0e46_0.jpg
16 |
17 |
18 |
19 |
20 | https://www.cnews.fr/france/2023-10-25/squat-de-saint-martin-du-touch-toulouse-pres-de-200-personnes-evacuees-1410951
21 |
22 |
23 | CNEWS
24 | fr
25 |
26 | Squat de Saint-Martin-du-Touch à Toulouse : près de 200 personnes évacuées
27 | 2023-10-25T08:47:27+00:00
28 | Squat, Toulouse, Squatteurs
29 |
30 |
31 | https://static.cnews.fr/sites/default/files/capture_decran_2023-10-25_a_10.10.05_6538ce23a0be6_0.png
32 |
33 |
34 |
35 |
36 | https://www.cnews.fr/videos/monde/2023-10-25/israel-hamas-des-que-jai-vu-lhorreur-je-suis-monte-dans-le-premier-avion
37 |
38 |
39 | CNEWS
40 | fr
41 |
42 | Israël-Hamas : «Dès que j'ai vu l'horreur, je suis monté dans le premier avion», explique un soldat de la réserve de Tsahal
43 | 2023-10-25T08:29:51+00:00
44 | Israël, Tsahal, Armée, Hamas
45 |
46 |
47 | https://static.cnews.fr/sites/default/files/Video/x8p2xa3_6538a94a625ad_0.jpg
48 | Témoignage d'un réserviste mobilisé en Israël
49 |
50 |
51 |
--------------------------------------------------------------------------------
/quotaclimat/data_ingestion/ingest_db/ingest_sitemap_in_db.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from argparse import ArgumentParser
3 | import sys,time
4 | import os
5 | from postgres.insert_data import insert_data_in_sitemap_table
6 | from postgres.insert_existing_data_example import \
7 | transformation_from_dumps_to_table_entry
8 | from postgres.schemas.models import create_tables, connect_to_db, get_last_month_sitemap_id
9 | from quotaclimat.utils.healthcheck_config import run_health_check_server
10 | from quotaclimat.utils.logger import CustomFormatter
11 | import sentry_sdk
12 | from sentry_sdk.crons import monitor
13 | from quotaclimat.utils.sentry import sentry_init
14 | import asyncio
15 | from quotaclimat.data_ingestion.scrap_sitemap import \
16 | query_one_sitemap_and_transform, get_sitemap_list
17 |
18 |
19 |
20 | async def batch_sitemap(exit_event):
21 | create_tables()
22 |
23 | conn = connect_to_db()
24 | sitemap_list = get_sitemap_list().items()
25 | logging.info("Going to parse %s" % (sitemap_list))
26 | df_from_pg = get_last_month_sitemap_id(conn)
27 | for media, sitemap_conf in sitemap_list:
28 | try:
29 | df = await query_one_sitemap_and_transform(media, sitemap_conf, df_from_pg)
30 | df_to_insert = transformation_from_dumps_to_table_entry(df)
31 | await asyncio.to_thread(insert_data_in_sitemap_table(df_to_insert, conn))
32 | except TypeError as err:
33 | logging.debug("Asyncio error %s" % (err))
34 | continue
35 | except Exception as err:
36 | logging.error("Could not ingest data in db for media %s:(%s) %s" % (media,type(err).__name__, err))
37 | continue
38 |
39 | logging.info("finished")
40 | conn.dispose()
41 | exit_event.set()
42 | return
43 |
44 | async def main():
45 | with monitor(monitor_slug='sitemap'): #https://docs.sentry.io/platforms/python/crons/
46 | event_finish = asyncio.Event()
47 | # Start the health check server in the background
48 | health_check_task = asyncio.create_task(run_health_check_server())
49 |
50 | # Start batch job
51 | asyncio.create_task(batch_sitemap(event_finish))
52 |
53 | # Wait for both tasks to complete
54 | await event_finish.wait()
55 |
56 | res=health_check_task.cancel()
57 | logging.info("Exiting with success")
58 | sys.exit(0)
59 |
60 | if __name__ == "__main__":
61 | # create logger with 'spam_application'
62 | logger = logging.getLogger()
63 | logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper())
64 | sentry_init()
65 | # create console handler with a higher log level
66 | if (logger.hasHandlers()):
67 | logger.handlers.clear()
68 | ch = logging.StreamHandler()
69 | ch.setFormatter(CustomFormatter())
70 | logger.addHandler(ch)
71 |
72 | asyncio.run(main())
73 | sys.exit(0)
74 |
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/api_import_utils/db.py:
--------------------------------------------------------------------------------
1 | from datetime import date
2 | import logging
3 | from typing import Tuple
4 | from quotaclimat.data_processing.mediatree.utils import *
5 | from quotaclimat.data_processing.mediatree.config import *
6 | from postgres.schemas.models import Keywords
7 | from sqlalchemy.orm import Session
8 | from sqlalchemy import Select, select, func, cast, Date, Integer, text, and_
9 | from quotaclimat.data_processing.mediatree.i8n.country import *
10 | from typing import NamedTuple
11 |
12 | class KeywordLastStats(NamedTuple):
13 | last_day_saved: date
14 | number_of_previous_days_from_yesterday: int
15 |
16 | # Security nets to catch up delays from production servers errors
17 |
18 | def get_last_date_and_number_of_delay_saved_in_keywords(session: Session, days_filter: int = 30, country = FRANCE) -> KeywordLastStats:
19 | logging.debug(f"get_last_date_and_number_of_delay_saved_in_keywords")
20 | try:
21 | source_subquery = (
22 | select(
23 | Keywords.start.label("start"),
24 | cast(
25 | func.extract(
26 | "day",
27 | func.date_trunc("day", (func.now() - text("INTERVAL '1 day'"))) - func.date_trunc("day", Keywords.start),
28 | ),
29 | Integer,
30 | ).label("previous_days"),
31 | )
32 | .select_from(Keywords)
33 | .where(
34 | and_(
35 | Keywords.start >= func.now() - text(f"INTERVAL '{days_filter} days'"),
36 | Keywords.country == country.name
37 | )
38 | )
39 | .subquery("source")
40 | )
41 |
42 | statement: Select[Tuple[date, int]] = (
43 | select(
44 | func.max(cast(source_subquery.c.start, Date)).label("last_day_saved"),
45 | func.min(source_subquery.c.previous_days).label("number_of_previous_days_from_yesterday"),
46 | )
47 | )
48 |
49 | result = session.execute(statement).fetchone()
50 | return KeywordLastStats(result[0], result[1])
51 | except Exception as err:
52 | logging.error("get_top_keywords_by_channel crash (%s) %s" % (type(err).__name__, err))
53 | raise err
54 |
55 | def get_delay_date(lastSavedKeywordsDate: KeywordLastStats, normal_delay_in_days: int = 1):
56 | logging.warning(f"Delay detected : {lastSavedKeywordsDate.number_of_previous_days_from_yesterday } days, it should be {normal_delay_in_days} day")
57 | default_start_date = get_epoch_from_datetime(datetime(lastSavedKeywordsDate.last_day_saved.year,lastSavedKeywordsDate.last_day_saved.month,lastSavedKeywordsDate.last_day_saved.day))
58 | default_number_of_previous_days = lastSavedKeywordsDate.number_of_previous_days_from_yesterday
59 | return default_start_date, default_number_of_previous_days
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | secrets/pwd_api.txt
2 | secrets/username_api.txt
3 | secrets/*
4 | s3/*
5 | i8n/mediatree_output/
6 | i8n/csa-belge/
7 | documents-experts/*
8 | i8n/mediatree_output
9 | i8n/csa-belge
10 |
11 | i8n/germany_big.parquet
12 | test/i8n
13 | llm/
14 | cc-bio.json
15 | *.xlsx
16 | coverage_re
17 | # Byte-compiled / optimized / DLL files
18 | __pycache__/
19 | *.py[cod]
20 | *$py.class
21 |
22 | # C extensions
23 | *.so
24 |
25 | # Distribution / packaging
26 | .Python
27 | build/
28 | develop-eggs/
29 | dist/
30 | downloads/
31 | eggs/
32 | .eggs/
33 | lib/
34 | lib64/
35 | parts/
36 | sdist/
37 | var/
38 | wheels/
39 | pip-wheel-metadata/
40 | share/python-wheels/
41 | *.egg-info/
42 | .installed.cfg
43 | *.egg
44 | MANIFEST
45 |
46 | # PyInstaller
47 | # Usually these files are written by a python script from a template
48 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
49 | *.manifest
50 | *.spec
51 |
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 |
56 | # Unit test / coverage reports
57 | htmlcov/
58 | .tox/
59 | .nox/
60 | .coverage
61 | .coverage.*
62 | .cache
63 | nosetests.xml
64 | coverage.xml
65 | *.cover
66 | *.py,cover
67 | .hypothesis/
68 | .pytest_cache/
69 |
70 | # Translations
71 | *.mo
72 | *.pot
73 |
74 | # Django stuff:
75 | *.log
76 | local_settings.py
77 | db.sqlite3
78 | db.sqlite3-journal
79 |
80 | # Flask stuff:
81 | instance/
82 | .webassets-cache
83 |
84 | # Scrapy stuff:
85 | .scrapy
86 |
87 | # Sphinx documentation
88 | docs/_build/
89 |
90 | # PyBuilder
91 | target/
92 |
93 | # Jupyter Notebook
94 | .ipynb_checkpoints
95 |
96 | # IPython
97 | profile_default/
98 | ipython_config.py
99 |
100 | # pyenv
101 | .python-version
102 |
103 | # pipenv
104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
107 | # install all needed dependencies.
108 | #Pipfile.lock
109 |
110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
111 | __pypackages__/
112 |
113 | # Celery stuff
114 | celerybeat-schedule
115 | celerybeat.pid
116 |
117 | # SageMath parsed files
118 | *.sage.py
119 |
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 |
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 |
133 | # Rope project settings
134 | .ropeproject
135 |
136 | # mkdocs documentation
137 | /site
138 |
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 |
144 | # Pyre type checker
145 | .pyre/
146 | data/*
147 | .vscode/settings.json
148 | notebooks/nlp/df_all.csv
149 | notebooks/nlp/df_X_tfidf.pkl
150 | .vscode/settings.json
151 |
152 | .DS_Store
153 | pgdata
154 | mb-data
155 | .idea
156 | pgdump/
--------------------------------------------------------------------------------
/alembic/versions/ac96222af6fe_hrfp_counters.py:
--------------------------------------------------------------------------------
1 | """hrfp counters
2 |
3 | Revision ID: ac96222af6fe
4 | Revises: 30abfd828007
5 | Create Date: 2024-12-02 14:36:21.970968
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'ac96222af6fe'
16 | down_revision: Union[str, None] = '30abfd828007'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.add_column('keywords', sa.Column('number_of_changement_climatique_constat_no_hrfp', sa.Integer(), nullable=True))
24 | op.add_column('keywords', sa.Column('number_of_changement_climatique_causes_no_hrfp', sa.Integer(), nullable=True))
25 | op.add_column('keywords', sa.Column('number_of_changement_climatique_consequences_no_hrfp', sa.Integer(), nullable=True))
26 | op.add_column('keywords', sa.Column('number_of_attenuation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
27 | op.add_column('keywords', sa.Column('number_of_adaptation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
28 | op.add_column('keywords', sa.Column('number_of_ressources_no_hrfp', sa.Integer(), nullable=True))
29 | op.add_column('keywords', sa.Column('number_of_ressources_solutions_no_hrfp', sa.Integer(), nullable=True))
30 | op.add_column('keywords', sa.Column('number_of_biodiversite_concepts_generaux_no_hrfp', sa.Integer(), nullable=True))
31 | op.add_column('keywords', sa.Column('number_of_biodiversite_causes_no_hrfp', sa.Integer(), nullable=True))
32 | op.add_column('keywords', sa.Column('number_of_biodiversite_consequences_no_hrfp', sa.Integer(), nullable=True))
33 | op.add_column('keywords', sa.Column('number_of_biodiversite_solutions_no_hrfp', sa.Integer(), nullable=True))
34 | # ### end Alembic commands ###
35 |
36 |
37 | def downgrade() -> None:
38 | # ### commands auto generated by Alembic - please adjust! ###
39 | op.drop_column('keywords', 'number_of_biodiversite_solutions_no_hrfp')
40 | op.drop_column('keywords', 'number_of_biodiversite_consequences_no_hrfp')
41 | op.drop_column('keywords', 'number_of_biodiversite_causes_no_hrfp')
42 | op.drop_column('keywords', 'number_of_biodiversite_concepts_generaux_no_hrfp')
43 | op.drop_column('keywords', 'number_of_ressources_solutions_no_hrfp')
44 | op.drop_column('keywords', 'number_of_ressources_no_hrfp')
45 | op.drop_column('keywords', 'number_of_adaptation_climatique_solutions_no_hrfp')
46 | op.drop_column('keywords', 'number_of_attenuation_climatique_solutions_no_hrfp')
47 | op.drop_column('keywords', 'number_of_changement_climatique_consequences_no_hrfp')
48 | op.drop_column('keywords', 'number_of_changement_climatique_causes_no_hrfp')
49 | op.drop_column('keywords', 'number_of_changement_climatique_constat_no_hrfp')
50 | # ### end Alembic commands ###
51 |
--------------------------------------------------------------------------------
/mockwebsite/lefigaro_localhost_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | http://localhost:8000/mediapart_website.html
5 | 2023-10-12T17:34:28+02:00
6 |
7 |
8 | Le Figaro
9 | fr
10 |
11 | 2023-10-12T06:13:00+02:00
12 | EN DIRECT - Conflit Hamas-Israël : l’armée israélienne dit avoir frappé Gaza avec 4000 tonnes d’explosifs depuis samedi
13 | Israël, Hamas, conflit israélo-palestinien, International, actualité internationale, affaires étrangères, ministère des affaires étrangères, politique étrangère
14 | Blog
15 |
16 |
17 | https://i.f1g.fr/media/cms/orig/2023/10/12/eccf7495cede8869a8a35d6fd70a1635759a12dbef68dd16e82e34162f69ec4f.jpg
18 | Explosion dans le centre de la ville de Gaza ce jeudi 12 octobre.
19 |
20 |
21 |
22 | http://localhost:8000/20minutes_website.html
23 | 2023-10-12T17:34:21+02:00
24 |
25 |
26 | Le Figaro
27 | fr
28 |
29 | 2023-10-11T16:16:00+02:00
30 | Grève du 13 octobre : SNCF, RATP, aérien, médecins… Retrouvez le détail des perturbations à prévoir
31 | grève, salaires, social, RH, ressources humaines, primes, conjoncture, entreprise, œuvres sociales, trséorerie, finance, comoité d'entreprise, elections syndicales, gestion entreprise, TPE, PME, PMI, CAC 40, fiscalité des entreprises, actualités sociales
32 |
33 |
34 | https://i.f1g.fr/media/cms/orig/2023/10/09/8f1062e1948f5c0abb930b0665ec4958613a74853c8fba9dfb7f374b3ec82065.jpg
35 | Grève: à quoi faut-il s’attendre ce 13 octobre ?
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/mockwebsite/20minutes_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 | https://www.20minutes.fr/justice/4059662-20231027-prisons-proces-rugby-re-passe-heure-voiture-eric-dupond-moretti20minutes.frfr2023-10-27T10:07:37+02:00Prisons, procès, rugby… On a (re) passé une heure en voiture avec Éric Dupond-Morettihttps://img.20mn.fr/DWn2CVxERkK9ZEKE_2ASMyk/1200x768_eric-dupond-moretti-au-centre-a-inaugure-mercrediEric Dupond-Moretti (au centre) a inauguré mercredi le centre pénitentiaire de Troyes-Lavau, dans l'Aube, aux côtés du maire de Troyes, François Baroin (à droiteà, et celui de Lavau, Jacques Gachowski (à gauche)https://www.20minutes.fr/guide-achat/guide-achat-bon-plan-cdiscount/4059580-20231026-top-5-meilleures-trottinettes-electriques-petit-prix-chez-cdiscount20minutes.frfr2023-10-27T10:05:36+02:00Top 5 des meilleures trottinettes électriques à petit prix chez Cdiscounthttps://img.20mn.fr/ilZnoCiMQsyvdlq67n7upyk/1200x768_top-5-des-meilleures-trottinettes-electriques-a-petit-prix-chez-cdiscountTop 5 des meilleures trottinettes électriques à petit prix chez Cdiscounthttps://www.20minutes.fr/monde/etats-unis/4059735-20231027-fusillades-etats-unis-direct-police-americaine-toujours-recherche-robert-card20minutes.frfr2023-10-27T10:04:16+02:00Fusillades aux Etats-Unis EN DIRECT : La police américaine toujours à la recherche de Robert Card…https://img.20mn.fr/OB_g4z-PQ6yJwXKhJBgf5yk/1200x768_oct-26-2023-bowdoin-maine-usa-law-enforcement-officers-search-the-area-of-bowdoin-maine-the-day-after-a-suspect-killed-at-least-18-people-during-multiple-shootings-in-the-lewiston-area-mandatory-credit-camille-fine-usa-today-sipa-usa-49221769-zd5-2310270429Des agents des forces de l'ordre fouillent la zone de Bowdoin, dans le Maine, au lendemain du jour où un suspect a tué au moins 18 personnes lors de multiples fusillades dans la région de Lewiston.
--------------------------------------------------------------------------------
/mockwebsite/lefigaro_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | http://nginxtest:80/mediapart_website.html
5 | 2023-10-12T17:34:28+02:00
6 |
7 |
8 | Le Figaro
9 | fr
10 |
11 | 2023-10-12T06:13:00+02:00
12 | EN DIRECT - Conflit Hamas-Israël : l’armée israélienne dit avoir frappé Gaza avec 4000 tonnes d’explosifs depuis samedi
13 | Israël, Hamas, conflit israélo-palestinien, International, actualité internationale, affaires étrangères, ministère des affaires étrangères, politique étrangère
14 | Blog
15 |
16 |
17 | https://i.f1g.fr/media/cms/orig/2023/10/12/eccf7495cede8869a8a35d6fd70a1635759a12dbef68dd16e82e34162f69ec4f.jpg
18 | Explosion dans le centre de la ville de Gaza ce jeudi 12 octobre.
19 |
20 |
21 |
22 | http://nginxtest:80/20minutes_website.html
23 | 2023-10-12T17:34:21+02:00
24 |
25 |
26 | Le Figaro
27 | fr
28 |
29 | 2023-10-11T16:16:00+02:00
30 |
31 | Grève du 13 octobre : SNCF, RATP, aérien, médecins… Retrouvez le détail des perturbations à prévoir
32 | grève, salaires, social, RH, ressources humaines, primes, conjoncture, entreprise, œuvres sociales, trséorerie, finance, comoité d'entreprise, elections syndicales, gestion entreprise, TPE, PME, PMI, CAC 40, fiscalité des entreprises, actualités sociales
33 |
34 |
35 | https://i.f1g.fr/media/cms/orig/2023/10/09/8f1062e1948f5c0abb930b0665ec4958613a74853c8fba9dfb7f374b3ec82065.jpg
36 | Grève: à quoi faut-il s’attendre ce 13 octobre ?
37 |
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/my_dbt_project/models/dashboards/core_query_thematics_keywords_i8n.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='incremental'
3 | ,unique_key=['week','channel_title']
4 | )
5 | }}
6 | {{ config(
7 | materialized='incremental'
8 | ,unique_key=['week','channel_title']
9 | )
10 | }}
11 |
12 | WITH keyword_occurrences AS (
13 | SELECT DISTINCT
14 | COALESCE(pm.channel_title, k.channel_title) AS channel_title,
15 | DATE_TRUNC('week', k.start)::date AS week,
16 | k.start AS occurrence_time,
17 | k.country AS country,
18 | -- Semantic tags
19 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%solution%' THEN TRUE ELSE FALSE END AS is_solution,
20 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%consequence%' THEN TRUE ELSE FALSE END AS is_consequence,
21 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%cause%' THEN TRUE ELSE FALSE END AS is_cause,
22 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%concepts_generaux%' THEN TRUE ELSE FALSE END AS is_general_concepts,
23 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%constat%' THEN TRUE ELSE FALSE END AS is_statement,
24 | -- Crisis type
25 | CASE
26 | WHEN LOWER(kw ->> 'theme') LIKE '%climat%' THEN 'Crise climatique'
27 | WHEN LOWER(kw ->> 'theme') LIKE '%biodiversite%' THEN 'Crise de la biodiversité'
28 | WHEN LOWER(kw ->> 'theme') LIKE '%ressource%' THEN 'Crise des ressources'
29 | ELSE 'Autre'
30 | END AS crise_type,
31 | kw ->> 'theme' AS theme,
32 | kw ->> 'keyword' AS keyword
33 | FROM public.keywords k
34 | LEFT JOIN public.program_metadata pm
35 | ON k.channel_program = pm.channel_program
36 | AND k.channel_name = pm.channel_name
37 | AND (
38 | (
39 | CASE
40 | WHEN ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7) = 0 THEN 7
41 | ELSE ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7)
42 | END = pm.weekday
43 | )
44 | )
45 | -- AND k.country = pm.country
46 | AND CAST(k.start AS date) BETWEEN CAST(pm.program_grid_start AS date)
47 | AND CAST(pm.program_grid_end AS date)
48 | , json_array_elements(k.keywords_with_timestamp::json) AS kw
49 | WHERE
50 | LOWER(kw ->> 'theme') NOT LIKE '%indirect%'
51 | )
52 |
53 | SELECT
54 | ko.channel_title,
55 | ko.country,
56 | ko.week,
57 | COALESCE(NULLIF(d.category, ''), 'Transversal') AS category,
58 | d.high_risk_of_false_positive,
59 | ko.is_solution,
60 | ko.is_consequence,
61 | ko.is_cause,
62 | ko.is_general_concepts,
63 | ko.is_statement,
64 | ko.crise_type,
65 | ko.theme,
66 | ko.keyword,
67 | COUNT(*) AS count
68 | FROM keyword_occurrences ko
69 | LEFT JOIN public.dictionary d
70 | ON d.keyword = ko.keyword AND d.theme LIKE ko.theme || '%' -- ensure matc with indirect theme inside the dictionary table
71 | GROUP BY
72 | ko.country,
73 | ko.channel_title,
74 | ko.week,
75 | d.high_risk_of_false_positive,
76 | COALESCE(NULLIF(d.category, ''), 'Transversal'),
77 | ko.is_solution,
78 | ko.is_consequence,
79 | ko.is_cause,
80 | ko.is_general_concepts,
81 | ko.is_statement,
82 | ko.crise_type,
83 | ko.theme,
84 | ko.keyword
85 | ORDER BY
86 | ko.channel_title, ko.week, ko.crise_type
--------------------------------------------------------------------------------
/test/sitemap/test_mediatree_utils.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import pandas as pd
3 |
4 | from test_utils import get_localhost
5 | from quotaclimat.data_processing.mediatree.utils import *
6 |
7 | import logging
8 | from time import strftime,localtime
9 |
10 | localhost = get_localhost()
11 |
12 | def test_get_date_sql_query():
13 | date = datetime(2024, 12, 12, 0, 0, 0)
14 | expected = "'2024-12-12 00:00:00.000 +00:00'"
15 |
16 | assert get_date_sql_query(date) == expected
17 |
18 | def test_get_yesterday():
19 | yesterday = get_yesterday()
20 | yesterday_string = strftime('%Y-%m-%d %H:%M:%S', localtime(yesterday))
21 | logging.info(f"yesterday_string {yesterday_string}")
22 | assert '00:00:00' in yesterday_string
23 |
24 | def test_is_it_tuesday():
25 | date = pd.Timestamp("2024-02-13 15:34:28")
26 | assert is_it_tuesday(date) == True
27 |
28 | date = pd.Timestamp("2024-01-01 15:34:28")
29 | assert is_it_tuesday(date) == False
30 |
31 | def test_get_end_of_month():
32 | assert get_end_of_month("2024-04-01") == "2024-04-30"
33 | assert get_end_of_month("2024-02-01") == "2024-02-29"
34 | assert get_end_of_month("2024-02-15") == "2024-02-29"
35 |
36 | def test_get_first_of_month():
37 | date = datetime(2024, 12, 12, 0, 0, 0)
38 | assert get_first_of_month(date) == "2024-12-01"
39 |
40 | def test_get_date_now_minus_days():
41 | date = datetime(2024, 12, 12, 0, 0, 0)
42 | assert get_date_now_minus_days(start=date, minus_days=6) == "2024-12-06"
43 | assert get_date_now_minus_days(start=date, minus_days=13) == "2024-11-29"
44 |
45 |
46 | def test_get_start_end_date_env_variable_with_default():
47 | start_date = 0
48 |
49 | assert get_start_end_date_env_variable_with_default(start_date, minus_days=1) == (get_yesterday(), None)
50 |
51 | def test_get_start_end_date_env_variable_with_start_date_value():
52 | start_date = 1734508085
53 | number_of_previous_days = 7
54 | start_date_minus_days = start_date - (number_of_previous_days * 24 * 60 * 60)
55 |
56 | assert get_start_end_date_env_variable_with_default(start_date, minus_days=number_of_previous_days) == (int(start_date), start_date_minus_days)
57 |
58 | def test_get_start_end_date_with_get_date_range():
59 | start_date = 1734508085
60 | number_of_previous_days = 7
61 | (start,end) = get_start_end_date_env_variable_with_default(start_date, minus_days=number_of_previous_days)
62 |
63 | expected = pd.DatetimeIndex(['2024-12-11', '2024-12-12', '2024-12-13', '2024-12-14', '2024-12-15', '2024-12-16', '2024-12-17', '2024-12-18'],
64 | dtype='datetime64[ns]', freq='D')
65 |
66 | output = get_date_range(start,end)
67 | assert len(output) == number_of_previous_days + 1
68 | pd.testing.assert_index_equal(output, expected)
69 |
70 | def test_get_start_end_date_with_get_date_range_default():
71 | start_date = 0
72 | number_of_previous_days = 7
73 | (start,end) = get_start_end_date_env_variable_with_default(start_date, minus_days=number_of_previous_days)
74 |
75 |
76 | output = get_date_range(start,end, minus_days=number_of_previous_days)
77 | assert len(output) == number_of_previous_days
--------------------------------------------------------------------------------
/test/s3/test_s3.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import pandas as pd
3 | from quotaclimat.data_processing.mediatree.s3.api_to_s3 import get_bucket_key, get_bucket_key_folder, get_partition_s3
4 | from quotaclimat.data_processing.mediatree.s3.s3_utils import read_folder_from_s3, transform_raw_keywords
5 | from quotaclimat.data_processing.mediatree.channel_program import *
6 | from quotaclimat.data_processing.mediatree.i8n.country import *
7 |
8 | def test_get_bucket_key_default():
9 | friday_6h26 = 1726719981
10 | date = pd.to_datetime(friday_6h26, unit='s', utc=True)
11 | channel = "tf1"
12 | assert get_bucket_key(date, channel) == "year=2024/month=9/day=19/channel=tf1/*.parquet"
13 |
14 | def test_get_bucket_key_france():
15 | friday_6h26 = 1726719981
16 | date = pd.to_datetime(friday_6h26, unit='s', utc=True)
17 | channel = "tf1"
18 | assert get_bucket_key(date, channel, country=FRANCE) == "year=2024/month=9/day=19/channel=tf1/*.parquet"
19 |
20 | def test_get_bucket_key_country():
21 | friday_6h26 = 1726719981
22 | date = pd.to_datetime(friday_6h26, unit='s', utc=True)
23 | channel = "tf1"
24 | assert get_bucket_key(date, channel, country=GERMANY) == f"country={GERMANY.name}/year=2024/month=9/day=19/channel=tf1/*.parquet"
25 |
26 | def test_get_bucket_key_first_of_the_month():
27 | first_december = 1733040125
28 | date = pd.to_datetime(first_december, unit='s', utc=True)
29 | channel = "tf1"
30 | assert get_bucket_key(date, channel) == "year=2024/month=12/day=1/channel=tf1/*.parquet"
31 |
32 | def test_get_bucket_key_first_of_the_month_default():
33 | first_december = 1733040125
34 | date = pd.to_datetime(first_december, unit='s', utc=True)
35 | channel = "tf1"
36 | assert get_bucket_key_folder(date, channel) == "year=2024/month=12/day=1/channel=tf1/"
37 |
38 | def test_get_bucket_key_first_of_the_month_france():
39 | first_december = 1733040125
40 | date = pd.to_datetime(first_december, unit='s', utc=True)
41 | channel = "tf1"
42 | key_folder = f"year=2024/month=12/day=1/channel=tf1/"
43 | assert get_bucket_key_folder(date, channel, country=FRANCE) == key_folder
44 |
45 | def test_get_bucket_key_first_of_the_month_brazil():
46 | first_december = 1733040125
47 | date = pd.to_datetime(first_december, unit='s', utc=True)
48 | channel = "tf1"
49 | key_folder = f"country={BRAZIL.name}/year=2024/month=12/day=1/channel=tf1/"
50 | assert get_bucket_key_folder(date, channel, country=BRAZIL) == key_folder
51 |
52 | def test_get_partition_s3_france_legacy():
53 | assert get_partition_s3(FRANCE) == ['year', 'month', 'day', 'channel']
54 |
55 | def test_get_partition_s3_other_country_than_france():
56 | assert get_partition_s3(GERMANY) == ['country','year', 'month', 'day', 'channel']
57 | assert get_partition_s3(BRAZIL) == ['country','year', 'month', 'day', 'channel']
58 |
59 | # TODO need to mock s3 reads
60 | # def test_read_folder_from_s3():
61 | # first_december = 1733040125
62 | # date = pd.to_datetime(first_december, unit='s', utc=True)
63 | # read_folder_from_s3(date=date, channel="tf1", storage_options=None)
64 |
65 | # assert False == True
66 |
67 | def test_transform_raw_keywords():
68 | df= pd.read_parquet(path="test/s3/one-day-one-channel.parquet")
69 | df_programs = get_programs()
70 | output = transform_raw_keywords(df, df_programs=df_programs)
71 |
72 | assert len(output) == 31
--------------------------------------------------------------------------------
/mockwebsite/lacroix_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 | https://www.la-croix.com/Evasion-Reau-helicoptere-heure-verdict-Redoine-Faid-2023-10-24-13012881582023-10-25T09:49:48+01:00La Croixfr2023-10-24T23:56:04+01:00Evasion par hélicoptère de Rédoine Faïd: le verdict attendu en fin d'après-midiprocès, prison, prisonniers, évasion, assises, 75http://i.la-croix.com/x/2023/10/24/1301288158/Croquis-audience-Redoine-Faid-ouverture-proces-devant-assises-Paris-5-septembre-2023_0.jpgCroquis d'audience de Rédoine Faïd à l'ouverture de son procès devant la cour d'assises de Paris, le 5 septembre 2023 https://www.la-croix.com/international/guerre-israel-hamas-jour-19-attaque-bande-gaza-otages-liban-resume-2023-10-25-12012881672023-10-25T09:36:14+01:00La Croixfr2023-10-25T05:16:56+01:00Guerre Israël-Hamas : Macron à Amman puis au Caire, 80 morts à Gaza selon le Hamasconflit israélo-palestinien, Israël, Hamas, Moyen-Orienthttp://i.la-croix.com/x/2023/10/25/1201288167/camions-daide-humanitaire-attendent-pouvoir-franchir-passage-Rafah-permettant-dacceder-bande-Gaza-Egypte-24-octobre-2023_0.jpgDes camions d’aide humanitaire attendent de pouvoir franchir le passage de Rafah permettant d’accéder à la bande de Gaza, en Égypte, le 24 octobre 2023.https://www.la-croix.com/debat/Vie-destin-saint-Crepin-2023-10-25-12012881852023-10-25T09:29:28+01:00La Croixfr2023-10-25T09:29:28+01:00Vie et destin de saint CrépinAlain Rémond, Chroniqueshttp://i.la-croix.com/x/2023/10/25/1201288185/Alain-Remond_0.jpgAlain Rémond.https://www.la-croix.com/Boxe-Naoya-Inoue-defier-Marlon-Tapales-devenir-roi-inconteste-super-coqs-2023-10-25-13012881842023-10-25T09:26:10+01:00La Croixfr2023-10-25T09:26:10+01:00Boxe: Naoya Inoue va défier Marlon Tapales pour devenir le roi incontesté des super-coqsBox, JPN, Inoue, PHI, Tapaleshttp://i.la-croix.com/x/2023/10/25/1301288184/boxeur-japonais-Naoya-Inoue-25-octobre-2023-Yokohama_0.jpgLe boxeur japonais Naoya Inoue, le 25 octobre 2023 à Yokohama
--------------------------------------------------------------------------------
/mockwebsite/midilibre_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | https://www.midilibre.fr/2023/10/24/emmanuel-macron-en-israel-le-president-annonce-que-les-sept-kidnappes-francais-sont-bien-vivantss-11539314.php
5 |
6 |
7 | Midi Libre
8 | fr
9 |
10 | 2023-10-24T10:01:57+02:00
11 | Les neuf "kidnappés Français" par le Hamas sont "bien vivants", annonce Emmanuel Macron en visite en Israël
12 | Attaque du Hamas contre Israël, Emmanuel Macron
13 |
14 |
15 | https://images.midilibre.fr/api/v1/images/view/653760e38756005f7e7a81d9/hd/image.jpg?v=1
16 | Les neuf "kidnappés Français" par le Hamas sont "bien vivants", annonce Emmanuel Macron en visite en Israël
17 |
18 |
19 |
20 | https://www.midilibre.fr/2023/10/24/controle-technique-des-deux-roues-motos-scooters-comment-la-mesure-va-t-elle-etre-mise-en-place-a-partir-de-2024-11539363.php
21 |
22 |
23 | Midi Libre
24 | fr
25 |
26 | 2023-10-24T10:01:03+02:00
27 | Contrôle technique des deux roues : motos, scooters... comment la mesure va-t-elle être mise en place à partir de 2024
28 | Auto-moto
29 |
30 |
31 | https://images.midilibre.fr/api/v1/images/view/6537772054da116cc865b469/hd/image.jpg?v=1
32 | Contrôle technique des deux roues : motos, scooters... comment la mesure va-t-elle être mise en place à partir de 2024
33 |
34 |
35 |
36 | https://www.midilibre.fr/2023/10/24/podcast-comment-les-caves-cooperatives-viticoles-sont-nees-et-quel-avenir-pour-ces-structures-aujourdhui-11532063.php
37 |
38 |
39 | Midi Libre
40 | fr
41 |
42 | 2023-10-24T10:06:02+02:00
43 | PODCAST. Comment les caves coopératives viticoles sont nées et quel avenir pour ces structures aujourd'hui
44 | Podcasts, Viticulture, Aude
45 |
46 |
47 | https://images.midilibre.fr/api/v1/images/view/6530efb5eea84505924071ba/hd/image.jpg?v=1
48 | PODCAST. Comment les caves coopératives viticoles sont nées et quel avenir pour ces structures aujourd'hui
49 |
50 |
51 |
--------------------------------------------------------------------------------
/postgres/insert_data.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 |
4 | import pandas as pd
5 | from sqlalchemy import DateTime
6 | from sqlalchemy.dialects.postgresql import insert
7 | from sqlalchemy import JSON
8 | from postgres.schemas.models import sitemap_table, Keywords, Stop_Word, keywords_table
9 | from datetime import datetime
10 |
11 | def clean_data(df: pd.DataFrame):
12 | df = df.drop_duplicates(subset="id")
13 | return df.query("id != 'empty'") # TODO improve - should be a None ?
14 |
15 | ## UPSERT
16 | def insert_or_update_on_conflict(table, conn, keys, data_iter):
17 | data = [dict(zip(keys, row)) for row in data_iter]
18 | insert_stmt = insert(table.table).values(data)
19 | # pk for tables
20 | if table.table.name == keywords_table:
21 | pk = ("id", "start") # pk of keywords
22 | else:
23 | pk = ("id",)
24 |
25 | upsert_stmt = insert_stmt.on_conflict_do_update(
26 | index_elements=list(pk),
27 | set_={k: insert_stmt.excluded[k] for k in keys if k not in pk}
28 | )
29 |
30 | return conn.execute(upsert_stmt)
31 |
32 | # do not save when primary key already exist - ignore duplicate key
33 | # from https://stackoverflow.com/a/69421596/3535853
34 | def insert_or_do_nothing_on_conflict(table, conn, keys, data_iter):
35 | data = [dict(zip(keys, row)) for row in data_iter]
36 |
37 | insert_statement = insert(table.table).values(data)
38 |
39 | on_duplicate_key_stmt = insert_statement.on_conflict_do_update(
40 | constraint=f"{table.table.name}_pkey",
41 | set_={c.key: c for c in insert_statement.excluded},
42 | )
43 |
44 | return conn.execute(on_duplicate_key_stmt)
45 |
46 | def show_sitemaps_dataframe(df: pd.DataFrame):
47 | try:
48 | df_tmp = df.groupby(by="id").size().reset_index(name="count").nlargest(5, "count")
49 | df_final = df_tmp[df_tmp['count'] > 1]
50 | if df_final.empty:
51 | logging.debug("No duplicates detected")
52 | else:
53 | logging.warning("Duplicates to remove : %s out of %s" % (len(df_final), len(df)))
54 | except Exception as err:
55 | logging.warning("Could show sitemap before saving : \n %s \n %s" % (err, df.head(1).to_string()))
56 |
57 |
58 | def save_to_pg(df, table, conn):
59 | number_of_elements = len(df)
60 | logging.info(f"Saving {number_of_elements} elements to PG table '{table}'")
61 |
62 | try:
63 | logging.debug("Schema before saving\n%s", df.dtypes)
64 | if table == keywords_table:
65 | df['updated_at'] = datetime.now()
66 |
67 | df.to_sql(
68 | table,
69 | index=False,
70 | con=conn,
71 | if_exists="append",
72 | chunksize=1000,
73 | method=insert_or_update_on_conflict, # TODO upsert
74 | dtype={"keywords_with_timestamp": JSON, "theme": JSON, "srt": JSON}, # only for keywords
75 | )
76 | logging.info("Saved dataframe to PG")
77 | return len(df)
78 | except Exception as err:
79 | logging.error("Could not save : \n %s" % (err))
80 | raise err
81 |
82 | def insert_data_in_sitemap_table(df: pd.DataFrame, conn):
83 | number_of_rows = len(df)
84 | if(number_of_rows == 0):
85 | logging.warning("0 elements to parse")
86 | else:
87 | logging.info("Received %s elements", number_of_rows)
88 |
89 | show_sitemaps_dataframe(df)
90 |
91 | df = clean_data(df)
92 | save_to_pg(df, sitemap_table, conn)
93 |
94 |
--------------------------------------------------------------------------------
/test/mediatree/test_mediatree_queries.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from sqlalchemy import Engine
4 |
5 | from quotaclimat.data_processing.mediatree.stop_word.main import *
6 | from postgres.schemas.models import get_db_session, connect_to_db, drop_tables
7 | from quotaclimat.data_processing.mediatree.api_import_utils.db import *
8 | from postgres.insert_data import save_to_pg
9 | from postgres.schemas.models import create_tables, get_db_session, get_keyword, connect_to_db, drop_tables, empty_tables,keywords_table
10 | from datetime import date
11 | from quotaclimat.data_processing.mediatree.update_pg_keywords import *
12 |
13 | conn = connect_to_db()
14 | session = get_db_session(conn)
15 |
16 |
17 |
18 | def test_mediatree_get_last_date_and_number_of_delay_saved_in_keywords():
19 | conn: Engine = connect_to_db()
20 | create_tables(conn)
21 | session = get_db_session(conn)
22 | start = pd.to_datetime("2025-01-26 12:18:54", utc=True).tz_convert('Europe/Paris')
23 | wrong_value = 1
24 | pk = "delete_me"
25 | df = pd.DataFrame([{
26 | "id" : pk,
27 | "start": start,
28 | "plaintext": "test",
29 | "channel_name": "test",
30 | "channel_radio": False,
31 | "theme":[],
32 | "keywords_with_timestamp": [],
33 | "srt": [],
34 | "number_of_keywords": wrong_value, # wrong data to reapply our custom logic for "new_value"
35 | "number_of_changement_climatique_constat": wrong_value,
36 | "number_of_changement_climatique_causes_directes": wrong_value,
37 | "number_of_changement_climatique_consequences": wrong_value,
38 | "number_of_attenuation_climatique_solutions_directes": wrong_value,
39 | "number_of_adaptation_climatique_solutions_directes": wrong_value,
40 | "number_of_ressources": wrong_value,
41 | "number_of_ressources_solutions": wrong_value,
42 | "number_of_biodiversite_concepts_generaux": wrong_value,
43 | "number_of_biodiversite_causes_directes": wrong_value,
44 | "number_of_biodiversite_consequences": wrong_value,
45 | "number_of_biodiversite_solutions_directes" : wrong_value,
46 | "channel_program_type": "to change",
47 | "channel_program":"to change"
48 | ,"program_metadata_id":"336643dc7fa09ac7335a4ceba43270ed3f553be3383a9b3b6e3cced101f2a87a"
49 | ,"channel_title":"channel_title"
50 | ,"number_of_keywords_climat": wrong_value
51 | ,"number_of_keywords_biodiversite": wrong_value
52 | ,"number_of_keywords_ressources": wrong_value
53 | ,"country" :"france"
54 | }])
55 |
56 | save_to_pg(df, keywords_table, conn)
57 |
58 | keywordStats = get_last_date_and_number_of_delay_saved_in_keywords(session, days_filter=3000)
59 | expected_max_date = KeywordLastStats(date(2025, 1, 26), 2)
60 |
61 | assert expected_max_date.last_day_saved == keywordStats.last_day_saved
62 | assert keywordStats.number_of_previous_days_from_yesterday > 1
63 | delete_keywords_id(session, pk)
64 | session.commit()
65 | session.close()
66 |
67 |
68 | def test_get_delay_date():
69 | unixtimestamp_2025_01_26 = 1737849600
70 | expected_max_date = KeywordLastStats(date(2025, 1, 26), 2)
71 | default_start_date, default_number_of_previous_days = get_delay_date(expected_max_date, normal_delay_in_days=1)
72 |
73 | assert default_start_date == unixtimestamp_2025_01_26
74 | assert default_number_of_previous_days == 2
--------------------------------------------------------------------------------
/alembic/env.py:
--------------------------------------------------------------------------------
1 | from logging.config import fileConfig
2 |
3 | from sqlalchemy import create_engine
4 | from postgres.schemas.base import Base
5 | from quotaclimat.data_ingestion.labelstudio.models import TargetBase
6 | from alembic import context
7 |
8 | import re
9 | import os
10 |
11 | # this is the Alembic Config object, which provides
12 | # access to the values within the .ini file in use.
13 | config = context.config
14 |
15 | # Interpret the config file for Python logging.
16 | # This line sets up loggers basically.
17 | if config.config_file_name is not None:
18 | fileConfig(config.config_file_name)
19 |
20 | # add your model's MetaData object here
21 | # for 'autogenerate' support
22 | # from myapp import mymodel
23 | # target_metadata = mymodel.Base.metadata
24 | target_metadata = [Base.metadata, TargetBase.metadata]
25 |
26 | # from https://stackoverflow.com/a/63672522/3535853
27 | # https://alembic.sqlalchemy.org/en/latest/cookbook.html#don-t-generate-any-drop-table-directives-with-autogenerate
28 | def include_object(object, name, type_, reflected, compare_to):
29 | if type_ == "table" and reflected and compare_to is None:
30 | return False
31 | else:
32 | return True
33 |
34 | # other values from the config, defined by the needs of env.py,
35 | # can be acquired:
36 | # my_important_option = config.get_main_option("my_important_option")
37 | # ... etc.
38 |
39 |
40 | def run_migrations_offline() -> None:
41 | """Run migrations in 'offline' mode.
42 |
43 | This configures the context with just a URL
44 | and not an Engine, though an Engine is acceptable
45 | here as well. By skipping the Engine creation
46 | we don't even need a DBAPI to be available.
47 |
48 | Calls to context.execute() here emit the given string to the
49 | script output.
50 |
51 | """
52 | url = config.get_main_option("sqlalchemy.url")
53 | context.configure(
54 | url=url,
55 | target_metadata=target_metadata,
56 | literal_binds=True,
57 | dialect_opts={"paramstyle": "named"},
58 | include_object=include_object
59 | )
60 |
61 | with context.begin_transaction():
62 | context.run_migrations()
63 |
64 |
65 | def run_migrations_online() -> None:
66 | """Run migrations in 'online' mode.
67 |
68 | In this scenario we need to create an Engine
69 | and associate a connection with the context.
70 |
71 | """
72 | url_tokens = {
73 | "POSTGRES_USER": os.getenv("POSTGRES_USER",""),
74 | "POSTGRES_DB": os.getenv("POSTGRES_DB",""),
75 | "POSTGRES_PASSWORD": os.getenv("POSTGRES_PASSWORD",""),
76 | "POSTGRES_HOST": os.getenv("POSTGRES_HOST",""),
77 | "POSTGRES_PORT": os.getenv("POSTGRES_PORT","")
78 | }
79 |
80 | url = config.get_main_option("sqlalchemy.url")
81 |
82 | url = re.sub(r"\${(.+?)}", lambda m: url_tokens[m.group(1)], url)
83 |
84 | connectable = create_engine(url)
85 |
86 | with connectable.connect() as connection:
87 | context.configure(
88 | connection=connection,
89 | target_metadata=target_metadata,
90 | compare_type=True,
91 | compare_server_default=True,
92 | include_object=include_object
93 | )
94 |
95 | with context.begin_transaction():
96 | context.run_migrations()
97 |
98 | if context.is_offline_mode():
99 | run_migrations_offline()
100 | else:
101 | run_migrations_online()
102 |
--------------------------------------------------------------------------------
/analyse/mediatree/test_program_durations.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "fa23a75a",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "import matplotlib.pyplot as plt"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "id": "ce7a2095",
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "df = pd.read_csv(\"data/mediatree_channel_coverages_2025-12-15\")\n",
22 | "df.date = pd.to_datetime(df.date)"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "id": "44b06fa4",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "df.date.max().strftime(\"%d %b\")"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "id": "a36a6874",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": []
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "id": "e638c622",
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "import plotly.express as px\n",
51 | "import os\n",
52 | "\n",
53 | "for country, group in df.groupby(\"country\"):\n",
54 | " start_date = group.date.min().strftime(\"%d %B\")\n",
55 | " end_date = group.date.max().strftime(\"%d %B\")\n",
56 | " fig = px.line(group, x=\"date\", y=\"coverage\", color='channel_name', title=f\"{country.title()}: {start_date} - {end_date}\")\n",
57 | " os.makedirs(f\"images/{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}\", exist_ok=True)\n",
58 | " fig.write_image(f\"images/{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}/coverage_{country}_chains_{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}.png\")\n",
59 | " fig.show()\n"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "id": "45d55028",
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "for country, group in df.groupby(\"country\"):\n",
70 | " start_date = group.date.min().strftime(\"%d %B\")\n",
71 | " end_date = group.date.max().strftime(\"%d %B\")\n",
72 | " df_mean = group.groupby(\"date\").agg({\"coverage\": \"mean\"})\n",
73 | " fig = px.line(df_mean, y=\"coverage\", title=f\"Mean Coverage {country.title()}: {start_date} - {end_date}\")\n",
74 | " fig.write_image(f\"images/{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}/coverage_{country}_mean_{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}.png\")\n",
75 | " fig.show()"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "id": "727893ea",
82 | "metadata": {},
83 | "outputs": [],
84 | "source": []
85 | }
86 | ],
87 | "metadata": {
88 | "kernelspec": {
89 | "display_name": ".venv",
90 | "language": "python",
91 | "name": "python3"
92 | },
93 | "language_info": {
94 | "codemirror_mode": {
95 | "name": "ipython",
96 | "version": 3
97 | },
98 | "file_extension": ".py",
99 | "mimetype": "text/x-python",
100 | "name": "python",
101 | "nbconvert_exporter": "python",
102 | "pygments_lexer": "ipython3",
103 | "version": "3.11.6"
104 | }
105 | },
106 | "nbformat": 4,
107 | "nbformat_minor": 5
108 | }
109 |
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/time_monitored/models.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from datetime import datetime
3 |
4 | from sqlalchemy import Column, DateTime, String, Text, Boolean, ARRAY, JSON, Integer, Table, MetaData, ForeignKey, PrimaryKeyConstraint
5 | from sqlalchemy.orm import declarative_base, sessionmaker, relationship
6 | from sqlalchemy.exc import SQLAlchemyError
7 | from sqlalchemy.dialects.postgresql import insert
8 | import pandas as pd
9 | from sqlalchemy import text
10 | from postgres.database_connection import connect_to_db, get_db_session
11 | from postgres.schemas.base import Base
12 | from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS
13 | from quotaclimat.data_processing.mediatree.i8n.country import FRANCE
14 | from quotaclimat.data_ingestion.scrap_sitemap import get_consistent_hash
15 | import os
16 | import json
17 | from json import JSONDecodeError
18 |
19 |
20 | import traceback
21 |
22 | # The duration in minutes of media monitoring based on number of chunks of 2 minutes saved in S3
23 | class Time_Monitored(Base):
24 | __tablename__ = "time_monitored"
25 | id = Column(Text, primary_key=True)
26 | channel_name = Column(String, nullable=False)
27 | start = Column(DateTime(), nullable=False)
28 | duration_minutes= Column(Integer)
29 | country = Column(String, nullable=False)
30 |
31 | def get_time_monitored(id: str):
32 | session = get_db_session()
33 | return session.get(Time_Monitored, id)
34 |
35 | # count how many rows are in the dataframe and save it to postgresql inside a new table called time_monitor
36 | def save_time_monitored(number_of_rows : int, day: datetime, channel :str, country : str,session=None):
37 | """
38 | Save the number of rows (chunk) to the time_monitor table in PostgreSQL.
39 |
40 | Args:
41 | number_of_rows (int): The number of rows (2 minute chunk) to save.
42 | day (datetime): The date of the monitoring.
43 | channel (str): The name of the channel.
44 | country (str): The country name.
45 | """
46 | try:
47 | duration_minutes = number_of_rows * 2 # 2 minutes per chunk
48 | logging.info(f"Saving time monitored of {duration_minutes} minutes ({number_of_rows} chunks of 2 minutes) for {day} - {channel} - {country}")
49 | max_hours = 23
50 | if duration_minutes / 60 > max_hours:
51 | logging.error(f"Duration of {duration_minutes / 60} hours is above {max_hours} hours. Please check the data.")
52 |
53 | if session is None:
54 | session = get_db_session()
55 |
56 | stmt = insert(Time_Monitored).values(
57 | id=get_consistent_hash(f"{channel}_{day}_{country}"),
58 | channel_name=channel,
59 | start=day,
60 | duration_minutes=duration_minutes,
61 | country=country
62 | )
63 | # upsert
64 | stmt = stmt.on_conflict_do_update(
65 | index_elements=['id'], # Use the 'id' column as the conflict target
66 | set_={
67 | 'channel_name': stmt.excluded.channel_name,
68 | 'start': stmt.excluded.start,
69 | 'duration_minutes': stmt.excluded.duration_minutes,
70 | 'country': stmt.excluded.country
71 | }
72 | )
73 |
74 | # Execute the statement
75 | session.execute(stmt)
76 |
77 | session.commit()
78 | logging.info("Saved time monitored")
79 | except SQLAlchemyError as e:
80 | logging.error(f"Error saving time monitored data: {e}")
81 | logging.error(traceback.format_exc())
82 | finally:
83 | session.close()
--------------------------------------------------------------------------------
/test/sitemap/test_keywords.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import pandas as pd
4 | from quotaclimat.data_processing.mediatree.utils import *
5 | from quotaclimat.data_processing.mediatree.detect_keywords import *
6 | from quotaclimat.data_processing.mediatree.keyword.stop_words import STOP_WORDS
7 |
8 | def test_get_remove_stopwords_recycler():
9 | stop_words_list = [
10 | "recycler"
11 | ]
12 | ad = "nous les recycler pour en faire de nouvelles en fabriquant nous-mêmes du plastique recyclé pour cela nous avons créé trois usines exclusivement dédié au recyclage dès cette année cristallines est capable de recycler autant de bouteilles"
13 |
14 | assert remove_stopwords(ad, stop_words_list) == "nous les pour en faire de nouvelles en fabriquant nous-mêmes du plastique recyclé pour cela nous avons créé trois usines exclusivement dédié au recyclage dès cette année cristallines est capable de autant de bouteilles"
15 |
16 | def test_get_remove_stopwords_no_modification():
17 | stop_words_list = [
18 | "recycler"
19 | ]
20 | ad = "no keywords"
21 |
22 | assert remove_stopwords(ad, stop_words_list) == ad
23 |
24 | def test_remove_stopwords_huile():
25 | stop_words_list = [
26 | "recycler",
27 | "huile de coude était aussi une énergie renouvelable",
28 | "est à fond sur le tri sélectif"
29 | ]
30 | assert remove_stopwords("l' huile de coude était aussi une énergie renouvelable stéphane est à fond sur le tri sélectif",stop_words_list) \
31 | == "l' stéphane "
32 |
33 |
34 | def test_remove_stopwords_energie():
35 | plaintext = "quand le prix de l' énergie augmente il y a ceux qui se couvre plus ceux qui sortent moins et il y a ceux qui choisissent d' optimiser leurs énergies panneaux solaires isolations thermique pompes à chaleur chaque jour fleuron industrie parcourt la france pour vous aider à optimiser votre énergie florent industries point com en ce moment la centrale photovoltaïque de trois kilowatts et deux mille cinq cents euros et oui deux deux mille cinq cents euros cents dépêchez euros vous dépêchez vous de réserver votre kit sur fleuron industries point com la rénovation énergétique avec ici pour changer de maison sans changer de maison isolation chauffage solaire plus de confort et d' économie avec ici pas à mal casser pas mal vous avez fait une toute la pâte à modeler la je fais comment une tartine de pâte à modeler sans pâte à modeler c' est pas interdit ça s' appelle dupin juste merci pour le partage le jour où vous aimerez la pâte"
36 | output = remove_stopwords(plaintext,STOP_WORDS)
37 | # plantext does not contain photovoltaïque
38 | assert "photovoltaïque" not in output
39 | assert "rénovation énergetique" not in output
40 | assert "chauffage" not in output
41 |
42 | def test_remove_stopwords_fleuron():
43 | plaintext = "chaque jour fleuron industrie parcourt"
44 | output = remove_stopwords(plaintext,STOP_WORDS)
45 | # plantext does not contain photovoltaïque
46 | assert output == ""
47 |
48 | def test_remove_stopwords_photovoltaique():
49 | plaintext = "point com en ce moment la centrale photovoltaïque de trois kilowatt et à deux m"
50 | output = remove_stopwords(plaintext,STOP_WORDS)
51 | # plantext does not contain photovoltaïque
52 | assert "photovoltaïque" not in output
53 | assert len(output) == 0
54 |
55 |
56 | def test_replace_word_with_context_unk():
57 | plaintext=" quand le prix de l' énergie augmente il y a ceux qui se couvren"
58 | output = replace_word_with_context(text=plaintext, word=" ", length_to_remove=0)
59 | assert output == "quand le prix de l' énergie augmente il y a ceux qui se couvren"
60 |
--------------------------------------------------------------------------------
/mockwebsite/franceinter_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 | https://www.radiofrance.fr/franceinter/attentat-de-l-opera-en-2018-le-meilleur-ami-du-terroriste-dans-le-box-des-accuses-7790800France Interfr2023-10-25T04:22:39+00:00Attentat de l'Opéra en 2018 : le meilleur ami du terroriste dans le box des accusésJustice, Attentats en France, Djihadisme, Terrorisme, Société, https://www.radiofrance.fr/s3/cruiser-production/2023/10/609eccc2-ca90-4694-a42e-9175f318a68a/1200x680_sc_maxnewsworldfour522282.jpgUne personne est décédée, quatre autres ont été blessées, lors de l'attaque dans le quartier Opéra de Paris en mai 2018. - Nicolas Jouberthttps://www.radiofrance.fr/franceinter/bronchiolite-par-manque-de-traitements-des-maternites-obligees-de-trier-les-bebes-eligibles-au-beyfortus-6279386France Interfr2023-10-25T04:16:27+00:00Bronchiolite : par manque de traitements, des maternités obligées de trier les bébés éligibles au BeyfortusSanté, Maternité, Enfance, Sociétéhttps://www.radiofrance.fr/s3/cruiser-production/2023/10/06db6c5f-d163-4f1d-af2a-f44283e87ff4/1200x680_sc_080-hl-amorcillo-2084694.jpgLes bébés peuvent bénéficier d'un traitement, le Beyfortus, permettant d'éviter les formes graves de la bronchiolite. - Aline Morcillohttps://www.radiofrance.fr/franceinter/sur-tik-tok-des-influenceurs-soutirent-des-milliers-d-euros-a-leurs-abonnes-pour-des-cadeaux-virtuels-9624456France Interfr2023-10-25T04:12:31+00:00Sur TikTok, des influenceurs soutirent des milliers d'euros à leurs abonnés pour des cadeaux virtuelsTech – Web, Applications mobiles, Société, https://www.radiofrance.fr/s3/cruiser-production/2023/10/3f085f10-3a39-43cd-82ad-617fc92b5e3c/1200x680_sc_illustration-tiktok.jpgCapture d"écran d'un "live" TikTok, au cours duquel sont proposés des cadeaux virtuels - Xavier Demagnyhttps://www.radiofrance.fr/franceinter/feminisation-attractivite-et-creativite-six-choses-a-savoir-sur-l-industrie-francaise-du-jeu-video-7091011France Interfr2023-10-24T15:50:56+00:00Féminisation, attractivité et créativité : six choses à savoir sur l'industrie française du jeu vidéo en 2023Entreprises – Marchés, Jeux vidéo, Économie, Arts et Divertissementhttps://www.radiofrance.fr/s3/cruiser-production/2023/10/9c284c6e-8797-47c7-b69a-2cab32e9917a/1200x680_sc_maxnewsfrfive059827.jpgStand d'Ubisoft, un des poids lourds du jeu vidéo français, lors de la Paris Games Week 2022 - Bruno Levesque / IP3
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/spain/channel_program.py:
--------------------------------------------------------------------------------
1 | channels_programs_spain = [
2 | {"channel_name": "antenna-3", "start": "06:15", "end": "08:50", "weekday": "weekday", "program_name": "Noticia de la manana", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
3 | {"channel_name": "antenna-3", "start": "15:00", "end": "15:30", "weekday": "weekday", "program_name": "Noticias 15:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
4 | {"channel_name": "antenna-3", "start": "21:00", "end": "21:30", "weekday": "weekday", "program_name": "Noticias", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
5 |
6 | {"channel_name": "rtve-la-1", "start": "06:00", "end": "06:30", "weekday": "weekday", "program_name": "Telediaro 06:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
7 | {"channel_name": "rtve-la-1", "start": "15:00", "end": "15:40", "weekday": "weekday", "program_name": "Telediaro 15:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
8 | {"channel_name": "rtve-la-1", "start": "21:00", "end": "21:30", "weekday": "weekday", "program_name": "Telediaro", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
9 | {"channel_name": "rtve-la-1", "start": "15:00", "end": "15:40", "weekday": "weekend", "program_name": "Telediaro fin de semana", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
10 |
11 | {"channel_name": "rtve-24h", "start": "14:00", "end": "14:45", "weekday": "*", "program_name": "Information 24 horas 14:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
12 | {"channel_name": "rtve-24h", "start": "20:00", "end": "20:45", "weekday": "*", "program_name": "Information 24 horas", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
13 |
14 | {"channel_name": "lasexta-news", "start": "11:00", "end": "15:00", "weekday": "weekday", "program_name": "Al Rojo Vivo", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
15 | {"channel_name": "lasexta-news", "start": "14:00", "end": "14:45", "weekday": "*", "program_name": "La Sexta Noticias 14:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
16 | {"channel_name": "lasexta-news", "start": "20:00", "end": "20:45", "weekday": "*", "program_name": "La Sexta Noticias", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
17 |
18 | {"channel_name": "telecinco-news", "start": "07:00", "end": "09:00", "weekday": "weekday", "program_name": "El Matinal 07:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
19 | {"channel_name": "telecinco-news", "start": "15:00", "end": "15:30", "weekday": "weekday", "program_name": "El Matinal 15:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
20 | {"channel_name": "telecinco-news", "start": "21:00", "end": "21:40", "weekday": "weekday", "program_name": "El Matinal", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
21 |
22 | {"channel_name": "cuatro-news", "start": "14:00", "end": "14:55", "weekday": "weekday", "program_name": "Noticias Cuatro 14:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
23 | {"channel_name": "cuatro-news", "start": "20:00", "end": "20:40", "weekday": "weekday", "program_name": "Noticias Cuatro", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
24 | {"channel_name": "cuatro-news", "start": "10:30", "end": "14:00", "weekday": "weekday", "program_name": "En Boca de Todos", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",},
25 |
26 | ]
27 |
--------------------------------------------------------------------------------
/alembic/versions/a578d21d7aee_add_tables_labelstudio.py:
--------------------------------------------------------------------------------
1 | """Add tables labelstudio
2 |
3 | Revision ID: a578d21d7aee
4 | Revises: 44f13b7eebd4
5 | Create Date: 2025-10-09 14:18:14.410103
6 |
7 | """
8 | from typing import Sequence, Union
9 |
10 | from alembic import op
11 | import sqlalchemy as sa
12 |
13 |
14 | # revision identifiers, used by Alembic.
15 | revision: str = 'a578d21d7aee'
16 | down_revision: Union[str, None] = '44f13b7eebd4'
17 | branch_labels: Union[str, Sequence[str], None] = None
18 | depends_on: Union[str, Sequence[str], None] = None
19 |
20 |
21 | def upgrade() -> None:
22 | # ### commands auto generated by Alembic - please adjust! ###
23 | op.create_table('labelstudio_task_aggregate',
24 | sa.Column('task_aggregate_id', sa.String(), nullable=False),
25 | sa.Column('id', sa.Integer(), nullable=False),
26 | sa.Column('data', sa.JSON(), nullable=False),
27 | sa.Column('created_at', sa.DateTime(), nullable=False),
28 | sa.Column('updated_at', sa.DateTime(), nullable=False),
29 | sa.Column('is_labeled', sa.Boolean(), nullable=False),
30 | sa.Column('project_id', sa.Integer(), nullable=True),
31 | sa.Column('meta', sa.JSON(), nullable=True),
32 | sa.Column('overlap', sa.Integer(), nullable=False),
33 | sa.Column('file_upload_id', sa.Integer(), nullable=True),
34 | sa.Column('updated_by_id', sa.Integer(), nullable=True),
35 | sa.Column('inner_id', sa.BigInteger(), nullable=True),
36 | sa.Column('total_annotations', sa.Integer(), nullable=False),
37 | sa.Column('cancelled_annotations', sa.Integer(), nullable=False),
38 | sa.Column('total_predictions', sa.Integer(), nullable=False),
39 | sa.Column('comment_count', sa.Integer(), nullable=False),
40 | sa.Column('last_comment_updated_at', sa.DateTime(), nullable=True),
41 | sa.Column('unresolved_comment_count', sa.Integer(), nullable=False),
42 | sa.Column('country', sa.String(), nullable=False),
43 | sa.PrimaryKeyConstraint('task_aggregate_id')
44 | )
45 | op.create_table('labelstudio_task_completion_aggregate',
46 | sa.Column('task_completion_aggregate_id', sa.String(), nullable=False),
47 | sa.Column('task_aggregate_id', sa.String(), nullable=False),
48 | sa.Column('id', sa.Integer(), nullable=False),
49 | sa.Column('result', sa.JSON(), nullable=True),
50 | sa.Column('was_cancelled', sa.Boolean(), nullable=False),
51 | sa.Column('ground_truth', sa.Boolean(), nullable=False),
52 | sa.Column('created_at', sa.DateTime(), nullable=False),
53 | sa.Column('updated_at', sa.DateTime(), nullable=False),
54 | sa.Column('task_id', sa.Integer(), nullable=True),
55 | sa.Column('prediction', sa.JSON(), nullable=True),
56 | sa.Column('lead_time', sa.Double(), nullable=True),
57 | sa.Column('result_count', sa.Integer(), nullable=False),
58 | sa.Column('completed_by_id', sa.Integer(), nullable=True),
59 | sa.Column('parent_prediction_id', sa.Integer(), nullable=True),
60 | sa.Column('parent_annotation_id', sa.Integer(), nullable=True),
61 | sa.Column('last_action', sa.Text(), nullable=True),
62 | sa.Column('last_created_by_id', sa.Integer(), nullable=True),
63 | sa.Column('project_id', sa.Integer(), nullable=True),
64 | sa.Column('updated_by_id', sa.Integer(), nullable=True),
65 | sa.Column('unique_id', sa.Uuid(), nullable=True),
66 | sa.Column('draft_created_at', sa.DateTime(), nullable=True),
67 | sa.Column('import_id', sa.BigInteger(), nullable=True),
68 | sa.Column('bulk_created', sa.Boolean(), nullable=True),
69 | sa.Column('country', sa.String(), nullable=False),
70 | sa.ForeignKeyConstraint(['task_aggregate_id'], ['labelstudio_task_aggregate.task_aggregate_id'], ),
71 | sa.PrimaryKeyConstraint('task_completion_aggregate_id')
72 | )
73 | # ### end Alembic commands ###
74 |
75 |
76 | def downgrade() -> None:
77 | # ### commands auto generated by Alembic - please adjust! ###
78 | op.drop_table('labelstudio_task_completion_aggregate')
79 | op.drop_table('labelstudio_task_aggregate')
80 | # ### end Alembic commands ###
81 |
--------------------------------------------------------------------------------
/my_dbt_project/models/dashboards/thematic_query_ocean.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='incremental'
3 | ,unique_key=['id']
4 | )
5 | }}
6 |
7 | with clean_keywords AS (
8 | SELECT
9 | "public"."keywords"."id" AS "id",
10 | json_array_elements(
11 | "public"."keywords"."keywords_with_timestamp" :: json
12 | ) AS kw
13 | FROM
14 | "public"."keywords"
15 | WHERE
16 | "public"."keywords"."start" >= '2025-01-01'
17 | AND "public"."keywords"."number_of_keywords" > 0
18 | AND "public"."keywords"."country" = 'france'
19 | AND "public"."keywords"."channel_title" <> 'C8'
20 | ),
21 |
22 | filtered_keywords AS (
23 | SELECT
24 | *
25 | FROM clean_keywords
26 | INNER JOIN "public"."dictionary"
27 | ON "public"."dictionary"."keyword" = clean_keywords.kw ->> 'keyword'
28 | AND "public"."dictionary"."theme" LIKE clean_keywords.kw ->> 'theme' || '%' -- ensure matc with indirect theme inside the dictionary table
29 | WHERE
30 | "public"."dictionary"."keyword" IN (
31 | 'acidification des océans',
32 | 'acidification des oceans',
33 | 'algues vertes',
34 | 'aménagement résilient',
35 | 'chalut',
36 | 'chalutage',
37 | 'chalutier',
38 | 'conservation marine',
39 | 'deep sea mining',
40 | 'dessalement de l’eau de mer',
41 | 'élévation du niveau de la mer',
42 | 'élévation du niveau des océans',
43 | 'érosion des côtes',
44 | 'érosion du littoral',
45 | 'exploitation fonds marins',
46 | 'exploitation gazière',
47 | 'exploitation pétrolière',
48 | 'filets de pêche',
49 | 'filets maillants',
50 | 'gestion du littoral',
51 | 'halieutique',
52 | 'hausse du niveau de la mer',
53 | 'hausse du niveau des océans',
54 | 'industrie de la pêche',
55 | 'journée mondiale des océans',
56 | 'limiter l’érosion des côtes',
57 | 'littoral',
58 | 'macro déchet plastique',
59 | 'mer',
60 | 'micro déchet plastique',
61 | 'montée du niveau de la mer',
62 | 'montée du niveau des océans',
63 | 'nano plastique',
64 | 'océan',
65 | 'océanographe',
66 | 'palangre',
67 | 'parc naturel marin',
68 | 'pêche artisanale',
69 | 'pêche au large',
70 | 'pêche côtière',
71 | 'pêche durable',
72 | 'pêche industrielle',
73 | 'pêche professionnelle',
74 | 'pêche responsable',
75 | 'pêcheur',
76 | 'petite pêche',
77 | 'plan de prévention des risques littoraux',
78 | 'pollution de la mer',
79 | 'protection des côtes',
80 | 'protection des océans',
81 | 'quota de pêche',
82 | 'réchauffement des océans',
83 | 'recul du trait de côte',
84 | 'septième continent',
85 | 'stress thermique',
86 | 'système de drainage',
87 | 'surpêche',
88 | 'the metals company',
89 | 'zone marine protégée',
90 | 'zone maritime'
91 | )
92 | ),
93 |
94 | distinct_kw AS (
95 | SELECT
96 | DISTINCT(id) AS "distinct_id"
97 | FROM
98 | filtered_keywords
99 | )
100 |
101 | SELECT
102 | "public"."keywords"."id",
103 | "public"."keywords"."start",
104 | "public"."keywords"."channel_title",
105 | "public"."keywords"."plaintext",
106 | "public"."keywords"."number_of_keywords",
107 | "public"."keywords"."keywords_with_timestamp",
108 | "public"."keywords"."country",
109 | "public"."keywords"."channel_name"
110 | FROM
111 | "public"."keywords"
112 | INNER JOIN distinct_kw ON distinct_kw.distinct_id = "public"."keywords".id
113 | WHERE
114 | "public"."keywords"."start" >= '2025-01-01'
115 | AND "public"."keywords"."number_of_keywords" > 0
116 | AND "public"."keywords"."country" = 'france'
117 | AND "public"."keywords"."channel_title" <> 'C8'
118 | AND "public"."keywords"."channel_title" IS NOT NULL
119 | AND "public"."keywords"."channel_title" <> ''
--------------------------------------------------------------------------------
/mockwebsite/republiquepyrenees_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 | https://www.larepubliquedespyrenees.fr/pyrenees-atlantiques/pontiacq-viellepinte/pontiacq-lamayou-c-est-parti-pour-le-36e-tournoi-de-pala-17196604.php2023-10-25T11:00:49+02:00https://images.larepubliquedespyrenees.fr/17196604/1200x-1/morlaasvic-bilh-0e06cc7df3274c2190c3d751fbe2f787-151648-ph0.jpgLe premier match du tournoi a vu la victoire de Sébastien Pina et Fabrice Lajus contre Romain Tillet et Maxime Delas.La République des Pyrénéesfr2023-10-25T11:00:49+02:00Pontiacq-Lamayou : c’est parti pour le 36e tournoi de pala !https://www.larepubliquedespyrenees.fr/pyrenees-atlantiques/vallee-d-aspe/vallee-d-aspe-des-changements-au-1er-novembre-pour-le-transport-a-la-demande-17196907.php2023-10-25T10:55:33+02:00https://images.larepubliquedespyrenees.fr/17196907/1200x-1/oloronvalleesbearnaises-6b1cd659e6db43dda3cc25d5e4b7efaa-154147-ph0.jpgLes panneaux signalétiques jaune et blanc ont fleuri dans chaque commune.La République des Pyrénéesfr2023-10-25T10:55:33+02:00Vallée d’Aspe : des changements au 1er novembre pour le transport à la demandehttps://www.larepubliquedespyrenees.fr/sport/equitation/le-concours-5-etoiles-de-pau-devient-un-evenement-familial-17147228.php2023-10-25T10:55:01+02:00https://images.larepubliquedespyrenees.fr/17147228/1200x-1/rep-10211-hd141476.jpgL’an dernier, 40 000 personnes ont assisté au concours.La République des Pyrénéesfr2023-10-25T10:55:01+02:00Le concours 5 étoiles de Pau devient un événement familialhttps://www.larepubliquedespyrenees.fr/societe/afp/evasion-par-helicoptere-de-redoine-faid-le-verdict-attendu-en-fin-d-apres-midi-17205823.php2023-10-25T10:49:48+02:00https://images.larepubliquedespyrenees.fr/17205823/1200x-1/pp-6538d860a43f5e284d9c2bef-ph0.jpgCroquis d'audience de Rédoine Faïd à l'ouverture de son procès devant la cour d'assises de Paris, le 5 septembre 2023La République des Pyrénéesfr2023-10-25T10:49:48+02:00Evasion par hélicoptère de Rédoine Faïd: le verdict attendu en fin d'après-midihttps://www.larepubliquedespyrenees.fr/culture-et-loisirs/pyrenees-gaming-notre-jeu-du-mois-assassin-s-creed-mirage-un-retour-aux-sources-de-la-saga-17170841.php2023-10-25T10:49:27+02:00https://images.larepubliquedespyrenees.fr/17170841/1200x-1/lcl3ybzh.jpg« Assassin’s Creed Mirage » a été développé par Ubisoft Bordeaux.La République des Pyrénéesfr2023-10-25T10:49:27+02:00▶️ Pyrénées Gaming. Notre jeu du mois : « Assassin’s Creed Mirage », « un retour aux sources, de la saga »
--------------------------------------------------------------------------------
/mockwebsite/liberation_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 | https://www.liberation.fr/international/moyen-orient/en-direct-guerre-hamas-israel-otages-liberees-macron-a-tel-aviv-bombardements-sur-gaza-crainte-dun-embrasement-regional-aide-humanitaire-retrouvez-toutes-les-informations-de-ce-mardi-24-octobre-20231024_6DU6EBVRLZAELLAYU47IAHF6Z4/2023-10-24T08:27:52.306Zalways0.5Libérationfr2023-10-24T08:27:52.306Zhttps://liberation-liberation-prod.cdn.arcpublishing.com/resizer/SKQApBHpBaSJVcpqIDj1h4O-sfU=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/5RDM4TAUGFEZPIHYWG3CVDUR7Y.jpghttps://www.liberation.fr/politique/elections/le-gros-bobard-de-jean-philippe-tanguy-sur-le-gud-ennemi-historique-du-rassemblement-national-20231023_EWA5NEN4QFEUXHLIN74PSPEDJ4/2023-10-23T15:15:23.928Zalways0.5Libérationfr2023-10-23T15:15:23.928Zhttps://liberation-liberation-prod.cdn.arcpublishing.com/resizer/3hZXTi8Ccr2O3s6zyYqk8-Us3Qw=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/AREAUQIPLZCWFKB6HUTFE7VQ24.jpghttps://www.liberation.fr/societe/police-justice/chateau-de-versailles-un-lanceur-de-fausse-alerte-condamne-a-huit-mois-de-prison-avec-sursis-20231023_F2KK3TWLVVGSDAJOPW4KM6OCZQ/2023-10-23T17:16:09.315Zalways0.5Libérationfr2023-10-23T17:16:09.315Zhttps://liberation-liberation-prod.cdn.arcpublishing.com/resizer/cJtbpHCkwdNZbFOVGCSkmRz9FUs=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/DNYUPQGQ2JE2NEWLG4UQLCIYAY.jpghttps://www.liberation.fr/international/europe/plusieurs-disparus-apres-une-collision-entre-deux-cargos-en-mer-du-nord-20231024_325S36NYBRGRLJ7LUAYQ2K5TKQ/2023-10-24T07:57:03.897Zalways0.5Libérationfr2023-10-24T07:57:03.897Zhttps://liberation-liberation-prod.cdn.arcpublishing.com/resizer/nIoB0Sv-h1lexX5KgABQaf4px5Y=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/KIBODBZQNREILHF6YWE7KFF4Z4.jpg
--------------------------------------------------------------------------------
/alembic.ini:
--------------------------------------------------------------------------------
1 | # A generic, single database configuration.
2 |
3 | [alembic]
4 | # path to migration scripts
5 | script_location = alembic
6 |
7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
8 | # Uncomment the line below if you want the files to be prepended with date and time
9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
10 | # for all available tokens
11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
12 |
13 | # sys.path path, will be prepended to sys.path if present.
14 | # defaults to the current working directory.
15 | prepend_sys_path = .
16 |
17 | # timezone to use when rendering the date within the migration file
18 | # as well as the filename.
19 | # If specified, requires the python>=3.9 or backports.zoneinfo library.
20 | # Any required deps can installed by adding `alembic[tz]` to the pip requirements
21 | # string value is passed to ZoneInfo()
22 | # leave blank for localtime
23 | # timezone =
24 |
25 | # max length of characters to apply to the
26 | # "slug" field
27 | # truncate_slug_length = 40
28 |
29 | # set to 'true' to run the environment during
30 | # the 'revision' command, regardless of autogenerate
31 | # revision_environment = false
32 |
33 | # set to 'true' to allow .pyc and .pyo files without
34 | # a source .py file to be detected as revisions in the
35 | # versions/ directory
36 | # sourceless = false
37 |
38 | # version location specification; This defaults
39 | # to alembic/versions. When using multiple version
40 | # directories, initial revisions must be specified with --version-path.
41 | # The path separator used here should be the separator specified by "version_path_separator" below.
42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
43 |
44 | # version path separator; As mentioned above, this is the character used to split
45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
47 | # Valid values for version_path_separator are:
48 | #
49 | # version_path_separator = :
50 | # version_path_separator = ;
51 | # version_path_separator = space
52 | version_path_separator = os # Use os.pathsep. Default configuration used for new projects.
53 |
54 | # set to 'true' to search source files recursively
55 | # in each "version_locations" directory
56 | # new in Alembic version 1.10
57 | # recursive_version_locations = false
58 |
59 | # the output encoding used when revision files
60 | # are written from script.py.mako
61 | # output_encoding = utf-8
62 | sqlalchemy.url = postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}
63 |
64 | [post_write_hooks]
65 | # post_write_hooks defines scripts or Python functions that are run
66 | # on newly generated revision scripts. See the documentation for further
67 | # detail and examples
68 |
69 | # format using "black" - use the console_scripts runner, against the "black" entrypoint
70 | # hooks = black
71 | # black.type = console_scripts
72 | # black.entrypoint = black
73 | # black.options = -l 79 REVISION_SCRIPT_FILENAME
74 |
75 | # lint with attempts to fix using "ruff" - use the exec runner, execute a binary
76 | # hooks = ruff
77 | # ruff.type = exec
78 | # ruff.executable = %(here)s/.venv/bin/ruff
79 | # ruff.options = --fix REVISION_SCRIPT_FILENAME
80 |
81 | # Logging configuration
82 | [loggers]
83 | keys = root,sqlalchemy,alembic
84 |
85 | [handlers]
86 | keys = console
87 |
88 | [formatters]
89 | keys = generic
90 |
91 | [logger_root]
92 | level = WARN
93 | handlers = console
94 | qualname =
95 |
96 | [logger_sqlalchemy]
97 | level = WARN
98 | handlers =
99 | qualname = sqlalchemy.engine
100 |
101 | [logger_alembic]
102 | level = INFO
103 | handlers =
104 | qualname = alembic
105 |
106 | [handler_console]
107 | class = StreamHandler
108 | args = (sys.stderr,)
109 | level = NOTSET
110 | formatter = generic
111 |
112 | [formatter_generic]
113 | format = %(levelname)-5.5s [%(name)s] %(message)s
114 | datefmt = %H:%M:%S
115 |
--------------------------------------------------------------------------------
/quotaclimat/data_processing/mediatree/i8n/germany/channel_program.py:
--------------------------------------------------------------------------------
1 | channels_programs_germany = [
2 | {"channel_name": "daserste", "start": "05:30", "end": "09:30", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "ZDF-Morgenmagazin", "program_type": "Information - Magazine"},
3 | {"channel_name": "daserste", "start": "12:00", "end": "14:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Mittagsmagazin", "program_type": "Information - Magazine"},
4 | {"channel_name": "daserste", "start": "17:00", "end": "18:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Tagesschau", "program_type": "Information - Journal"},
5 | {"channel_name": "daserste", "start": "19:30", "end": "00:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Prime Time", "program_type": "Entertainment - Various"},
6 | {"channel_name": "daserste", "start": "21:45", "end": "00:00", "weekday": "6", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Sunday Special", "program_type": "Information - Magazine"},
7 |
8 | {"channel_name": "zdf-neo", "start": "00:00", "end": "01:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Nighttime Programming", "program_type": "Entertainment - Talk Show"},
9 | {"channel_name": "zdf-neo", "start": "05:30", "end": "11:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "ZDF-Morgenmagazin", "program_type": "Information - Journal"},
10 | {"channel_name": "zdf-neo", "start": "12:00", "end": "14:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Mittagsmagazin", "program_type": "Information - Magazine"},
11 | {"channel_name": "zdf-neo", "start": "21:30", "end": "00:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Late Evening Show", "program_type": "Entertainment - Various"},
12 |
13 | {"channel_name": "rtl-television", "start": "00:00", "end": "01:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "RTL Late Night", "program_type": "Entertainment - Talk Show"},
14 | {"channel_name": "rtl-television", "start": "06:00", "end": "09:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Guten Morgen Deutschland ", "program_type": "Information - Magazine"},
15 | {"channel_name": "rtl-television", "start": "12:00", "end": "15:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Midday Show", "program_type": "Entertainment - Various"},
16 | {"channel_name": "rtl-television", "start": "18:30", "end": "20:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "RTL Aktuell", "program_type": "Information - Journal"},
17 | {"channel_name": "rtl-television", "start": "22:00", "end": "00:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Primetime Shows", "program_type": "Entertainment - Various"},
18 |
19 | {"channel_name": "sat1", "start": "05:30", "end": "10:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Sat.1 Frühstücksfernsehen", "program_type": "Information - Magazine"},
20 | {"channel_name": "sat1", "start": "19:30", "end": "20:30", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Prime Time Show", "program_type": "Entertainment - Various"},
21 |
22 | {"channel_name": "prosieben", "start": "17:00", "end": "20:30", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "taff & Galileo", "program_type": "Information - Magazine"},
23 |
24 | {"channel_name": "kabel-eins", "start": "16:30", "end": "18:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Abenteuer Leben täglich ", "program_type": "Information - Magazine"},
25 | ]
26 |
--------------------------------------------------------------------------------
/mockwebsite/nicematin_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | https://www.nicematin.com/education/cest-un-scandale-une-mere-de-famille-en-colere-apres-avoir-mis-de-longs-mois-a-trouver-un-mode-de-garde-pour-sa-fille-sur-la-cote-dazur-881497
5 |
6 | Nice-Matin
7 | fr
8 | 2023-10-25T10:55:00+02:00"C'est un scandale": une mère de famille en colère après avoir mis de longs mois à trouver un mode de garde pour sa fille sur la Côte-d'Azurhttps://fyooyzbm.filerobot.com/v7/nounou2-C8iVj9UI.jpg?vh=bb8c9a&ci_seal=1795970eb9&w=750&h=375&gravity=auto&func=cropAprès de long mois, Emilie a fini par trouver une solution de garde pour sa fille Mélina. Non sans répercussions sur sa vie professionnelle.
9 | https://www.nicematin.com/faits-divers/a-nice-la-replique-dun-gilet-tactique-abandonne-avec-une-grenade-provoque-lintervention-de-la-police-881520
10 |
11 | Nice-Matin
12 | fr
13 | 2023-10-25T10:49:00+02:00À Nice, la réplique d'un gilet tactique abandonné avec une grenade provoque l'intervention de la policehttps://fyooyzbm.filerobot.com/v7/maxmatinarch530448-Zr26gJZK.jpg?vh=9bf068&ci_seal=812f0dc672&w=750&h=375&gravity=auto&func=cropL'intervention a eu lieu rue Delille à Nice.
14 | https://www.nicematin.com/temoignage/rien-que-par-le-bouche-a-oreille-dans-ma-residence-jai-deja-des-appels-a-51-ans-elle-plaque-tout-pour-devenir-assistante-maternelle-a-nice-881495
15 |
16 | Nice-Matin
17 | fr
18 | 2023-10-25T10:35:00+02:00"Rien que par le bouche-à-oreille dans ma résidence, j’ai déjà des appels": à 51 ans, elle plaque tout pour devenir assistante maternelle à Nicehttps://fyooyzbm.filerobot.com/v7/assistantenounou+%281%29-cHTI0xtv.webp?ci_seal=30e64b9995&tl_px=6,9&br_px=1270,735&w=750&h=375&gravity=auto&func=cropDans les Alpes-Maritimes, près de 500 assistants maternels ont quitté leur fonction depuis quatre ans.
19 | https://www.nicematin.com/environnement/totalenergies-accuse-par-greenpeace-detre-implique-dans-33-projets-fossiles-catastrophiques-pour-le-climat-881516
20 |
21 | Nice-Matin
22 | fr
23 | 2023-10-25T10:35:00+02:00TotalEnergies accusé par Greenpeace d'être impliqué dans 33 projets fossiles "catastrophiques pour le climat"https://fyooyzbm.filerobot.com/v7/000_33A94W3-g3Zrfh2z.jpg?vh=7d1a0a&ci_seal=df517fd0af&w=750&h=375&gravity=auto&func=cropTotalEnergies participe à 33 projets de gaz et de pétrole "super-émetteurs" en gaz à effet de serre, accuse mercredi l'ONG Greenpeace dans une étude visant à démontrer une "logique d'expansion fossile" en contradiction avec les objectifs climatiques.
24 |
--------------------------------------------------------------------------------
/mockwebsite/letelegramme_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | https://www.letelegramme.fr/monde/coups-de-feu-a-bruxelles-deux-morts-le-suspect-en-fuite-6450252.php
5 |
6 | https://media.letelegramme.fr/api/v1/images/view/652d8f34710625629665f40a/web_golden_xxl/652d8f34710625629665f40a.1
7 | Un périmètre de sécurité a été installé autour de la place Sainctelette. (Hatim Kaghat/AFP)
8 | Un périmètre de sécurité a été installé autour de la place Sainctelette.
9 |
10 |
11 |
12 | Le Télégramme
13 | fr
14 |
15 | 2023-10-16T19:29:56+00:00
16 | Coups de feu à Bruxelles : deux morts, le suspect en fuite, la piste terroriste évoquée
17 |
18 |
19 |
20 | https://www.letelegramme.fr/monde/mali-la-mission-de-lonu-engage-sous-tension-une-nouvelle-phase-de-son-retrait-6450249.php
21 |
22 |
23 | Le Télégramme
24 | fr
25 |
26 | 2023-10-16T19:18:00+00:00
27 | Mali : la mission de l’Onu engage sous tension une nouvelle phase de son retrait
28 |
29 |
30 |
31 | https://www.letelegramme.fr/morbihan/vannes-56000/circulation-et-stationnement-a-la-gare-de-vannes-ca-va-etre-tres-complique-pendant-deux-ans-6450250.php
32 |
33 | https://media.letelegramme.fr/api/v1/images/view/652d8d905a16a826de416f33/web_golden_xxl/652d8d905a16a826de416f33.1
34 | Le côté sud de la gare vu d’en haut. L’avenue Favrel et Lincy deviendra une voie de bus dans le sens descendant, une voie pour les voitures et bus dans le sens montant. Le parvis de la gare sera élargi et végétalisé. Les vélos y trouveront leur place. (Image : Villes et paysages)
35 | Le côté sud de la gare vu d’en haut. L’avenue Favrel et Lincy deviendra une voie de bus dans le sens descendant, une voie pour les voitures et bus dans le sens montant. Le parvis de la gare sera élargi et végétalisé. Les vélos y trouveront leur place.
36 |
37 |
38 |
39 | Le Télégramme
40 | fr
41 |
42 | 2023-10-16T19:13:00+00:00
43 | Circulation et stationnement à la gare de Vannes : « Ça va être très compliqué pendant deux ans »
44 | Futur quartier de la gare de Vannes,Gare
45 |
46 |
47 |
48 | https://www.letelegramme.fr/finistere/ergue-gaberic-29500/cinq-blesses-dans-un-accident-de-circulation-a-ergue-gaberic-6450248.php
49 |
50 | https://media.letelegramme.fr/api/v1/images/view/652d8cd651450e731a713e6a/web_golden_xxl/652d8cd651450e731a713e6a.1
51 | Un homme a été transporté dans un état critique à l’hôpital de Quimper. (Photo d’illustration Lionel Le Saux/Le Télégramme)
52 | Un homme a été transporté dans un état critique à l’hôpital de Quimper.
53 |
54 |
55 |
56 | Le Télégramme
57 | fr
58 |
59 | 2023-10-16T19:08:00+00:00
60 | Cinq blessés dans un accident de la circulation à Ergué-Gabéric
61 | Faits divers,Accident
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/my_dbt_project/pytest_tests/test_dbt_model_analytics.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 | import os
4 | import subprocess
5 | from decimal import *
6 |
7 | import psycopg2
8 | import pytest
9 |
10 | from my_dbt_project.pytest_tests.test_dbt_model_homepage import run_dbt_command
11 |
12 |
13 | @pytest.fixture(scope="module")
14 | def db_connection():
15 | conn = psycopg2.connect(
16 | dbname=os.getenv("POSTGRES_DB", ""),
17 | user=os.getenv("POSTGRES_USER", ""),
18 | password=os.getenv("POSTGRES_PASSWORD", ""),
19 | host=os.getenv("POSTGRES_HOST", ""),
20 | port=os.getenv("POSTGRES_PORT", ""),
21 | )
22 | yield conn
23 | conn.close()
24 |
25 |
26 | def seed_dbt_labelstudio():
27 | """Run dbt seed once before any test."""
28 | commands = [
29 | "seed",
30 | "--select",
31 | "labelstudio_task_aggregate",
32 | "--select",
33 | "labelstudio_task_completion_aggregate",
34 | "--full-refresh",
35 | ]
36 | logging.info(f"pytest running dbt seed : {commands}")
37 | run_dbt_command(commands)
38 | # seed and dbt run upstream tables
39 | commands = [
40 | "seed",
41 | "--select",
42 | "program_metadata",
43 | "--select",
44 | "time_monitored",
45 | "--select",
46 | "keywords",
47 | "--select",
48 | "dictionary",
49 | "--select",
50 | "keyword_macro_category",
51 | "--full-refresh",
52 | ]
53 | run_dbt_command(commands)
54 |
55 | seed_dbt_labelstudio()
56 |
57 | @pytest.fixture(scope="module", autouse=True)
58 | def run_analytics():
59 | logging.info("Run dbt for the thematics model once before related tests.")
60 | run_dbt_command(
61 | [
62 | "run",
63 | "--exclude",
64 | "core_query_causal_links",
65 | "--exclude",
66 | "task_global_completion",
67 | "--exclude",
68 | "environmental_shares_with_desinfo_counts",
69 | "--full-refresh",
70 | ]
71 | )
72 | logging.info("pytest running dbt task_global_completion")
73 | run_dbt_command(
74 | [
75 | "run",
76 | "--select",
77 | "task_global_completion",
78 | "--select",
79 | "environmental_shares_with_desinfo_counts",
80 | "--target",
81 | "analytics",
82 | "--full-refresh",
83 | ]
84 | )
85 |
86 |
87 | def test_task_global_completion(db_connection):
88 | with db_connection.cursor() as cur:
89 | cur.execute("""
90 | SELECT
91 | "analytics"."task_global_completion"."task_completion_aggregate_id",
92 | "analytics"."task_global_completion"."country",
93 | "analytics"."task_global_completion"."data_item_channel_name",
94 | "analytics"."task_global_completion"."mesinfo_choice",
95 | "analytics"."task_global_completion"."sum_duration_minutes"
96 | FROM analytics.task_global_completion
97 | ORDER BY analytics.task_global_completion.task_completion_aggregate_id
98 | LIMIT 1
99 | """)
100 | row = cur.fetchone()
101 |
102 | expected = (
103 | "0e7ee7f70a223e21b10c0dad27464bebb8cc6a7f4bd5f5b7746c661a44ec7b45",
104 | "france",
105 | "europe1",
106 | "Correct",
107 | None,
108 | )
109 |
110 | assert row == expected, f"Unexpected values: {row}"
111 |
112 | def test_environmental_shares_desinfo(db_connection):
113 | with db_connection.cursor() as cur:
114 | cur.execute("""
115 | SELECT
116 | "analytics"."environmental_shares_with_desinfo_counts"."start",
117 | "analytics"."environmental_shares_with_desinfo_counts"."channel_name",
118 | "analytics"."environmental_shares_with_desinfo_counts"."sum_duration_minutes",
119 | "analytics"."environmental_shares_with_desinfo_counts"."weekly_perc_climat",
120 | "analytics"."environmental_shares_with_desinfo_counts"."total_mesinfo"
121 | FROM analytics.environmental_shares_with_desinfo_counts
122 | ORDER BY analytics.environmental_shares_with_desinfo_counts.start
123 | LIMIT 1
124 | """)
125 | row = cur.fetchone()
126 | expected = (
127 | datetime.datetime(2025, 1, 27, 0, 0),
128 | "arte",
129 | 65,
130 | 0.13846153846153847,
131 | 0,
132 | )
133 | assert row == expected
--------------------------------------------------------------------------------
/my_dbt_project/models/dashboards/core_query_thematics_keywords.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='incremental',
3 | unique_key=['week','channel_title'],
4 | on_schema_change='append_new_columns'
5 | )
6 | }}
7 |
8 | -- Core Query Thematics Keywords makes only sense when looking for keywords,theme, and category together (otherwise duplicates
9 | -- as a keyword inside keyword_with_timestamp is present 4 times, if the keyword has 4 themes)
10 |
11 | WITH program_durations AS (
12 | SELECT
13 | pm.channel_title,
14 | pm.channel_program,
15 | pm.weekday,
16 | CAST(pm.program_grid_start AS date) AS program_start,
17 | CAST(pm.program_grid_end AS date) AS program_end,
18 | pm.duration_minutes
19 | FROM public.program_metadata pm
20 | WHERE pm.country = 'france'
21 | ),
22 | program_weeks AS (
23 | SELECT
24 | pd.channel_title,
25 | pd.channel_program,
26 | pd.duration_minutes,
27 | pd.weekday,
28 | generate_series(
29 | date_trunc('week', pd.program_start),
30 | date_trunc('week', pd.program_end),
31 | interval '1 week'
32 | )::date AS week_start
33 | FROM program_durations pd
34 | ),
35 | program_airings AS (
36 | SELECT
37 | channel_title,
38 | channel_program,
39 | duration_minutes,
40 | -- calculate actual airing date per week + weekday offset
41 | (week_start + (weekday - 1) * INTERVAL '1 day')::date AS airing_date,
42 | week_start
43 | FROM program_weeks
44 | ),
45 | weekly_program_durations AS (
46 | SELECT
47 | channel_title,
48 | week_start AS week,
49 | SUM(duration_minutes) AS weekly_duration_minutes
50 | FROM program_airings
51 | GROUP BY channel_title, week_start
52 | ),
53 | keyword_occurrences AS (
54 | SELECT DISTINCT
55 | COALESCE(pm.channel_title, k.channel_title) AS channel_title,
56 | DATE_TRUNC('week', k.start)::date AS week,
57 | k.start AS occurrence_time,
58 | -- Semantic tags
59 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%solution%' THEN TRUE ELSE FALSE END AS is_solution,
60 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%consequence%' THEN TRUE ELSE FALSE END AS is_consequence,
61 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%cause%' THEN TRUE ELSE FALSE END AS is_cause,
62 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%concepts_generaux%' THEN TRUE ELSE FALSE END AS is_general_concepts,
63 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%constat%' THEN TRUE ELSE FALSE END AS is_statement,
64 | -- Crisis type
65 | CASE
66 | WHEN LOWER(kw ->> 'theme') LIKE '%climat%' THEN 'Crise climatique'
67 | WHEN LOWER(kw ->> 'theme') LIKE '%biodiversite%' THEN 'Crise de la biodiversité'
68 | WHEN LOWER(kw ->> 'theme') LIKE '%ressource%' THEN 'Crise des ressources'
69 | ELSE 'Autre'
70 | END AS crise_type,
71 | kw ->> 'theme' AS theme,
72 | kw ->> 'keyword' AS keyword
73 | FROM public.keywords k
74 | LEFT JOIN public.program_metadata pm
75 | ON k.channel_program = pm.channel_program
76 | AND k.channel_name = pm.channel_name
77 | AND (
78 | (
79 | CASE
80 | WHEN ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7) = 0 THEN 7
81 | ELSE ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7)
82 | END = pm.weekday
83 | )
84 | )
85 | AND CAST(k.start AS date) BETWEEN CAST(pm.program_grid_start AS date)
86 | AND CAST(pm.program_grid_end AS date)
87 | , json_array_elements(k.keywords_with_timestamp::json) AS kw
88 | WHERE
89 | LOWER(kw ->> 'theme') NOT LIKE '%indirect%'
90 | AND k.country = 'france'
91 | )
92 | SELECT
93 | ko.channel_title,
94 | ko.week,
95 | COALESCE(NULLIF(d.category, ''), 'Transversal') AS category,
96 | d.high_risk_of_false_positive,
97 | ko.is_solution,
98 | ko.is_consequence,
99 | ko.is_cause,
100 | ko.is_general_concepts,
101 | ko.is_statement,
102 | ko.crise_type,
103 | ko.theme,
104 | ko.keyword,
105 | kmc.general,
106 | kmc.agriculture,
107 | kmc.transport,
108 | kmc.batiments,
109 | kmc.energie,
110 | kmc.industrie,
111 | kmc.eau,
112 | kmc.ecosysteme,
113 | kmc.economie_ressources,
114 | COUNT(*) AS count,
115 | COALESCE(wpd.weekly_duration_minutes, 0) AS sum_duration_minutes
116 | FROM keyword_occurrences ko
117 | LEFT JOIN public.dictionary d
118 | ON d.keyword = ko.keyword AND d.theme LIKE ko.theme || '%' -- ensure matc with indirect theme inside the dictionary table
119 | LEFT JOIN weekly_program_durations wpd
120 | ON wpd.channel_title = ko.channel_title AND wpd.week = ko.week
121 | LEFT JOIN public.keyword_macro_category kmc
122 | ON kmc.keyword = ko.keyword
123 | GROUP BY
124 | ko.channel_title,
125 | ko.week,
126 | d.high_risk_of_false_positive,
127 | COALESCE(NULLIF(d.category, ''), 'Transversal'),
128 | ko.is_solution,
129 | ko.is_consequence,
130 | ko.is_cause,
131 | ko.is_general_concepts,
132 | ko.is_statement,
133 | ko.crise_type,
134 | ko.theme,
135 | ko.keyword,
136 | kmc.general,
137 | kmc.agriculture,
138 | kmc.transport,
139 | kmc.batiments,
140 | kmc.energie,
141 | kmc.industrie,
142 | kmc.eau,
143 | kmc.ecosysteme,
144 | kmc.economie_ressources,
145 | wpd.weekly_duration_minutes
146 | ORDER BY
147 | ko.channel_title, ko.week, ko.crise_type
--------------------------------------------------------------------------------
/test/sitemap/test_main_import_api.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from modin.pandas.dataframe import DataFrame
4 |
5 | from quotaclimat.data_processing.mediatree.update_pg_keywords import *
6 |
7 | from postgres.insert_data import (clean_data,
8 | insert_data_in_sitemap_table)
9 |
10 | from postgres.schemas.models import create_tables, get_db_session, get_keyword, connect_to_db, drop_tables, empty_tables
11 | from postgres.insert_data import save_to_pg
12 | from quotaclimat.data_processing.mediatree.detect_keywords import *
13 | from quotaclimat.data_processing.mediatree.api_import import *
14 | from quotaclimat.data_processing.mediatree.keyword.stop_words import STOP_WORDS
15 | from quotaclimat.data_processing.mediatree.stop_word.main import save_append_stop_word
16 | from quotaclimat.data_processing.mediatree.s3.api_to_s3 import parse_reponse_subtitle
17 | from test_utils import get_localhost, debug_df, compare_unordered_lists_of_dicts
18 |
19 | import time as t
20 |
21 |
22 | def insert_mediatree_json(conn, json_file_path='test/sitemap/mediatree.json'):
23 | create_tables(conn)
24 | empty_tables(get_db_session(conn), stop_word=False)
25 | logging.info(f"reading {json_file_path}")
26 | with open(json_file_path, 'r') as file:
27 | json_response = json.load(file)
28 | start_time = t.time()
29 | df = parse_reponse_subtitle(json_response)
30 | df = filter_and_tag_by_theme(df)
31 | df["id"] = df.apply(lambda x: add_primary_key(x), axis=1)
32 | end_time = t.time()
33 | logging.info(f"Elapsed time for api import {end_time - start_time}")
34 |
35 | # must df._to_pandas() because to_sql does not handle modin dataframe
36 | save_to_pg(df._to_pandas(), keywords_table, conn)
37 |
38 | return len(df)
39 |
40 | def insert_stop_word(conn):
41 | logging.info("test saving stop words")
42 | to_save = []
43 | for stop in STOP_WORDS:
44 | stop_word = dict()
45 | stop_word['id'] = stop
46 | stop_word['context'] = stop
47 | to_save.append(stop_word)
48 |
49 | save_append_stop_word(conn, to_save)
50 |
51 | def test_main_api_import():
52 | conn = connect_to_db()
53 | drop_tables(conn)
54 | create_tables(conn)
55 | insert_stop_word(conn)
56 | len_df = insert_mediatree_json(conn, json_file_path="test/sitemap/light.json")
57 |
58 | session = get_db_session(conn)
59 | saved_keywords = get_keywords_columns(session, start_date="2024-02-01", end_date="2024-02-29")
60 | assert len(saved_keywords) != 0
61 | assert len(saved_keywords) == len_df
62 |
63 | def test_first_row_api_import():
64 | primary_key = "29d2b1f8267b206cb62e475b960de3247e835273f396af012f5ce21bf3056472"
65 |
66 | specific_keyword = get_keyword(primary_key)
67 | logging.info(f"Getting {primary_key} :\n {specific_keyword}")
68 | assert set(specific_keyword.theme) == set([
69 | 'biodiversite_concepts_generaux_indirectes',
70 | 'changement_climatique_consequences_indirectes',
71 | 'changement_climatique_constat_indirectes'
72 | ])
73 |
74 | assert specific_keyword.number_of_keywords == 0
75 |
76 | def test_second_row_api_import():
77 |
78 | primary_key = "9f0fb1987371c1dc0b4a165a11feb7ca7ed9b6f9f40d3d6b4fc0748e2ca59c3f"
79 | specific_keyword = get_keyword(primary_key)
80 | assert len(set(specific_keyword.theme)) > 0
81 | assert specific_keyword.number_of_keywords > 0
82 |
83 |
84 | def test_third_row_api_import():
85 | primary_key = "32cb864fe56a4436151bcf78c385a7cc4226316e0563a298ac6988d1b8ee955b"
86 |
87 | specific_keyword = get_keyword(primary_key)
88 | assert len(set(specific_keyword.theme)) > 0
89 |
90 | assert specific_keyword.number_of_keywords == 1
91 |
92 | def test_get_api_stop():
93 | conn = connect_to_db()
94 | session = get_db_session(conn)
95 | stopwords = get_stop_words(session, country=None)
96 | assert type(stopwords[0]) == str
97 |
98 | def test_transform_raw_keywords_srt_to_mediatree():
99 | conn = connect_to_db()
100 |
101 | channel = "LAUNE"
102 | primary_key = "df0d86983f0c4ed074800f5cdabbd577671b90845fb6208a5de1ae3802fb10e0"
103 | df: DataFrame= pd.read_parquet(path=f"i8n/mediatree_output/year=2024/month=10/day=1/channel={channel}")
104 | df_programs = get_programs()
105 | output = transform_raw_keywords(df, df_programs=df_programs,country=BELGIUM)
106 |
107 | output_dict = output.to_dict(orient='records')
108 | filtered = output[output["id"] == primary_key]
109 | row_dict = filtered.iloc[0].to_dict()
110 | assert row_dict["country"] == "belgium"
111 | assert row_dict["channel_name"] == channel
112 |
113 | assert len(output) == 29
114 | save_to_pg(df=output,conn=conn, table=keywords_table)
115 | specific_keyword = get_keyword(primary_key)
116 | assert set(specific_keyword.theme) == set([
117 | 'changement_climatique_causes_indirectes',
118 | ])
119 |
120 | assert specific_keyword.number_of_keywords == 0
--------------------------------------------------------------------------------
/mockwebsite/leparisien_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 | https://www.leparisien.fr/sports/cyclisme/tour-de-france/direct-tour-de-france-lannonce-des-parcours-2024-a-suivre-en-live-25-10-2023-SGPV57QEYVAOJKR2VTRVETMVSY.php2023-10-25T08:53:25.512ZLe Parisienfr2023-10-25T08:53:26.556Zhttps://www.leparisien.fr/resizer/pZgWLK34dnSm3PnePH4YT7PDeLI=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/RBJCS5CIAVHK3IG3DKVEMSO56Y.jpghttps://www.leparisien.fr/sports/football/sadio-mane-arrive-aux-commandes-de-bourges-foot-18-club-de-national-2-25-10-2023-Z7GNAIUG65ECXC33R7XZPG6V2E.php2023-10-25T08:52:34.982ZLe Parisienfr2023-10-25T08:52:35.420Zhttps://www.leparisien.fr/resizer/oYbqphCAWq15Lf1aZAo4uO651ZI=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/XC6BTYGH3VENLEONZWY7LRTYOY.jpghttps://www.leparisien.fr/faits-divers/le-pilote-americain-qui-a-tente-de-couper-les-moteurs-dun-avion-avait-consomme-des-hallucinogenes-25-10-2023-OBK4GDNF4NFN7MXLD4NY4FVUEU.php2023-10-25T08:50:28.302ZLe Parisienfr2023-10-25T08:50:28.762Zhttps://www.leparisien.fr/resizer/dpRGItWIAA5vHv2D6cmBGmOff7U=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/TAPHNZFSIRBAVHYEOIIN3U7AP4.jpghttps://www.leparisien.fr/futurs/punaises-de-lit-comment-sen-debarrasser-les-reconnaitre-dou-viennent-elles-posez-nous-vos-questions-25-10-2023-A5ZSPB6LSBBVLHCFIX4OIAZFWA.php2023-10-25T08:49:39.415ZLe Parisienfr2023-10-25T08:49:40.613Zhttps://www.leparisien.fr/resizer/02sdSrjueqCNoNETKkV7cTpDO_0=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/4QGY2FZT2JF4XFUFQGSNWNZ5WQ.jpghttps://www.leparisien.fr/culture-loisirs/cinema/le-syndrome-des-amours-passees-mais-pourquoi-couchent-ils-avec-leurs-ex-25-10-2023-SOCRYLNKVBHH5N7RZONQFRQLII.php2023-10-25T08:49:30.367ZLe Parisienfr2023-10-25T08:49:30.814Zhttps://www.leparisien.fr/resizer/yk9qwslNqiBUhMh5EhTUDc5JoRc=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/7W2I45MNRNHLRHKSYZK2BZ6IMI.jpg
--------------------------------------------------------------------------------
/.github/workflows/deploy-main.yml:
--------------------------------------------------------------------------------
1 | name: Build & Deploy to Scaleway
2 |
3 | on:
4 | push:
5 | # Sequence of patterns matched against refs/heads
6 | branches:
7 | - main
8 |
9 | # to be able to force deploy
10 | workflow_dispatch:
11 |
12 |
13 | env:
14 | PYTHON_VERSION: '3.12'
15 | POETRY_VERSION: '2.1.3'
16 |
17 | jobs:
18 | build:
19 | runs-on: ubuntu-latest
20 | steps:
21 | - uses: actions/checkout@v4
22 | - uses: actions/setup-python@v5
23 | with:
24 | python-version: ${{ env.PYTHON_VERSION }}
25 | - uses: actions/checkout@v4
26 | - name: Install Poetry
27 | uses: snok/install-poetry@v1
28 | with:
29 | version: ${{ env.POETRY_VERSION }}
30 | virtualenvs-create: true
31 | virtualenvs-in-project: true
32 | installer-parallel: true
33 | - name: Poetry install & bump version
34 | run: |
35 | poetry install --only dev
36 | poetry version patch
37 | PROJECT_VERSION=$(poetry version --short)
38 | echo "PROJECT_VERSION=$PROJECT_VERSION" >> $GITHUB_ENV
39 | git config user.name barometre-github-actions
40 | git config user.email barometre-github-actions@github.com
41 | git add pyproject.toml
42 | git commit -m "[no ci]: $PROJECT_VERSION bumping version"
43 | git push origin main
44 | - name: Login to Scaleway Container Registry
45 | uses: docker/login-action@v3
46 | with:
47 | username: nologin
48 | password: ${{ secrets.SCALEWAY_API_KEY }}
49 | registry: ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}
50 |
51 | - name: Build mediatree_import image
52 | run: docker build -f Dockerfile_api_import . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }}
53 | - name: Tag mediatree_import latest image
54 | run: docker tag ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }} ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:latest
55 | - name: Push mediatree_import Image
56 | run: docker push --all-tags ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import
57 |
58 | - name: update scaleway job definition with version mediatree_import
59 | uses: jawher/action-scw@v2.34.0
60 | env:
61 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
62 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
63 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
64 | SCW_ZONE: ${{ secrets.SCW_ZONE }}
65 | with:
66 | args: jobs definition update ${{ secrets.SCALEWAY_JOB_IMPORT_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }}
67 |
68 | - name: Build s3 image
69 | run: docker build -f Dockerfile_api_to_s3 . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }}
70 | - name: Tag s3 latest image
71 | run: docker tag ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }} ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:latest
72 | - name: Push s3 Image
73 | run: docker push ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }}
74 |
75 | - name: update scaleway job definition with version s3
76 | uses: jawher/action-scw@v2.34.0
77 | env:
78 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
79 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
80 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
81 | SCW_ZONE: ${{ secrets.SCW_ZONE }}
82 | with:
83 | args: jobs definition update ${{ secrets.SCALEWAY_JOB_S3_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }}
84 |
85 | - name: Build stop_word image
86 | run: docker build -f Dockerfile_stop_word . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }}
87 | - name: Tag stop_word latest image
88 | run: docker tag ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }} ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:latest
89 | - name: Push stop_word Image
90 | run: docker push ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }}
91 |
92 | - name: update scaleway job definition with version stopwords
93 | uses: jawher/action-scw@v2.34.0
94 | env:
95 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
96 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
97 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
98 | SCW_ZONE: ${{ secrets.SCW_ZONE }}
99 | with:
100 | args: jobs definition update ${{ secrets.SCALEWAY_STOP_WORDS_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }}
101 |
102 | - name: update scaleway job update job
103 | uses: jawher/action-scw@v2.34.0
104 | env:
105 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }}
106 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
107 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
108 | SCW_ZONE: ${{ secrets.SCW_ZONE }}
109 | with:
110 | args: jobs definition update ${{ secrets.SCALEWAY_UPDATE_JOB_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }}
111 |
--------------------------------------------------------------------------------
/mockwebsite/lexpress_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 | https://www.lexpress.fr/societe/evasion-de-reau-par-helicoptere-lheure-du-verdict-pour-redoine-faid-SYPRU6BXSRB27DFSOLH23QSCRY/2023-10-25T10:49:48.000+02:00always0.5L'Expressfr2023-10-25T08:49:48Zhttps://www.lexpress.fr/resizer/rBes-Zxn7XqcPvpVdnoTR_0vEIM=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/4WE5RWALHVECPOS3OAQ4U6MGUI.jpghttps://www.lexpress.fr/monde/europe/le-ministre-russe-de-la-defense-sur-la-zone-de-loperation-militaire-en-ukraine-JBK5YZUYZZFNLIDRKF54LHHEAE/2023-10-25T10:31:43.539+02:00always0.5L'Expressfr2023-10-25T08:31:43.539Zhttps://www.lexpress.fr/resizer/5OIiTmRnwqg0l6dHTKEoovcmCCM=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/UAC35INRQZFKFF6GIWQRWM6RJU.jpghttps://www.lexpress.fr/monde/proche-moyen-orient/guerre-hamas-israel-macron-va-rencontrer-le-roi-de-jordanie-a-amman-ZR4BAAKC45FRRE7O4JV454AUKY/2023-10-25T10:17:31.494+02:00always0.5L'Expressfr2023-10-25T08:17:31.494Zhttps://www.lexpress.fr/resizer/cqcHx_xhgHOc6D2tPodgor6yp5M=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/CHZX6RK67VB5TGGYQJDI4FD5K4.jpghttps://www.lexpress.fr/monde/japon-decision-de-justice-tres-attendue-sur-le-changement-detat-civil-des-personnes-transgenres-TDU6FGHBANHVPNU5ZM5FI75KIU/2023-10-25T10:02:23.000+02:00always0.5L'Expressfr2023-10-25T08:02:23Zhttps://www.lexpress.fr/resizer/tDdXsSjhswEleE1mcoIdChtNwTw=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/VCTHD6G5MJAA3NYI3BDYZ6X2VI.jpghttps://www.lexpress.fr/societe/deserts-medicaux-le-senat-retoque-la-repartition-des-medecins-VEF4G4QZDZFFRLBAAWP4UP57JU/2023-10-25T09:59:37.725+02:00always0.5L'Expressfr2023-10-25T07:59:37.725Zhttps://www.lexpress.fr/resizer/NSdrjfJ2Na62498cuhWsqyqjRuk=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/XSQJAM2USRBMZFKDB2AFJ3MMAM.jpghttps://www.lexpress.fr/monde/laide-de-lonu-a-gaza-menacee-de-paralysie-discussions-autour-dune-pause-humanitaire-XV7BLYTMLVGJRI2XPLSREFVC2Q/2023-10-25T09:20:53.000+02:00always0.5L'Expressfr2023-10-25T07:20:53Zhttps://www.lexpress.fr/resizer/l-bVyVd-EWXxoZLn5QoX6xp5cAI=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/SG5MCUKVT5EDHN4HJYAONCMGDM.jpg
--------------------------------------------------------------------------------
/mockwebsite/francebleu_sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 | https://www.francebleu.fr/infos/faits-divers-justice/accident-villognon-le-maitre-d-hotel-blesse-au-pied-droit-sera-indemnise-1322246France Bleufr2023-10-26T09:57:37+00:00Accident Villognon : le maître d'hôtel blessé au pied droit sera indemniséFaits divers - Justice, François Hollande, justice, Infoshttps://www.francebleu.fr/infos/culture-loisirs/en-route-pour-une-nouvelle-semaine-de-cadeaux-avec-france-bleu-vaucluse-9545500France Bleufr2023-10-26T09:55:40+00:00En route pour une nouvelle semaine de cadeaux avec France Bleu Vaucluse ! Culture - Loisirs, Infoshttps://www.francebleu.fr/infos/economie-social/dans-les-deux-charentes-payez-vos-factures-d-electricite-moins-cheres-grace-a-l-achat-groupe-d-energie-9597185France Bleufr2023-10-26T09:51:10+00:00Dans les deux Charentes : payez vos factures d'électricité moins chères grâce à l'achat groupé d'énergieÉconomie - Social, Énergie, UFC Que Choisir, Électricité, Inflation, Économies d'énergie – Éco-gestes, Infoshttps://www.francebleu.fr/infos/faits-divers-justice/cette-histoire-m-a-traumatise-francis-nachbar-ancien-magistrat-publie-un-livre-sur-les-affaires-fourniret-6174798France Bleufr2023-10-26T09:47:09+00:00 "Cette histoire m'a traumatisé", Francis Nachbar, ancien magistrat publie un livre sur les affaires FourniretFaits divers - Justice, Les affaires Fourniret, justice, Monique Olivier, Enquêtes – Investigation, Infoshttps://www.francebleu.fr/sports/football/liverpool-tfc-gagner-a-anfield-c-est-si-rare-pour-un-club-francais-5163393France Bleufr2023-10-26T09:46:03+00:00Liverpool-TFC : gagner à Anfield, c'est si rare pour un club françaisFootball, TFC - Toulouse Football Club, Europa League, Toulouse, Sportshttps://www.francebleu.fr/infos/faits-divers-justice/caen-coups-de-marteau-menaces-de-mort-et-videos-humiliantes-un-jeune-homme-condamne-a-2-ans-de-prison-ferme-4264720France Bleufr2023-10-26T09:43:03+00:00Caen: coups de marteau, menaces de mort et vidéos humiliantes: un jeune homme condamné à 2 ans de prison fermeFaits divers - Justice, Infoshttps://www.francebleu.fr/infos/faits-divers-justice/seine-maritime-un-jeune-homme-de-21-ans-tue-par-balles-a-maromme-l-auteur-en-fuite-5680400France Bleufr2023-10-26T09:40:36+00:00Seine-Maritime : un jeune homme de 21 ans tué par balles à Maromme, l'auteur en fuiteFaits divers - Justice, Armes à feu, Agression, Police nationale, Enquêtes – Investigation, Infoshttps://www.francebleu.fr/infos/societe/ehpad-une-enquete-de-60-millions-de-consommateurs-pointe-du-doigt-la-qualite-des-repas-servis-6540286France Bleufr2023-10-26T09:39:07+00:00Ehpad : une enquête de 60 millions de consommateurs pointe du doigt la qualité des repas servisSociété, Maisons de retraite – Ehpad, Alimentation, Infoshttps://www.francebleu.fr/infos/societe/poule-cherche-nouvelle-maison-a-pia-un-eleveur-met-a-la-vente-ses-pondeuses-avec-poule-pour-tous-4774679France Bleufr2023-10-26T09:31:46+00:00Poule cherche nouvelle maison, à Pia un éleveur met à la vente ses pondeuses avec Poule pour tous Société, poulet, animaux, Élevage, Infoshttps://www.francebleu.fr/infos/faits-divers-justice/cannabis-cocaine-ecstasy-1-homme-et-1-femme-arretes-a-bagnols-sur-ceze-5907058France Bleufr2023-10-26T09:27:41+00:00Cannabis, cocaïne, ecstasy : un homme et une femme arrêtés à Bagnols-sur-CèzeFaits divers - Justice, Gard, Drogues, Police nationale, Infoshttps://www.francebleu.fr/infos/societe/une-cinquantaine-d-habitants-de-mourenx-denoncent-les-odeurs-intenables-du-methaniseur-biobearn-2192014France Bleufr2023-10-26T09:12:24+00:00Une cinquantaine d'habitants de Mourenx dénoncent les odeurs "intenables" du méthaniseur BiobéarnSociété, Entreprises, Infos
--------------------------------------------------------------------------------