├── s3 └── .empty ├── secrets └── .empty ├── alembic ├── versions │ ├── .keep │ ├── 2450da0e6c60_number_of_keywords_20_30_40.py │ ├── 5cc9e1ec5362_add_info_public_to_program_metadata.py │ ├── c1d78b9968fe_add_info_public_to_program_metadata.py │ ├── a5c39db3c8e9_add_new_column_test_for_table_keywords.py │ ├── 43103d5b49c9_program_add_start_end_date_for_grid_.py │ ├── 5ccd746ee292_add_updated_at.py │ ├── 055173743036_keywords_add_channel_title.py │ ├── a0a707673259_add_radio_to_program_metadata.py │ ├── 30abfd828007_program_metadata.py │ ├── 5bff4dceda53_add_info_public_to_program_metadata.py │ ├── 827fb6dde3bb_time_monitored_new_table.py │ ├── c08231a9eb37_program_add_created_at_updated_at.py │ ├── 2c48f626a749_keywords_program_name.py │ ├── 4ccd746ee291_add_20_30.py │ ├── af956a85658f_add_new_column_number_of_keywords_.py │ ├── 356882459cec_remove_category_keywords_change_columns_.py │ ├── 4333bc46985d_keywords_program_id_foreign_key.py │ ├── 44f13b7eebd4_dictionary_category.py │ ├── ac96222af6fe_hrfp_counters.py │ └── a578d21d7aee_add_tables_labelstudio.py ├── script.py.mako └── env.py ├── my_dbt_project ├── analyses │ └── .gitkeep ├── macros │ └── .gitkeep ├── seeds │ ├── .gitkeep │ └── time_monitored.csv ├── snapshots │ └── .gitkeep ├── tests │ └── .gitkeep ├── pytest_tests │ ├── .gitkeep │ └── test_dbt_model_analytics.py ├── .gitignore ├── dbt │ ├── .user.yml │ └── profiles.yml ├── README.md └── models │ ├── analytics │ └── environmental_shares_with_desinfo_counts.sql │ └── dashboards │ ├── core_query_causal_links.sql │ ├── core_query_thematics_keywords_i8n.sql │ ├── thematic_query_ocean.sql │ └── core_query_thematics_keywords.sql ├── quotaclimat ├── utils │ ├── __init__.py │ ├── coverquotaclimat.png │ ├── logger.py │ ├── healthcheck_config.py │ └── sentry.py ├── data_ingestion │ ├── __init__.py │ ├── ingest_db │ │ ├── __init__.py │ │ └── ingest_sitemap_in_db.py │ ├── labelstudio │ │ └── configs.py │ └── scrap_html │ │ └── scrap_description_article.py ├── data_processing │ ├── __init__.py │ └── mediatree │ │ ├── i8n │ │ ├── dictionary.py │ │ ├── brazil │ │ │ ├── __init__.py │ │ │ └── channel_titles.py │ │ ├── france │ │ │ ├── __init__.py │ │ │ └── channel_titles.py │ │ ├── poland │ │ │ ├── __init__.py │ │ │ └── channel_titles.py │ │ ├── spain │ │ │ ├── __init__.py │ │ │ ├── channel_titles.py │ │ │ └── channel_program.py │ │ └── germany │ │ │ ├── __init__.py │ │ │ ├── channel_titles.py │ │ │ └── channel_program.py │ │ ├── config.py │ │ ├── api_import_utils │ │ └── db.py │ │ └── time_monitored │ │ └── models.py └── __init__.py ├── document-experts └── .download-from-gdrive.empty ├── .dockerignore ├── postgres ├── schemas │ ├── base.py │ └── sitemap.pgsql ├── insert_existing_data_example.py ├── database_connection.py └── insert_data.py ├── docs └── images │ └── data_tiers.png ├── mockwebsite ├── README.md ├── cnews_sitemap.xml ├── lefigaro_localhost_sitemap.xml ├── 20minutes_sitemap.xml ├── lefigaro_sitemap.xml ├── lacroix_sitemap.xml ├── midilibre_sitemap.xml ├── franceinter_sitemap.xml ├── republiquepyrenees_sitemap.xml ├── liberation_sitemap.xml ├── nicematin_sitemap.xml ├── letelegramme_sitemap.xml ├── leparisien_sitemap.xml ├── lexpress_sitemap.xml └── francebleu_sitemap.xml ├── test ├── s3 │ ├── one-day-one-channel.parquet │ └── test_s3.py ├── sitemap │ ├── test_utils.py │ ├── test_scrap_html.py │ ├── test_mediatree_utils.py │ ├── test_keywords.py │ └── test_main_import_api.py ├── time_monitored │ └── test_time_monitored.py ├── i8n │ └── test_country.py └── mediatree │ └── test_mediatree_queries.py ├── .flake8 ├── i8n └── mediatree_output │ └── year=2024 │ └── month=10 │ └── day=1 │ └── channel=LAUNE │ └── data.parquet ├── docker-entrypoint_stop_word.sh ├── .github ├── dependabot.yml └── workflows │ ├── docker-compose.yml │ ├── dependabot-auto-approve.yml │ ├── scaleway-down.yml │ ├── scaleway-up.yml │ ├── scaleway-start-import-job-update.yml │ └── deploy-main.yml ├── .vscode └── launch.json ├── Dockerfile_ingest ├── LICENSE ├── Dockerfile ├── Dockerfile_stop_word ├── Dockerfile_api_to_s3 ├── Dockerfile_api_import ├── pyproject.toml ├── docker-entrypoint.sh ├── .gitignore ├── analyse └── mediatree │ └── test_program_durations.ipynb └── alembic.ini /s3/.empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /secrets/.empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /alembic/versions/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /my_dbt_project/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /my_dbt_project/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /my_dbt_project/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /my_dbt_project/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /my_dbt_project/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /quotaclimat/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /my_dbt_project/pytest_tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /quotaclimat/data_ingestion/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /quotaclimat/data_processing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /document-experts/.download-from-gdrive.empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /quotaclimat/data_ingestion/ingest_db/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/dictionary.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /my_dbt_project/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /my_dbt_project/dbt/.user.yml: -------------------------------------------------------------------------------- 1 | id: e72efce9-d03e-4b9f-b04b-c919cc719b38 2 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | pgdata 2 | .git 3 | .venv 4 | venv 5 | .vscode 6 | notebooks 7 | LICENSE 8 | .idea 9 | -------------------------------------------------------------------------------- /postgres/schemas/base.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.orm import declarative_base 2 | 3 | Base = declarative_base() -------------------------------------------------------------------------------- /docs/images/data_tiers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/docs/images/data_tiers.png -------------------------------------------------------------------------------- /mockwebsite/README.md: -------------------------------------------------------------------------------- 1 | Everything in this folder will be served thanks to a docker image (nginx) to be tested locally. -------------------------------------------------------------------------------- /test/s3/one-day-one-channel.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/test/s3/one-day-one-channel.parquet -------------------------------------------------------------------------------- /quotaclimat/utils/coverquotaclimat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/quotaclimat/utils/coverquotaclimat.png -------------------------------------------------------------------------------- /quotaclimat/__init__.py: -------------------------------------------------------------------------------- 1 | # Useless in the current structure 2 | # from quotaclimat.ui.streamlit_dashboard import main as build_dashboard 3 | -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/brazil/__init__.py: -------------------------------------------------------------------------------- 1 | from .channel_program import channels_programs_brazil 2 | from .channel_titles import channel_titles_brazil -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/france/__init__.py: -------------------------------------------------------------------------------- 1 | from .channel_program import channels_programs_france 2 | from .channel_titles import channel_titles_france -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/poland/__init__.py: -------------------------------------------------------------------------------- 1 | from .channel_program import channels_programs_poland 2 | from .channel_titles import channel_titles_poland -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/spain/__init__.py: -------------------------------------------------------------------------------- 1 | from .channel_program import channels_programs_spain 2 | from .channel_titles import channel_titles_spain -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/germany/__init__.py: -------------------------------------------------------------------------------- 1 | from .channel_program import channels_programs_germany 2 | from .channel_titles import channel_titles_germany -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | extend-ignore = E203,E501,F401 4 | exclude = 5 | .venv, 6 | .git 7 | per-file-ignores = 8 | */__init__.py:F403,F401 9 | -------------------------------------------------------------------------------- /i8n/mediatree_output/year=2024/month=10/day=1/channel=LAUNE/data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataforgoodfr/quotaclimat/HEAD/i8n/mediatree_output/year=2024/month=10/day=1/channel=LAUNE/data.parquet -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/germany/channel_titles.py: -------------------------------------------------------------------------------- 1 | channel_titles_germany = { 2 | "daserste":"Das Erste", 3 | "zdf-neo":"ZDFneo", 4 | "zdf":"ZDF", 5 | "rtl-television":"RTL", 6 | "sat1":"Sat.1", 7 | "prosieben":"ProSieben", 8 | "kabel-eins":"Kabel Eins", 9 | } -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/brazil/channel_titles.py: -------------------------------------------------------------------------------- 1 | channel_titles_brazil = { 2 | "tvbrasil":"TV Brasil", 3 | "tvglobo":"TV Globo", 4 | "tvrecord":"TV Record", 5 | "sbt":"SBT", 6 | "redebandeirantes":"Band", 7 | "jovempan":"Jovem Pan", 8 | "cnnbrasil":"CNN Brasil", 9 | } -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/spain/channel_titles.py: -------------------------------------------------------------------------------- 1 | channel_titles_spain = { 2 | "antenna-3": "Antenna 3", 3 | "rtve-la-1": "RTVE La 1", 4 | "rtve-24h": "RTVE 24h", 5 | "lasexta-news": "LaSexta News", 6 | "telecinco-news": "Telecinco News", 7 | "cuatro-news": "Cuatro News", 8 | } -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/poland/channel_titles.py: -------------------------------------------------------------------------------- 1 | channel_titles_poland = { 2 | "tvp": "TVP", 3 | "polsat": "Polsat", 4 | "tvn": "TVN", 5 | "polskie-radio": "Polskie Radio", 6 | "tofkm": "TOFKM", 7 | "radio-zet": "Radio Zet", 8 | "eska": "Eska", 9 | "tokfm": "TOKFM", 10 | } -------------------------------------------------------------------------------- /quotaclimat/data_ingestion/labelstudio/configs.py: -------------------------------------------------------------------------------- 1 | db_config = [ 2 | {"database": "labelstudio", "countries": {6: "france", 9: "brazil", 20: "germany"}}, 3 | {"database": "labelstudio-climate-poland-prod-db", "countries": {1: "poland"}}, 4 | {"database": "labelstudio-climate-spain-prod-db", "countries": {1: "spain"}}, 5 | ] -------------------------------------------------------------------------------- /postgres/schemas/sitemap.pgsql: -------------------------------------------------------------------------------- 1 | CREATE TABLE sitemap_table( 2 | publication_name VARCHAR(255) NOT NULL, 3 | news_title TEXT NOT NULL, 4 | download_date DATE NOT NULL, 5 | news_publication_date DATE NOT NULL, 6 | news_keywords TEXT, 7 | section TEXT, 8 | image_caption TEXT, 9 | media_type VARCHAR(255) 10 | ) 11 | 12 | -------------------------------------------------------------------------------- /docker-entrypoint_stop_word.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run migrations before starting the application 4 | echo "Running migrations with alembic if exists" 5 | poetry run alembic upgrade head 6 | 7 | if [[ $? -eq 0 ]]; then 8 | echo "Command succeeded" 9 | else 10 | echo "Command failed" 11 | fi 12 | 13 | echo "starting stop_word import app" 14 | python quotaclimat/data_processing/mediatree/stop_word/main.py -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/docker-compose.yml: -------------------------------------------------------------------------------- 1 | name: Docker Compose CI 2 | 3 | on: 4 | workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch 5 | 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | - name: init and load data 12 | run: docker compose up -d 13 | - name: sleep 14 | run: sleep 60 15 | - name: log sitemap 16 | run: docker logs sitemap 17 | - name: log db ingestion 18 | run: docker logs ingest_to_db 19 | - name: log streamlit 20 | run: docker logs streamlit -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/france/channel_titles.py: -------------------------------------------------------------------------------- 1 | channel_titles_france = { 2 | "tf1": "TF1", 3 | "france2": "France 2", 4 | "fr3-idf": "France 3-idf", 5 | "m6": "M6", 6 | "arte": "Arte", 7 | "d8": "C8", 8 | "bfmtv": "BFM TV", 9 | "lci": "LCI", 10 | "franceinfotv": "France Info TV", 11 | "itele": "CNews", 12 | "europe1": "Europe 1", 13 | "france-culture": "France Culture", 14 | "france-inter": "France Inter", 15 | "sud-radio": "Sud Radio", 16 | "rmc": "RMC", 17 | "rtl": "RTL", 18 | "france24": "France 24", 19 | "france-info": "FranceinfoRadio", 20 | "rfi": "RFI", 21 | } -------------------------------------------------------------------------------- /my_dbt_project/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | cd my_dbt_project 7 | - dbt debug 8 | - dbt run 9 | - dbt test 10 | 11 | 12 | ### Resources: 13 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 14 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 15 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 16 | - Find [dbt events](https://events.getdbt.com) near you 17 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 18 | -------------------------------------------------------------------------------- /.github/workflows/dependabot-auto-approve.yml: -------------------------------------------------------------------------------- 1 | name: Dependabot auto-approve 2 | on: pull_request 3 | 4 | permissions: 5 | pull-requests: write 6 | 7 | jobs: 8 | dependabot: 9 | runs-on: ubuntu-latest 10 | if: github.event.pull_request.user.login == 'dependabot[bot]' && github.repository == 'dataforgoodfr/quotaclimat' 11 | steps: 12 | - name: Dependabot metadata 13 | id: metadata 14 | uses: dependabot/fetch-metadata@v2 15 | with: 16 | github-token: "${{ secrets.GITHUB_TOKEN }}" 17 | - name: Approve a PR 18 | run: gh pr review --approve "$PR_URL" 19 | env: 20 | PR_URL: ${{github.event.pull_request.html_url}} 21 | GH_TOKEN: ${{secrets.GITHUB_TOKEN}} 22 | -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | #read whole file to a string 4 | def get_password(): 5 | password = os.environ.get("MEDIATREE_PASSWORD") 6 | if(password == '/run/secrets/pwd_api'): 7 | password= open("/run/secrets/pwd_api", "r").read() 8 | return password 9 | 10 | def get_auth_url(): 11 | return os.environ.get("MEDIATREE_AUTH_URL") # 12 | 13 | def get_user(): 14 | USER = os.environ.get("MEDIATREE_USER") 15 | if(USER == '/run/secrets/username_api'): 16 | USER=open("/run/secrets/username_api", "r").read() 17 | return USER 18 | 19 | #https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList 20 | def get_keywords_url(): 21 | return os.environ.get("KEYWORDS_URL") -------------------------------------------------------------------------------- /alembic/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | ${imports if imports else ""} 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = ${repr(up_revision)} 16 | down_revision: Union[str, None] = ${repr(down_revision)} 17 | branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} 18 | depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} 19 | 20 | 21 | def upgrade() -> None: 22 | ${upgrades if upgrades else "pass"} 23 | 24 | 25 | def downgrade() -> None: 26 | ${downgrades if downgrades else "pass"} 27 | -------------------------------------------------------------------------------- /my_dbt_project/dbt/profiles.yml: -------------------------------------------------------------------------------- 1 | my_dbt_project: 2 | outputs: 3 | docker: 4 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 5 | port: "{{ env_var('POSTGRES_PORT') | as_number }}" 6 | schema: public 7 | threads: 4 8 | type: postgres 9 | user: "{{ env_var('POSTGRES_USER') }}" 10 | dbname: "{{ env_var('POSTGRES_DB') }}" 11 | host: "{{ env_var('POSTGRES_HOST') }}" 12 | analytics: 13 | pass: "{{ env_var('POSTGRES_PASSWORD') }}" 14 | port: "{{ env_var('POSTGRES_PORT') | as_number }}" 15 | schema: analytics 16 | threads: 4 17 | type: postgres 18 | user: "{{ env_var('POSTGRES_USER') }}" 19 | dbname: "{{ env_var('POSTGRES_DB') }}" 20 | host: "{{ env_var('POSTGRES_HOST') }}" 21 | target: docker 22 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | 5 | { 6 | "name": "Python: Current File", 7 | "type": "python", 8 | "request": "launch", 9 | "program": "${file}", 10 | "console": "integratedTerminal", 11 | "justMyCode": true 12 | }, 13 | { 14 | "name": "Python: File", 15 | "type": "python", 16 | "request": "launch", 17 | "program": "${file}", 18 | "justMyCode": true 19 | }, 20 | {"name": "Python data: Current File", 21 | "type": "python", 22 | "request": "launch", 23 | "program": "${file}", 24 | "console": "integratedTerminal", 25 | } 26 | }, 27 | 28 | ] 29 | } -------------------------------------------------------------------------------- /postgres/insert_existing_data_example.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import pandas as pd 5 | 6 | from quotaclimat.data_ingestion.scrap_sitemap import get_sitemap_cols 7 | 8 | 9 | def parse_section(section: str): 10 | logging.debug(section) 11 | if "," not in section: 12 | return section 13 | else: 14 | return ",".join(map(str, section)) 15 | 16 | def transformation_from_dumps_to_table_entry(df: pd.DataFrame): 17 | try: 18 | cols = get_sitemap_cols() 19 | df_template_db = pd.DataFrame(columns=cols) 20 | df_consistent = pd.concat([df, df_template_db]) 21 | 22 | df_consistent.section = df_consistent.section.apply(parse_section) 23 | 24 | return df_consistent[cols] 25 | except Exception as err: 26 | logging.error("Could not transform %s" % (err)) 27 | return None -------------------------------------------------------------------------------- /alembic/versions/2450da0e6c60_number_of_keywords_20_30_40.py: -------------------------------------------------------------------------------- 1 | """number of keywords 20,30,40 2 | 3 | Revision ID: 2450da0e6c60 4 | Revises: 055173743036 5 | Create Date: 2024-06-19 10:21:34.624231 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '2450da0e6c60' 16 | down_revision: Union[str, None] = '055173743036' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | pass 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade() -> None: 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | pass 30 | # ### end Alembic commands ### 31 | -------------------------------------------------------------------------------- /alembic/versions/5cc9e1ec5362_add_info_public_to_program_metadata.py: -------------------------------------------------------------------------------- 1 | """Add info/public to program metadata 2 | 3 | Revision ID: 5cc9e1ec5362 4 | Revises: 356882459cec 5 | Create Date: 2024-05-03 08:54:16.764307 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '5cc9e1ec5362' 16 | down_revision: Union[str, None] = '356882459cec' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | pass 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade() -> None: 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | pass 30 | # ### end Alembic commands ### 31 | -------------------------------------------------------------------------------- /alembic/versions/c1d78b9968fe_add_info_public_to_program_metadata.py: -------------------------------------------------------------------------------- 1 | """Add info/public to program metadata 2 | 3 | Revision ID: c1d78b9968fe 4 | Revises: 5cc9e1ec5362 5 | Create Date: 2024-05-03 08:56:47.087189 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = 'c1d78b9968fe' 16 | down_revision: Union[str, None] = '5cc9e1ec5362' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | pass 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade() -> None: 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | pass 30 | # ### end Alembic commands ### 31 | -------------------------------------------------------------------------------- /alembic/versions/a5c39db3c8e9_add_new_column_test_for_table_keywords.py: -------------------------------------------------------------------------------- 1 | """Add new column test for table keywords 2 | 3 | Revision ID: a5c39db3c8e9 4 | Revises: 5ccd746ee292 5 | Create Date: 2024-09-12 14:10:26.305593 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = 'a5c39db3c8e9' 16 | down_revision: Union[str, None] = '5ccd746ee292' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | pass 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade() -> None: 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | pass 30 | # ### end Alembic commands ### 31 | -------------------------------------------------------------------------------- /alembic/versions/43103d5b49c9_program_add_start_end_date_for_grid_.py: -------------------------------------------------------------------------------- 1 | """program: add start/end date for grid evolution 2 | 3 | Revision ID: 43103d5b49c9 4 | Revises: af956a85658f 5 | Create Date: 2024-10-02 13:18:56.251135 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '43103d5b49c9' 16 | down_revision: Union[str, None] = 'af956a85658f' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | pass 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade() -> None: 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | pass 30 | # ### end Alembic commands ### 31 | -------------------------------------------------------------------------------- /alembic/versions/5ccd746ee292_add_updated_at.py: -------------------------------------------------------------------------------- 1 | """add 20/30 2 | 3 | Revision ID: 5ccd746ee292 4 | Revises: 4ccd746ee291 5 | Create Date: 2024-07-03 06:35:00.316441 6 | """ 7 | from typing import Sequence, Union 8 | 9 | from alembic import op 10 | import sqlalchemy as sa 11 | from sqlalchemy.dialects import postgresql 12 | 13 | # revision identifiers, used by Alembic. 14 | revision: str = '5ccd746ee292' 15 | down_revision: Union[str, None] = '4ccd746ee291' 16 | branch_labels: Union[str, Sequence[str], None] = None 17 | depends_on: Union[str, Sequence[str], None] = None 18 | 19 | def upgrade() -> None: 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('keywords', sa.Column('updated_at',sa.DateTime(), nullable=True)) 22 | # ### end Alembic commands ### 23 | 24 | 25 | def downgrade() -> None: 26 | # ### commands auto generated by Alembic - please adjust! ### 27 | op.drop_column('keywords', 'updated_at') 28 | # ### end Alembic commands ### 29 | -------------------------------------------------------------------------------- /alembic/versions/055173743036_keywords_add_channel_title.py: -------------------------------------------------------------------------------- 1 | """keywords: add channel_title 2 | 3 | 4 | Revision ID: 055173743036 5 | Revises: a0a707673259 6 | Create Date: 2024-06-05 11:43:22.071610 7 | 8 | """ 9 | from typing import Sequence, Union 10 | 11 | from alembic import op 12 | import sqlalchemy as sa 13 | 14 | 15 | # revision identifiers, used by Alembic. 16 | revision: str = '055173743036' 17 | down_revision: Union[str, None] = 'a0a707673259' 18 | branch_labels: Union[str, Sequence[str], None] = None 19 | depends_on: Union[str, Sequence[str], None] = None 20 | 21 | 22 | def upgrade() -> None: 23 | # ### commands auto generated by Alembic - please adjust! ### 24 | op.add_column('keywords', sa.Column('channel_title', sa.String(), nullable=True)) 25 | # ### end Alembic commands ### 26 | 27 | 28 | def downgrade() -> None: 29 | # ### commands auto generated by Alembic - please adjust! ### 30 | op.drop_column('keywords','channel_title') 31 | # ### end Alembic commands ### 32 | -------------------------------------------------------------------------------- /alembic/versions/a0a707673259_add_radio_to_program_metadata.py: -------------------------------------------------------------------------------- 1 | """Add radio to program metadata 2 | 3 | Revision ID: a0a707673259 4 | Revises: 5bff4dceda53 5 | Create Date: 2024-05-03 09:36:04.954535 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = 'a0a707673259' 16 | down_revision: Union[str, None] = '5bff4dceda53' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.add_column('program_metadata', sa.Column('radio', sa.Boolean(), nullable=True)) 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade() -> None: 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | op.drop_column('program_metadata', 'radio') 30 | # ### end Alembic commands ### 31 | -------------------------------------------------------------------------------- /test/sitemap/test_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pandas as pd 4 | def get_localhost(): 5 | localhost = "" 6 | if(os.environ.get("ENV") == "docker"): 7 | localhost ="http://nginxtest:80" 8 | else: 9 | localhost = "http://localhost:8000" 10 | return localhost 11 | 12 | def debug_df(df: pd.DataFrame): 13 | pd.set_option('display.max_columns', None) 14 | logging.warning("--------------------DEBUG DF-------------------") 15 | logging.info(df.dtypes) 16 | logging.info(df.head(3)) 17 | logging.warning("--------------------DEBUG DF-------------------") 18 | 19 | 20 | def list_of_dicts_to_set_of_frozensets(list_of_dicts): 21 | # Convert each dictionary to a frozenset to make it hashable 22 | return {frozenset(d.items()) for d in list_of_dicts} 23 | 24 | def compare_unordered_lists_of_dicts(list1, list2): 25 | # Convert each list of dictionaries to a set of frozensets 26 | set1 = list_of_dicts_to_set_of_frozensets(list1) 27 | set2 = list_of_dicts_to_set_of_frozensets(list2) 28 | 29 | # Check if the sets are equal 30 | return set1 == set2 -------------------------------------------------------------------------------- /Dockerfile_ingest: -------------------------------------------------------------------------------- 1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0 2 | FROM python:3.11 as builder 3 | 4 | ENV VIRTUAL_ENV=/app/.venv 5 | 6 | ENV POETRY_NO_INTERACTION=1 \ 7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \ 8 | POETRY_VIRTUALENVS_CREATE=1 \ 9 | POETRY_CACHE_DIR=/tmp/poetry_cache 10 | 11 | WORKDIR /app 12 | 13 | COPY pyproject.toml poetry.lock ./ 14 | 15 | RUN pip install poetry==2.1.3 16 | 17 | RUN poetry install --no-root 18 | 19 | # The runtime image, used to just run the code provided its virtual environment 20 | FROM python:3.11-slim as runtime 21 | 22 | WORKDIR /app 23 | 24 | ENV VIRTUAL_ENV=/app/.venv 25 | ENV PATH="/app/.venv/bin:$PATH" 26 | ENV PATH="$PYENV_ROOT/bin:$PATH" 27 | ENV PYTHONPATH=/app 28 | 29 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} 30 | 31 | # App code is include with docker-compose as well 32 | 33 | COPY quotaclimat ./quotaclimat 34 | COPY postgres ./postgres 35 | COPY pyproject.toml pyproject.toml 36 | 37 | # healthcheck 38 | EXPOSE 5000 39 | 40 | 41 | ENTRYPOINT ["python", "quotaclimat/data_ingestion/ingest_db/ingest_sitemap_in_db.py"] 42 | -------------------------------------------------------------------------------- /test/time_monitored/test_time_monitored.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pytest 3 | import pandas as pd 4 | 5 | from postgres.schemas.models import get_db_session, connect_to_db, create_tables 6 | from quotaclimat.data_processing.mediatree.time_monitored.models import * 7 | import zoneinfo 8 | 9 | @pytest.fixture(scope="module", autouse=True) 10 | def init_tables(): 11 | create_tables() 12 | 13 | def test_save_time_monitored(): 14 | start = datetime(2025, 1, 14, 15, 18, 43, 807525, tzinfo=zoneinfo.ZoneInfo(key='Europe/Paris')) 15 | channel_name = "test_channel" 16 | country = "france" 17 | id = get_consistent_hash(f"{channel_name}_{start}_{country}") 18 | duration_minutes = 30 19 | 20 | time_monitored = Time_Monitored( 21 | id=id, 22 | channel_name=channel_name, 23 | start=start, 24 | duration_minutes=duration_minutes, 25 | country=country 26 | ) 27 | save_time_monitored(number_of_rows=int(duration_minutes/2), day=start, channel=channel_name, country=country) 28 | 29 | output = get_time_monitored(id) 30 | assert output.duration_minutes == time_monitored.duration_minutes -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Data For Good France 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /alembic/versions/30abfd828007_program_metadata.py: -------------------------------------------------------------------------------- 1 | """program metadata 2 | 3 | Revision ID: 30abfd828007 4 | Revises: 43103d5b49c9 5 | Create Date: 2024-10-03 14:18:09.874225 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '30abfd828007' 16 | down_revision: Union[str, None] = '43103d5b49c9' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.add_column('program_metadata', sa.Column('program_grid_start', sa.DateTime(), nullable=True)) 24 | op.add_column('program_metadata', sa.Column('program_grid_end', sa.DateTime(), nullable=True)) 25 | # ### end Alembic commands ### 26 | 27 | 28 | def downgrade() -> None: 29 | # ### commands auto generated by Alembic - please adjust! ### 30 | op.drop_column('program_metadata', 'program_grid_end') 31 | op.drop_column('program_metadata', 'program_grid_start') 32 | # ### end Alembic commands ### 33 | -------------------------------------------------------------------------------- /alembic/versions/5bff4dceda53_add_info_public_to_program_metadata.py: -------------------------------------------------------------------------------- 1 | """Add info/public to program metadata 2 | 3 | Revision ID: 5bff4dceda53 4 | Revises: c1d78b9968fe 5 | Create Date: 2024-05-03 09:09:44.751432 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '5bff4dceda53' 16 | down_revision: Union[str, None] = 'c1d78b9968fe' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.add_column('program_metadata', sa.Column('public', sa.Boolean(), nullable=True)) 24 | op.add_column('program_metadata', sa.Column('infocontinue', sa.Boolean(), nullable=True)) 25 | # ### end Alembic commands ### 26 | 27 | 28 | def downgrade() -> None: 29 | # ### commands auto generated by Alembic - please adjust! ### 30 | op.drop_column('program_metadata', 'infocontinue') 31 | op.drop_column('program_metadata', 'public') 32 | # ### end Alembic commands ### 33 | -------------------------------------------------------------------------------- /alembic/versions/827fb6dde3bb_time_monitored_new_table.py: -------------------------------------------------------------------------------- 1 | """time monitored new table 2 | 3 | Revision ID: 827fb6dde3bb 4 | Revises: c08231a9eb37 5 | Create Date: 2025-04-29 13:29:54.299095 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '827fb6dde3bb' 16 | down_revision: Union[str, None] = 'c08231a9eb37' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | def upgrade() -> None: 21 | # Create the time_monitored table 22 | op.create_table( 23 | 'time_monitored', 24 | sa.Column('id', sa.String(), nullable=False), 25 | sa.Column('channel_name', sa.String(), nullable=False), 26 | sa.Column('start', sa.DateTime(), nullable=False), 27 | sa.Column('duration_minutes', sa.Integer(), nullable=True), 28 | sa.Column('country', sa.String(), nullable=False), 29 | sa.PrimaryKeyConstraint('id') 30 | ) 31 | 32 | 33 | def downgrade() -> None: 34 | # Drop the time_monitored table 35 | op.drop_table('time_monitored') -------------------------------------------------------------------------------- /alembic/versions/c08231a9eb37_program_add_created_at_updated_at.py: -------------------------------------------------------------------------------- 1 | """program: add created_at updated_at 2 | 3 | Revision ID: c08231a9eb37 4 | Revises: 4333bc46985d 5 | Create Date: 2025-03-29 08:17:51.997077 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = 'c08231a9eb37' 16 | down_revision: Union[str, None] = '4333bc46985d' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.add_column('program_metadata', sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text("(now() at time zone 'utc')"), nullable=True)) 24 | op.add_column('program_metadata', sa.Column('updated_at', sa.DateTime(), nullable=True)) 25 | # ### end Alembic commands ### 26 | 27 | 28 | def downgrade() -> None: 29 | # ### commands auto generated by Alembic - please adjust! ### 30 | op.drop_column('program_metadata', 'updated_at') 31 | op.drop_column('program_metadata', 'created_at') 32 | # ### end Alembic commands ### 33 | -------------------------------------------------------------------------------- /alembic/versions/2c48f626a749_keywords_program_name.py: -------------------------------------------------------------------------------- 1 | """keywords: program name 2 | 3 | Revision ID: 2c48f626a749 4 | Revises: 5 | Create Date: 2024-04-12 12:44:23.512407 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '2c48f626a749' 16 | down_revision: Union[str, None] = None 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.add_column('keywords', sa.Column('channel_program', sa.String(), nullable=True)) 24 | op.add_column('keywords', sa.Column('channel_program_type', sa.String(), nullable=True)) 25 | op.add_column('keywords', sa.Column('category', sa.JSON(), nullable=True)) 26 | # ### end Alembic commands ### 27 | 28 | 29 | def downgrade() -> None: 30 | # ### commands auto generated by Alembic - please adjust! ### 31 | op.drop_column('keywords', 'category') 32 | op.drop_column('keywords', 'channel_program_type') 33 | op.drop_column('keywords', 'channel_program') 34 | # ### end Alembic commands ### 35 | -------------------------------------------------------------------------------- /quotaclimat/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | class CustomFormatter(logging.Formatter): 4 | 5 | grey = "\x1b[38;20m" 6 | yellow = "\x1b[33;20m" 7 | red = "\x1b[31;20m" 8 | bold_red = "\x1b[31;1m" 9 | reset = "\x1b[0m" 10 | light_blue = "\x1b[36m" 11 | format = "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d | %(message)s" 12 | 13 | FORMATS = { 14 | logging.DEBUG: grey + format + reset, 15 | logging.INFO: light_blue + format + reset, 16 | logging.WARNING: yellow + format + reset, 17 | logging.ERROR: red + format + reset, 18 | logging.CRITICAL: bold_red + format + reset 19 | } 20 | 21 | def format(self, record): 22 | log_fmt = self.FORMATS.get(record.levelno) 23 | formatter = logging.Formatter(log_fmt) 24 | return formatter.format(record) 25 | 26 | def getLogger(): 27 | # create logger with 'spam_application' 28 | logger = logging.getLogger() 29 | logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper()) 30 | # create console handler with a higher log level 31 | if (logger.hasHandlers()): 32 | logger.handlers.clear() 33 | ch = logging.StreamHandler() 34 | ch.setFormatter(CustomFormatter()) 35 | logger.addHandler(ch) 36 | 37 | return logger -------------------------------------------------------------------------------- /alembic/versions/4ccd746ee291_add_20_30.py: -------------------------------------------------------------------------------- 1 | """add 20/30 2 | 3 | Revision ID: 4ccd746ee291 4 | Revises: 2450da0e6c60 5 | Create Date: 2024-06-20 06:35:00.316441 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | from sqlalchemy.dialects import postgresql 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '4ccd746ee291' 16 | down_revision: Union[str, None] = '2450da0e6c60' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.alter_column('keywords', sa.Column('number_of_keywords_20', sa.Integer(), nullable=True)) 24 | op.alter_column('keywords', sa.Column('number_of_keywords_30', sa.Integer(), nullable=True)) 25 | op.alter_column('keywords', sa.Column('number_of_keywords_40', sa.Integer(), nullable=True)) 26 | # ### end Alembic commands ### 27 | 28 | 29 | def downgrade() -> None: 30 | # ### commands auto generated by Alembic - please adjust! ### 31 | op.drop_column('keywords', 'number_of_keywords_20') 32 | op.drop_column('keywords', 'number_of_keywords_30') 33 | op.drop_column('keywords', 'number_of_keywords_40') 34 | # ### end Alembic commands ### 35 | -------------------------------------------------------------------------------- /.github/workflows/scaleway-down.yml: -------------------------------------------------------------------------------- 1 | name: Stop Scaleway 2 | 3 | on: 4 | workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch 5 | 6 | schedule: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule 7 | - cron: '49 21 * * *' 8 | 9 | jobs: 10 | down: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Use CLI 14 | uses: jawher/action-scw@v2.34.0 15 | env: 16 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }} 17 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }} 18 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }} 19 | SCW_ZONE: ${{ secrets.SCW_ZONE }} 20 | with: 21 | args: container container list name=${{ secrets.CONTAINER_NAME }} --output json 22 | 23 | - name: Get CONTAINER_ID env var 24 | run: echo "CONTAINER_ID=$(cat "${GITHUB_WORKSPACE}/scw.output" | jq -r '.[0].id')" >> $GITHUB_ENV 25 | 26 | 27 | - name: 0 instances 28 | uses: jawher/action-scw@v2.34.0 29 | env: 30 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }} 31 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }} 32 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }} 33 | SCW_ZONE: ${{ secrets.SCW_ZONE }} 34 | with: 35 | args: container container update min-scale=0 ${{ env.CONTAINER_ID }} 36 | -------------------------------------------------------------------------------- /.github/workflows/scaleway-up.yml: -------------------------------------------------------------------------------- 1 | name: Start Scaleway 2 | 3 | on: 4 | workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch 5 | 6 | schedule: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule 7 | - cron: '52 05 * * *' 8 | 9 | jobs: 10 | up: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Use CLI 14 | uses: jawher/action-scw@v2.34.0 15 | env: 16 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }} 17 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }} 18 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }} 19 | SCW_ZONE: ${{ secrets.SCW_ZONE }} 20 | with: 21 | args: container container list name=${{ secrets.CONTAINER_NAME }} --output json 22 | 23 | - name: Get CONTAINER_ID env var 24 | run: echo "CONTAINER_ID=$(cat "${GITHUB_WORKSPACE}/scw.output" | jq -r '.[0].id')" >> $GITHUB_ENV 25 | 26 | - name: start 1 instances 27 | uses: jawher/action-scw@v2.34.0 28 | env: 29 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }} 30 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }} 31 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }} 32 | SCW_ZONE: ${{ secrets.SCW_ZONE }} 33 | with: 34 | args: container container update min-scale=1 ${{ env.CONTAINER_ID }} 35 | -------------------------------------------------------------------------------- /alembic/versions/af956a85658f_add_new_column_number_of_keywords_.py: -------------------------------------------------------------------------------- 1 | """Add new column number_of_keywords climat/biod/r 2 | 3 | Revision ID: af956a85658f 4 | Revises: a5c39db3c8e9 5 | Create Date: 2024-09-12 14:15:12.049367 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = 'af956a85658f' 16 | down_revision: Union[str, None] = 'a5c39db3c8e9' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.add_column('keywords', sa.Column('number_of_keywords_climat', sa.Integer(), nullable=True)) 24 | op.add_column('keywords', sa.Column('number_of_keywords_biodiversite', sa.Integer(), nullable=True)) 25 | op.add_column('keywords', sa.Column('number_of_keywords_ressources', sa.Integer(), nullable=True)) 26 | # ### end Alembic commands ### 27 | 28 | 29 | def downgrade() -> None: 30 | # ### commands auto generated by Alembic - please adjust! ### 31 | op.drop_column('keywords', 'number_of_keywords_ressources') 32 | op.drop_column('keywords', 'number_of_keywords_biodiversite') 33 | op.drop_column('keywords', 'number_of_keywords_climat') 34 | # ### end Alembic commands ### 35 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0 2 | FROM python:3.12.10 as builder 3 | 4 | ENV VIRTUAL_ENV=/app/.venv 5 | 6 | ENV POETRY_NO_INTERACTION=1 \ 7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \ 8 | POETRY_VIRTUALENVS_CREATE=1 \ 9 | POETRY_CACHE_DIR=/tmp/poetry_cache 10 | 11 | WORKDIR /app 12 | 13 | COPY pyproject.toml poetry.lock ./ 14 | 15 | RUN pip install poetry==2.1.3 16 | 17 | RUN poetry install --no-root 18 | 19 | # The runtime image, used to just run the code provided its virtual environment 20 | FROM python:3.12.10-slim as runtime 21 | 22 | RUN apt update && apt-get install -y git 23 | 24 | WORKDIR /app 25 | 26 | ENV VIRTUAL_ENV=/app/.venv 27 | ENV PATH="/app/.venv/bin:$PATH" 28 | ENV PATH="$PYENV_ROOT/bin:$PATH" 29 | ENV PYTHONPATH=/app 30 | ENV DBT_PROFILES_DIR=/app/my_dbt_project/dbt 31 | ENV DBT_PROJECT_DIR=/app/my_dbt_project 32 | 33 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} 34 | 35 | # For streamlit only 36 | COPY pyproject.toml poetry.lock ./ 37 | RUN pip install poetry 38 | 39 | # App code is include with docker-compose as well 40 | 41 | COPY quotaclimat ./quotaclimat 42 | COPY postgres ./postgres 43 | COPY alembic/ ./alembic 44 | COPY transform_program.py ./transform_program.py 45 | COPY my_dbt_project/ ./my_dbt_project 46 | 47 | # Docker compose overwrite this config to have only one Dockerfile 48 | CMD ["ls"] 49 | -------------------------------------------------------------------------------- /Dockerfile_stop_word: -------------------------------------------------------------------------------- 1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0 2 | FROM python:3.12.10 as builder 3 | 4 | ENV VIRTUAL_ENV=/app/.venv 5 | 6 | ENV POETRY_NO_INTERACTION=1 \ 7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \ 8 | POETRY_VIRTUALENVS_CREATE=1 \ 9 | POETRY_CACHE_DIR=/tmp/poetry_cache 10 | 11 | WORKDIR /app 12 | 13 | COPY pyproject.toml poetry.lock ./ 14 | 15 | RUN pip install poetry==2.1.3 16 | 17 | RUN poetry install --no-root 18 | 19 | # The runtime image, used to just run the code provided its virtual environment 20 | FROM python:3.12.10-slim as runtime 21 | 22 | WORKDIR /app 23 | 24 | ENV VIRTUAL_ENV=/app/.venv 25 | ENV PATH="/app/.venv/bin:$PATH" 26 | ENV PATH="$PYENV_ROOT/bin:$PATH" 27 | ENV PYTHONPATH=/app 28 | 29 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} 30 | 31 | # App code is include with docker-compose as well 32 | 33 | COPY quotaclimat ./quotaclimat 34 | COPY postgres ./postgres 35 | COPY pyproject.toml pyproject.toml 36 | COPY alembic/ ./alembic 37 | COPY alembic.ini ./alembic.ini 38 | COPY transform_program.py ./transform_program.py 39 | 40 | # healthcheck 41 | EXPOSE 5050 42 | 43 | # Use a separate script to handle migrations and start the application 44 | COPY docker-entrypoint_stop_word.sh ./docker-entrypoint_stop_word.sh 45 | RUN chmod +x ./docker-entrypoint_stop_word.sh 46 | 47 | ENTRYPOINT ["./docker-entrypoint_stop_word.sh"] -------------------------------------------------------------------------------- /quotaclimat/utils/healthcheck_config.py: -------------------------------------------------------------------------------- 1 | 2 | import http.server 3 | import socketserver 4 | import os 5 | import logging 6 | import asyncio 7 | import tomli 8 | 9 | def get_app_version(): 10 | # Open and read the pyproject.toml file 11 | with open('pyproject.toml', 'rb') as toml_file: 12 | pyproject_data = tomli.load(toml_file) 13 | 14 | # Access the version from the pyproject.toml file 15 | version = pyproject_data['project']['version'] 16 | return version 17 | 18 | version = get_app_version() 19 | 20 | class HealthCheckHandler(http.server.SimpleHTTPRequestHandler): 21 | def do_GET(self): 22 | self.send_response(200) 23 | self.end_headers() 24 | self.wfile.write((f"Healthy.\n\nApp version {version}").encode()) 25 | 26 | async def run_health_check_server(): 27 | PORT = int(os.environ.get("PORT_HS", 5050)) 28 | SERVER_ADDRESS = os.environ.get("HEALTHCHECK_SERVER", "") 29 | 30 | logging.info(f"App version {version}") 31 | logging.info(f"Healthcheck at '{SERVER_ADDRESS}' : port {PORT}") 32 | with socketserver.TCPServer((SERVER_ADDRESS, PORT), HealthCheckHandler) as httpd: 33 | try: 34 | await asyncio.to_thread(httpd.serve_forever) 35 | except asyncio.CancelledError: 36 | logging.info("health check cancel") 37 | httpd.shutdown() # to terminal infinite loop "serve_forever" 38 | return 39 | -------------------------------------------------------------------------------- /Dockerfile_api_to_s3: -------------------------------------------------------------------------------- 1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0 2 | FROM python:3.12.10 as builder 3 | 4 | ENV VIRTUAL_ENV=/app/.venv 5 | 6 | ENV POETRY_NO_INTERACTION=1 \ 7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \ 8 | POETRY_VIRTUALENVS_CREATE=1 \ 9 | POETRY_CACHE_DIR=/tmp/poetry_cache 10 | 11 | WORKDIR /app 12 | 13 | COPY pyproject.toml poetry.lock ./ 14 | 15 | RUN pip install poetry==2.1.3 16 | 17 | RUN poetry install --no-root 18 | 19 | # The runtime image, used to just run the code provided its virtual environment 20 | FROM python:3.12.10-slim as runtime 21 | 22 | WORKDIR /app 23 | 24 | ENV VIRTUAL_ENV=/app/.venv 25 | ENV PATH="/app/.venv/bin:$PATH" 26 | ENV PATH="$PYENV_ROOT/bin:$PATH" 27 | ENV PYTHONPATH=/app 28 | 29 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} 30 | 31 | # App code is include with docker-compose as well 32 | 33 | COPY quotaclimat ./quotaclimat 34 | COPY postgres ./postgres 35 | COPY pyproject.toml pyproject.toml 36 | COPY alembic/ ./alembic 37 | COPY alembic.ini ./alembic.ini 38 | COPY transform_program.py ./transform_program.py 39 | 40 | # healthcheck 41 | EXPOSE 5050 42 | 43 | # Use a separate script to handle migrations and start the application 44 | COPY docker-entrypoint.sh ./docker-entrypoint.sh 45 | RUN chmod +x ./docker-entrypoint.sh 46 | 47 | 48 | ENTRYPOINT ["python", "quotaclimat/data_processing/mediatree/s3/api_to_s3.py"] 49 | -------------------------------------------------------------------------------- /Dockerfile_api_import: -------------------------------------------------------------------------------- 1 | #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0 2 | FROM python:3.12.10 as builder 3 | 4 | ENV VIRTUAL_ENV=/app/.venv 5 | 6 | ENV POETRY_NO_INTERACTION=1 \ 7 | POETRY_VIRTUALENVS_IN_PROJECT=1 \ 8 | POETRY_VIRTUALENVS_CREATE=1 \ 9 | POETRY_CACHE_DIR=/tmp/poetry_cache 10 | 11 | WORKDIR /app 12 | 13 | COPY pyproject.toml poetry.lock ./ 14 | 15 | RUN pip install poetry==2.1.3 16 | 17 | RUN poetry install --no-root 18 | 19 | # The runtime image, used to just run the code provided its virtual environment 20 | FROM python:3.12.10-slim as runtime 21 | 22 | RUN apt update && apt-get install -y git 23 | 24 | WORKDIR /app 25 | 26 | ENV VIRTUAL_ENV=/app/.venv 27 | ENV PATH="/app/.venv/bin:$PATH" 28 | ENV PATH="$PYENV_ROOT/bin:$PATH" 29 | ENV PYTHONPATH=/app 30 | 31 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} 32 | 33 | # App code is include with docker-compose as well 34 | 35 | COPY quotaclimat ./quotaclimat 36 | COPY postgres ./postgres 37 | COPY pyproject.toml pyproject.toml 38 | COPY alembic/ ./alembic 39 | COPY alembic.ini ./alembic.ini 40 | COPY transform_program.py ./transform_program.py 41 | COPY my_dbt_project/ ./my_dbt_project 42 | COPY i8n/ ./i8n 43 | ENV DBT_PROFILES_DIR=/app/my_dbt_project/dbt 44 | ENV DBT_PROJECT_DIR=/app/my_dbt_project 45 | 46 | # healthcheck 47 | EXPOSE 5050 48 | 49 | # Use a separate script to handle migrations and start the application 50 | COPY docker-entrypoint.sh ./docker-entrypoint.sh 51 | RUN chmod +x ./docker-entrypoint.sh 52 | 53 | ENTRYPOINT ["./docker-entrypoint.sh"] 54 | -------------------------------------------------------------------------------- /my_dbt_project/seeds/time_monitored.csv: -------------------------------------------------------------------------------- 1 | id,channel_name,start,duration_minutes,country 2 | f48e555ced0b59dc6016b9ed62e4ca0b630ff98d48ac459c8f3ae0945d81a534,daserste,"February 01, 2025, 12:00 AM",258,germany 3 | 3a6fd867f15cafbddc489509576a495b1794633e895ff0f18a48250bb6f1cf25,zdf-neo,"February 01, 2025, 12:00 AM",352,germany 4 | 31a2db38f49bd7b3d1689369a409bca7f031f2cab2c2d2c8715d367560651277,rtl-television,"February 01, 2025, 12:00 AM",294,germany 5 | 37d6723cd58f3b137045298c8b3dded8563da30df84e979cf27441808c7381ec,sat1,"February 01, 2025, 12:00 AM",222,germany 6 | f015abc528de99458ea833d94cdea466ab0e9c4445727a2d005bca9b2ea4adff,prosieben,"February 01, 2025, 12:00 AM",156,germany 7 | 143cfbae72cbf7c634645fe8f0b3dce52c3e95c0d27d01af10210252ec3e67e8,kabel-eins,"February 01, 2025, 12:00 AM",36,germany 8 | cf6d8f980175b1335583bce4a40595eca5886fcaa9ebeaf7611557fc41b6cf21,tf1,"February 01, 2025, 12:00 AM",258,france 9 | 6b7e0d69c3111ceb6b9f176f5c3748b5c9d44a898f5c2d9ecc7e3f0a37cb5adf,france2,"February 01, 2025, 12:00 AM",334,france 10 | 3b046c77314301e63bef3a4142eb9ac62b48fe52b72602de1ab3d93eb1c5d24b,fr3-idf,"February 01, 2025, 12:00 AM",240,france 11 | b51fe8a6a65b06ead17099a2eac4312b526f76f9b1f256d8d3779c76533a3b6a,m6,"February 01, 2025, 12:00 AM",316,france 12 | 9b1ebe8bc77b319560f91fc1c768079ff16e9f01f544b5aad25065d335c5f3f7,arte,"February 01, 2025, 12:00 AM",88,france 13 | 6aba0a0299934ed1a3411289a51ccbd11b6d9236ffef2adc8df0d76b003357f0,bfmtv,"February 01, 2025, 12:00 AM","1,030",france 14 | 0bb8064e6500c8bc63e9e30f42d21d9ad5322d508f04dd024c1b76956f0d40c4,franceinfotv,"February 01, 2025, 12:00 AM","1,030",france -------------------------------------------------------------------------------- /test/sitemap/test_scrap_html.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | from quotaclimat.data_ingestion.scrap_html.scrap_description_article import get_meta_news, get_hat_20minutes, get_url_content 4 | from quotaclimat.data_ingestion.scrap_sitemap import get_description_article 5 | from bs4 import BeautifulSoup 6 | from test_utils import get_localhost, debug_df 7 | 8 | localhost = get_localhost() 9 | 10 | @pytest.mark.asyncio 11 | async def test_get_description_article(): 12 | url_to_parse = f"{localhost}/mediapart_website.html" 13 | media = "Le Figaro" 14 | df_articles = pd.DataFrame([{ 15 | "url" : url_to_parse, 16 | "news_title" :media, 17 | }]) 18 | 19 | expected_result = pd.DataFrame([{ 20 | "url" : url_to_parse, 21 | "news_title" :media, 22 | "news_description" : "description could be parsed with success" 23 | }]) 24 | 25 | df_articles["news_description"] = await get_description_article(media, df_articles) 26 | debug_df(df_articles) 27 | pd.testing.assert_frame_equal(df_articles.reset_index(drop=True), expected_result.reset_index(drop=True)) 28 | 29 | @pytest.mark.asyncio 30 | async def test_get_meta_news(): 31 | url_to_parse = f"{localhost}/mediapart_website.html" 32 | 33 | ouput = await get_meta_news(url_to_parse, "media") 34 | assert ouput["description"] == "description could be parsed with success" 35 | 36 | @pytest.mark.asyncio 37 | async def test_get_hat_20minutes(): 38 | url_to_parse = f"{localhost}/20minutes_website.html" 39 | 40 | response = await get_url_content(url_to_parse) 41 | hat = get_hat_20minutes(BeautifulSoup(response, "html.parser")) 42 | assert hat == "howdy there" -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "quotaclimat" 3 | version = "2.0.74" 4 | description = "" 5 | authors = [ 6 | {name = "Rambier Estelle", "email"="estelle.rambier@hotmail.fr"}, 7 | {name = "Paul Leclercq", "email"="paul@epauler.fr"} 8 | ] 9 | readme = "README.md" 10 | 11 | [tool.pytest.ini_options] 12 | log_cli = 1 13 | log_cli_level = "INFO" 14 | testpaths = [ 15 | "test" 16 | ] 17 | 18 | [tool.poetry.dependencies] 19 | s3fs = {extras = ["boto3"], version = ">=2023.12.0"} 20 | boto3 = "*" 21 | botocore = "*" 22 | python = ">=3.11,<=3.13" 23 | s3transfer = "0.10.4" 24 | pandas = "^2.2.3" 25 | advertools = "^0.14.1" 26 | xmltodict = "^0.13.0" 27 | sqlalchemy = "^2.0.35" 28 | psycopg2-binary = "^2.9.5" 29 | alembic = "^1.13.1" 30 | beautifulsoup4 = "^4.11.1" 31 | asyncio = "^3.4.3" 32 | tomli = "^2.0.1" 33 | aiohttp = "^3.10.8" 34 | pytest-asyncio = "^0.23.5" 35 | swifter = "^1.4.0" 36 | tenacity = "^8.2.3" 37 | sentry-sdk = ">=2.53.0" 38 | modin = {extras = ["ray"], version = "^0.32.0"} 39 | openpyxl = "^3.1.5" 40 | requests = "^2.32.3" 41 | thefuzz = "^0.22.1" 42 | dbt-core = "^1.9.2" 43 | dbt-postgres = "^1.9.0" 44 | ruff = "^0.13.3" 45 | graphviz = "^0.21" 46 | matplotlib = "^3.10.7" 47 | plotly = "^6.5.0" 48 | nbformat = "^5.10.4" 49 | kaleido = "^1.2.0" 50 | [build-system] 51 | requires = ["poetry-core>=1.1"] 52 | build-backend = "poetry.core.masonry.api" 53 | 54 | 55 | 56 | [tool.poetry.group.dev.dependencies] 57 | coverage = "^7.5.4" 58 | pytest = "^8.1.1" 59 | pytest-cov = "^5.0.0" 60 | poetry-bumpversion = "^0.3.1" 61 | pre-commit = "^2.18.1" 62 | black = "^22.3.0" 63 | isort = "^5.10.1" 64 | flake8 = "^4.0.1" 65 | invoke = "^1.7.3" 66 | deptry = "^0.20.0" 67 | graphviz = "^0.21" 68 | ipykernel = "^7.0.1" 69 | -------------------------------------------------------------------------------- /my_dbt_project/models/analytics/environmental_shares_with_desinfo_counts.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='incremental', 3 | unique_key=['start','channel_name','country'] 4 | ) 5 | }} 6 | 7 | with env_shares as ( 8 | with name_map as ( 9 | select 10 | channel_title, 11 | max(channel_name) channel_name 12 | from 13 | program_metadata pm 14 | where pm.country='france' 15 | group by 16 | channel_title 17 | ) 18 | select 19 | start, 20 | cqes."Program Metadata - Channel Name__channel_title" as "channel_title", 21 | name_map.channel_name, 22 | cqes.sum_duration_minutes, 23 | cqes."% climat" as weekly_perc_climat, 24 | 'france' as country 25 | from 26 | public.core_query_environmental_shares cqes 27 | left join 28 | name_map 29 | on 30 | name_map.channel_title=cqes."Program Metadata - Channel Name__channel_title" 31 | union all 32 | select 33 | cqesin."start", 34 | cqesin.channel_title, 35 | cqesin.channel_name, 36 | cqesin.sum_duration_minutes, 37 | cqesin."% climat" as weekly_perc_climat, 38 | country 39 | from 40 | public.core_query_environmental_shares_i8n cqesin 41 | where country!='france' 42 | ), 43 | weekly_desinfo as ( 44 | select 45 | date_trunc('week', tgc.data_item_start) week_start, 46 | tgc.data_item_channel_name, 47 | tgc.country, 48 | sum(case when tgc.mesinfo_correct is null then 0 else tgc.mesinfo_correct end) total_mesinfo 49 | from 50 | {{ ref("task_global_completion") }} tgc 51 | where tgc."Annotation Version"=1 52 | group by 53 | week_start, 54 | tgc.data_item_channel_name, 55 | tgc.country 56 | ) 57 | select 58 | env_shares.*, 59 | case when weekly_desinfo.total_mesinfo is null then 0 else weekly_desinfo.total_mesinfo end total_mesinfo 60 | from 61 | env_shares 62 | left join 63 | weekly_desinfo 64 | on 65 | env_shares.start=weekly_desinfo.week_start 66 | and env_shares.channel_name=weekly_desinfo.data_item_channel_name 67 | and env_shares.country=weekly_desinfo.country 68 | -------------------------------------------------------------------------------- /my_dbt_project/models/dashboards/core_query_causal_links.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='incremental', 3 | incremental_strategy='append', 4 | on_schema_change='append_new_columns' 5 | ) 6 | }} 7 | 8 | {% set process_month = var("process_month", date_trunc('month', current_date)) %} 9 | 10 | SELECT 11 | public.keywords.id, 12 | public.keywords.channel_title, 13 | public.keywords.country, 14 | public.keywords.start, 15 | kw_consequence ->> 'keyword' AS keyword, 16 | CASE 17 | WHEN LOWER(kw_consequence ->> 'theme') LIKE '%climat%' THEN 'Crise climatique' 18 | WHEN LOWER(kw_consequence ->> 'theme') LIKE '%biodiversite%' THEN 'Crise de la biodiversité' 19 | WHEN LOWER(kw_consequence ->> 'theme') LIKE '%ressource%' THEN 'Crise des ressources' 20 | ELSE 'Autre' 21 | END AS crise, 22 | ( 23 | SELECT COUNT(*) 24 | FROM public.keywords k2 25 | WHERE k2.channel_title = public.keywords.channel_title 26 | AND k2.number_of_changement_climatique_constat_no_hrfp > 0 27 | AND k2.start BETWEEN public.keywords.start - interval '4 minutes' AND public.keywords.start + interval '4 minutes' 28 | and date_trunc('month', public.keywords.start) = cast('{{ var("process_month") }}' as date) 29 | ) AS nb_constats_climat_neighbor, 30 | ( 31 | SELECT COUNT(*) 32 | FROM public.keywords k3 33 | WHERE k3.channel_title = public.keywords.channel_title 34 | AND k3.number_of_biodiversite_concepts_generaux_no_hrfp > 0 35 | AND k3.start BETWEEN public.keywords.start - interval '4 minutes' AND public.keywords.start + interval '4 minutes' 36 | and date_trunc('month', public.keywords.start) = cast('{{ var("process_month") }}' as date) 37 | ) AS nb_constats_biodiversite_neighbor 38 | FROM public.keywords 39 | CROSS JOIN LATERAL json_array_elements(public.keywords.keywords_with_timestamp::json) kw_consequence 40 | WHERE LOWER(kw_consequence ->> 'theme') LIKE '%consequence%' 41 | and date_trunc('month', public.keywords.start) = cast('{{ var("process_month") }}' as date) -------------------------------------------------------------------------------- /test/i8n/test_country.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from quotaclimat.data_processing.mediatree.i8n.country import * 4 | 5 | def test_validate_country_code_fra(): 6 | france_code = validate_country_code("fra") 7 | assert france_code == FRANCE.code 8 | 9 | def test_validate_country_code_invalid(): 10 | with pytest.raises(ValueError, match="Invalid country code: nz"): 11 | validate_country_code("nz") 12 | 13 | def test_get_country_from_code_fra(): 14 | france = get_country_from_code("fra") 15 | assert france == FRANCE 16 | 17 | def test_get_channels_brazil(): 18 | os.environ['ENV'] = 'prod' 19 | channels = get_channels(country_code=BRAZIL.code) 20 | assert channels == BRAZIL.channels 21 | os.environ['ENV'] = 'docker' 22 | 23 | def test_get_channels_default(): 24 | os.environ['ENV'] = 'docker' 25 | channels = get_channels() 26 | assert channels == ["france2"] 27 | 28 | 29 | def test_get_channels_default(): 30 | os.environ['ENV'] = 'prod' 31 | channels = get_channels() 32 | assert channels == FRANCE.channels 33 | os.environ['ENV'] = 'docker' 34 | 35 | def test_get_channel_title_for_name(): 36 | assert get_channel_title_for_name("tf1") == "TF1" 37 | 38 | def test_get_channel_title_for_name_germany(): 39 | assert get_channel_title_for_name("rtl-television", GERMANY) == "RTL" 40 | 41 | def test_get_channels_poland(): 42 | os.environ['ENV'] = 'prod' 43 | channels = get_channels(country_code=POLAND.code) 44 | assert channels == POLAND.channels 45 | os.environ['ENV'] = 'docker' 46 | 47 | def test_get_channel_title_for_name_poland(): 48 | assert get_channel_title_for_name("tvp", POLAND) == "TVP" 49 | 50 | def test_get_channels_spain(): 51 | os.environ['ENV'] = 'prod' 52 | channels = get_channels(country_code=SPAIN.code) 53 | assert channels == SPAIN.channels 54 | os.environ['ENV'] = 'docker' 55 | 56 | def test_get_channel_title_for_name_spain(): 57 | assert get_channel_title_for_name("antenna-3", SPAIN) == "Antenna 3" -------------------------------------------------------------------------------- /alembic/versions/356882459cec_remove_category_keywords_change_columns_.py: -------------------------------------------------------------------------------- 1 | """Remove: category keywords / change columns names 2 | 3 | Revision ID: 356882459cec 4 | Revises: 2c48f626a749 5 | Create Date: 2024-04-29 10:14:27.240887 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | from sqlalchemy.dialects import postgresql 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '356882459cec' 16 | down_revision: Union[str, None] = '2c48f626a749' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.add_column('keywords', sa.Column('number_of_ressources', sa.Integer(), nullable=True)) 24 | op.add_column('keywords', sa.Column('number_of_ressources_solutions', sa.Integer(), nullable=True)) 25 | op.drop_column('keywords', 'number_of_ressources_naturelles_causes') 26 | op.drop_column('keywords', 'number_of_ressources_naturelles_concepts_generaux') 27 | op.drop_column('keywords', 'category') 28 | op.drop_column('keywords', 'number_of_ressources_naturelles_solutions') 29 | # ### end Alembic commands ### 30 | 31 | 32 | def downgrade() -> None: 33 | # ### commands auto generated by Alembic - please adjust! ### 34 | op.add_column('keywords', sa.Column('number_of_ressources_naturelles_solutions', sa.INTEGER(), autoincrement=False, nullable=True)) 35 | op.add_column('keywords', sa.Column('category', postgresql.JSON(astext_type=sa.Text()), autoincrement=False, nullable=True)) 36 | op.add_column('keywords', sa.Column('number_of_ressources_naturelles_concepts_generaux', sa.INTEGER(), autoincrement=False, nullable=True)) 37 | op.add_column('keywords', sa.Column('number_of_ressources_naturelles_causes', sa.INTEGER(), autoincrement=False, nullable=True)) 38 | op.drop_column('keywords', 'number_of_ressources_solutions') 39 | op.drop_column('keywords', 'number_of_ressources') 40 | # ### end Alembic commands ### 41 | -------------------------------------------------------------------------------- /docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run migrations before starting the application 4 | echo "Running migrations with alembic if exists" 5 | poetry run alembic upgrade head 6 | 7 | 8 | echo "update program metadata file" 9 | poetry run python3 transform_program.py 10 | if [[ $? -eq 0 ]]; then 11 | echo "Command succeeded" 12 | else 13 | echo "Command failed" 14 | fi 15 | if [[ "${REPARSE_CAUSAL_LINKS:-0}" -eq 1 ]]; then 16 | echo "Reparsing core_query_causal_links" 17 | year_end=$(date +%d) 18 | 19 | for m in $(seq 2022 2025); do 20 | start_reparse=0 21 | for mm in $(seq -w 1 12); do 22 | date="$m-$mm-01" 23 | echo "Processing month: $date" 24 | poetry run dbt run --select core_query_causal_links --vars "{\"process_month\": \"$date\"}" 25 | done 26 | done 27 | else 28 | echo "starting mediatree import app" 29 | python quotaclimat/data_processing/mediatree/api_import.py 30 | 31 | echo "ingest labelstudio data into barometre database" 32 | poetry run python -m quotaclimat.data_ingestion.labelstudio.ingest_labelstudio 33 | 34 | echo "apply dbt models - except causal links and analytics tables" 35 | poetry run dbt run --full-refresh \ 36 | --exclude core_query_causal_links \ 37 | --exclude task_global_completion \ 38 | --exclude environmental_shares_with_desinfo_counts 39 | 40 | echo "apply dbt models to build analytics tables in 'analytics' schema." 41 | poetry run dbt run --full-refresh --target analytics \ 42 | --select task_global_completion \ 43 | --select environmental_shares_with_desinfo_counts 44 | 45 | echo "Causal query case: Checking if today is the first of the month..." 46 | day=$(date +%d) 47 | 48 | if [ "$day" -eq 01 ]; then 49 | echo "✅ It's the 1st — running DBT for the previous month" 50 | 51 | # previous month (first day) 52 | prev_month=$(date -d "$(date +%Y-%m-01) -1 month" +%Y-%m-01) 53 | 54 | echo "Processing month: $prev_month" 55 | poetry run dbt run --select core_query_causal_links --vars "{\"process_month\": \"$prev_month\"}" 56 | else 57 | echo "⏭️ Not the 1st — skipping DBT run" 58 | fi 59 | 60 | fi 61 | -------------------------------------------------------------------------------- /quotaclimat/data_ingestion/scrap_html/scrap_description_article.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import aiohttp 4 | from bs4 import BeautifulSoup 5 | import asyncio 6 | import re 7 | 8 | agent = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"} 9 | async def get_url_content(url_article: str): 10 | async with aiohttp.ClientSession() as session: 11 | async with session.get(url_article, headers=agent) as response: 12 | return await response.text() 13 | 14 | def get_hat_20minutes(soup, url_article = ""): 15 | hat = soup.select_one(".hat-summary") 16 | if hat is not None: 17 | return (hat.text).strip() 18 | else: 19 | logging.warning(f"could not get hat : {url_article}") 20 | return "" 21 | 22 | # get https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta 23 | async def get_meta_news(url_article, media): 24 | result = { 25 | "title": "", 26 | "description": "", 27 | } 28 | 29 | if(media != "ouest-france"): # anti robot 30 | response = await get_url_content(str(url_article)) 31 | else: 32 | return result 33 | 34 | soup = BeautifulSoup(response, "html.parser") 35 | soup_description = soup.find(name="meta", attrs={'name': 'description'}) 36 | if soup_description is not None: 37 | description = soup_description.get("content").strip() 38 | logging.debug(f"description for {url_article} is \n {description}") 39 | result["description"] = description 40 | elif media == "20_minutes": # does not have meta description 41 | hat = get_hat_20minutes(soup, url_article) 42 | logging.info(f"reading hat for {media} - {hat}") 43 | result["description"] = hat 44 | else: 45 | logging.warning(f"could not find description for {url_article} - response \n {response}") 46 | 47 | # TODO : use it someday to parse missing data 48 | soup_title = soup.find(name="title") 49 | if soup_title is not None: 50 | result["title"] = (soup_title.string).strip() 51 | 52 | return result 53 | 54 | 55 | -------------------------------------------------------------------------------- /postgres/database_connection.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sqlalchemy import create_engine, URL, Engine 3 | from sqlalchemy.orm import sessionmaker, Session 4 | import logging 5 | 6 | logging.basicConfig(level=logging.INFO) 7 | 8 | 9 | def connect_to_db( 10 | database: str = os.environ.get("POSTGRES_DB", "barometre"), 11 | user: str = os.environ.get("POSTGRES_USER", "user"), 12 | host: str = os.environ.get("POSTGRES_HOST", "localhost"), 13 | port: int = os.environ.get("POSTGRES_PORT", 5432), 14 | password: str = os.environ.get("POSTGRES_PASSWORD", "password"), 15 | ): 16 | """ 17 | Connect to the PostgreSQL database using environment variables or provided parameters. 18 | 19 | Parameters: 20 | - database (str, optional): The name of the database. Defaults to 'barometre'. 21 | - user (str, optional): The username for accessing the database. Defaults to 'user'. 22 | - localhost (str, optional): The hostname of the database server. Defaults to 'localhost'. 23 | - port (int, optional): The port number on which the database server is listening. Defaults to 5432. 24 | - password (str, optional): The password for accessing the database. Defaults to 'password'. 25 | 26 | Returns: 27 | - Engine: The SQLAlchemy engine object representing the connection to the database. 28 | """ 29 | 30 | logging.info("Connect to the host %s for DB %s" % (host, database)) 31 | 32 | url = URL.create( 33 | drivername="postgresql", 34 | username=user, 35 | host=host, 36 | database=database, 37 | port=port, 38 | password=password, 39 | ) 40 | 41 | engine = create_engine(url) 42 | 43 | return engine 44 | 45 | 46 | def get_db_session(engine: Engine = None) -> Session: 47 | """ 48 | Create a session for interacting with the database using the provided engine. 49 | 50 | Parameters: 51 | - engine (Engine, optional): The SQLAlchemy engine object. If not provided, it calls `connect_to_db()` to obtain one. 52 | 53 | Returns: 54 | - Session: A SQLAlchemy session bound to the provided engine or created by calling `connect_to_db()`. 55 | """ 56 | if engine is None: 57 | engine = connect_to_db() 58 | 59 | Session = sessionmaker(bind=engine) 60 | return Session() 61 | -------------------------------------------------------------------------------- /quotaclimat/utils/sentry.py: -------------------------------------------------------------------------------- 1 | 2 | import ray 3 | import os 4 | import logging 5 | from quotaclimat.utils.healthcheck_config import get_app_version 6 | import sentry_sdk 7 | from sentry_sdk.integrations.logging import LoggingIntegration 8 | 9 | # read SENTRY_DSN from env 10 | functions_to_trace = [ 11 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.get_cts_in_ms_for_keywords"}, 12 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.filter_keyword_with_same_timestamp"}, 13 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.get_themes_keywords_duration"}, 14 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.count_keywords_duration_overlap"}, 15 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.filter_and_tag_by_theme"}, 16 | {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.add_primary_key"}, 17 | {"qualified_name": "quotaclimat.data_processing.mediatree.api_import.extract_api_sub"}, 18 | {"qualified_name": "quotaclimat.data_processing.mediatree.api_import.parse_reponse_subtitle"}, 19 | ] 20 | 21 | def sentry_init(): 22 | if(os.environ.get("SENTRY_DSN", None) != None): 23 | logging.info("Sentry init") 24 | logging_kwargs = {} 25 | if os.getenv("SENTRY_LOGGING") == "true": 26 | logging_kwargs = dict( 27 | enable_logs=True, 28 | integrations=[ 29 | # Only send WARNING (and higher) logs to Sentry logs, 30 | # even if the logger is set to a lower level. 31 | LoggingIntegration(sentry_logs_level=logging.INFO), 32 | ] 33 | ) 34 | sentry_sdk.init( 35 | traces_sample_rate=0.3, 36 | # To set a uniform sample rate 37 | # Set profiles_sample_rate to 1.0 to profile 100% 38 | # of sampled transactions. 39 | # We recommend adjusting this value in production, 40 | profiles_sample_rate=0.3, 41 | release=get_app_version(), 42 | # functions_to_trace=functions_to_trace, 43 | # integrations=[ # TODO : https://docs.sentry.io/platforms/python/integrations/ray/ 44 | # RayIntegration(), 45 | # ], 46 | **logging_kwargs 47 | ) 48 | else: 49 | logging.info("Sentry not init - SENTRY_DSN not found") -------------------------------------------------------------------------------- /.github/workflows/scaleway-start-import-job-update.yml: -------------------------------------------------------------------------------- 1 | name: Import job Scaleway 2 | 3 | on: 4 | workflow_dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch 5 | 6 | 7 | jobs: 8 | start-job-image: 9 | strategy: 10 | matrix: 11 | dates: [ 12 | {start_date: "2023-04-01", end_date: "2023-05-01"} 13 | ,{start_date: "2023-05-01", end_date: "2023-06-01"} 14 | ,{start_date: "2023-06-01", end_date: "2023-07-01"} 15 | ,{start_date: "2023-07-01", end_date: "2023-08-01"} 16 | ,{start_date: "2023-08-01", end_date: "2023-09-01"} 17 | ,{start_date: "2023-09-01", end_date: "2023-10-01"} 18 | ,{start_date: "2023-10-01", end_date: "2023-11-01"} 19 | ,{start_date: "2023-11-01", end_date: "2023-12-01"} 20 | ,{start_date: "2023-12-01", end_date: "2024-01-01"} 21 | ,{start_date: "2024-01-01", end_date: "2024-02-01"} 22 | ,{start_date: "2024-02-01", end_date: "2024-03-01"} 23 | ,{start_date: "2024-03-01", end_date: "2024-04-01"} 24 | ,{start_date: "2024-04-01", end_date: "2024-05-01"} 25 | ,{start_date: "2024-05-01", end_date: "2024-06-01"} 26 | ,{start_date: "2024-06-01", end_date: "2024-07-01"} 27 | ,{start_date: "2024-07-01", end_date: "2024-08-01"} 28 | ,{start_date: "2024-08-01", end_date: "2024-09-01"} 29 | ,{start_date: "2024-09-01", end_date: "2024-10-01"} 30 | ,{start_date: "2024-10-01", end_date: "2024-11-01"} 31 | ,{start_date: "2024-11-01", end_date: "2024-12-01"} 32 | ,{start_date: "2024-12-01", end_date: "2025-01-01"} 33 | ,{start_date: "2025-01-01", end_date: "2025-02-01"} 34 | ] 35 | runs-on: ubuntu-latest 36 | steps: 37 | - name: start import job to reapply logic to all elements start_date matrix 38 | uses: jawher/action-scw@v2.34.0 39 | env: 40 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }} 41 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }} 42 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }} 43 | SCW_ZONE: ${{ secrets.SCW_ZONE }} 44 | with: 45 | args: jobs definition start ${{ secrets.SCALEWAY_JOB_IMPORT_ID }} environment-variables.UPDATE=true environment-variables.BIODIVERSITY_ONLY=true environment-variables.START_DATE_UPDATE=${{ matrix.dates.start_date }} environment-variables.END_DATE=${{ matrix.dates.end_date }} 46 | -------------------------------------------------------------------------------- /alembic/versions/4333bc46985d_keywords_program_id_foreign_key.py: -------------------------------------------------------------------------------- 1 | """keywords: program_id foreign key 2 | 3 | Revision ID: 4333bc46985d 4 | Revises: ac96222af6fe 5 | Create Date: 2025-03-21 14:25:06.180296 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | from sqlalchemy.dialects import postgresql 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '4333bc46985d' 16 | down_revision: Union[str, None] = 'ac96222af6fe' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.add_column('keywords', sa.Column('program_metadata_id', sa.Text(), nullable=True)) 24 | op.create_foreign_key(None, 'keywords', 'program_metadata', ['program_metadata_id'], ['id']) 25 | op.alter_column('sitemap_table', 'download_date', 26 | existing_type=postgresql.TIMESTAMP(timezone=True), 27 | type_=sa.DateTime(), 28 | existing_nullable=True) 29 | op.alter_column('sitemap_table', 'news_publication_date', 30 | existing_type=postgresql.TIMESTAMP(timezone=True), 31 | type_=sa.DateTime(), 32 | existing_nullable=True) 33 | op.alter_column('sitemap_table', 'updated_on', 34 | existing_type=postgresql.TIMESTAMP(timezone=True), 35 | type_=sa.DateTime(), 36 | existing_nullable=True) 37 | # ### end Alembic commands ### 38 | 39 | 40 | def downgrade() -> None: 41 | # ### commands auto generated by Alembic - please adjust! ### 42 | op.alter_column('sitemap_table', 'updated_on', 43 | existing_type=sa.DateTime(), 44 | type_=postgresql.TIMESTAMP(timezone=True), 45 | existing_nullable=True) 46 | op.alter_column('sitemap_table', 'news_publication_date', 47 | existing_type=sa.DateTime(), 48 | type_=postgresql.TIMESTAMP(timezone=True), 49 | existing_nullable=True) 50 | op.alter_column('sitemap_table', 'download_date', 51 | existing_type=sa.DateTime(), 52 | type_=postgresql.TIMESTAMP(timezone=True), 53 | existing_nullable=True) 54 | op.drop_constraint(None, 'keywords', type_='foreignkey') 55 | op.drop_column('keywords', 'program_metadata_id') 56 | # ### end Alembic commands ### 57 | -------------------------------------------------------------------------------- /alembic/versions/44f13b7eebd4_dictionary_category.py: -------------------------------------------------------------------------------- 1 | """dictionary category 2 | 3 | Revision ID: 44f13b7eebd4 4 | Revises: 827fb6dde3bb 5 | Create Date: 2025-05-23 12:54:53.323525 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | from sqlalchemy.dialects import postgresql 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = '44f13b7eebd4' 16 | down_revision: Union[str, None] = '827fb6dde3bb' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.add_column('dictionary', sa.Column('category', sa.String(), nullable=True)) 24 | op.add_column('dictionary', sa.Column('theme', sa.String(), nullable=True)) 25 | 26 | op.drop_column('dictionary', 'categories') 27 | op.drop_column('dictionary', 'themes') 28 | 29 | op.drop_column('dictionary', 'solution') 30 | op.drop_column('dictionary', 'consequence') 31 | op.drop_column('dictionary', 'cause') 32 | op.drop_column('dictionary', 'general_concepts') 33 | op.drop_column('dictionary', 'statement') 34 | 35 | op.drop_column('dictionary', 'crisis_climate') 36 | op.drop_column('dictionary', 'crisis_biodiversity') 37 | op.drop_column('dictionary', 'crisis_resource') 38 | pass 39 | # ### end Alembic commands ### 40 | 41 | 42 | def downgrade() -> None: 43 | op.add_column('dictionary', sa.Column('categories', postgresql.ARRAY(sa.String()), nullable=True)) 44 | op.add_column('dictionary', sa.Column('themes', postgresql.ARRAY(sa.String()), nullable=True)) 45 | op.add_column('dictionary', sa.Column('solution', sa.Boolean(), nullable=True, server_default=sa.text('false'))) 46 | op.add_column('dictionary', sa.Column('consequence', sa.Boolean(), nullable=True, server_default=sa.text('false'))) 47 | op.add_column('dictionary', sa.Column('cause', sa.Boolean(), nullable=True, server_default=sa.text('false'))) 48 | op.add_column('dictionary', sa.Column('general_concepts', sa.Boolean(), nullable=True, server_default=sa.text('false'))) 49 | op.add_column('dictionary', sa.Column('statement', sa.Boolean(), nullable=True, server_default=sa.text('false'))) 50 | 51 | op.add_column('dictionary', sa.Column('crisis_climate', sa.Boolean(), nullable=True, server_default=sa.text('true'))) 52 | op.add_column('dictionary', sa.Column('crisis_biodiversity', sa.Boolean(), nullable=True, server_default=sa.text('true'))) 53 | op.add_column('dictionary', sa.Column('crisis_resource', sa.Boolean(), nullable=True, server_default=sa.text('true'))) 54 | 55 | op.drop_column('dictionary', 'category') 56 | op.drop_column('dictionary', 'theme') -------------------------------------------------------------------------------- /mockwebsite/cnews_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://www.cnews.fr/culture/2023-10-25/mort-de-richard-roundtree-samuel-l-jackson-gabrielle-union-carl-weathers-les 5 | 6 | 7 | CNEWS 8 | fr 9 | 10 | Mort de Richard Roundtree : Samuel L. Jackson, Gabrielle Union, Carl Weathers… Les stars rendent hommage à l’acteur de «Shaft» 11 | 2023-10-25T08:51:25+00:00 12 | Cinéma, culture, Carnet noir, hommages, People 13 | 14 | 15 | https://static.cnews.fr/sites/default/files/richard_roundtree_hommages_6538c96cd0e46_0.jpg 16 | 17 | 18 | 19 | 20 | https://www.cnews.fr/france/2023-10-25/squat-de-saint-martin-du-touch-toulouse-pres-de-200-personnes-evacuees-1410951 21 | 22 | 23 | CNEWS 24 | fr 25 | 26 | Squat de Saint-Martin-du-Touch à Toulouse : près de 200 personnes évacuées 27 | 2023-10-25T08:47:27+00:00 28 | Squat, Toulouse, Squatteurs 29 | 30 | 31 | https://static.cnews.fr/sites/default/files/capture_decran_2023-10-25_a_10.10.05_6538ce23a0be6_0.png 32 | 33 | 34 | 35 | 36 | https://www.cnews.fr/videos/monde/2023-10-25/israel-hamas-des-que-jai-vu-lhorreur-je-suis-monte-dans-le-premier-avion 37 | 38 | 39 | CNEWS 40 | fr 41 | 42 | Israël-Hamas : «Dès que j'ai vu l'horreur, je suis monté dans le premier avion», explique un soldat de la réserve de Tsahal 43 | 2023-10-25T08:29:51+00:00 44 | Israël, Tsahal, Armée, Hamas 45 | 46 | 47 | https://static.cnews.fr/sites/default/files/Video/x8p2xa3_6538a94a625ad_0.jpg 48 | Témoignage d'un réserviste mobilisé en Israël 49 | 50 | 51 | -------------------------------------------------------------------------------- /quotaclimat/data_ingestion/ingest_db/ingest_sitemap_in_db.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from argparse import ArgumentParser 3 | import sys,time 4 | import os 5 | from postgres.insert_data import insert_data_in_sitemap_table 6 | from postgres.insert_existing_data_example import \ 7 | transformation_from_dumps_to_table_entry 8 | from postgres.schemas.models import create_tables, connect_to_db, get_last_month_sitemap_id 9 | from quotaclimat.utils.healthcheck_config import run_health_check_server 10 | from quotaclimat.utils.logger import CustomFormatter 11 | import sentry_sdk 12 | from sentry_sdk.crons import monitor 13 | from quotaclimat.utils.sentry import sentry_init 14 | import asyncio 15 | from quotaclimat.data_ingestion.scrap_sitemap import \ 16 | query_one_sitemap_and_transform, get_sitemap_list 17 | 18 | 19 | 20 | async def batch_sitemap(exit_event): 21 | create_tables() 22 | 23 | conn = connect_to_db() 24 | sitemap_list = get_sitemap_list().items() 25 | logging.info("Going to parse %s" % (sitemap_list)) 26 | df_from_pg = get_last_month_sitemap_id(conn) 27 | for media, sitemap_conf in sitemap_list: 28 | try: 29 | df = await query_one_sitemap_and_transform(media, sitemap_conf, df_from_pg) 30 | df_to_insert = transformation_from_dumps_to_table_entry(df) 31 | await asyncio.to_thread(insert_data_in_sitemap_table(df_to_insert, conn)) 32 | except TypeError as err: 33 | logging.debug("Asyncio error %s" % (err)) 34 | continue 35 | except Exception as err: 36 | logging.error("Could not ingest data in db for media %s:(%s) %s" % (media,type(err).__name__, err)) 37 | continue 38 | 39 | logging.info("finished") 40 | conn.dispose() 41 | exit_event.set() 42 | return 43 | 44 | async def main(): 45 | with monitor(monitor_slug='sitemap'): #https://docs.sentry.io/platforms/python/crons/ 46 | event_finish = asyncio.Event() 47 | # Start the health check server in the background 48 | health_check_task = asyncio.create_task(run_health_check_server()) 49 | 50 | # Start batch job 51 | asyncio.create_task(batch_sitemap(event_finish)) 52 | 53 | # Wait for both tasks to complete 54 | await event_finish.wait() 55 | 56 | res=health_check_task.cancel() 57 | logging.info("Exiting with success") 58 | sys.exit(0) 59 | 60 | if __name__ == "__main__": 61 | # create logger with 'spam_application' 62 | logger = logging.getLogger() 63 | logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper()) 64 | sentry_init() 65 | # create console handler with a higher log level 66 | if (logger.hasHandlers()): 67 | logger.handlers.clear() 68 | ch = logging.StreamHandler() 69 | ch.setFormatter(CustomFormatter()) 70 | logger.addHandler(ch) 71 | 72 | asyncio.run(main()) 73 | sys.exit(0) 74 | -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/api_import_utils/db.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | import logging 3 | from typing import Tuple 4 | from quotaclimat.data_processing.mediatree.utils import * 5 | from quotaclimat.data_processing.mediatree.config import * 6 | from postgres.schemas.models import Keywords 7 | from sqlalchemy.orm import Session 8 | from sqlalchemy import Select, select, func, cast, Date, Integer, text, and_ 9 | from quotaclimat.data_processing.mediatree.i8n.country import * 10 | from typing import NamedTuple 11 | 12 | class KeywordLastStats(NamedTuple): 13 | last_day_saved: date 14 | number_of_previous_days_from_yesterday: int 15 | 16 | # Security nets to catch up delays from production servers errors 17 | 18 | def get_last_date_and_number_of_delay_saved_in_keywords(session: Session, days_filter: int = 30, country = FRANCE) -> KeywordLastStats: 19 | logging.debug(f"get_last_date_and_number_of_delay_saved_in_keywords") 20 | try: 21 | source_subquery = ( 22 | select( 23 | Keywords.start.label("start"), 24 | cast( 25 | func.extract( 26 | "day", 27 | func.date_trunc("day", (func.now() - text("INTERVAL '1 day'"))) - func.date_trunc("day", Keywords.start), 28 | ), 29 | Integer, 30 | ).label("previous_days"), 31 | ) 32 | .select_from(Keywords) 33 | .where( 34 | and_( 35 | Keywords.start >= func.now() - text(f"INTERVAL '{days_filter} days'"), 36 | Keywords.country == country.name 37 | ) 38 | ) 39 | .subquery("source") 40 | ) 41 | 42 | statement: Select[Tuple[date, int]] = ( 43 | select( 44 | func.max(cast(source_subquery.c.start, Date)).label("last_day_saved"), 45 | func.min(source_subquery.c.previous_days).label("number_of_previous_days_from_yesterday"), 46 | ) 47 | ) 48 | 49 | result = session.execute(statement).fetchone() 50 | return KeywordLastStats(result[0], result[1]) 51 | except Exception as err: 52 | logging.error("get_top_keywords_by_channel crash (%s) %s" % (type(err).__name__, err)) 53 | raise err 54 | 55 | def get_delay_date(lastSavedKeywordsDate: KeywordLastStats, normal_delay_in_days: int = 1): 56 | logging.warning(f"Delay detected : {lastSavedKeywordsDate.number_of_previous_days_from_yesterday } days, it should be {normal_delay_in_days} day") 57 | default_start_date = get_epoch_from_datetime(datetime(lastSavedKeywordsDate.last_day_saved.year,lastSavedKeywordsDate.last_day_saved.month,lastSavedKeywordsDate.last_day_saved.day)) 58 | default_number_of_previous_days = lastSavedKeywordsDate.number_of_previous_days_from_yesterday 59 | return default_start_date, default_number_of_previous_days -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | secrets/pwd_api.txt 2 | secrets/username_api.txt 3 | secrets/* 4 | s3/* 5 | i8n/mediatree_output/ 6 | i8n/csa-belge/ 7 | documents-experts/* 8 | i8n/mediatree_output 9 | i8n/csa-belge 10 | 11 | i8n/germany_big.parquet 12 | test/i8n 13 | llm/ 14 | cc-bio.json 15 | *.xlsx 16 | coverage_re 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | pip-wheel-metadata/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | db.sqlite3 78 | db.sqlite3-journal 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | .python-version 102 | 103 | # pipenv 104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 107 | # install all needed dependencies. 108 | #Pipfile.lock 109 | 110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 111 | __pypackages__/ 112 | 113 | # Celery stuff 114 | celerybeat-schedule 115 | celerybeat.pid 116 | 117 | # SageMath parsed files 118 | *.sage.py 119 | 120 | # Environments 121 | .env 122 | .venv 123 | env/ 124 | venv/ 125 | ENV/ 126 | env.bak/ 127 | venv.bak/ 128 | 129 | # Spyder project settings 130 | .spyderproject 131 | .spyproject 132 | 133 | # Rope project settings 134 | .ropeproject 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ 146 | data/* 147 | .vscode/settings.json 148 | notebooks/nlp/df_all.csv 149 | notebooks/nlp/df_X_tfidf.pkl 150 | .vscode/settings.json 151 | 152 | .DS_Store 153 | pgdata 154 | mb-data 155 | .idea 156 | pgdump/ -------------------------------------------------------------------------------- /alembic/versions/ac96222af6fe_hrfp_counters.py: -------------------------------------------------------------------------------- 1 | """hrfp counters 2 | 3 | Revision ID: ac96222af6fe 4 | Revises: 30abfd828007 5 | Create Date: 2024-12-02 14:36:21.970968 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = 'ac96222af6fe' 16 | down_revision: Union[str, None] = '30abfd828007' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.add_column('keywords', sa.Column('number_of_changement_climatique_constat_no_hrfp', sa.Integer(), nullable=True)) 24 | op.add_column('keywords', sa.Column('number_of_changement_climatique_causes_no_hrfp', sa.Integer(), nullable=True)) 25 | op.add_column('keywords', sa.Column('number_of_changement_climatique_consequences_no_hrfp', sa.Integer(), nullable=True)) 26 | op.add_column('keywords', sa.Column('number_of_attenuation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True)) 27 | op.add_column('keywords', sa.Column('number_of_adaptation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True)) 28 | op.add_column('keywords', sa.Column('number_of_ressources_no_hrfp', sa.Integer(), nullable=True)) 29 | op.add_column('keywords', sa.Column('number_of_ressources_solutions_no_hrfp', sa.Integer(), nullable=True)) 30 | op.add_column('keywords', sa.Column('number_of_biodiversite_concepts_generaux_no_hrfp', sa.Integer(), nullable=True)) 31 | op.add_column('keywords', sa.Column('number_of_biodiversite_causes_no_hrfp', sa.Integer(), nullable=True)) 32 | op.add_column('keywords', sa.Column('number_of_biodiversite_consequences_no_hrfp', sa.Integer(), nullable=True)) 33 | op.add_column('keywords', sa.Column('number_of_biodiversite_solutions_no_hrfp', sa.Integer(), nullable=True)) 34 | # ### end Alembic commands ### 35 | 36 | 37 | def downgrade() -> None: 38 | # ### commands auto generated by Alembic - please adjust! ### 39 | op.drop_column('keywords', 'number_of_biodiversite_solutions_no_hrfp') 40 | op.drop_column('keywords', 'number_of_biodiversite_consequences_no_hrfp') 41 | op.drop_column('keywords', 'number_of_biodiversite_causes_no_hrfp') 42 | op.drop_column('keywords', 'number_of_biodiversite_concepts_generaux_no_hrfp') 43 | op.drop_column('keywords', 'number_of_ressources_solutions_no_hrfp') 44 | op.drop_column('keywords', 'number_of_ressources_no_hrfp') 45 | op.drop_column('keywords', 'number_of_adaptation_climatique_solutions_no_hrfp') 46 | op.drop_column('keywords', 'number_of_attenuation_climatique_solutions_no_hrfp') 47 | op.drop_column('keywords', 'number_of_changement_climatique_consequences_no_hrfp') 48 | op.drop_column('keywords', 'number_of_changement_climatique_causes_no_hrfp') 49 | op.drop_column('keywords', 'number_of_changement_climatique_constat_no_hrfp') 50 | # ### end Alembic commands ### 51 | -------------------------------------------------------------------------------- /mockwebsite/lefigaro_localhost_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | http://localhost:8000/mediapart_website.html 5 | 2023-10-12T17:34:28+02:00 6 | 7 | 8 | Le Figaro 9 | fr 10 | 11 | 2023-10-12T06:13:00+02:00 12 | EN DIRECT - Conflit Hamas-Israël : l’armée israélienne dit avoir frappé Gaza avec 4000 tonnes d’explosifs depuis samedi 13 | Israël, Hamas, conflit israélo-palestinien, International, actualité internationale, affaires étrangères, ministère des affaires étrangères, politique étrangère 14 | Blog 15 | 16 | 17 | https://i.f1g.fr/media/cms/orig/2023/10/12/eccf7495cede8869a8a35d6fd70a1635759a12dbef68dd16e82e34162f69ec4f.jpg 18 | Explosion dans le centre de la ville de Gaza ce jeudi 12 octobre. 19 | 20 | 21 | 22 | http://localhost:8000/20minutes_website.html 23 | 2023-10-12T17:34:21+02:00 24 | 25 | 26 | Le Figaro 27 | fr 28 | 29 | 2023-10-11T16:16:00+02:00 30 | Grève du 13 octobre : SNCF, RATP, aérien, médecins… Retrouvez le détail des perturbations à prévoir 31 | grève, salaires, social, RH, ressources humaines, primes, conjoncture, entreprise, œuvres sociales, trséorerie, finance, comoité d'entreprise, elections syndicales, gestion entreprise, TPE, PME, PMI, CAC 40, fiscalité des entreprises, actualités sociales 32 | 33 | 34 | https://i.f1g.fr/media/cms/orig/2023/10/09/8f1062e1948f5c0abb930b0665ec4958613a74853c8fba9dfb7f374b3ec82065.jpg 35 | Grève: à quoi faut-il s’attendre ce 13 octobre ? 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /mockwebsite/20minutes_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | https://www.20minutes.fr/justice/4059662-20231027-prisons-proces-rugby-re-passe-heure-voiture-eric-dupond-moretti20minutes.frfr2023-10-27T10:07:37+02:00Prisons, procès, rugby… On a (re) passé une heure en voiture avec Éric Dupond-Morettihttps://img.20mn.fr/DWn2CVxERkK9ZEKE_2ASMyk/1200x768_eric-dupond-moretti-au-centre-a-inaugure-mercrediEric Dupond-Moretti (au centre) a inauguré mercredi le centre pénitentiaire de Troyes-Lavau, dans l'Aube, aux côtés du maire de Troyes, François Baroin (à droiteà, et celui de Lavau, Jacques Gachowski (à gauche)https://www.20minutes.fr/guide-achat/guide-achat-bon-plan-cdiscount/4059580-20231026-top-5-meilleures-trottinettes-electriques-petit-prix-chez-cdiscount20minutes.frfr2023-10-27T10:05:36+02:00Top 5 des meilleures trottinettes électriques à petit prix chez Cdiscounthttps://img.20mn.fr/ilZnoCiMQsyvdlq67n7upyk/1200x768_top-5-des-meilleures-trottinettes-electriques-a-petit-prix-chez-cdiscountTop 5 des meilleures trottinettes électriques à petit prix chez Cdiscounthttps://www.20minutes.fr/monde/etats-unis/4059735-20231027-fusillades-etats-unis-direct-police-americaine-toujours-recherche-robert-card20minutes.frfr2023-10-27T10:04:16+02:00Fusillades aux Etats-Unis EN DIRECT : La police américaine toujours à la recherche de Robert Card…https://img.20mn.fr/OB_g4z-PQ6yJwXKhJBgf5yk/1200x768_oct-26-2023-bowdoin-maine-usa-law-enforcement-officers-search-the-area-of-bowdoin-maine-the-day-after-a-suspect-killed-at-least-18-people-during-multiple-shootings-in-the-lewiston-area-mandatory-credit-camille-fine-usa-today-sipa-usa-49221769-zd5-2310270429Des agents des forces de l'ordre fouillent la zone de Bowdoin, dans le Maine, au lendemain du jour où un suspect a tué au moins 18 personnes lors de multiples fusillades dans la région de Lewiston. -------------------------------------------------------------------------------- /mockwebsite/lefigaro_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | http://nginxtest:80/mediapart_website.html 5 | 2023-10-12T17:34:28+02:00 6 | 7 | 8 | Le Figaro 9 | fr 10 | 11 | 2023-10-12T06:13:00+02:00 12 | EN DIRECT - Conflit Hamas-Israël : l’armée israélienne dit avoir frappé Gaza avec 4000 tonnes d’explosifs depuis samedi 13 | Israël, Hamas, conflit israélo-palestinien, International, actualité internationale, affaires étrangères, ministère des affaires étrangères, politique étrangère 14 | Blog 15 | 16 | 17 | https://i.f1g.fr/media/cms/orig/2023/10/12/eccf7495cede8869a8a35d6fd70a1635759a12dbef68dd16e82e34162f69ec4f.jpg 18 | Explosion dans le centre de la ville de Gaza ce jeudi 12 octobre. 19 | 20 | 21 | 22 | http://nginxtest:80/20minutes_website.html 23 | 2023-10-12T17:34:21+02:00 24 | 25 | 26 | Le Figaro 27 | fr 28 | 29 | 2023-10-11T16:16:00+02:00 30 | 31 | Grève du 13 octobre : SNCF, RATP, aérien, médecins… Retrouvez le détail des perturbations à prévoir 32 | grève, salaires, social, RH, ressources humaines, primes, conjoncture, entreprise, œuvres sociales, trséorerie, finance, comoité d'entreprise, elections syndicales, gestion entreprise, TPE, PME, PMI, CAC 40, fiscalité des entreprises, actualités sociales 33 | 34 | 35 | https://i.f1g.fr/media/cms/orig/2023/10/09/8f1062e1948f5c0abb930b0665ec4958613a74853c8fba9dfb7f374b3ec82065.jpg 36 | Grève: à quoi faut-il s’attendre ce 13 octobre ? 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /my_dbt_project/models/dashboards/core_query_thematics_keywords_i8n.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='incremental' 3 | ,unique_key=['week','channel_title'] 4 | ) 5 | }} 6 | {{ config( 7 | materialized='incremental' 8 | ,unique_key=['week','channel_title'] 9 | ) 10 | }} 11 | 12 | WITH keyword_occurrences AS ( 13 | SELECT DISTINCT 14 | COALESCE(pm.channel_title, k.channel_title) AS channel_title, 15 | DATE_TRUNC('week', k.start)::date AS week, 16 | k.start AS occurrence_time, 17 | k.country AS country, 18 | -- Semantic tags 19 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%solution%' THEN TRUE ELSE FALSE END AS is_solution, 20 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%consequence%' THEN TRUE ELSE FALSE END AS is_consequence, 21 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%cause%' THEN TRUE ELSE FALSE END AS is_cause, 22 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%concepts_generaux%' THEN TRUE ELSE FALSE END AS is_general_concepts, 23 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%constat%' THEN TRUE ELSE FALSE END AS is_statement, 24 | -- Crisis type 25 | CASE 26 | WHEN LOWER(kw ->> 'theme') LIKE '%climat%' THEN 'Crise climatique' 27 | WHEN LOWER(kw ->> 'theme') LIKE '%biodiversite%' THEN 'Crise de la biodiversité' 28 | WHEN LOWER(kw ->> 'theme') LIKE '%ressource%' THEN 'Crise des ressources' 29 | ELSE 'Autre' 30 | END AS crise_type, 31 | kw ->> 'theme' AS theme, 32 | kw ->> 'keyword' AS keyword 33 | FROM public.keywords k 34 | LEFT JOIN public.program_metadata pm 35 | ON k.channel_program = pm.channel_program 36 | AND k.channel_name = pm.channel_name 37 | AND ( 38 | ( 39 | CASE 40 | WHEN ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7) = 0 THEN 7 41 | ELSE ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7) 42 | END = pm.weekday 43 | ) 44 | ) 45 | -- AND k.country = pm.country 46 | AND CAST(k.start AS date) BETWEEN CAST(pm.program_grid_start AS date) 47 | AND CAST(pm.program_grid_end AS date) 48 | , json_array_elements(k.keywords_with_timestamp::json) AS kw 49 | WHERE 50 | LOWER(kw ->> 'theme') NOT LIKE '%indirect%' 51 | ) 52 | 53 | SELECT 54 | ko.channel_title, 55 | ko.country, 56 | ko.week, 57 | COALESCE(NULLIF(d.category, ''), 'Transversal') AS category, 58 | d.high_risk_of_false_positive, 59 | ko.is_solution, 60 | ko.is_consequence, 61 | ko.is_cause, 62 | ko.is_general_concepts, 63 | ko.is_statement, 64 | ko.crise_type, 65 | ko.theme, 66 | ko.keyword, 67 | COUNT(*) AS count 68 | FROM keyword_occurrences ko 69 | LEFT JOIN public.dictionary d 70 | ON d.keyword = ko.keyword AND d.theme LIKE ko.theme || '%' -- ensure matc with indirect theme inside the dictionary table 71 | GROUP BY 72 | ko.country, 73 | ko.channel_title, 74 | ko.week, 75 | d.high_risk_of_false_positive, 76 | COALESCE(NULLIF(d.category, ''), 'Transversal'), 77 | ko.is_solution, 78 | ko.is_consequence, 79 | ko.is_cause, 80 | ko.is_general_concepts, 81 | ko.is_statement, 82 | ko.crise_type, 83 | ko.theme, 84 | ko.keyword 85 | ORDER BY 86 | ko.channel_title, ko.week, ko.crise_type -------------------------------------------------------------------------------- /test/sitemap/test_mediatree_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | 4 | from test_utils import get_localhost 5 | from quotaclimat.data_processing.mediatree.utils import * 6 | 7 | import logging 8 | from time import strftime,localtime 9 | 10 | localhost = get_localhost() 11 | 12 | def test_get_date_sql_query(): 13 | date = datetime(2024, 12, 12, 0, 0, 0) 14 | expected = "'2024-12-12 00:00:00.000 +00:00'" 15 | 16 | assert get_date_sql_query(date) == expected 17 | 18 | def test_get_yesterday(): 19 | yesterday = get_yesterday() 20 | yesterday_string = strftime('%Y-%m-%d %H:%M:%S', localtime(yesterday)) 21 | logging.info(f"yesterday_string {yesterday_string}") 22 | assert '00:00:00' in yesterday_string 23 | 24 | def test_is_it_tuesday(): 25 | date = pd.Timestamp("2024-02-13 15:34:28") 26 | assert is_it_tuesday(date) == True 27 | 28 | date = pd.Timestamp("2024-01-01 15:34:28") 29 | assert is_it_tuesday(date) == False 30 | 31 | def test_get_end_of_month(): 32 | assert get_end_of_month("2024-04-01") == "2024-04-30" 33 | assert get_end_of_month("2024-02-01") == "2024-02-29" 34 | assert get_end_of_month("2024-02-15") == "2024-02-29" 35 | 36 | def test_get_first_of_month(): 37 | date = datetime(2024, 12, 12, 0, 0, 0) 38 | assert get_first_of_month(date) == "2024-12-01" 39 | 40 | def test_get_date_now_minus_days(): 41 | date = datetime(2024, 12, 12, 0, 0, 0) 42 | assert get_date_now_minus_days(start=date, minus_days=6) == "2024-12-06" 43 | assert get_date_now_minus_days(start=date, minus_days=13) == "2024-11-29" 44 | 45 | 46 | def test_get_start_end_date_env_variable_with_default(): 47 | start_date = 0 48 | 49 | assert get_start_end_date_env_variable_with_default(start_date, minus_days=1) == (get_yesterday(), None) 50 | 51 | def test_get_start_end_date_env_variable_with_start_date_value(): 52 | start_date = 1734508085 53 | number_of_previous_days = 7 54 | start_date_minus_days = start_date - (number_of_previous_days * 24 * 60 * 60) 55 | 56 | assert get_start_end_date_env_variable_with_default(start_date, minus_days=number_of_previous_days) == (int(start_date), start_date_minus_days) 57 | 58 | def test_get_start_end_date_with_get_date_range(): 59 | start_date = 1734508085 60 | number_of_previous_days = 7 61 | (start,end) = get_start_end_date_env_variable_with_default(start_date, minus_days=number_of_previous_days) 62 | 63 | expected = pd.DatetimeIndex(['2024-12-11', '2024-12-12', '2024-12-13', '2024-12-14', '2024-12-15', '2024-12-16', '2024-12-17', '2024-12-18'], 64 | dtype='datetime64[ns]', freq='D') 65 | 66 | output = get_date_range(start,end) 67 | assert len(output) == number_of_previous_days + 1 68 | pd.testing.assert_index_equal(output, expected) 69 | 70 | def test_get_start_end_date_with_get_date_range_default(): 71 | start_date = 0 72 | number_of_previous_days = 7 73 | (start,end) = get_start_end_date_env_variable_with_default(start_date, minus_days=number_of_previous_days) 74 | 75 | 76 | output = get_date_range(start,end, minus_days=number_of_previous_days) 77 | assert len(output) == number_of_previous_days -------------------------------------------------------------------------------- /test/s3/test_s3.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | from quotaclimat.data_processing.mediatree.s3.api_to_s3 import get_bucket_key, get_bucket_key_folder, get_partition_s3 4 | from quotaclimat.data_processing.mediatree.s3.s3_utils import read_folder_from_s3, transform_raw_keywords 5 | from quotaclimat.data_processing.mediatree.channel_program import * 6 | from quotaclimat.data_processing.mediatree.i8n.country import * 7 | 8 | def test_get_bucket_key_default(): 9 | friday_6h26 = 1726719981 10 | date = pd.to_datetime(friday_6h26, unit='s', utc=True) 11 | channel = "tf1" 12 | assert get_bucket_key(date, channel) == "year=2024/month=9/day=19/channel=tf1/*.parquet" 13 | 14 | def test_get_bucket_key_france(): 15 | friday_6h26 = 1726719981 16 | date = pd.to_datetime(friday_6h26, unit='s', utc=True) 17 | channel = "tf1" 18 | assert get_bucket_key(date, channel, country=FRANCE) == "year=2024/month=9/day=19/channel=tf1/*.parquet" 19 | 20 | def test_get_bucket_key_country(): 21 | friday_6h26 = 1726719981 22 | date = pd.to_datetime(friday_6h26, unit='s', utc=True) 23 | channel = "tf1" 24 | assert get_bucket_key(date, channel, country=GERMANY) == f"country={GERMANY.name}/year=2024/month=9/day=19/channel=tf1/*.parquet" 25 | 26 | def test_get_bucket_key_first_of_the_month(): 27 | first_december = 1733040125 28 | date = pd.to_datetime(first_december, unit='s', utc=True) 29 | channel = "tf1" 30 | assert get_bucket_key(date, channel) == "year=2024/month=12/day=1/channel=tf1/*.parquet" 31 | 32 | def test_get_bucket_key_first_of_the_month_default(): 33 | first_december = 1733040125 34 | date = pd.to_datetime(first_december, unit='s', utc=True) 35 | channel = "tf1" 36 | assert get_bucket_key_folder(date, channel) == "year=2024/month=12/day=1/channel=tf1/" 37 | 38 | def test_get_bucket_key_first_of_the_month_france(): 39 | first_december = 1733040125 40 | date = pd.to_datetime(first_december, unit='s', utc=True) 41 | channel = "tf1" 42 | key_folder = f"year=2024/month=12/day=1/channel=tf1/" 43 | assert get_bucket_key_folder(date, channel, country=FRANCE) == key_folder 44 | 45 | def test_get_bucket_key_first_of_the_month_brazil(): 46 | first_december = 1733040125 47 | date = pd.to_datetime(first_december, unit='s', utc=True) 48 | channel = "tf1" 49 | key_folder = f"country={BRAZIL.name}/year=2024/month=12/day=1/channel=tf1/" 50 | assert get_bucket_key_folder(date, channel, country=BRAZIL) == key_folder 51 | 52 | def test_get_partition_s3_france_legacy(): 53 | assert get_partition_s3(FRANCE) == ['year', 'month', 'day', 'channel'] 54 | 55 | def test_get_partition_s3_other_country_than_france(): 56 | assert get_partition_s3(GERMANY) == ['country','year', 'month', 'day', 'channel'] 57 | assert get_partition_s3(BRAZIL) == ['country','year', 'month', 'day', 'channel'] 58 | 59 | # TODO need to mock s3 reads 60 | # def test_read_folder_from_s3(): 61 | # first_december = 1733040125 62 | # date = pd.to_datetime(first_december, unit='s', utc=True) 63 | # read_folder_from_s3(date=date, channel="tf1", storage_options=None) 64 | 65 | # assert False == True 66 | 67 | def test_transform_raw_keywords(): 68 | df= pd.read_parquet(path="test/s3/one-day-one-channel.parquet") 69 | df_programs = get_programs() 70 | output = transform_raw_keywords(df, df_programs=df_programs) 71 | 72 | assert len(output) == 31 -------------------------------------------------------------------------------- /mockwebsite/lacroix_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | https://www.la-croix.com/Evasion-Reau-helicoptere-heure-verdict-Redoine-Faid-2023-10-24-13012881582023-10-25T09:49:48+01:00La Croixfr2023-10-24T23:56:04+01:00Evasion par hélicoptère de Rédoine Faïd: le verdict attendu en fin d'après-midiprocès, prison, prisonniers, évasion, assises, 75http://i.la-croix.com/x/2023/10/24/1301288158/Croquis-audience-Redoine-Faid-ouverture-proces-devant-assises-Paris-5-septembre-2023_0.jpgCroquis d'audience de Rédoine Faïd à l'ouverture de son procès devant la cour d'assises de Paris, le 5 septembre 2023 https://www.la-croix.com/international/guerre-israel-hamas-jour-19-attaque-bande-gaza-otages-liban-resume-2023-10-25-12012881672023-10-25T09:36:14+01:00La Croixfr2023-10-25T05:16:56+01:00Guerre Israël-Hamas : Macron à Amman puis au Caire, 80 morts à Gaza selon le Hamasconflit israélo-palestinien, Israël, Hamas, Moyen-Orienthttp://i.la-croix.com/x/2023/10/25/1201288167/camions-daide-humanitaire-attendent-pouvoir-franchir-passage-Rafah-permettant-dacceder-bande-Gaza-Egypte-24-octobre-2023_0.jpgDes camions d’aide humanitaire attendent de pouvoir franchir le passage de Rafah permettant d’accéder à la bande de Gaza, en Égypte, le 24 octobre 2023.https://www.la-croix.com/debat/Vie-destin-saint-Crepin-2023-10-25-12012881852023-10-25T09:29:28+01:00La Croixfr2023-10-25T09:29:28+01:00Vie et destin de saint CrépinAlain Rémond, Chroniqueshttp://i.la-croix.com/x/2023/10/25/1201288185/Alain-Remond_0.jpgAlain Rémond.https://www.la-croix.com/Boxe-Naoya-Inoue-defier-Marlon-Tapales-devenir-roi-inconteste-super-coqs-2023-10-25-13012881842023-10-25T09:26:10+01:00La Croixfr2023-10-25T09:26:10+01:00Boxe: Naoya Inoue va défier Marlon Tapales pour devenir le roi incontesté des super-coqsBox, JPN, Inoue, PHI, Tapaleshttp://i.la-croix.com/x/2023/10/25/1301288184/boxeur-japonais-Naoya-Inoue-25-octobre-2023-Yokohama_0.jpgLe boxeur japonais Naoya Inoue, le 25 octobre 2023 à Yokohama -------------------------------------------------------------------------------- /mockwebsite/midilibre_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://www.midilibre.fr/2023/10/24/emmanuel-macron-en-israel-le-president-annonce-que-les-sept-kidnappes-francais-sont-bien-vivantss-11539314.php 5 | 6 | 7 | Midi Libre 8 | fr 9 | 10 | 2023-10-24T10:01:57+02:00 11 | Les neuf "kidnappés Français" par le Hamas sont "bien vivants", annonce Emmanuel Macron en visite en Israël 12 | Attaque du Hamas contre Israël, Emmanuel Macron 13 | 14 | 15 | https://images.midilibre.fr/api/v1/images/view/653760e38756005f7e7a81d9/hd/image.jpg?v=1 16 | Les neuf "kidnappés Français" par le Hamas sont "bien vivants", annonce Emmanuel Macron en visite en Israël 17 | 18 | 19 | 20 | https://www.midilibre.fr/2023/10/24/controle-technique-des-deux-roues-motos-scooters-comment-la-mesure-va-t-elle-etre-mise-en-place-a-partir-de-2024-11539363.php 21 | 22 | 23 | Midi Libre 24 | fr 25 | 26 | 2023-10-24T10:01:03+02:00 27 | Contrôle technique des deux roues : motos, scooters... comment la mesure va-t-elle être mise en place à partir de 2024 28 | Auto-moto 29 | 30 | 31 | https://images.midilibre.fr/api/v1/images/view/6537772054da116cc865b469/hd/image.jpg?v=1 32 | Contrôle technique des deux roues : motos, scooters... comment la mesure va-t-elle être mise en place à partir de 2024 33 | 34 | 35 | 36 | https://www.midilibre.fr/2023/10/24/podcast-comment-les-caves-cooperatives-viticoles-sont-nees-et-quel-avenir-pour-ces-structures-aujourdhui-11532063.php 37 | 38 | 39 | Midi Libre 40 | fr 41 | 42 | 2023-10-24T10:06:02+02:00 43 | PODCAST. Comment les caves coopératives viticoles sont nées et quel avenir pour ces structures aujourd'hui 44 | Podcasts, Viticulture, Aude 45 | 46 | 47 | https://images.midilibre.fr/api/v1/images/view/6530efb5eea84505924071ba/hd/image.jpg?v=1 48 | PODCAST. Comment les caves coopératives viticoles sont nées et quel avenir pour ces structures aujourd'hui 49 | 50 | 51 | -------------------------------------------------------------------------------- /postgres/insert_data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import pandas as pd 5 | from sqlalchemy import DateTime 6 | from sqlalchemy.dialects.postgresql import insert 7 | from sqlalchemy import JSON 8 | from postgres.schemas.models import sitemap_table, Keywords, Stop_Word, keywords_table 9 | from datetime import datetime 10 | 11 | def clean_data(df: pd.DataFrame): 12 | df = df.drop_duplicates(subset="id") 13 | return df.query("id != 'empty'") # TODO improve - should be a None ? 14 | 15 | ## UPSERT 16 | def insert_or_update_on_conflict(table, conn, keys, data_iter): 17 | data = [dict(zip(keys, row)) for row in data_iter] 18 | insert_stmt = insert(table.table).values(data) 19 | # pk for tables 20 | if table.table.name == keywords_table: 21 | pk = ("id", "start") # pk of keywords 22 | else: 23 | pk = ("id",) 24 | 25 | upsert_stmt = insert_stmt.on_conflict_do_update( 26 | index_elements=list(pk), 27 | set_={k: insert_stmt.excluded[k] for k in keys if k not in pk} 28 | ) 29 | 30 | return conn.execute(upsert_stmt) 31 | 32 | # do not save when primary key already exist - ignore duplicate key 33 | # from https://stackoverflow.com/a/69421596/3535853 34 | def insert_or_do_nothing_on_conflict(table, conn, keys, data_iter): 35 | data = [dict(zip(keys, row)) for row in data_iter] 36 | 37 | insert_statement = insert(table.table).values(data) 38 | 39 | on_duplicate_key_stmt = insert_statement.on_conflict_do_update( 40 | constraint=f"{table.table.name}_pkey", 41 | set_={c.key: c for c in insert_statement.excluded}, 42 | ) 43 | 44 | return conn.execute(on_duplicate_key_stmt) 45 | 46 | def show_sitemaps_dataframe(df: pd.DataFrame): 47 | try: 48 | df_tmp = df.groupby(by="id").size().reset_index(name="count").nlargest(5, "count") 49 | df_final = df_tmp[df_tmp['count'] > 1] 50 | if df_final.empty: 51 | logging.debug("No duplicates detected") 52 | else: 53 | logging.warning("Duplicates to remove : %s out of %s" % (len(df_final), len(df))) 54 | except Exception as err: 55 | logging.warning("Could show sitemap before saving : \n %s \n %s" % (err, df.head(1).to_string())) 56 | 57 | 58 | def save_to_pg(df, table, conn): 59 | number_of_elements = len(df) 60 | logging.info(f"Saving {number_of_elements} elements to PG table '{table}'") 61 | 62 | try: 63 | logging.debug("Schema before saving\n%s", df.dtypes) 64 | if table == keywords_table: 65 | df['updated_at'] = datetime.now() 66 | 67 | df.to_sql( 68 | table, 69 | index=False, 70 | con=conn, 71 | if_exists="append", 72 | chunksize=1000, 73 | method=insert_or_update_on_conflict, # TODO upsert 74 | dtype={"keywords_with_timestamp": JSON, "theme": JSON, "srt": JSON}, # only for keywords 75 | ) 76 | logging.info("Saved dataframe to PG") 77 | return len(df) 78 | except Exception as err: 79 | logging.error("Could not save : \n %s" % (err)) 80 | raise err 81 | 82 | def insert_data_in_sitemap_table(df: pd.DataFrame, conn): 83 | number_of_rows = len(df) 84 | if(number_of_rows == 0): 85 | logging.warning("0 elements to parse") 86 | else: 87 | logging.info("Received %s elements", number_of_rows) 88 | 89 | show_sitemaps_dataframe(df) 90 | 91 | df = clean_data(df) 92 | save_to_pg(df, sitemap_table, conn) 93 | 94 | -------------------------------------------------------------------------------- /test/mediatree/test_mediatree_queries.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from sqlalchemy import Engine 4 | 5 | from quotaclimat.data_processing.mediatree.stop_word.main import * 6 | from postgres.schemas.models import get_db_session, connect_to_db, drop_tables 7 | from quotaclimat.data_processing.mediatree.api_import_utils.db import * 8 | from postgres.insert_data import save_to_pg 9 | from postgres.schemas.models import create_tables, get_db_session, get_keyword, connect_to_db, drop_tables, empty_tables,keywords_table 10 | from datetime import date 11 | from quotaclimat.data_processing.mediatree.update_pg_keywords import * 12 | 13 | conn = connect_to_db() 14 | session = get_db_session(conn) 15 | 16 | 17 | 18 | def test_mediatree_get_last_date_and_number_of_delay_saved_in_keywords(): 19 | conn: Engine = connect_to_db() 20 | create_tables(conn) 21 | session = get_db_session(conn) 22 | start = pd.to_datetime("2025-01-26 12:18:54", utc=True).tz_convert('Europe/Paris') 23 | wrong_value = 1 24 | pk = "delete_me" 25 | df = pd.DataFrame([{ 26 | "id" : pk, 27 | "start": start, 28 | "plaintext": "test", 29 | "channel_name": "test", 30 | "channel_radio": False, 31 | "theme":[], 32 | "keywords_with_timestamp": [], 33 | "srt": [], 34 | "number_of_keywords": wrong_value, # wrong data to reapply our custom logic for "new_value" 35 | "number_of_changement_climatique_constat": wrong_value, 36 | "number_of_changement_climatique_causes_directes": wrong_value, 37 | "number_of_changement_climatique_consequences": wrong_value, 38 | "number_of_attenuation_climatique_solutions_directes": wrong_value, 39 | "number_of_adaptation_climatique_solutions_directes": wrong_value, 40 | "number_of_ressources": wrong_value, 41 | "number_of_ressources_solutions": wrong_value, 42 | "number_of_biodiversite_concepts_generaux": wrong_value, 43 | "number_of_biodiversite_causes_directes": wrong_value, 44 | "number_of_biodiversite_consequences": wrong_value, 45 | "number_of_biodiversite_solutions_directes" : wrong_value, 46 | "channel_program_type": "to change", 47 | "channel_program":"to change" 48 | ,"program_metadata_id":"336643dc7fa09ac7335a4ceba43270ed3f553be3383a9b3b6e3cced101f2a87a" 49 | ,"channel_title":"channel_title" 50 | ,"number_of_keywords_climat": wrong_value 51 | ,"number_of_keywords_biodiversite": wrong_value 52 | ,"number_of_keywords_ressources": wrong_value 53 | ,"country" :"france" 54 | }]) 55 | 56 | save_to_pg(df, keywords_table, conn) 57 | 58 | keywordStats = get_last_date_and_number_of_delay_saved_in_keywords(session, days_filter=3000) 59 | expected_max_date = KeywordLastStats(date(2025, 1, 26), 2) 60 | 61 | assert expected_max_date.last_day_saved == keywordStats.last_day_saved 62 | assert keywordStats.number_of_previous_days_from_yesterday > 1 63 | delete_keywords_id(session, pk) 64 | session.commit() 65 | session.close() 66 | 67 | 68 | def test_get_delay_date(): 69 | unixtimestamp_2025_01_26 = 1737849600 70 | expected_max_date = KeywordLastStats(date(2025, 1, 26), 2) 71 | default_start_date, default_number_of_previous_days = get_delay_date(expected_max_date, normal_delay_in_days=1) 72 | 73 | assert default_start_date == unixtimestamp_2025_01_26 74 | assert default_number_of_previous_days == 2 -------------------------------------------------------------------------------- /alembic/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from sqlalchemy import create_engine 4 | from postgres.schemas.base import Base 5 | from quotaclimat.data_ingestion.labelstudio.models import TargetBase 6 | from alembic import context 7 | 8 | import re 9 | import os 10 | 11 | # this is the Alembic Config object, which provides 12 | # access to the values within the .ini file in use. 13 | config = context.config 14 | 15 | # Interpret the config file for Python logging. 16 | # This line sets up loggers basically. 17 | if config.config_file_name is not None: 18 | fileConfig(config.config_file_name) 19 | 20 | # add your model's MetaData object here 21 | # for 'autogenerate' support 22 | # from myapp import mymodel 23 | # target_metadata = mymodel.Base.metadata 24 | target_metadata = [Base.metadata, TargetBase.metadata] 25 | 26 | # from https://stackoverflow.com/a/63672522/3535853 27 | # https://alembic.sqlalchemy.org/en/latest/cookbook.html#don-t-generate-any-drop-table-directives-with-autogenerate 28 | def include_object(object, name, type_, reflected, compare_to): 29 | if type_ == "table" and reflected and compare_to is None: 30 | return False 31 | else: 32 | return True 33 | 34 | # other values from the config, defined by the needs of env.py, 35 | # can be acquired: 36 | # my_important_option = config.get_main_option("my_important_option") 37 | # ... etc. 38 | 39 | 40 | def run_migrations_offline() -> None: 41 | """Run migrations in 'offline' mode. 42 | 43 | This configures the context with just a URL 44 | and not an Engine, though an Engine is acceptable 45 | here as well. By skipping the Engine creation 46 | we don't even need a DBAPI to be available. 47 | 48 | Calls to context.execute() here emit the given string to the 49 | script output. 50 | 51 | """ 52 | url = config.get_main_option("sqlalchemy.url") 53 | context.configure( 54 | url=url, 55 | target_metadata=target_metadata, 56 | literal_binds=True, 57 | dialect_opts={"paramstyle": "named"}, 58 | include_object=include_object 59 | ) 60 | 61 | with context.begin_transaction(): 62 | context.run_migrations() 63 | 64 | 65 | def run_migrations_online() -> None: 66 | """Run migrations in 'online' mode. 67 | 68 | In this scenario we need to create an Engine 69 | and associate a connection with the context. 70 | 71 | """ 72 | url_tokens = { 73 | "POSTGRES_USER": os.getenv("POSTGRES_USER",""), 74 | "POSTGRES_DB": os.getenv("POSTGRES_DB",""), 75 | "POSTGRES_PASSWORD": os.getenv("POSTGRES_PASSWORD",""), 76 | "POSTGRES_HOST": os.getenv("POSTGRES_HOST",""), 77 | "POSTGRES_PORT": os.getenv("POSTGRES_PORT","") 78 | } 79 | 80 | url = config.get_main_option("sqlalchemy.url") 81 | 82 | url = re.sub(r"\${(.+?)}", lambda m: url_tokens[m.group(1)], url) 83 | 84 | connectable = create_engine(url) 85 | 86 | with connectable.connect() as connection: 87 | context.configure( 88 | connection=connection, 89 | target_metadata=target_metadata, 90 | compare_type=True, 91 | compare_server_default=True, 92 | include_object=include_object 93 | ) 94 | 95 | with context.begin_transaction(): 96 | context.run_migrations() 97 | 98 | if context.is_offline_mode(): 99 | run_migrations_offline() 100 | else: 101 | run_migrations_online() 102 | -------------------------------------------------------------------------------- /analyse/mediatree/test_program_durations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "fa23a75a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "ce7a2095", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "df = pd.read_csv(\"data/mediatree_channel_coverages_2025-12-15\")\n", 22 | "df.date = pd.to_datetime(df.date)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "44b06fa4", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "df.date.max().strftime(\"%d %b\")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "a36a6874", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "e638c622", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import plotly.express as px\n", 51 | "import os\n", 52 | "\n", 53 | "for country, group in df.groupby(\"country\"):\n", 54 | " start_date = group.date.min().strftime(\"%d %B\")\n", 55 | " end_date = group.date.max().strftime(\"%d %B\")\n", 56 | " fig = px.line(group, x=\"date\", y=\"coverage\", color='channel_name', title=f\"{country.title()}: {start_date} - {end_date}\")\n", 57 | " os.makedirs(f\"images/{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}\", exist_ok=True)\n", 58 | " fig.write_image(f\"images/{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}/coverage_{country}_chains_{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}.png\")\n", 59 | " fig.show()\n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "45d55028", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "for country, group in df.groupby(\"country\"):\n", 70 | " start_date = group.date.min().strftime(\"%d %B\")\n", 71 | " end_date = group.date.max().strftime(\"%d %B\")\n", 72 | " df_mean = group.groupby(\"date\").agg({\"coverage\": \"mean\"})\n", 73 | " fig = px.line(df_mean, y=\"coverage\", title=f\"Mean Coverage {country.title()}: {start_date} - {end_date}\")\n", 74 | " fig.write_image(f\"images/{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}/coverage_{country}_mean_{group.date.min().strftime('%d%b')}_{group.date.max().strftime('%d%b')}.png\")\n", 75 | " fig.show()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "727893ea", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": ".venv", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.11.6" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 5 108 | } 109 | -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/time_monitored/models.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | 4 | from sqlalchemy import Column, DateTime, String, Text, Boolean, ARRAY, JSON, Integer, Table, MetaData, ForeignKey, PrimaryKeyConstraint 5 | from sqlalchemy.orm import declarative_base, sessionmaker, relationship 6 | from sqlalchemy.exc import SQLAlchemyError 7 | from sqlalchemy.dialects.postgresql import insert 8 | import pandas as pd 9 | from sqlalchemy import text 10 | from postgres.database_connection import connect_to_db, get_db_session 11 | from postgres.schemas.base import Base 12 | from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS 13 | from quotaclimat.data_processing.mediatree.i8n.country import FRANCE 14 | from quotaclimat.data_ingestion.scrap_sitemap import get_consistent_hash 15 | import os 16 | import json 17 | from json import JSONDecodeError 18 | 19 | 20 | import traceback 21 | 22 | # The duration in minutes of media monitoring based on number of chunks of 2 minutes saved in S3 23 | class Time_Monitored(Base): 24 | __tablename__ = "time_monitored" 25 | id = Column(Text, primary_key=True) 26 | channel_name = Column(String, nullable=False) 27 | start = Column(DateTime(), nullable=False) 28 | duration_minutes= Column(Integer) 29 | country = Column(String, nullable=False) 30 | 31 | def get_time_monitored(id: str): 32 | session = get_db_session() 33 | return session.get(Time_Monitored, id) 34 | 35 | # count how many rows are in the dataframe and save it to postgresql inside a new table called time_monitor 36 | def save_time_monitored(number_of_rows : int, day: datetime, channel :str, country : str,session=None): 37 | """ 38 | Save the number of rows (chunk) to the time_monitor table in PostgreSQL. 39 | 40 | Args: 41 | number_of_rows (int): The number of rows (2 minute chunk) to save. 42 | day (datetime): The date of the monitoring. 43 | channel (str): The name of the channel. 44 | country (str): The country name. 45 | """ 46 | try: 47 | duration_minutes = number_of_rows * 2 # 2 minutes per chunk 48 | logging.info(f"Saving time monitored of {duration_minutes} minutes ({number_of_rows} chunks of 2 minutes) for {day} - {channel} - {country}") 49 | max_hours = 23 50 | if duration_minutes / 60 > max_hours: 51 | logging.error(f"Duration of {duration_minutes / 60} hours is above {max_hours} hours. Please check the data.") 52 | 53 | if session is None: 54 | session = get_db_session() 55 | 56 | stmt = insert(Time_Monitored).values( 57 | id=get_consistent_hash(f"{channel}_{day}_{country}"), 58 | channel_name=channel, 59 | start=day, 60 | duration_minutes=duration_minutes, 61 | country=country 62 | ) 63 | # upsert 64 | stmt = stmt.on_conflict_do_update( 65 | index_elements=['id'], # Use the 'id' column as the conflict target 66 | set_={ 67 | 'channel_name': stmt.excluded.channel_name, 68 | 'start': stmt.excluded.start, 69 | 'duration_minutes': stmt.excluded.duration_minutes, 70 | 'country': stmt.excluded.country 71 | } 72 | ) 73 | 74 | # Execute the statement 75 | session.execute(stmt) 76 | 77 | session.commit() 78 | logging.info("Saved time monitored") 79 | except SQLAlchemyError as e: 80 | logging.error(f"Error saving time monitored data: {e}") 81 | logging.error(traceback.format_exc()) 82 | finally: 83 | session.close() -------------------------------------------------------------------------------- /test/sitemap/test_keywords.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pandas as pd 4 | from quotaclimat.data_processing.mediatree.utils import * 5 | from quotaclimat.data_processing.mediatree.detect_keywords import * 6 | from quotaclimat.data_processing.mediatree.keyword.stop_words import STOP_WORDS 7 | 8 | def test_get_remove_stopwords_recycler(): 9 | stop_words_list = [ 10 | "recycler" 11 | ] 12 | ad = "nous les recycler pour en faire de nouvelles en fabriquant nous-mêmes du plastique recyclé pour cela nous avons créé trois usines exclusivement dédié au recyclage dès cette année cristallines est capable de recycler autant de bouteilles" 13 | 14 | assert remove_stopwords(ad, stop_words_list) == "nous les pour en faire de nouvelles en fabriquant nous-mêmes du plastique recyclé pour cela nous avons créé trois usines exclusivement dédié au recyclage dès cette année cristallines est capable de autant de bouteilles" 15 | 16 | def test_get_remove_stopwords_no_modification(): 17 | stop_words_list = [ 18 | "recycler" 19 | ] 20 | ad = "no keywords" 21 | 22 | assert remove_stopwords(ad, stop_words_list) == ad 23 | 24 | def test_remove_stopwords_huile(): 25 | stop_words_list = [ 26 | "recycler", 27 | "huile de coude était aussi une énergie renouvelable", 28 | "est à fond sur le tri sélectif" 29 | ] 30 | assert remove_stopwords("l' huile de coude était aussi une énergie renouvelable stéphane est à fond sur le tri sélectif",stop_words_list) \ 31 | == "l' stéphane " 32 | 33 | 34 | def test_remove_stopwords_energie(): 35 | plaintext = "quand le prix de l' énergie augmente il y a ceux qui se couvre plus ceux qui sortent moins et il y a ceux qui choisissent d' optimiser leurs énergies panneaux solaires isolations thermique pompes à chaleur chaque jour fleuron industrie parcourt la france pour vous aider à optimiser votre énergie florent industries point com en ce moment la centrale photovoltaïque de trois kilowatts et deux mille cinq cents euros et oui deux deux mille cinq cents euros cents dépêchez euros vous dépêchez vous de réserver votre kit sur fleuron industries point com la rénovation énergétique avec ici pour changer de maison sans changer de maison isolation chauffage solaire plus de confort et d' économie avec ici pas à mal casser pas mal vous avez fait une toute la pâte à modeler la je fais comment une tartine de pâte à modeler sans pâte à modeler c' est pas interdit ça s' appelle dupin juste merci pour le partage le jour où vous aimerez la pâte" 36 | output = remove_stopwords(plaintext,STOP_WORDS) 37 | # plantext does not contain photovoltaïque 38 | assert "photovoltaïque" not in output 39 | assert "rénovation énergetique" not in output 40 | assert "chauffage" not in output 41 | 42 | def test_remove_stopwords_fleuron(): 43 | plaintext = "chaque jour fleuron industrie parcourt" 44 | output = remove_stopwords(plaintext,STOP_WORDS) 45 | # plantext does not contain photovoltaïque 46 | assert output == "" 47 | 48 | def test_remove_stopwords_photovoltaique(): 49 | plaintext = "point com en ce moment la centrale photovoltaïque de trois kilowatt et à deux m" 50 | output = remove_stopwords(plaintext,STOP_WORDS) 51 | # plantext does not contain photovoltaïque 52 | assert "photovoltaïque" not in output 53 | assert len(output) == 0 54 | 55 | 56 | def test_replace_word_with_context_unk(): 57 | plaintext=" quand le prix de l' énergie augmente il y a ceux qui se couvren" 58 | output = replace_word_with_context(text=plaintext, word=" ", length_to_remove=0) 59 | assert output == "quand le prix de l' énergie augmente il y a ceux qui se couvren" 60 | -------------------------------------------------------------------------------- /mockwebsite/franceinter_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | https://www.radiofrance.fr/franceinter/attentat-de-l-opera-en-2018-le-meilleur-ami-du-terroriste-dans-le-box-des-accuses-7790800France Interfr2023-10-25T04:22:39+00:00Attentat de l'Opéra en 2018 : le meilleur ami du terroriste dans le box des accusésJustice, Attentats en France, Djihadisme, Terrorisme, Société, https://www.radiofrance.fr/s3/cruiser-production/2023/10/609eccc2-ca90-4694-a42e-9175f318a68a/1200x680_sc_maxnewsworldfour522282.jpgUne personne est décédée, quatre autres ont été blessées, lors de l'attaque dans le quartier Opéra de Paris en mai 2018. - Nicolas Jouberthttps://www.radiofrance.fr/franceinter/bronchiolite-par-manque-de-traitements-des-maternites-obligees-de-trier-les-bebes-eligibles-au-beyfortus-6279386France Interfr2023-10-25T04:16:27+00:00Bronchiolite : par manque de traitements, des maternités obligées de trier les bébés éligibles au BeyfortusSanté, Maternité, Enfance, Sociétéhttps://www.radiofrance.fr/s3/cruiser-production/2023/10/06db6c5f-d163-4f1d-af2a-f44283e87ff4/1200x680_sc_080-hl-amorcillo-2084694.jpgLes bébés peuvent bénéficier d'un traitement, le Beyfortus, permettant d'éviter les formes graves de la bronchiolite. - Aline Morcillohttps://www.radiofrance.fr/franceinter/sur-tik-tok-des-influenceurs-soutirent-des-milliers-d-euros-a-leurs-abonnes-pour-des-cadeaux-virtuels-9624456France Interfr2023-10-25T04:12:31+00:00Sur TikTok, des influenceurs soutirent des milliers d'euros à leurs abonnés pour des cadeaux virtuelsTech – Web, Applications mobiles, Société, https://www.radiofrance.fr/s3/cruiser-production/2023/10/3f085f10-3a39-43cd-82ad-617fc92b5e3c/1200x680_sc_illustration-tiktok.jpgCapture d"écran d'un "live" TikTok, au cours duquel sont proposés des cadeaux virtuels - Xavier Demagnyhttps://www.radiofrance.fr/franceinter/feminisation-attractivite-et-creativite-six-choses-a-savoir-sur-l-industrie-francaise-du-jeu-video-7091011France Interfr2023-10-24T15:50:56+00:00Féminisation, attractivité et créativité : six choses à savoir sur l'industrie française du jeu vidéo en 2023Entreprises – Marchés, Jeux vidéo, Économie, Arts et Divertissementhttps://www.radiofrance.fr/s3/cruiser-production/2023/10/9c284c6e-8797-47c7-b69a-2cab32e9917a/1200x680_sc_maxnewsfrfive059827.jpgStand d'Ubisoft, un des poids lourds du jeu vidéo français, lors de la Paris Games Week 2022 - Bruno Levesque / IP3 -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/spain/channel_program.py: -------------------------------------------------------------------------------- 1 | channels_programs_spain = [ 2 | {"channel_name": "antenna-3", "start": "06:15", "end": "08:50", "weekday": "weekday", "program_name": "Noticia de la manana", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 3 | {"channel_name": "antenna-3", "start": "15:00", "end": "15:30", "weekday": "weekday", "program_name": "Noticias 15:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 4 | {"channel_name": "antenna-3", "start": "21:00", "end": "21:30", "weekday": "weekday", "program_name": "Noticias", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 5 | 6 | {"channel_name": "rtve-la-1", "start": "06:00", "end": "06:30", "weekday": "weekday", "program_name": "Telediaro 06:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 7 | {"channel_name": "rtve-la-1", "start": "15:00", "end": "15:40", "weekday": "weekday", "program_name": "Telediaro 15:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 8 | {"channel_name": "rtve-la-1", "start": "21:00", "end": "21:30", "weekday": "weekday", "program_name": "Telediaro", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 9 | {"channel_name": "rtve-la-1", "start": "15:00", "end": "15:40", "weekday": "weekend", "program_name": "Telediaro fin de semana", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 10 | 11 | {"channel_name": "rtve-24h", "start": "14:00", "end": "14:45", "weekday": "*", "program_name": "Information 24 horas 14:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 12 | {"channel_name": "rtve-24h", "start": "20:00", "end": "20:45", "weekday": "*", "program_name": "Information 24 horas", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 13 | 14 | {"channel_name": "lasexta-news", "start": "11:00", "end": "15:00", "weekday": "weekday", "program_name": "Al Rojo Vivo", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 15 | {"channel_name": "lasexta-news", "start": "14:00", "end": "14:45", "weekday": "*", "program_name": "La Sexta Noticias 14:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 16 | {"channel_name": "lasexta-news", "start": "20:00", "end": "20:45", "weekday": "*", "program_name": "La Sexta Noticias", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 17 | 18 | {"channel_name": "telecinco-news", "start": "07:00", "end": "09:00", "weekday": "weekday", "program_name": "El Matinal 07:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 19 | {"channel_name": "telecinco-news", "start": "15:00", "end": "15:30", "weekday": "weekday", "program_name": "El Matinal 15:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 20 | {"channel_name": "telecinco-news", "start": "21:00", "end": "21:40", "weekday": "weekday", "program_name": "El Matinal", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 21 | 22 | {"channel_name": "cuatro-news", "start": "14:00", "end": "14:55", "weekday": "weekday", "program_name": "Noticias Cuatro 14:00", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 23 | {"channel_name": "cuatro-news", "start": "20:00", "end": "20:40", "weekday": "weekday", "program_name": "Noticias Cuatro", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 24 | {"channel_name": "cuatro-news", "start": "10:30", "end": "14:00", "weekday": "weekday", "program_name": "En Boca de Todos", "program_type": "","program_grid_start": "2023-04-01", "program_grid_end": "",}, 25 | 26 | ] 27 | -------------------------------------------------------------------------------- /alembic/versions/a578d21d7aee_add_tables_labelstudio.py: -------------------------------------------------------------------------------- 1 | """Add tables labelstudio 2 | 3 | Revision ID: a578d21d7aee 4 | Revises: 44f13b7eebd4 5 | Create Date: 2025-10-09 14:18:14.410103 6 | 7 | """ 8 | from typing import Sequence, Union 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = 'a578d21d7aee' 16 | down_revision: Union[str, None] = '44f13b7eebd4' 17 | branch_labels: Union[str, Sequence[str], None] = None 18 | depends_on: Union[str, Sequence[str], None] = None 19 | 20 | 21 | def upgrade() -> None: 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.create_table('labelstudio_task_aggregate', 24 | sa.Column('task_aggregate_id', sa.String(), nullable=False), 25 | sa.Column('id', sa.Integer(), nullable=False), 26 | sa.Column('data', sa.JSON(), nullable=False), 27 | sa.Column('created_at', sa.DateTime(), nullable=False), 28 | sa.Column('updated_at', sa.DateTime(), nullable=False), 29 | sa.Column('is_labeled', sa.Boolean(), nullable=False), 30 | sa.Column('project_id', sa.Integer(), nullable=True), 31 | sa.Column('meta', sa.JSON(), nullable=True), 32 | sa.Column('overlap', sa.Integer(), nullable=False), 33 | sa.Column('file_upload_id', sa.Integer(), nullable=True), 34 | sa.Column('updated_by_id', sa.Integer(), nullable=True), 35 | sa.Column('inner_id', sa.BigInteger(), nullable=True), 36 | sa.Column('total_annotations', sa.Integer(), nullable=False), 37 | sa.Column('cancelled_annotations', sa.Integer(), nullable=False), 38 | sa.Column('total_predictions', sa.Integer(), nullable=False), 39 | sa.Column('comment_count', sa.Integer(), nullable=False), 40 | sa.Column('last_comment_updated_at', sa.DateTime(), nullable=True), 41 | sa.Column('unresolved_comment_count', sa.Integer(), nullable=False), 42 | sa.Column('country', sa.String(), nullable=False), 43 | sa.PrimaryKeyConstraint('task_aggregate_id') 44 | ) 45 | op.create_table('labelstudio_task_completion_aggregate', 46 | sa.Column('task_completion_aggregate_id', sa.String(), nullable=False), 47 | sa.Column('task_aggregate_id', sa.String(), nullable=False), 48 | sa.Column('id', sa.Integer(), nullable=False), 49 | sa.Column('result', sa.JSON(), nullable=True), 50 | sa.Column('was_cancelled', sa.Boolean(), nullable=False), 51 | sa.Column('ground_truth', sa.Boolean(), nullable=False), 52 | sa.Column('created_at', sa.DateTime(), nullable=False), 53 | sa.Column('updated_at', sa.DateTime(), nullable=False), 54 | sa.Column('task_id', sa.Integer(), nullable=True), 55 | sa.Column('prediction', sa.JSON(), nullable=True), 56 | sa.Column('lead_time', sa.Double(), nullable=True), 57 | sa.Column('result_count', sa.Integer(), nullable=False), 58 | sa.Column('completed_by_id', sa.Integer(), nullable=True), 59 | sa.Column('parent_prediction_id', sa.Integer(), nullable=True), 60 | sa.Column('parent_annotation_id', sa.Integer(), nullable=True), 61 | sa.Column('last_action', sa.Text(), nullable=True), 62 | sa.Column('last_created_by_id', sa.Integer(), nullable=True), 63 | sa.Column('project_id', sa.Integer(), nullable=True), 64 | sa.Column('updated_by_id', sa.Integer(), nullable=True), 65 | sa.Column('unique_id', sa.Uuid(), nullable=True), 66 | sa.Column('draft_created_at', sa.DateTime(), nullable=True), 67 | sa.Column('import_id', sa.BigInteger(), nullable=True), 68 | sa.Column('bulk_created', sa.Boolean(), nullable=True), 69 | sa.Column('country', sa.String(), nullable=False), 70 | sa.ForeignKeyConstraint(['task_aggregate_id'], ['labelstudio_task_aggregate.task_aggregate_id'], ), 71 | sa.PrimaryKeyConstraint('task_completion_aggregate_id') 72 | ) 73 | # ### end Alembic commands ### 74 | 75 | 76 | def downgrade() -> None: 77 | # ### commands auto generated by Alembic - please adjust! ### 78 | op.drop_table('labelstudio_task_completion_aggregate') 79 | op.drop_table('labelstudio_task_aggregate') 80 | # ### end Alembic commands ### 81 | -------------------------------------------------------------------------------- /my_dbt_project/models/dashboards/thematic_query_ocean.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='incremental' 3 | ,unique_key=['id'] 4 | ) 5 | }} 6 | 7 | with clean_keywords AS ( 8 | SELECT 9 | "public"."keywords"."id" AS "id", 10 | json_array_elements( 11 | "public"."keywords"."keywords_with_timestamp" :: json 12 | ) AS kw 13 | FROM 14 | "public"."keywords" 15 | WHERE 16 | "public"."keywords"."start" >= '2025-01-01' 17 | AND "public"."keywords"."number_of_keywords" > 0 18 | AND "public"."keywords"."country" = 'france' 19 | AND "public"."keywords"."channel_title" <> 'C8' 20 | ), 21 | 22 | filtered_keywords AS ( 23 | SELECT 24 | * 25 | FROM clean_keywords 26 | INNER JOIN "public"."dictionary" 27 | ON "public"."dictionary"."keyword" = clean_keywords.kw ->> 'keyword' 28 | AND "public"."dictionary"."theme" LIKE clean_keywords.kw ->> 'theme' || '%' -- ensure matc with indirect theme inside the dictionary table 29 | WHERE 30 | "public"."dictionary"."keyword" IN ( 31 | 'acidification des océans', 32 | 'acidification des oceans', 33 | 'algues vertes', 34 | 'aménagement résilient', 35 | 'chalut', 36 | 'chalutage', 37 | 'chalutier', 38 | 'conservation marine', 39 | 'deep sea mining', 40 | 'dessalement de l’eau de mer', 41 | 'élévation du niveau de la mer', 42 | 'élévation du niveau des océans', 43 | 'érosion des côtes', 44 | 'érosion du littoral', 45 | 'exploitation fonds marins', 46 | 'exploitation gazière', 47 | 'exploitation pétrolière', 48 | 'filets de pêche', 49 | 'filets maillants', 50 | 'gestion du littoral', 51 | 'halieutique', 52 | 'hausse du niveau de la mer', 53 | 'hausse du niveau des océans', 54 | 'industrie de la pêche', 55 | 'journée mondiale des océans', 56 | 'limiter l’érosion des côtes', 57 | 'littoral', 58 | 'macro déchet plastique', 59 | 'mer', 60 | 'micro déchet plastique', 61 | 'montée du niveau de la mer', 62 | 'montée du niveau des océans', 63 | 'nano plastique', 64 | 'océan', 65 | 'océanographe', 66 | 'palangre', 67 | 'parc naturel marin', 68 | 'pêche artisanale', 69 | 'pêche au large', 70 | 'pêche côtière', 71 | 'pêche durable', 72 | 'pêche industrielle', 73 | 'pêche professionnelle', 74 | 'pêche responsable', 75 | 'pêcheur', 76 | 'petite pêche', 77 | 'plan de prévention des risques littoraux', 78 | 'pollution de la mer', 79 | 'protection des côtes', 80 | 'protection des océans', 81 | 'quota de pêche', 82 | 'réchauffement des océans', 83 | 'recul du trait de côte', 84 | 'septième continent', 85 | 'stress thermique', 86 | 'système de drainage', 87 | 'surpêche', 88 | 'the metals company', 89 | 'zone marine protégée', 90 | 'zone maritime' 91 | ) 92 | ), 93 | 94 | distinct_kw AS ( 95 | SELECT 96 | DISTINCT(id) AS "distinct_id" 97 | FROM 98 | filtered_keywords 99 | ) 100 | 101 | SELECT 102 | "public"."keywords"."id", 103 | "public"."keywords"."start", 104 | "public"."keywords"."channel_title", 105 | "public"."keywords"."plaintext", 106 | "public"."keywords"."number_of_keywords", 107 | "public"."keywords"."keywords_with_timestamp", 108 | "public"."keywords"."country", 109 | "public"."keywords"."channel_name" 110 | FROM 111 | "public"."keywords" 112 | INNER JOIN distinct_kw ON distinct_kw.distinct_id = "public"."keywords".id 113 | WHERE 114 | "public"."keywords"."start" >= '2025-01-01' 115 | AND "public"."keywords"."number_of_keywords" > 0 116 | AND "public"."keywords"."country" = 'france' 117 | AND "public"."keywords"."channel_title" <> 'C8' 118 | AND "public"."keywords"."channel_title" IS NOT NULL 119 | AND "public"."keywords"."channel_title" <> '' -------------------------------------------------------------------------------- /mockwebsite/republiquepyrenees_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | https://www.larepubliquedespyrenees.fr/pyrenees-atlantiques/pontiacq-viellepinte/pontiacq-lamayou-c-est-parti-pour-le-36e-tournoi-de-pala-17196604.php2023-10-25T11:00:49+02:00https://images.larepubliquedespyrenees.fr/17196604/1200x-1/morlaasvic-bilh-0e06cc7df3274c2190c3d751fbe2f787-151648-ph0.jpgLe premier match du tournoi a vu la victoire de Sébastien Pina et Fabrice Lajus contre Romain Tillet et Maxime Delas.La République des Pyrénéesfr2023-10-25T11:00:49+02:00Pontiacq-Lamayou : c’est parti pour le 36e tournoi de pala !https://www.larepubliquedespyrenees.fr/pyrenees-atlantiques/vallee-d-aspe/vallee-d-aspe-des-changements-au-1er-novembre-pour-le-transport-a-la-demande-17196907.php2023-10-25T10:55:33+02:00https://images.larepubliquedespyrenees.fr/17196907/1200x-1/oloronvalleesbearnaises-6b1cd659e6db43dda3cc25d5e4b7efaa-154147-ph0.jpgLes panneaux signalétiques jaune et blanc ont fleuri dans chaque commune.La République des Pyrénéesfr2023-10-25T10:55:33+02:00Vallée d’Aspe : des changements au 1er novembre pour le transport à la demandehttps://www.larepubliquedespyrenees.fr/sport/equitation/le-concours-5-etoiles-de-pau-devient-un-evenement-familial-17147228.php2023-10-25T10:55:01+02:00https://images.larepubliquedespyrenees.fr/17147228/1200x-1/rep-10211-hd141476.jpgL’an dernier, 40 000 personnes ont assisté au concours.La République des Pyrénéesfr2023-10-25T10:55:01+02:00Le concours 5 étoiles de Pau devient un événement familialhttps://www.larepubliquedespyrenees.fr/societe/afp/evasion-par-helicoptere-de-redoine-faid-le-verdict-attendu-en-fin-d-apres-midi-17205823.php2023-10-25T10:49:48+02:00https://images.larepubliquedespyrenees.fr/17205823/1200x-1/pp-6538d860a43f5e284d9c2bef-ph0.jpgCroquis d'audience de Rédoine Faïd à l'ouverture de son procès devant la cour d'assises de Paris, le 5 septembre 2023La République des Pyrénéesfr2023-10-25T10:49:48+02:00Evasion par hélicoptère de Rédoine Faïd: le verdict attendu en fin d'après-midihttps://www.larepubliquedespyrenees.fr/culture-et-loisirs/pyrenees-gaming-notre-jeu-du-mois-assassin-s-creed-mirage-un-retour-aux-sources-de-la-saga-17170841.php2023-10-25T10:49:27+02:00https://images.larepubliquedespyrenees.fr/17170841/1200x-1/lcl3ybzh.jpg« Assassin’s Creed Mirage » a été développé par Ubisoft Bordeaux.La République des Pyrénéesfr2023-10-25T10:49:27+02:00▶️ Pyrénées Gaming. Notre jeu du mois : « Assassin’s Creed Mirage », « un retour aux sources, de la saga » -------------------------------------------------------------------------------- /mockwebsite/liberation_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | https://www.liberation.fr/international/moyen-orient/en-direct-guerre-hamas-israel-otages-liberees-macron-a-tel-aviv-bombardements-sur-gaza-crainte-dun-embrasement-regional-aide-humanitaire-retrouvez-toutes-les-informations-de-ce-mardi-24-octobre-20231024_6DU6EBVRLZAELLAYU47IAHF6Z4/2023-10-24T08:27:52.306Zalways0.5Libérationfr2023-10-24T08:27:52.306Zhttps://liberation-liberation-prod.cdn.arcpublishing.com/resizer/SKQApBHpBaSJVcpqIDj1h4O-sfU=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/5RDM4TAUGFEZPIHYWG3CVDUR7Y.jpghttps://www.liberation.fr/politique/elections/le-gros-bobard-de-jean-philippe-tanguy-sur-le-gud-ennemi-historique-du-rassemblement-national-20231023_EWA5NEN4QFEUXHLIN74PSPEDJ4/2023-10-23T15:15:23.928Zalways0.5Libérationfr2023-10-23T15:15:23.928Zhttps://liberation-liberation-prod.cdn.arcpublishing.com/resizer/3hZXTi8Ccr2O3s6zyYqk8-Us3Qw=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/AREAUQIPLZCWFKB6HUTFE7VQ24.jpghttps://www.liberation.fr/societe/police-justice/chateau-de-versailles-un-lanceur-de-fausse-alerte-condamne-a-huit-mois-de-prison-avec-sursis-20231023_F2KK3TWLVVGSDAJOPW4KM6OCZQ/2023-10-23T17:16:09.315Zalways0.5Libérationfr2023-10-23T17:16:09.315Zhttps://liberation-liberation-prod.cdn.arcpublishing.com/resizer/cJtbpHCkwdNZbFOVGCSkmRz9FUs=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/DNYUPQGQ2JE2NEWLG4UQLCIYAY.jpghttps://www.liberation.fr/international/europe/plusieurs-disparus-apres-une-collision-entre-deux-cargos-en-mer-du-nord-20231024_325S36NYBRGRLJ7LUAYQ2K5TKQ/2023-10-24T07:57:03.897Zalways0.5Libérationfr2023-10-24T07:57:03.897Zhttps://liberation-liberation-prod.cdn.arcpublishing.com/resizer/nIoB0Sv-h1lexX5KgABQaf4px5Y=/cloudfront-eu-central-1.images.arcpublishing.com/liberation/KIBODBZQNREILHF6YWE7KFF4Z4.jpg -------------------------------------------------------------------------------- /alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = alembic 6 | 7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s 8 | # Uncomment the line below if you want the files to be prepended with date and time 9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file 10 | # for all available tokens 11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s 12 | 13 | # sys.path path, will be prepended to sys.path if present. 14 | # defaults to the current working directory. 15 | prepend_sys_path = . 16 | 17 | # timezone to use when rendering the date within the migration file 18 | # as well as the filename. 19 | # If specified, requires the python>=3.9 or backports.zoneinfo library. 20 | # Any required deps can installed by adding `alembic[tz]` to the pip requirements 21 | # string value is passed to ZoneInfo() 22 | # leave blank for localtime 23 | # timezone = 24 | 25 | # max length of characters to apply to the 26 | # "slug" field 27 | # truncate_slug_length = 40 28 | 29 | # set to 'true' to run the environment during 30 | # the 'revision' command, regardless of autogenerate 31 | # revision_environment = false 32 | 33 | # set to 'true' to allow .pyc and .pyo files without 34 | # a source .py file to be detected as revisions in the 35 | # versions/ directory 36 | # sourceless = false 37 | 38 | # version location specification; This defaults 39 | # to alembic/versions. When using multiple version 40 | # directories, initial revisions must be specified with --version-path. 41 | # The path separator used here should be the separator specified by "version_path_separator" below. 42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions 43 | 44 | # version path separator; As mentioned above, this is the character used to split 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. 47 | # Valid values for version_path_separator are: 48 | # 49 | # version_path_separator = : 50 | # version_path_separator = ; 51 | # version_path_separator = space 52 | version_path_separator = os # Use os.pathsep. Default configuration used for new projects. 53 | 54 | # set to 'true' to search source files recursively 55 | # in each "version_locations" directory 56 | # new in Alembic version 1.10 57 | # recursive_version_locations = false 58 | 59 | # the output encoding used when revision files 60 | # are written from script.py.mako 61 | # output_encoding = utf-8 62 | sqlalchemy.url = postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB} 63 | 64 | [post_write_hooks] 65 | # post_write_hooks defines scripts or Python functions that are run 66 | # on newly generated revision scripts. See the documentation for further 67 | # detail and examples 68 | 69 | # format using "black" - use the console_scripts runner, against the "black" entrypoint 70 | # hooks = black 71 | # black.type = console_scripts 72 | # black.entrypoint = black 73 | # black.options = -l 79 REVISION_SCRIPT_FILENAME 74 | 75 | # lint with attempts to fix using "ruff" - use the exec runner, execute a binary 76 | # hooks = ruff 77 | # ruff.type = exec 78 | # ruff.executable = %(here)s/.venv/bin/ruff 79 | # ruff.options = --fix REVISION_SCRIPT_FILENAME 80 | 81 | # Logging configuration 82 | [loggers] 83 | keys = root,sqlalchemy,alembic 84 | 85 | [handlers] 86 | keys = console 87 | 88 | [formatters] 89 | keys = generic 90 | 91 | [logger_root] 92 | level = WARN 93 | handlers = console 94 | qualname = 95 | 96 | [logger_sqlalchemy] 97 | level = WARN 98 | handlers = 99 | qualname = sqlalchemy.engine 100 | 101 | [logger_alembic] 102 | level = INFO 103 | handlers = 104 | qualname = alembic 105 | 106 | [handler_console] 107 | class = StreamHandler 108 | args = (sys.stderr,) 109 | level = NOTSET 110 | formatter = generic 111 | 112 | [formatter_generic] 113 | format = %(levelname)-5.5s [%(name)s] %(message)s 114 | datefmt = %H:%M:%S 115 | -------------------------------------------------------------------------------- /quotaclimat/data_processing/mediatree/i8n/germany/channel_program.py: -------------------------------------------------------------------------------- 1 | channels_programs_germany = [ 2 | {"channel_name": "daserste", "start": "05:30", "end": "09:30", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "ZDF-Morgenmagazin", "program_type": "Information - Magazine"}, 3 | {"channel_name": "daserste", "start": "12:00", "end": "14:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Mittagsmagazin", "program_type": "Information - Magazine"}, 4 | {"channel_name": "daserste", "start": "17:00", "end": "18:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Tagesschau", "program_type": "Information - Journal"}, 5 | {"channel_name": "daserste", "start": "19:30", "end": "00:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Prime Time", "program_type": "Entertainment - Various"}, 6 | {"channel_name": "daserste", "start": "21:45", "end": "00:00", "weekday": "6", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Sunday Special", "program_type": "Information - Magazine"}, 7 | 8 | {"channel_name": "zdf-neo", "start": "00:00", "end": "01:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Nighttime Programming", "program_type": "Entertainment - Talk Show"}, 9 | {"channel_name": "zdf-neo", "start": "05:30", "end": "11:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "ZDF-Morgenmagazin", "program_type": "Information - Journal"}, 10 | {"channel_name": "zdf-neo", "start": "12:00", "end": "14:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Mittagsmagazin", "program_type": "Information - Magazine"}, 11 | {"channel_name": "zdf-neo", "start": "21:30", "end": "00:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Late Evening Show", "program_type": "Entertainment - Various"}, 12 | 13 | {"channel_name": "rtl-television", "start": "00:00", "end": "01:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "RTL Late Night", "program_type": "Entertainment - Talk Show"}, 14 | {"channel_name": "rtl-television", "start": "06:00", "end": "09:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Guten Morgen Deutschland ", "program_type": "Information - Magazine"}, 15 | {"channel_name": "rtl-television", "start": "12:00", "end": "15:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Midday Show", "program_type": "Entertainment - Various"}, 16 | {"channel_name": "rtl-television", "start": "18:30", "end": "20:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "RTL Aktuell", "program_type": "Information - Journal"}, 17 | {"channel_name": "rtl-television", "start": "22:00", "end": "00:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Primetime Shows", "program_type": "Entertainment - Various"}, 18 | 19 | {"channel_name": "sat1", "start": "05:30", "end": "10:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Sat.1 Frühstücksfernsehen", "program_type": "Information - Magazine"}, 20 | {"channel_name": "sat1", "start": "19:30", "end": "20:30", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Prime Time Show", "program_type": "Entertainment - Various"}, 21 | 22 | {"channel_name": "prosieben", "start": "17:00", "end": "20:30", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "taff & Galileo", "program_type": "Information - Magazine"}, 23 | 24 | {"channel_name": "kabel-eins", "start": "16:30", "end": "18:00", "weekday": "weekday", "program_grid_start": "2023-04-01", "program_grid_end": "", "program_name": "Abenteuer Leben täglich ", "program_type": "Information - Magazine"}, 25 | ] 26 | -------------------------------------------------------------------------------- /mockwebsite/nicematin_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://www.nicematin.com/education/cest-un-scandale-une-mere-de-famille-en-colere-apres-avoir-mis-de-longs-mois-a-trouver-un-mode-de-garde-pour-sa-fille-sur-la-cote-dazur-881497 5 | 6 | Nice-Matin 7 | fr 8 | 2023-10-25T10:55:00+02:00"C'est un scandale": une mère de famille en colère après avoir mis de longs mois à trouver un mode de garde pour sa fille sur la Côte-d'Azurhttps://fyooyzbm.filerobot.com/v7/nounou2-C8iVj9UI.jpg?vh=bb8c9a&ci_seal=1795970eb9&w=750&h=375&gravity=auto&func=cropAprès de long mois, Emilie a fini par trouver une solution de garde pour sa fille Mélina. Non sans répercussions sur sa vie professionnelle. 9 | https://www.nicematin.com/faits-divers/a-nice-la-replique-dun-gilet-tactique-abandonne-avec-une-grenade-provoque-lintervention-de-la-police-881520 10 | 11 | Nice-Matin 12 | fr 13 | 2023-10-25T10:49:00+02:00À Nice, la réplique d'un gilet tactique abandonné avec une grenade provoque l'intervention de la policehttps://fyooyzbm.filerobot.com/v7/maxmatinarch530448-Zr26gJZK.jpg?vh=9bf068&ci_seal=812f0dc672&w=750&h=375&gravity=auto&func=cropL'intervention a eu lieu rue Delille à Nice. 14 | https://www.nicematin.com/temoignage/rien-que-par-le-bouche-a-oreille-dans-ma-residence-jai-deja-des-appels-a-51-ans-elle-plaque-tout-pour-devenir-assistante-maternelle-a-nice-881495 15 | 16 | Nice-Matin 17 | fr 18 | 2023-10-25T10:35:00+02:00"Rien que par le bouche-à-oreille dans ma résidence, j’ai déjà des appels": à 51 ans, elle plaque tout pour devenir assistante maternelle à Nicehttps://fyooyzbm.filerobot.com/v7/assistantenounou+%281%29-cHTI0xtv.webp?ci_seal=30e64b9995&tl_px=6,9&br_px=1270,735&w=750&h=375&gravity=auto&func=cropDans les Alpes-Maritimes, près de 500 assistants maternels ont quitté leur fonction depuis quatre ans. 19 | https://www.nicematin.com/environnement/totalenergies-accuse-par-greenpeace-detre-implique-dans-33-projets-fossiles-catastrophiques-pour-le-climat-881516 20 | 21 | Nice-Matin 22 | fr 23 | 2023-10-25T10:35:00+02:00TotalEnergies accusé par Greenpeace d'être impliqué dans 33 projets fossiles "catastrophiques pour le climat"https://fyooyzbm.filerobot.com/v7/000_33A94W3-g3Zrfh2z.jpg?vh=7d1a0a&ci_seal=df517fd0af&w=750&h=375&gravity=auto&func=cropTotalEnergies participe à 33 projets de gaz et de pétrole "super-émetteurs" en gaz à effet de serre, accuse mercredi l'ONG Greenpeace dans une étude visant à démontrer une "logique d'expansion fossile" en contradiction avec les objectifs climatiques. 24 | -------------------------------------------------------------------------------- /mockwebsite/letelegramme_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://www.letelegramme.fr/monde/coups-de-feu-a-bruxelles-deux-morts-le-suspect-en-fuite-6450252.php 5 | 6 | https://media.letelegramme.fr/api/v1/images/view/652d8f34710625629665f40a/web_golden_xxl/652d8f34710625629665f40a.1 7 | Un périmètre de sécurité a été installé autour de la place Sainctelette. (Hatim Kaghat/AFP) 8 | Un périmètre de sécurité a été installé autour de la place Sainctelette. 9 | 10 | 11 | 12 | Le Télégramme 13 | fr 14 | 15 | 2023-10-16T19:29:56+00:00 16 | Coups de feu à Bruxelles : deux morts, le suspect en fuite, la piste terroriste évoquée 17 | 18 | 19 | 20 | https://www.letelegramme.fr/monde/mali-la-mission-de-lonu-engage-sous-tension-une-nouvelle-phase-de-son-retrait-6450249.php 21 | 22 | 23 | Le Télégramme 24 | fr 25 | 26 | 2023-10-16T19:18:00+00:00 27 | Mali : la mission de l’Onu engage sous tension une nouvelle phase de son retrait 28 | 29 | 30 | 31 | https://www.letelegramme.fr/morbihan/vannes-56000/circulation-et-stationnement-a-la-gare-de-vannes-ca-va-etre-tres-complique-pendant-deux-ans-6450250.php 32 | 33 | https://media.letelegramme.fr/api/v1/images/view/652d8d905a16a826de416f33/web_golden_xxl/652d8d905a16a826de416f33.1 34 | Le côté sud de la gare vu d’en haut. L’avenue Favrel et Lincy deviendra une voie de bus dans le sens descendant, une voie pour les voitures et bus dans le sens montant. Le parvis de la gare sera élargi et végétalisé. Les vélos y trouveront leur place. (Image : Villes et paysages) 35 | Le côté sud de la gare vu d’en haut. L’avenue Favrel et Lincy deviendra une voie de bus dans le sens descendant, une voie pour les voitures et bus dans le sens montant. Le parvis de la gare sera élargi et végétalisé. Les vélos y trouveront leur place. 36 | 37 | 38 | 39 | Le Télégramme 40 | fr 41 | 42 | 2023-10-16T19:13:00+00:00 43 | Circulation et stationnement à la gare de Vannes : « Ça va être très compliqué pendant deux ans » 44 | Futur quartier de la gare de Vannes,Gare 45 | 46 | 47 | 48 | https://www.letelegramme.fr/finistere/ergue-gaberic-29500/cinq-blesses-dans-un-accident-de-circulation-a-ergue-gaberic-6450248.php 49 | 50 | https://media.letelegramme.fr/api/v1/images/view/652d8cd651450e731a713e6a/web_golden_xxl/652d8cd651450e731a713e6a.1 51 | Un homme a été transporté dans un état critique à l’hôpital de Quimper. (Photo d’illustration Lionel Le Saux/Le Télégramme) 52 | Un homme a été transporté dans un état critique à l’hôpital de Quimper. 53 | 54 | 55 | 56 | Le Télégramme 57 | fr 58 | 59 | 2023-10-16T19:08:00+00:00 60 | Cinq blessés dans un accident de la circulation à Ergué-Gabéric 61 | Faits divers,Accident 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /my_dbt_project/pytest_tests/test_dbt_model_analytics.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import os 4 | import subprocess 5 | from decimal import * 6 | 7 | import psycopg2 8 | import pytest 9 | 10 | from my_dbt_project.pytest_tests.test_dbt_model_homepage import run_dbt_command 11 | 12 | 13 | @pytest.fixture(scope="module") 14 | def db_connection(): 15 | conn = psycopg2.connect( 16 | dbname=os.getenv("POSTGRES_DB", ""), 17 | user=os.getenv("POSTGRES_USER", ""), 18 | password=os.getenv("POSTGRES_PASSWORD", ""), 19 | host=os.getenv("POSTGRES_HOST", ""), 20 | port=os.getenv("POSTGRES_PORT", ""), 21 | ) 22 | yield conn 23 | conn.close() 24 | 25 | 26 | def seed_dbt_labelstudio(): 27 | """Run dbt seed once before any test.""" 28 | commands = [ 29 | "seed", 30 | "--select", 31 | "labelstudio_task_aggregate", 32 | "--select", 33 | "labelstudio_task_completion_aggregate", 34 | "--full-refresh", 35 | ] 36 | logging.info(f"pytest running dbt seed : {commands}") 37 | run_dbt_command(commands) 38 | # seed and dbt run upstream tables 39 | commands = [ 40 | "seed", 41 | "--select", 42 | "program_metadata", 43 | "--select", 44 | "time_monitored", 45 | "--select", 46 | "keywords", 47 | "--select", 48 | "dictionary", 49 | "--select", 50 | "keyword_macro_category", 51 | "--full-refresh", 52 | ] 53 | run_dbt_command(commands) 54 | 55 | seed_dbt_labelstudio() 56 | 57 | @pytest.fixture(scope="module", autouse=True) 58 | def run_analytics(): 59 | logging.info("Run dbt for the thematics model once before related tests.") 60 | run_dbt_command( 61 | [ 62 | "run", 63 | "--exclude", 64 | "core_query_causal_links", 65 | "--exclude", 66 | "task_global_completion", 67 | "--exclude", 68 | "environmental_shares_with_desinfo_counts", 69 | "--full-refresh", 70 | ] 71 | ) 72 | logging.info("pytest running dbt task_global_completion") 73 | run_dbt_command( 74 | [ 75 | "run", 76 | "--select", 77 | "task_global_completion", 78 | "--select", 79 | "environmental_shares_with_desinfo_counts", 80 | "--target", 81 | "analytics", 82 | "--full-refresh", 83 | ] 84 | ) 85 | 86 | 87 | def test_task_global_completion(db_connection): 88 | with db_connection.cursor() as cur: 89 | cur.execute(""" 90 | SELECT 91 | "analytics"."task_global_completion"."task_completion_aggregate_id", 92 | "analytics"."task_global_completion"."country", 93 | "analytics"."task_global_completion"."data_item_channel_name", 94 | "analytics"."task_global_completion"."mesinfo_choice", 95 | "analytics"."task_global_completion"."sum_duration_minutes" 96 | FROM analytics.task_global_completion 97 | ORDER BY analytics.task_global_completion.task_completion_aggregate_id 98 | LIMIT 1 99 | """) 100 | row = cur.fetchone() 101 | 102 | expected = ( 103 | "0e7ee7f70a223e21b10c0dad27464bebb8cc6a7f4bd5f5b7746c661a44ec7b45", 104 | "france", 105 | "europe1", 106 | "Correct", 107 | None, 108 | ) 109 | 110 | assert row == expected, f"Unexpected values: {row}" 111 | 112 | def test_environmental_shares_desinfo(db_connection): 113 | with db_connection.cursor() as cur: 114 | cur.execute(""" 115 | SELECT 116 | "analytics"."environmental_shares_with_desinfo_counts"."start", 117 | "analytics"."environmental_shares_with_desinfo_counts"."channel_name", 118 | "analytics"."environmental_shares_with_desinfo_counts"."sum_duration_minutes", 119 | "analytics"."environmental_shares_with_desinfo_counts"."weekly_perc_climat", 120 | "analytics"."environmental_shares_with_desinfo_counts"."total_mesinfo" 121 | FROM analytics.environmental_shares_with_desinfo_counts 122 | ORDER BY analytics.environmental_shares_with_desinfo_counts.start 123 | LIMIT 1 124 | """) 125 | row = cur.fetchone() 126 | expected = ( 127 | datetime.datetime(2025, 1, 27, 0, 0), 128 | "arte", 129 | 65, 130 | 0.13846153846153847, 131 | 0, 132 | ) 133 | assert row == expected -------------------------------------------------------------------------------- /my_dbt_project/models/dashboards/core_query_thematics_keywords.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='incremental', 3 | unique_key=['week','channel_title'], 4 | on_schema_change='append_new_columns' 5 | ) 6 | }} 7 | 8 | -- Core Query Thematics Keywords makes only sense when looking for keywords,theme, and category together (otherwise duplicates 9 | -- as a keyword inside keyword_with_timestamp is present 4 times, if the keyword has 4 themes) 10 | 11 | WITH program_durations AS ( 12 | SELECT 13 | pm.channel_title, 14 | pm.channel_program, 15 | pm.weekday, 16 | CAST(pm.program_grid_start AS date) AS program_start, 17 | CAST(pm.program_grid_end AS date) AS program_end, 18 | pm.duration_minutes 19 | FROM public.program_metadata pm 20 | WHERE pm.country = 'france' 21 | ), 22 | program_weeks AS ( 23 | SELECT 24 | pd.channel_title, 25 | pd.channel_program, 26 | pd.duration_minutes, 27 | pd.weekday, 28 | generate_series( 29 | date_trunc('week', pd.program_start), 30 | date_trunc('week', pd.program_end), 31 | interval '1 week' 32 | )::date AS week_start 33 | FROM program_durations pd 34 | ), 35 | program_airings AS ( 36 | SELECT 37 | channel_title, 38 | channel_program, 39 | duration_minutes, 40 | -- calculate actual airing date per week + weekday offset 41 | (week_start + (weekday - 1) * INTERVAL '1 day')::date AS airing_date, 42 | week_start 43 | FROM program_weeks 44 | ), 45 | weekly_program_durations AS ( 46 | SELECT 47 | channel_title, 48 | week_start AS week, 49 | SUM(duration_minutes) AS weekly_duration_minutes 50 | FROM program_airings 51 | GROUP BY channel_title, week_start 52 | ), 53 | keyword_occurrences AS ( 54 | SELECT DISTINCT 55 | COALESCE(pm.channel_title, k.channel_title) AS channel_title, 56 | DATE_TRUNC('week', k.start)::date AS week, 57 | k.start AS occurrence_time, 58 | -- Semantic tags 59 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%solution%' THEN TRUE ELSE FALSE END AS is_solution, 60 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%consequence%' THEN TRUE ELSE FALSE END AS is_consequence, 61 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%cause%' THEN TRUE ELSE FALSE END AS is_cause, 62 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%concepts_generaux%' THEN TRUE ELSE FALSE END AS is_general_concepts, 63 | CASE WHEN LOWER(kw ->> 'theme') LIKE '%constat%' THEN TRUE ELSE FALSE END AS is_statement, 64 | -- Crisis type 65 | CASE 66 | WHEN LOWER(kw ->> 'theme') LIKE '%climat%' THEN 'Crise climatique' 67 | WHEN LOWER(kw ->> 'theme') LIKE '%biodiversite%' THEN 'Crise de la biodiversité' 68 | WHEN LOWER(kw ->> 'theme') LIKE '%ressource%' THEN 'Crise des ressources' 69 | ELSE 'Autre' 70 | END AS crise_type, 71 | kw ->> 'theme' AS theme, 72 | kw ->> 'keyword' AS keyword 73 | FROM public.keywords k 74 | LEFT JOIN public.program_metadata pm 75 | ON k.channel_program = pm.channel_program 76 | AND k.channel_name = pm.channel_name 77 | AND ( 78 | ( 79 | CASE 80 | WHEN ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7) = 0 THEN 7 81 | ELSE ((EXTRACT(DOW FROM k.start)::int + 1 + 6) % 7) 82 | END = pm.weekday 83 | ) 84 | ) 85 | AND CAST(k.start AS date) BETWEEN CAST(pm.program_grid_start AS date) 86 | AND CAST(pm.program_grid_end AS date) 87 | , json_array_elements(k.keywords_with_timestamp::json) AS kw 88 | WHERE 89 | LOWER(kw ->> 'theme') NOT LIKE '%indirect%' 90 | AND k.country = 'france' 91 | ) 92 | SELECT 93 | ko.channel_title, 94 | ko.week, 95 | COALESCE(NULLIF(d.category, ''), 'Transversal') AS category, 96 | d.high_risk_of_false_positive, 97 | ko.is_solution, 98 | ko.is_consequence, 99 | ko.is_cause, 100 | ko.is_general_concepts, 101 | ko.is_statement, 102 | ko.crise_type, 103 | ko.theme, 104 | ko.keyword, 105 | kmc.general, 106 | kmc.agriculture, 107 | kmc.transport, 108 | kmc.batiments, 109 | kmc.energie, 110 | kmc.industrie, 111 | kmc.eau, 112 | kmc.ecosysteme, 113 | kmc.economie_ressources, 114 | COUNT(*) AS count, 115 | COALESCE(wpd.weekly_duration_minutes, 0) AS sum_duration_minutes 116 | FROM keyword_occurrences ko 117 | LEFT JOIN public.dictionary d 118 | ON d.keyword = ko.keyword AND d.theme LIKE ko.theme || '%' -- ensure matc with indirect theme inside the dictionary table 119 | LEFT JOIN weekly_program_durations wpd 120 | ON wpd.channel_title = ko.channel_title AND wpd.week = ko.week 121 | LEFT JOIN public.keyword_macro_category kmc 122 | ON kmc.keyword = ko.keyword 123 | GROUP BY 124 | ko.channel_title, 125 | ko.week, 126 | d.high_risk_of_false_positive, 127 | COALESCE(NULLIF(d.category, ''), 'Transversal'), 128 | ko.is_solution, 129 | ko.is_consequence, 130 | ko.is_cause, 131 | ko.is_general_concepts, 132 | ko.is_statement, 133 | ko.crise_type, 134 | ko.theme, 135 | ko.keyword, 136 | kmc.general, 137 | kmc.agriculture, 138 | kmc.transport, 139 | kmc.batiments, 140 | kmc.energie, 141 | kmc.industrie, 142 | kmc.eau, 143 | kmc.ecosysteme, 144 | kmc.economie_ressources, 145 | wpd.weekly_duration_minutes 146 | ORDER BY 147 | ko.channel_title, ko.week, ko.crise_type -------------------------------------------------------------------------------- /test/sitemap/test_main_import_api.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from modin.pandas.dataframe import DataFrame 4 | 5 | from quotaclimat.data_processing.mediatree.update_pg_keywords import * 6 | 7 | from postgres.insert_data import (clean_data, 8 | insert_data_in_sitemap_table) 9 | 10 | from postgres.schemas.models import create_tables, get_db_session, get_keyword, connect_to_db, drop_tables, empty_tables 11 | from postgres.insert_data import save_to_pg 12 | from quotaclimat.data_processing.mediatree.detect_keywords import * 13 | from quotaclimat.data_processing.mediatree.api_import import * 14 | from quotaclimat.data_processing.mediatree.keyword.stop_words import STOP_WORDS 15 | from quotaclimat.data_processing.mediatree.stop_word.main import save_append_stop_word 16 | from quotaclimat.data_processing.mediatree.s3.api_to_s3 import parse_reponse_subtitle 17 | from test_utils import get_localhost, debug_df, compare_unordered_lists_of_dicts 18 | 19 | import time as t 20 | 21 | 22 | def insert_mediatree_json(conn, json_file_path='test/sitemap/mediatree.json'): 23 | create_tables(conn) 24 | empty_tables(get_db_session(conn), stop_word=False) 25 | logging.info(f"reading {json_file_path}") 26 | with open(json_file_path, 'r') as file: 27 | json_response = json.load(file) 28 | start_time = t.time() 29 | df = parse_reponse_subtitle(json_response) 30 | df = filter_and_tag_by_theme(df) 31 | df["id"] = df.apply(lambda x: add_primary_key(x), axis=1) 32 | end_time = t.time() 33 | logging.info(f"Elapsed time for api import {end_time - start_time}") 34 | 35 | # must df._to_pandas() because to_sql does not handle modin dataframe 36 | save_to_pg(df._to_pandas(), keywords_table, conn) 37 | 38 | return len(df) 39 | 40 | def insert_stop_word(conn): 41 | logging.info("test saving stop words") 42 | to_save = [] 43 | for stop in STOP_WORDS: 44 | stop_word = dict() 45 | stop_word['id'] = stop 46 | stop_word['context'] = stop 47 | to_save.append(stop_word) 48 | 49 | save_append_stop_word(conn, to_save) 50 | 51 | def test_main_api_import(): 52 | conn = connect_to_db() 53 | drop_tables(conn) 54 | create_tables(conn) 55 | insert_stop_word(conn) 56 | len_df = insert_mediatree_json(conn, json_file_path="test/sitemap/light.json") 57 | 58 | session = get_db_session(conn) 59 | saved_keywords = get_keywords_columns(session, start_date="2024-02-01", end_date="2024-02-29") 60 | assert len(saved_keywords) != 0 61 | assert len(saved_keywords) == len_df 62 | 63 | def test_first_row_api_import(): 64 | primary_key = "29d2b1f8267b206cb62e475b960de3247e835273f396af012f5ce21bf3056472" 65 | 66 | specific_keyword = get_keyword(primary_key) 67 | logging.info(f"Getting {primary_key} :\n {specific_keyword}") 68 | assert set(specific_keyword.theme) == set([ 69 | 'biodiversite_concepts_generaux_indirectes', 70 | 'changement_climatique_consequences_indirectes', 71 | 'changement_climatique_constat_indirectes' 72 | ]) 73 | 74 | assert specific_keyword.number_of_keywords == 0 75 | 76 | def test_second_row_api_import(): 77 | 78 | primary_key = "9f0fb1987371c1dc0b4a165a11feb7ca7ed9b6f9f40d3d6b4fc0748e2ca59c3f" 79 | specific_keyword = get_keyword(primary_key) 80 | assert len(set(specific_keyword.theme)) > 0 81 | assert specific_keyword.number_of_keywords > 0 82 | 83 | 84 | def test_third_row_api_import(): 85 | primary_key = "32cb864fe56a4436151bcf78c385a7cc4226316e0563a298ac6988d1b8ee955b" 86 | 87 | specific_keyword = get_keyword(primary_key) 88 | assert len(set(specific_keyword.theme)) > 0 89 | 90 | assert specific_keyword.number_of_keywords == 1 91 | 92 | def test_get_api_stop(): 93 | conn = connect_to_db() 94 | session = get_db_session(conn) 95 | stopwords = get_stop_words(session, country=None) 96 | assert type(stopwords[0]) == str 97 | 98 | def test_transform_raw_keywords_srt_to_mediatree(): 99 | conn = connect_to_db() 100 | 101 | channel = "LAUNE" 102 | primary_key = "df0d86983f0c4ed074800f5cdabbd577671b90845fb6208a5de1ae3802fb10e0" 103 | df: DataFrame= pd.read_parquet(path=f"i8n/mediatree_output/year=2024/month=10/day=1/channel={channel}") 104 | df_programs = get_programs() 105 | output = transform_raw_keywords(df, df_programs=df_programs,country=BELGIUM) 106 | 107 | output_dict = output.to_dict(orient='records') 108 | filtered = output[output["id"] == primary_key] 109 | row_dict = filtered.iloc[0].to_dict() 110 | assert row_dict["country"] == "belgium" 111 | assert row_dict["channel_name"] == channel 112 | 113 | assert len(output) == 29 114 | save_to_pg(df=output,conn=conn, table=keywords_table) 115 | specific_keyword = get_keyword(primary_key) 116 | assert set(specific_keyword.theme) == set([ 117 | 'changement_climatique_causes_indirectes', 118 | ]) 119 | 120 | assert specific_keyword.number_of_keywords == 0 -------------------------------------------------------------------------------- /mockwebsite/leparisien_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | https://www.leparisien.fr/sports/cyclisme/tour-de-france/direct-tour-de-france-lannonce-des-parcours-2024-a-suivre-en-live-25-10-2023-SGPV57QEYVAOJKR2VTRVETMVSY.php2023-10-25T08:53:25.512ZLe Parisienfr2023-10-25T08:53:26.556Zhttps://www.leparisien.fr/resizer/pZgWLK34dnSm3PnePH4YT7PDeLI=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/RBJCS5CIAVHK3IG3DKVEMSO56Y.jpghttps://www.leparisien.fr/sports/football/sadio-mane-arrive-aux-commandes-de-bourges-foot-18-club-de-national-2-25-10-2023-Z7GNAIUG65ECXC33R7XZPG6V2E.php2023-10-25T08:52:34.982ZLe Parisienfr2023-10-25T08:52:35.420Zhttps://www.leparisien.fr/resizer/oYbqphCAWq15Lf1aZAo4uO651ZI=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/XC6BTYGH3VENLEONZWY7LRTYOY.jpghttps://www.leparisien.fr/faits-divers/le-pilote-americain-qui-a-tente-de-couper-les-moteurs-dun-avion-avait-consomme-des-hallucinogenes-25-10-2023-OBK4GDNF4NFN7MXLD4NY4FVUEU.php2023-10-25T08:50:28.302ZLe Parisienfr2023-10-25T08:50:28.762Zhttps://www.leparisien.fr/resizer/dpRGItWIAA5vHv2D6cmBGmOff7U=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/TAPHNZFSIRBAVHYEOIIN3U7AP4.jpghttps://www.leparisien.fr/futurs/punaises-de-lit-comment-sen-debarrasser-les-reconnaitre-dou-viennent-elles-posez-nous-vos-questions-25-10-2023-A5ZSPB6LSBBVLHCFIX4OIAZFWA.php2023-10-25T08:49:39.415ZLe Parisienfr2023-10-25T08:49:40.613Zhttps://www.leparisien.fr/resizer/02sdSrjueqCNoNETKkV7cTpDO_0=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/4QGY2FZT2JF4XFUFQGSNWNZ5WQ.jpghttps://www.leparisien.fr/culture-loisirs/cinema/le-syndrome-des-amours-passees-mais-pourquoi-couchent-ils-avec-leurs-ex-25-10-2023-SOCRYLNKVBHH5N7RZONQFRQLII.php2023-10-25T08:49:30.367ZLe Parisienfr2023-10-25T08:49:30.814Zhttps://www.leparisien.fr/resizer/yk9qwslNqiBUhMh5EhTUDc5JoRc=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/7W2I45MNRNHLRHKSYZK2BZ6IMI.jpg -------------------------------------------------------------------------------- /.github/workflows/deploy-main.yml: -------------------------------------------------------------------------------- 1 | name: Build & Deploy to Scaleway 2 | 3 | on: 4 | push: 5 | # Sequence of patterns matched against refs/heads 6 | branches: 7 | - main 8 | 9 | # to be able to force deploy 10 | workflow_dispatch: 11 | 12 | 13 | env: 14 | PYTHON_VERSION: '3.12' 15 | POETRY_VERSION: '2.1.3' 16 | 17 | jobs: 18 | build: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v4 22 | - uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ env.PYTHON_VERSION }} 25 | - uses: actions/checkout@v4 26 | - name: Install Poetry 27 | uses: snok/install-poetry@v1 28 | with: 29 | version: ${{ env.POETRY_VERSION }} 30 | virtualenvs-create: true 31 | virtualenvs-in-project: true 32 | installer-parallel: true 33 | - name: Poetry install & bump version 34 | run: | 35 | poetry install --only dev 36 | poetry version patch 37 | PROJECT_VERSION=$(poetry version --short) 38 | echo "PROJECT_VERSION=$PROJECT_VERSION" >> $GITHUB_ENV 39 | git config user.name barometre-github-actions 40 | git config user.email barometre-github-actions@github.com 41 | git add pyproject.toml 42 | git commit -m "[no ci]: $PROJECT_VERSION bumping version" 43 | git push origin main 44 | - name: Login to Scaleway Container Registry 45 | uses: docker/login-action@v3 46 | with: 47 | username: nologin 48 | password: ${{ secrets.SCALEWAY_API_KEY }} 49 | registry: ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }} 50 | 51 | - name: Build mediatree_import image 52 | run: docker build -f Dockerfile_api_import . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }} 53 | - name: Tag mediatree_import latest image 54 | run: docker tag ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }} ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:latest 55 | - name: Push mediatree_import Image 56 | run: docker push --all-tags ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import 57 | 58 | - name: update scaleway job definition with version mediatree_import 59 | uses: jawher/action-scw@v2.34.0 60 | env: 61 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }} 62 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }} 63 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }} 64 | SCW_ZONE: ${{ secrets.SCW_ZONE }} 65 | with: 66 | args: jobs definition update ${{ secrets.SCALEWAY_JOB_IMPORT_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }} 67 | 68 | - name: Build s3 image 69 | run: docker build -f Dockerfile_api_to_s3 . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }} 70 | - name: Tag s3 latest image 71 | run: docker tag ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }} ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:latest 72 | - name: Push s3 Image 73 | run: docker push ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }} 74 | 75 | - name: update scaleway job definition with version s3 76 | uses: jawher/action-scw@v2.34.0 77 | env: 78 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }} 79 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }} 80 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }} 81 | SCW_ZONE: ${{ secrets.SCW_ZONE }} 82 | with: 83 | args: jobs definition update ${{ secrets.SCALEWAY_JOB_S3_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/s3:${{ env.PROJECT_VERSION }} 84 | 85 | - name: Build stop_word image 86 | run: docker build -f Dockerfile_stop_word . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }} 87 | - name: Tag stop_word latest image 88 | run: docker tag ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }} ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:latest 89 | - name: Push stop_word Image 90 | run: docker push ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }} 91 | 92 | - name: update scaleway job definition with version stopwords 93 | uses: jawher/action-scw@v2.34.0 94 | env: 95 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }} 96 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }} 97 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }} 98 | SCW_ZONE: ${{ secrets.SCW_ZONE }} 99 | with: 100 | args: jobs definition update ${{ secrets.SCALEWAY_STOP_WORDS_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/stop_word:${{ env.PROJECT_VERSION }} 101 | 102 | - name: update scaleway job update job 103 | uses: jawher/action-scw@v2.34.0 104 | env: 105 | SCW_ACCESS_KEY: ${{ secrets.SCW_ACCESS_KEY }} 106 | SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }} 107 | SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }} 108 | SCW_ZONE: ${{ secrets.SCW_ZONE }} 109 | with: 110 | args: jobs definition update ${{ secrets.SCALEWAY_UPDATE_JOB_ID }} image-uri=${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/mediatree_import:${{ env.PROJECT_VERSION }} 111 | -------------------------------------------------------------------------------- /mockwebsite/lexpress_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | https://www.lexpress.fr/societe/evasion-de-reau-par-helicoptere-lheure-du-verdict-pour-redoine-faid-SYPRU6BXSRB27DFSOLH23QSCRY/2023-10-25T10:49:48.000+02:00always0.5L'Expressfr2023-10-25T08:49:48Zhttps://www.lexpress.fr/resizer/rBes-Zxn7XqcPvpVdnoTR_0vEIM=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/4WE5RWALHVECPOS3OAQ4U6MGUI.jpghttps://www.lexpress.fr/monde/europe/le-ministre-russe-de-la-defense-sur-la-zone-de-loperation-militaire-en-ukraine-JBK5YZUYZZFNLIDRKF54LHHEAE/2023-10-25T10:31:43.539+02:00always0.5L'Expressfr2023-10-25T08:31:43.539Zhttps://www.lexpress.fr/resizer/5OIiTmRnwqg0l6dHTKEoovcmCCM=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/UAC35INRQZFKFF6GIWQRWM6RJU.jpghttps://www.lexpress.fr/monde/proche-moyen-orient/guerre-hamas-israel-macron-va-rencontrer-le-roi-de-jordanie-a-amman-ZR4BAAKC45FRRE7O4JV454AUKY/2023-10-25T10:17:31.494+02:00always0.5L'Expressfr2023-10-25T08:17:31.494Zhttps://www.lexpress.fr/resizer/cqcHx_xhgHOc6D2tPodgor6yp5M=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/CHZX6RK67VB5TGGYQJDI4FD5K4.jpghttps://www.lexpress.fr/monde/japon-decision-de-justice-tres-attendue-sur-le-changement-detat-civil-des-personnes-transgenres-TDU6FGHBANHVPNU5ZM5FI75KIU/2023-10-25T10:02:23.000+02:00always0.5L'Expressfr2023-10-25T08:02:23Zhttps://www.lexpress.fr/resizer/tDdXsSjhswEleE1mcoIdChtNwTw=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/VCTHD6G5MJAA3NYI3BDYZ6X2VI.jpghttps://www.lexpress.fr/societe/deserts-medicaux-le-senat-retoque-la-repartition-des-medecins-VEF4G4QZDZFFRLBAAWP4UP57JU/2023-10-25T09:59:37.725+02:00always0.5L'Expressfr2023-10-25T07:59:37.725Zhttps://www.lexpress.fr/resizer/NSdrjfJ2Na62498cuhWsqyqjRuk=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/XSQJAM2USRBMZFKDB2AFJ3MMAM.jpghttps://www.lexpress.fr/monde/laide-de-lonu-a-gaza-menacee-de-paralysie-discussions-autour-dune-pause-humanitaire-XV7BLYTMLVGJRI2XPLSREFVC2Q/2023-10-25T09:20:53.000+02:00always0.5L'Expressfr2023-10-25T07:20:53Zhttps://www.lexpress.fr/resizer/l-bVyVd-EWXxoZLn5QoX6xp5cAI=/1200x630/cloudfront-eu-central-1.images.arcpublishing.com/lexpress/SG5MCUKVT5EDHN4HJYAONCMGDM.jpg -------------------------------------------------------------------------------- /mockwebsite/francebleu_sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | https://www.francebleu.fr/infos/faits-divers-justice/accident-villognon-le-maitre-d-hotel-blesse-au-pied-droit-sera-indemnise-1322246France Bleufr2023-10-26T09:57:37+00:00Accident Villognon : le maître d'hôtel blessé au pied droit sera indemniséFaits divers - Justice, François Hollande, justice, Infoshttps://www.francebleu.fr/infos/culture-loisirs/en-route-pour-une-nouvelle-semaine-de-cadeaux-avec-france-bleu-vaucluse-9545500France Bleufr2023-10-26T09:55:40+00:00En route pour une nouvelle semaine de cadeaux avec France Bleu Vaucluse ! Culture - Loisirs, Infoshttps://www.francebleu.fr/infos/economie-social/dans-les-deux-charentes-payez-vos-factures-d-electricite-moins-cheres-grace-a-l-achat-groupe-d-energie-9597185France Bleufr2023-10-26T09:51:10+00:00Dans les deux Charentes : payez vos factures d'électricité moins chères grâce à l'achat groupé d'énergieÉconomie - Social, Énergie, UFC Que Choisir, Électricité, Inflation, Économies d'énergie – Éco-gestes, Infoshttps://www.francebleu.fr/infos/faits-divers-justice/cette-histoire-m-a-traumatise-francis-nachbar-ancien-magistrat-publie-un-livre-sur-les-affaires-fourniret-6174798France Bleufr2023-10-26T09:47:09+00:00 "Cette histoire m'a traumatisé", Francis Nachbar, ancien magistrat publie un livre sur les affaires FourniretFaits divers - Justice, Les affaires Fourniret, justice, Monique Olivier, Enquêtes – Investigation, Infoshttps://www.francebleu.fr/sports/football/liverpool-tfc-gagner-a-anfield-c-est-si-rare-pour-un-club-francais-5163393France Bleufr2023-10-26T09:46:03+00:00Liverpool-TFC : gagner à Anfield, c'est si rare pour un club françaisFootball, TFC - Toulouse Football Club, Europa League, Toulouse, Sportshttps://www.francebleu.fr/infos/faits-divers-justice/caen-coups-de-marteau-menaces-de-mort-et-videos-humiliantes-un-jeune-homme-condamne-a-2-ans-de-prison-ferme-4264720France Bleufr2023-10-26T09:43:03+00:00Caen: coups de marteau, menaces de mort et vidéos humiliantes: un jeune homme condamné à 2 ans de prison fermeFaits divers - Justice, Infoshttps://www.francebleu.fr/infos/faits-divers-justice/seine-maritime-un-jeune-homme-de-21-ans-tue-par-balles-a-maromme-l-auteur-en-fuite-5680400France Bleufr2023-10-26T09:40:36+00:00Seine-Maritime : un jeune homme de 21 ans tué par balles à Maromme, l'auteur en fuiteFaits divers - Justice, Armes à feu, Agression, Police nationale, Enquêtes – Investigation, Infoshttps://www.francebleu.fr/infos/societe/ehpad-une-enquete-de-60-millions-de-consommateurs-pointe-du-doigt-la-qualite-des-repas-servis-6540286France Bleufr2023-10-26T09:39:07+00:00Ehpad : une enquête de 60 millions de consommateurs pointe du doigt la qualité des repas servisSociété, Maisons de retraite – Ehpad, Alimentation, Infoshttps://www.francebleu.fr/infos/societe/poule-cherche-nouvelle-maison-a-pia-un-eleveur-met-a-la-vente-ses-pondeuses-avec-poule-pour-tous-4774679France Bleufr2023-10-26T09:31:46+00:00Poule cherche nouvelle maison, à Pia un éleveur met à la vente ses pondeuses avec Poule pour tous Société, poulet, animaux, Élevage, Infoshttps://www.francebleu.fr/infos/faits-divers-justice/cannabis-cocaine-ecstasy-1-homme-et-1-femme-arretes-a-bagnols-sur-ceze-5907058France Bleufr2023-10-26T09:27:41+00:00Cannabis, cocaïne, ecstasy : un homme et une femme arrêtés à Bagnols-sur-CèzeFaits divers - Justice, Gard, Drogues, Police nationale, Infoshttps://www.francebleu.fr/infos/societe/une-cinquantaine-d-habitants-de-mourenx-denoncent-les-odeurs-intenables-du-methaniseur-biobearn-2192014France Bleufr2023-10-26T09:12:24+00:00Une cinquantaine d'habitants de Mourenx dénoncent les odeurs "intenables" du méthaniseur BiobéarnSociété, Entreprises, Infos --------------------------------------------------------------------------------