├── .env ├── .envrc ├── .github ├── actions │ ├── python_deps │ │ └── action.yaml │ ├── run_task │ │ ├── action.yaml │ │ └── run_task.sh │ └── tools │ │ └── action.yaml ├── dependabot.yaml ├── pull_request_template.md └── workflows │ ├── ad_hoc_deploy_run.yml │ ├── asana-pr-merged.yml │ ├── asana-pr-opened.yml │ ├── change_task_count.yaml │ ├── ci-yaml.yaml │ ├── ci_python.yaml │ ├── deploy-base.yaml │ ├── deploy-prod.yaml │ ├── deploy-staging.yaml │ ├── manual-deploy.yaml │ └── run-task.yaml ├── .gitignore ├── .tool-versions ├── .yamllint.yml ├── Data_Dictionary.md ├── Dockerfile ├── LICENSE ├── README.md ├── alembic.ini ├── analysis ├── check_bus.py ├── check_bus_tableau.py ├── check_data_all_days_in_LAMP_ALL_RT_fields.py ├── prism.py └── sample_data.py ├── architecture.jpg ├── docker-compose.yml ├── poetry.lock ├── pyproject.toml ├── runners ├── run_glides_parquet_converter.py ├── run_gtfs_rt_parquet_converter.py ├── run_query_s3_with_date_range.py └── run_static_trips_subquery.py ├── src └── lamp_py │ ├── __version__.py │ ├── ad_hoc │ ├── __init__.py │ ├── pipeline.py │ └── runner_001.py │ ├── aws │ ├── __init__.py │ ├── ecs.py │ ├── kinesis.py │ └── s3.py │ ├── bus_performance_manager │ ├── README.md │ ├── __init__.py │ ├── event_files.py │ ├── events_gtfs_rt.py │ ├── events_gtfs_schedule.py │ ├── events_joined.py │ ├── events_metrics.py │ ├── events_tm.py │ ├── pipeline.py │ └── write_events.py │ ├── ingestion │ ├── README.md │ ├── __init__.py │ ├── compress_gtfs │ │ ├── __init__.py │ │ ├── gtfs_schema_map.py │ │ ├── gtfs_to_parquet.py │ │ ├── pipe.py │ │ ├── pq_to_sqlite.py │ │ └── schedule_details.py │ ├── config_busloc_trip.py │ ├── config_busloc_vehicle.py │ ├── config_rt_alerts.py │ ├── config_rt_trip.py │ ├── config_rt_vehicle.py │ ├── convert_gtfs.py │ ├── convert_gtfs_rt.py │ ├── converter.py │ ├── glides.py │ ├── gtfs_rt_detail.py │ ├── gtfs_rt_structs.py │ ├── ingest_gtfs.py │ ├── light_rail_gps.py │ ├── pipeline.py │ └── utils.py │ ├── ingestion_tm │ ├── ingest.py │ ├── jobs │ │ ├── parition_table.py │ │ └── whole_table.py │ ├── pipeline.py │ └── tm_export.py │ ├── migrations │ ├── README │ ├── __init__.py │ ├── env.py │ ├── migration_template_generator.py │ ├── script.py.mako │ └── versions │ │ ├── metadata_dev │ │ ├── 001_07903947aabe_initial_changes.py │ │ └── 002_26db393ea854_update_glides_location_column_names.py │ │ ├── metadata_prod │ │ ├── 001_07903947aabe_initial_changes.py │ │ ├── 002_cce8dfee767a_re_run_input_files_from_2024_04_03.py │ │ ├── 003_26db393ea854_update_glides_location_column_names.py │ │ └── 004_a08c5fd37dbd_reprocess_422_423.py │ │ ├── metadata_staging │ │ ├── 001_07903947aabe_initial_changes.py │ │ ├── 002_26db393ea854_update_glides_location_column_names.py │ │ └── 003_a08c5fd37dbd_reprocess_422_423.py │ │ ├── performance_manager_dev │ │ ├── 001_5d9a7ee21ae5_initial_prod_schema.py │ │ ├── 002_1b53fd278b10_fix_trip_id_length.py │ │ ├── 003_ae6c6e4b2df5_extend_service_id_view.py │ │ ├── 004_45dedc21086e_canon_stop_seq.py │ │ ├── 005_96187da84955_remove_metadata.py │ │ ├── 006_2dfbde5ec151_sync_stop_trunk.py │ │ ├── 007_896dedd8a4db_dwell_time_update.py │ │ ├── 008_32ba735d080c_add_revenue_columns.py │ │ └── 009_36e7a7aee148_upgrade_sequence.py │ │ ├── performance_manager_prod │ │ ├── 001_5d9a7ee21ae5_initial_prod_schema.py │ │ ├── 002_f09e853d5672_update_prod_stop_sync.py │ │ ├── 003_2dfbde5ec151_sync_stop_trunk.py │ │ ├── 004_896dedd8a4db_dwell_time_update.py │ │ ├── 005_32ba735d080c_add_revenue_columns.py │ │ ├── 006_36e7a7aee148_upgrade_sequence.py │ │ ├── 007_da8f80a3dd90_upgrade_sequence.py │ │ ├── 008_5e3066f113ff_backfill_rt_rail_2025_04_04_to_2025_04_18.py │ │ └── sql_strings │ │ │ └── strings_001.py │ │ └── performance_manager_staging │ │ ├── 001_5d9a7ee21ae5_initial_prod_schema.py │ │ ├── 002_1b53fd278b10_fix_trip_id_length.py │ │ ├── 003_ae6c6e4b2df5_extend_service_id_view.py │ │ ├── 004_45dedc21086e_canon_stop_seq.py │ │ ├── 005_96187da84955_remove_metadata.py │ │ ├── 006_e20a4f3f8c03_fix_null_vehicle_consist.py │ │ ├── 007_2dfbde5ec151_sync_stop_trunk.py │ │ ├── 008_896dedd8a4db_dwell_time_update.py │ │ ├── 009_32ba735d080c_add_revenue_columns.py │ │ ├── 010_36e7a7aee148_upgrade_sequence.py │ │ ├── 011_5e3066f113ff_backfill_rt_rail_2025_04_04_to_2025_04_18.py │ │ ├── 012_9b461d7aa53a_backfill_rt_rail_2025_04_04_to_2025_04_22.py │ │ └── sql_strings │ │ ├── strings_001.py │ │ └── strings_003.py │ ├── mssql │ ├── __init__.py │ ├── mssql_utils.py │ └── test_connect.py │ ├── performance_manager │ ├── README.md │ ├── __init__.py │ ├── alerts.py │ ├── flat_file.py │ ├── gtfs_utils.py │ ├── l0_gtfs_rt_events.py │ ├── l0_gtfs_static_load.py │ ├── l0_gtfs_static_mod.py │ ├── l0_rt_trip_updates.py │ ├── l0_rt_vehicle_positions.py │ ├── l1_cte_statements.py │ ├── l1_rt_metrics.py │ ├── l1_rt_trips.py │ └── pipeline.py │ ├── postgres │ ├── __init__.py │ ├── metadata_schema.py │ ├── postgres_utils.py │ ├── rail_performance_manager_schema.py │ └── seed_metadata.py │ ├── publishing │ ├── __init__.py │ ├── index.html │ └── performancedata.py │ ├── runtime_utils │ ├── __init__.py │ ├── alembic_migration.py │ ├── env_validation.py │ ├── infinite_wait.py │ ├── lamp_exception.py │ ├── process_logger.py │ └── remote_files.py │ ├── tableau │ ├── README.md │ ├── __init__.py │ ├── conversions │ │ ├── convert_bus_performance_data.py │ │ ├── convert_gtfs_rt_trip_updates.py │ │ └── convert_gtfs_rt_vehicle_position.py │ ├── hyper.py │ ├── jobs │ │ ├── bus_performance.py │ │ ├── filtered_hyper.py │ │ ├── glides.py │ │ ├── gtfs_rail.py │ │ ├── rt_alerts.py │ │ └── rt_rail.py │ ├── pipeline.py │ └── server.py │ └── utils │ ├── __init__.py │ ├── clear_folder.py │ ├── date_range_builder.py │ ├── filter_bank.py │ └── gtfs_utils.py └── tests ├── __init__.py ├── aws ├── __init__.py └── test_s3_utils.py ├── bus_performance_manager ├── __init__.py ├── bus_test_gtfs.csv ├── test_bus_convert_for_tableau.py ├── test_gtfs.py ├── test_gtfs_rt_ingestion.py └── test_tm_ingestion.py ├── conftest.py ├── ingestion ├── __init__.py ├── test_configuration.py ├── test_gtfs_compress.py ├── test_gtfs_converter.py ├── test_gtfs_rt_converter.py ├── test_ingest.py └── test_light_rail_gps.py ├── ingestion_tm ├── __init__.py └── test_ingest.py ├── performance_manager ├── __init__.py ├── test_alerts.py ├── test_backup_trips_match.py ├── test_l0_gtfs_rt_events.py ├── test_performance_manager.py └── test_static_trips_subquery.py ├── test_files ├── INCOMING │ ├── 2019-12-12T00_00_10_https___mbta_gtfs_s3_dev.s3.amazonaws.com_concentrate_VehiclePositions_enhanced.json │ ├── 2019-12-12T00_00_57_https___mbta_gtfs_s3_dev.s3.amazonaws.com_concentrate_TripUpdates_enhanced.json │ ├── 2022-01-01T00:00:03Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz │ ├── 2022-05-04T15:59:48Z_https_cdn.mbta.com_realtime_Alerts_enhanced.json.gz │ ├── 2022-05-05T16_00_15Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_VehiclePositions_enhanced.json.gz │ ├── 2022-05-08T06:04:57Z_https_cdn.mbta.com_realtime_TripUpdates_enhanced.json.gz │ ├── 2022-06-28T10_03_18Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_TripUpdates_enhanced.json.gz │ ├── 2022-07-05T12:35:16Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz │ ├── 2024-05-01T02:30:11Z_s3_mbta_ctd_trc_data_rtr_prod_LightRailRawGPS.json.gz │ ├── MBTA_GTFS.zip │ ├── empty.json.gz │ ├── large_page_obj_response.json │ └── one_blank_record.json.gz ├── PUBLIC_ARCHIVE │ └── lamp │ │ ├── bus_vehicle_events │ │ └── test_events.parquet │ │ └── gtfs_archive │ │ ├── 2023 │ │ └── routes.parquet │ │ └── 2024 │ │ └── routes.parquet ├── SPRINGBOARD │ ├── CALENDAR │ │ └── timestamp=1682375024 │ │ │ └── f18c9f5747194660a793cf0cd6f9df90-0.parquet │ ├── CALENDAR_DATES │ │ └── timestamp=1682375024 │ │ │ └── 7c0b0da47e284237a7b50df57e3ef33c-0.parquet │ ├── DIRECTIONS │ │ └── timestamp=1682375024 │ │ │ └── 562949d9931149f8a5d8f0cb2eb52c80-0.parquet │ ├── FEED_INFO │ │ └── timestamp=1682375024 │ │ │ └── e84307ae774a4d8c8968c5e38e7affdc-0.parquet │ ├── ROUTES │ │ └── timestamp=1682375024 │ │ │ └── b4e038eb63da41fcb66eed81548f664a-0.parquet │ ├── ROUTE_PATTERNS │ │ └── timestamp=1682375024 │ │ │ └── 57233d3677484fe1bd0373749c34cc63-0.parquet │ ├── RT_ALERTS │ │ └── year=2020 │ │ │ └── month=2 │ │ │ └── day=9 │ │ │ └── hour=1 │ │ │ └── 6ef6922c20064cb9a8f09a3b3b1d2783-0.parquet │ ├── RT_TRIP_UPDATES │ │ └── year=2023 │ │ │ └── month=5 │ │ │ └── day=8 │ │ │ ├── hour=12 │ │ │ └── 8e2c182968e24ecea3d37f03d6bae84d-0.parquet │ │ │ └── hour=13 │ │ │ └── eaeee968b94b4a74b166df4b8ffd9f29-0.parquet │ ├── RT_VEHICLE_POSITIONS │ │ ├── year=2023 │ │ │ └── month=5 │ │ │ │ └── day=8 │ │ │ │ ├── hour=12 │ │ │ │ └── 1613b49e4fa1459eabe9c83553ef1045-0.parquet │ │ │ │ └── hour=13 │ │ │ │ └── 9a1bb1c5269042a284b2ed57b4dfebb9-0.parquet │ │ └── year=2024 │ │ │ └── month=6 │ │ │ └── day=1 │ │ │ ├── hour=12 │ │ │ └── fcf91fbba92d418aa136d928c6243121-0.parquet │ │ │ └── hour=13 │ │ │ └── 47ffb78637a5400aabdfd7c9c7142757-0.parquet │ ├── STOPS │ │ └── timestamp=1682375024 │ │ │ └── 920a42ad1b5e4ef0942c7a1bc2ef2fea-0.parquet │ ├── STOP_TIMES │ │ └── timestamp=1682375024 │ │ │ └── 88c016320de440789357f14df6399d4c-0.parquet │ ├── TM │ │ ├── STOP_CROSSING │ │ │ ├── 120240601.parquet │ │ │ └── 120240811.parquet │ │ ├── TMMAIN_GEO_NODE.parquet │ │ ├── TMMAIN_ROUTE.parquet │ │ ├── TMMAIN_TRIP.parquet │ │ └── TMMAIN_VEHICLE.parquet │ └── TRIPS │ │ └── timestamp=1682375024 │ │ └── cdca1ec8575c4705bb93bc76244c1a86-0.parquet ├── april_2023_filepaths.json ├── before_times_filepaths.json ├── ingestion_BUSLOC_TU.parquet ├── ingestion_BUSLOC_VP.parquet ├── ingestion_GTFS-RT_ALERT.parquet ├── ingestion_GTFS-RT_TU.parquet ├── ingestion_GTFS-RT_TU_OLD.parquet ├── ingestion_GTFS-RT_VP.parquet ├── ingestion_GTFS-RT_VP_OLD.parquet ├── july_17_filepaths.json ├── may_8.json ├── pipeline_flat_out.csv ├── process_vp_files_flat_out.csv ├── replace_perf_mgr_query_test_data │ ├── 20250415_rt_trips_for_backup_match_subquery.csv │ ├── 20250415_static_trips_subquery.csv │ ├── staging_test_summary_sub.csv │ └── summary_sub.sql ├── short_list.json ├── staging_dec_10.json ├── tu_missing_start_date.parquet ├── vehicle_positions_flat_input.csv ├── vp_missing_start_date.csv └── vp_missing_start_time.csv ├── test_resources.py └── utils ├── test_date_range_builder.py ├── test_filter_bank.py ├── test_gtfs_utils.py └── timezones.py /.env: -------------------------------------------------------------------------------- 1 | # helper to know if env is already loaded 2 | BOOTSTRAPPED=1 3 | 4 | # metadata database 5 | MD_DB_HOST=local_md_rds 6 | MD_DB_PORT=5433 7 | MD_DB_NAME=metadata 8 | MD_DB_USER=postgres 9 | MD_DB_PASSWORD=postgres 10 | ALEMBIC_MD_DB_NAME=metadata_prod 11 | 12 | # performance manager database 13 | RPM_DB_HOST=local_rpm_rds 14 | RPM_DB_PORT=5434 15 | RPM_DB_NAME=performance_manager 16 | RPM_DB_USER=postgres 17 | RPM_DB_PASSWORD=postgres 18 | ALEMBIC_RPM_DB_NAME=performance_manager_prod 19 | 20 | # MSSQL TransitMaster database 21 | TM_DB_HOST=do_update 22 | TM_DB_NAME=do_update 23 | TM_DB_USER=do_update 24 | TM_DB_PASSWORD=do_update 25 | 26 | # s3 locations 27 | SPRINGBOARD_BUCKET=mbta-ctd-dataplatform-dev-springboard 28 | ARCHIVE_BUCKET=mbta-ctd-dataplatform-dev-archive 29 | ERROR_BUCKET=mbta-ctd-dataplatform-dev-error 30 | INCOMING_BUCKET=mbta-ctd-dataplatform-dev-incoming 31 | 32 | # mbta-performance with personal access 33 | PUBLIC_ARCHIVE_BUCKET=mbta-ctd-dataplatform-dev-archive 34 | 35 | # Tableau 36 | TABLEAU_USER=DOUPDATE 37 | TABLEAU_PASSWORD=DOUPDATE 38 | TABLEAU_SERVER=http://awtabDEV02.mbta.com 39 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | use asdf 2 | 3 | dotenv 4 | -------------------------------------------------------------------------------- /.github/actions/python_deps/action.yaml: -------------------------------------------------------------------------------- 1 | name: Setup Python Dependencies 2 | description: Loads python dependencies for a CI/CD job, install them if not cached 3 | 4 | runs: 5 | using: composite 6 | steps: 7 | - name: ASDF Tools Install 8 | uses: ./.github/actions/tools 9 | 10 | - name: Python Deps Cache 11 | uses: actions/cache@v3 12 | id: python-cache 13 | with: 14 | path: | 15 | ~/.cache/pypoetry 16 | **/.venv 17 | key: ${{ runner.os }}-poetry-${{ hashFiles('./poetry.lock') }} 18 | 19 | - name: Install Python Deps 20 | working-directory: . 21 | # env use python3.10 to force usage of python3.10 installed by asdf over system python version 22 | run: | 23 | poetry env use python3.10 24 | poetry install -v 25 | shell: bash 26 | if: "!steps.python-cache.outputs.cache-hit" 27 | -------------------------------------------------------------------------------- /.github/actions/run_task/action.yaml: -------------------------------------------------------------------------------- 1 | name: Manually Run ECS Task 2 | description: Run an existing task in an existing AWS Service and Cluster 3 | 4 | inputs: 5 | role-to-assume: 6 | description: IAM role 7 | required: true 8 | aws-region: 9 | description: AWS region to use 10 | required: true 11 | default: us-east-1 12 | cluster: 13 | description: ECS Cluster for Service 14 | required: true 15 | service: 16 | description: ECS Service for task to run 17 | required: true 18 | 19 | runs: 20 | using: composite 21 | steps: 22 | - name: Setup AWS Credentials 23 | uses: aws-actions/configure-aws-credentials@v4 24 | with: 25 | role-to-assume: ${{ inputs.role-to-assume }} 26 | aws-region: ${{ inputs.aws-region }} 27 | mask-aws-account-id: true 28 | - name: Start ECS Task 29 | run: ${{ github.action_path }}/run_task.sh 30 | shell: bash 31 | env: 32 | AWS_REGION: ${{ inputs.aws-region }} 33 | CLUSTER: ${{ inputs.cluster }} 34 | SERVICE: ${{ inputs.service }} 35 | -------------------------------------------------------------------------------- /.github/actions/run_task/run_task.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -u 3 | 4 | # uncomment to debug 5 | # set -x 6 | 7 | # Run an ECS Task from the provided CLUSTER and SERVICE 8 | 9 | # required environment varialbes 10 | # - CLUSTER 11 | # - SERVICE 12 | 13 | # Get the Security Groups that can run the task. 14 | echo "Retrieving SecurityGroups for SERVICE:${SERVICE} in CLUSTER:${CLUSTER}" 15 | SECURITY_GROUPS=$(aws ecs describe-services \ 16 | --services $SERVICE \ 17 | --cluster $CLUSTER \ 18 | --query services[0].networkConfiguration.awsvpcConfiguration.securityGroups \ 19 | --output text \ 20 | | sed 's/\t/,/g') 21 | echo "SECURITY GROUPS: ${SECURITY_GROUPS}" 22 | 23 | # Get the Subnets that the task runs on. 24 | echo "Retrieving subnets for SERVICE:${SERVICE} in CLUSTER:${CLUSTER}" 25 | SUBNETS=$(aws ecs describe-services \ 26 | --services $SERVICE \ 27 | --cluster $CLUSTER \ 28 | --query services[0].networkConfiguration.awsvpcConfiguration.subnets \ 29 | --output text \ 30 | | sed 's/\t/,/g') 31 | echo "SUBNETS: ${SUBNETS}" 32 | 33 | # Run the ECS task 34 | aws ecs run-task \ 35 | --cluster $CLUSTER \ 36 | --task-definition $SERVICE \ 37 | --launch-type FARGATE \ 38 | --count 1 \ 39 | --network-configuration "awsvpcConfiguration={subnets=[$SUBNETS],securityGroups=[$SECURITY_GROUPS],assignPublicIp=DISABLED}" 40 | -------------------------------------------------------------------------------- /.github/actions/tools/action.yaml: -------------------------------------------------------------------------------- 1 | name: Setup ASDF Tools 2 | description: Loads ASDF tools for for a CI/CD job, installing them if not cached 3 | outputs: 4 | cache-hit: 5 | description: "Whether the ASDF cache was hit" 6 | value: ${{ steps.asdf-cache.outputs-cache-hit }} 7 | runs: 8 | using: composite 9 | steps: 10 | # cache the ASDF directory, using values from .tool-versions 11 | - name: ASDF Tools Cache 12 | uses: actions/cache@v3 13 | id: asdf-cache 14 | with: 15 | path: ~/.asdf 16 | # runner.os vs CACHE_UUID secret 17 | key: ${{ runner.os}}-asdf-${{ hashFiles('**/.tool-versions') }} 18 | 19 | - name: Install ASDF Tools 20 | uses: asdf-vm/actions/install@v2 21 | if: steps.asdf-cache.outputs.cache-hit != 'true' 22 | 23 | - name: Re-shim ASDF Install 24 | uses: mbta/actions/reshim-asdf@v1 25 | -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | - package-ecosystem: pip 5 | directory: "." 6 | schedule: 7 | interval: weekly 8 | time: "08:00" 9 | timezone: "America/New_York" 10 | open-pull-requests-limit: 5 11 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | Asana Task: 2 | -------------------------------------------------------------------------------- /.github/workflows/ad_hoc_deploy_run.yml: -------------------------------------------------------------------------------- 1 | name: Ad-Hoc Deploy & Run 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | environment: 7 | description: Environment 8 | type: choice 9 | options: 10 | - dev 11 | - staging 12 | - prod 13 | secrets: 14 | AWS_ROLE_ARN: 15 | description: AWS_ROLE_ARN 16 | required: true 17 | 18 | jobs: 19 | deploy: 20 | uses: ./.github/workflows/deploy-base.yaml 21 | with: 22 | # pass the inputs from the workflow dispatch through to the deploy base. the booleans are 23 | # converted to strings, so flip them back using fromJson function 24 | environment: ${{ github.event.inputs.environment }} 25 | deploy-ad-hoc: true 26 | secrets: inherit 27 | run_ad_hoc_task: 28 | needs: deploy 29 | runs-on: ubuntu-latest 30 | permissions: 31 | id-token: write 32 | contents: read 33 | steps: 34 | - name: Checkout Branch 35 | uses: actions/checkout@v3 36 | - name: Run Ad-Hoc Task 37 | uses: ./.github/actions/run_task 38 | with: 39 | role-to-assume: ${{ secrets.AWS_ROLE_ARN }} 40 | cluster: 'lamp' 41 | service: lamp-ad-hoc-${{ inputs.environment }} 42 | -------------------------------------------------------------------------------- /.github/workflows/asana-pr-merged.yml: -------------------------------------------------------------------------------- 1 | name: Move Asana Ticket after PR Merged 2 | on: 3 | pull_request: 4 | types: [closed] 5 | 6 | jobs: 7 | move-asana-ticket-to_done_job: 8 | runs-on: ubuntu-latest 9 | if: github.event.pull_request.merged == true 10 | steps: 11 | - name: Github-Asana Move Ticket Action 12 | uses: mbta/github-asana-action@v4.3.0 13 | with: 14 | asana-pat: ${{ secrets.ASANA_SECRET_FOR_MOVE_ACTION }} 15 | trigger-phrase: "Asana Task:" 16 | target-section: "Done" 17 | mark-complete: true 18 | 19 | move-asana-ticket-to_todo_job: 20 | runs-on: ubuntu-latest 21 | if: github.event.pull_request.merged == false 22 | steps: 23 | - name: Github-Asana Move Ticket Action 24 | uses: mbta/github-asana-action@v4.3.0 25 | with: 26 | asana-pat: ${{ secrets.ASANA_SECRET_FOR_MOVE_ACTION }} 27 | trigger-phrase: "Asana Task:" 28 | target-section: "To Do" 29 | -------------------------------------------------------------------------------- /.github/workflows/asana-pr-opened.yml: -------------------------------------------------------------------------------- 1 | name: Move Asana Ticket after PR Opened 2 | on: 3 | pull_request: 4 | types: [opened, reopened] 5 | 6 | jobs: 7 | move-asana-ticket-job: 8 | runs-on: ubuntu-latest 9 | if: ${{ !github.event.pull_request.head.repo.fork }} 10 | steps: 11 | - name: Github-Asana Move Ticket Action 12 | uses: mbta/github-asana-action@v4.3.0 13 | with: 14 | asana-pat: ${{ secrets.ASANA_SECRET_FOR_MOVE_ACTION }} 15 | trigger-phrase: "Asana Task:" 16 | target-section: "In Review" 17 | task-comment: "View Pull Request Here: " 18 | -------------------------------------------------------------------------------- /.github/workflows/change_task_count.yaml: -------------------------------------------------------------------------------- 1 | name: Change Task Count 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | environment: 7 | type: choice 8 | description: What environment to change the task count for 9 | options: 10 | - dev 11 | - staging 12 | - prod 13 | new_count: 14 | description: 1 to turn on 0 to turn off 15 | required: true 16 | application_name: 17 | type: choice 18 | description: What application to adjust the task count for 19 | options: 20 | - ingestion 21 | - rail-performance-manager 22 | - bus-performance-manager 23 | 24 | jobs: 25 | set_count: 26 | if: | 27 | ( github.event.inputs.new_count == 0 || github.event.inputs.new_count == 1) 28 | 29 | runs-on: ubuntu-latest 30 | permissions: 31 | id-token: write 32 | contents: read 33 | 34 | steps: 35 | - name: Configure AWS Credentials 36 | uses: aws-actions/configure-aws-credentials@v4 37 | with: 38 | role-to-assume: ${{ secrets.AWS_ROLE_ARN }} 39 | aws-region: us-east-1 40 | - name: Run ECS Update Service Command 41 | # yamllint disable rule:line-length 42 | run: > 43 | aws ecs update-service 44 | --cluster lamp 45 | --service lamp-${{ github.event.inputs.application_name }}-${{ github.event.inputs.environment }} 46 | --desired-count ${{ github.event.inputs.new_count }} 47 | # yamllint enable 48 | -------------------------------------------------------------------------------- /.github/workflows/ci-yaml.yaml: -------------------------------------------------------------------------------- 1 | name: Validate YAML 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - "**.yaml" 9 | pull_request: 10 | paths: 11 | - "**.yaml" 12 | 13 | jobs: 14 | build: 15 | name: Validate YAML actions 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | - run: yamllint . -f parsable --strict 21 | -------------------------------------------------------------------------------- /.github/workflows/ci_python.yaml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration (Python) 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - 'src/**' 9 | - 'tests/**' 10 | - 'pyproject.toml' 11 | - 'poetry.lock' 12 | - '.github/workflows/ci_python.yaml' 13 | - '.github/python_deps/action.yaml' 14 | pull_request: 15 | paths: 16 | - 'src/**' 17 | - 'tests/**' 18 | - 'pyproject.toml' 19 | - 'poetry.lock' 20 | - '.github/workflows/ci_python.yaml' 21 | - '.github/python_deps/action.yaml' 22 | 23 | defaults: 24 | run: 25 | shell: bash 26 | working-directory: . 27 | 28 | concurrency: 29 | group: python-ci-${{ github.ref }} 30 | cancel-in-progress: true 31 | 32 | jobs: 33 | setup: 34 | name: Python Setup 35 | runs-on: ubuntu-22.04 36 | steps: 37 | - uses: actions/checkout@v3 38 | - uses: ./.github/actions/python_deps 39 | 40 | format: 41 | name: Format 42 | runs-on: ubuntu-22.04 43 | needs: setup 44 | steps: 45 | - uses: actions/checkout@v3 46 | - uses: ./.github/actions/python_deps 47 | 48 | - run: poetry run black . --check 49 | 50 | typing: 51 | name: Type Check 52 | runs-on: ubuntu-22.04 53 | needs: setup 54 | steps: 55 | - uses: actions/checkout@v3 56 | - uses: ./.github/actions/python_deps 57 | 58 | - run: poetry run mypy . 59 | 60 | lint: 61 | name: Lint 62 | runs-on: ubuntu-22.04 63 | needs: setup 64 | steps: 65 | - uses: actions/checkout@v3 66 | - uses: ./.github/actions/python_deps 67 | 68 | - run: poetry run pylint src tests --rcfile pyproject.toml 69 | 70 | test: 71 | name: Test 72 | runs-on: ubuntu-22.04 73 | needs: setup 74 | env: 75 | BOOTSTRAPPED: 1 76 | MD_DB_HOST: local_rds 77 | MD_DB_PORT: 5433 78 | MD_DB_NAME: metadata 79 | MD_DB_USER: postgres 80 | MD_DB_PASSWORD: postgres 81 | ALEMBIC_MD_DB_NAME: metadata_prod 82 | RPM_DB_HOST: local_rds 83 | RPM_DB_PORT: 5434 84 | RPM_DB_NAME: performance_manager 85 | RPM_DB_USER: postgres 86 | RPM_DB_PASSWORD: postgres 87 | ALEMBIC_RPM_DB_NAME: performance_manager_prod 88 | services: 89 | rpm_postgres: 90 | image: postgres:14.4 91 | ports: 92 | - 5434:5432 93 | env: 94 | POSTGRES_PASSWORD: ${{env.RPM_DB_PASSWORD}} 95 | POSTGRES_USER: ${{env.RPM_DB_USER}} 96 | POSTGRES_DB: ${{env.RPM_DB_NAME}} 97 | options: 98 | --health-cmd pg_isready 99 | --health-interval 10s 100 | --health-timeout 5s 101 | --health-retries 5 102 | md_postgres: 103 | image: postgres:14.4 104 | ports: 105 | - 5433:5432 106 | env: 107 | POSTGRES_PASSWORD: ${{env.MD_DB_PASSWORD}} 108 | POSTGRES_USER: ${{env.MD_DB_USER}} 109 | POSTGRES_DB: ${{env.MD_DB_NAME}} 110 | options: 111 | --health-cmd pg_isready 112 | --health-interval 10s 113 | --health-timeout 5s 114 | --health-retries 5 115 | steps: 116 | - uses: actions/checkout@v3 117 | - uses: ./.github/actions/python_deps 118 | 119 | # Execute tests and generate coverage report 120 | - name: Run pytest With Coverage 121 | run: | 122 | poetry run pytest \ 123 | --cov-report lcov:coverage.info \ 124 | --cov-report term-missing \ 125 | --cov-branch \ 126 | --cov=lamp_py 127 | 128 | # Upload Coverage as an Artifact for Subsequent Jobs 129 | - name: Setup LCOV 130 | uses: hrishikesh-kadam/setup-lcov@v1 131 | - name: Report code coverage 132 | uses: mbta/github-actions-report-lcov@v4 133 | with: 134 | coverage-files: coverage.info 135 | artifact-name: python-code-coverage 136 | github-token: ${{ secrets.GITHUB_TOKEN }} 137 | -------------------------------------------------------------------------------- /.github/workflows/deploy-prod.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy to Production 2 | 3 | on: 4 | # deploy when version tags are published 5 | push: 6 | tags: 7 | - v[0-9]+.[0-9]+.[0-9]+ 8 | 9 | jobs: 10 | deploy: 11 | name: Deploy to Production 12 | concurrency: 13 | group: prod 14 | uses: ./.github/workflows/deploy-base.yaml 15 | with: 16 | environment: prod 17 | deploy-ingestion: true 18 | deploy-rail-pm: true 19 | deploy-bus-pm: true 20 | deploy-tm-ingestion: true 21 | deploy-tableau-publisher: true 22 | secrets: inherit 23 | -------------------------------------------------------------------------------- /.github/workflows/deploy-staging.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy to Staging 2 | 3 | on: 4 | # deploy when ci has been completed on main (should occur after new commits are added to main 5 | # directly or via pull request) 6 | workflow_run: 7 | workflows: ["Continuous Integration (Python)"] 8 | types: [completed] 9 | branches: 10 | - main 11 | 12 | jobs: 13 | deploy: 14 | name: Deploy to Staging 15 | concurrency: 16 | group: staging 17 | uses: ./.github/workflows/deploy-base.yaml 18 | with: 19 | environment: staging 20 | deploy-ingestion: true 21 | deploy-rail-pm: true 22 | deploy-bus-pm: true 23 | deploy-tm-ingestion: true 24 | deploy-tableau-publisher: true 25 | secrets: inherit 26 | -------------------------------------------------------------------------------- /.github/workflows/manual-deploy.yaml: -------------------------------------------------------------------------------- 1 | name: Manual Deploy 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | environment: 7 | description: Environment 8 | type: choice 9 | options: 10 | - dev 11 | - staging 12 | - prod 13 | deploy-ingestion: 14 | description: Deploy Ingestion 15 | default: false 16 | type: boolean 17 | deploy-rail-pm: 18 | description: Deploy Rail Performance Manager 19 | default: false 20 | type: boolean 21 | deploy-bus-pm: 22 | description: Deploy Bus Performance Manager 23 | default: false 24 | type: boolean 25 | deploy-tm-ingestion: 26 | description: Deploy TransitMaster Ingestion (not run on Dev) 27 | default: false 28 | type: boolean 29 | deploy-tableau-publisher: 30 | description: Deploy Tableau Publisher (not run on Dev) 31 | default: false 32 | type: boolean 33 | 34 | jobs: 35 | deploy: 36 | concurrency: 37 | group: github.event.inputs.environment 38 | uses: ./.github/workflows/deploy-base.yaml 39 | with: 40 | # pass the inputs from the workflow dispatch through to the deploy base. the booleans are 41 | # converted to strings, so flip them back using fromJson function 42 | environment: ${{ github.event.inputs.environment }} 43 | deploy-ingestion: ${{ fromJson(github.event.inputs.deploy-ingestion) }} 44 | deploy-rail-pm: ${{ fromJson(github.event.inputs.deploy-rail-pm) }} 45 | deploy-bus-pm: ${{ fromJson(github.event.inputs.deploy-bus-pm) }} 46 | deploy-tm-ingestion: ${{ fromJson(github.event.inputs.deploy-tm-ingestion) }} 47 | deploy-tableau-publisher: ${{ fromJson(github.event.inputs.deploy-tableau-publisher) }} 48 | secrets: inherit 49 | -------------------------------------------------------------------------------- /.github/workflows/run-task.yaml: -------------------------------------------------------------------------------- 1 | name: Run Task 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | environment: 7 | description: Environment 8 | type: choice 9 | options: 10 | - staging 11 | - prod 12 | task: 13 | description: Task 14 | type: choice 15 | options: 16 | - Tableau Publisher 17 | - Transit Master Ingestion 18 | 19 | jobs: 20 | run_task: 21 | runs-on: ubuntu-latest 22 | permissions: 23 | id-token: write 24 | contents: read 25 | steps: 26 | - name: Checkout Branch 27 | uses: actions/checkout@v3 28 | - name: Generate Task Name 29 | run: | 30 | if [ "${{ inputs.task }}" == "Tableau Publisher" ]; then 31 | echo "task_name=tableau-publisher" >> $GITHUB_ENV 32 | elif [ "${{ inputs.task }}" == "Transit Master Ingestion" ]; then 33 | echo "task_name=tm-ingestion" >> $GITHUB_ENV 34 | fi 35 | - name: Run Task Action 36 | uses: ./.github/actions/run_task 37 | with: 38 | role-to-assume: ${{ secrets.AWS_ROLE_ARN }} 39 | cluster: 'lamp' 40 | service: lamp-${{ env.task_name }}-${{ inputs.environment }} 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /_build 2 | /cover 3 | /deps 4 | /doc 5 | /.fetch 6 | erl_crash.dump 7 | *.ez 8 | *.beam 9 | /config/*.secret.exs 10 | .elixir_ls/ 11 | /performance_manager/test.db 12 | /notebook 13 | /investigation 14 | 15 | 16 | __pycache__ 17 | venv 18 | *.ipynb 19 | .coverage 20 | htmlcov 21 | .DS_Store 22 | package 23 | dist 24 | *.sh 25 | .devcontainer/* 26 | .vscode/* 27 | -------------------------------------------------------------------------------- /.tool-versions: -------------------------------------------------------------------------------- 1 | poetry 1.7.1 2 | python 3.10.13 3 | direnv 2.32.2 4 | -------------------------------------------------------------------------------- /.yamllint.yml: -------------------------------------------------------------------------------- 1 | extends: default 2 | 3 | ignore-from-file: .gitignore 4 | 5 | rules: 6 | document-start: disable 7 | line-length: 8 | max: 100 9 | truthy: 10 | check-keys: false 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim-bookworm 2 | 3 | # Keeps Python from generating .pyc files in the container 4 | ENV PYTHONDONTWRITEBYTECODE 1 5 | # Turns off buffering for easier container logging 6 | ENV PYTHONUNBUFFERED 1 7 | 8 | # Install non python dependencies 9 | RUN apt-get update 10 | RUN apt-get install -y libpq-dev gcc curl gpg 11 | 12 | # Fetch Amazon RDS certificate chain 13 | RUN curl https://truststore.pki.rds.amazonaws.com/global/global-bundle.pem -o /usr/local/share/amazon-certs.pem 14 | RUN chmod a=r /usr/local/share/amazon-certs.pem 15 | 16 | # Install MSSQL ODBC 18 Driver 17 | # for TransitMaster DB connection and ingestion 18 | RUN mkdir -m 0755 -p /etc/apt/keyrings/ \ 19 | && curl -fsSL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /etc/apt/keyrings/microsoft.gpg \ 20 | && echo "deb [signed-by=/etc/apt/keyrings/microsoft.gpg] https://packages.microsoft.com/debian/12/prod bookworm main" | tee /etc/apt/sources.list.d/mssql-release.list \ 21 | && apt-get update \ 22 | && ACCEPT_EULA=Y apt-get install -y msodbcsql18 23 | 24 | # modify openssl config to allow TLSv1 connection 25 | # moves [openssl_init] section and creates [ssl_default_sect] section to allow TLSv1 26 | # for TransitMaster DB connection and ingestion 27 | RUN sed -i 's/\[openssl_init\]/# [openssl_init]/' /etc/ssl/openssl.cnf \ 28 | && echo '\n\n[openssl_init]\nssl_conf = ssl_sect\n\n[ssl_sect]\nsystem_default = ssl_default_sect\n\n[ssl_default_sect]\nMinProtocol = TLSv1\nCipherString = DEFAULT@SECLEVEL=0\n' >> /etc/ssl/openssl.cnf 29 | 30 | # Create tmp directory that will mount to the ephemeral storage on ECS 31 | # Implemented to solve this problem: https://github.com/aws/amazon-ecs-agent/issues/3594 32 | # Where the reported memory usage reported up to ECS far exceeds the actual memory usage 33 | # when many reads/writes occur on a temp directory. 34 | # Related terraform changes are here: https://github.com/mbta/devops/pull/2727 35 | RUN mkdir -m 1777 -p /tmp 36 | VOLUME ["/tmp"] 37 | 38 | # Install poetry 39 | RUN pip install -U pip 40 | RUN pip install "poetry==1.7.1" 41 | 42 | # copy poetry and pyproject files and install dependencies 43 | WORKDIR /lamp/ 44 | COPY poetry.lock poetry.lock 45 | COPY pyproject.toml pyproject.toml 46 | 47 | # Tableau dependencies for arm64 cannot be resolved (since salesforce doesn't 48 | # support them yet). For that buildplatform build without those dependencies 49 | ARG TARGETARCH BUILDPLATFORM TARGETPLATFORM 50 | RUN echo "Installing python dependencies for build: ${BUILDPLATFORM} target: ${TARGETPLATFORM}" 51 | RUN if [ "$TARGETARCH" = "arm64" ]; then \ 52 | poetry install --without tableau --no-interaction --no-ansi -v ;\ 53 | else poetry install --no-interaction --no-ansi -v ;\ 54 | fi 55 | 56 | # Copy src directory to run against and build lamp py 57 | COPY src src 58 | COPY alembic.ini alembic.ini 59 | 60 | # Add Version information as an argument, it is provided by GHA and left to the 61 | # default for local development. 62 | ARG VERSION="v0.0.0-unknown" 63 | RUN echo "VERSION = '${VERSION}'" > src/lamp_py/__version__.py 64 | 65 | RUN poetry install --only main --no-interaction --no-ansi -v 66 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Massachusetts Bay Transportation Authority 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /analysis/check_data_all_days_in_LAMP_ALL_RT_fields.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import datetime 3 | import polars as pl 4 | from pyarrow import dataset as pd 5 | 6 | # ds = pd.dataset("s3://mbta-ctd-dataplatform-staging-archive/lamp/tableau/rail/LAMP_ALL_RT_fields.parquet") 7 | ds = pd.dataset("https://performancedata.mbta.com/lamp/tableau/rail/LAMP_ALL_RT_fields.parquet") 8 | dates = [] 9 | # todo - 30 days? 31 days? 10 | for day in range(1, 31): 11 | date = datetime.datetime(2025, 4, day) 12 | for bat in ds.to_batches( 13 | batch_size=500_000, batch_readahead=5, fragment_readahead=0, columns=["service_date", "route_id"] 14 | ): 15 | # breakpoint() 16 | pls = pl.from_arrow(bat) 17 | res = pls.filter(pl.col("service_date") == date) 18 | # breakpoint() 19 | if res.height > 0: 20 | dates.append(date) 21 | print(f"ok: {date}, {res.height}") 22 | # print(".")÷ 23 | 24 | assert all([(dates[i + 1] - dates[i]).days < 2 for i in range(len(dates) - 1)]) 25 | -------------------------------------------------------------------------------- /analysis/prism.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from lamp_py.aws.s3 import file_list_from_s3_with_details 4 | from lamp_py.runtime_utils.remote_files import S3_SPRINGBOARD 5 | 6 | # Problem: Looking into data quality issues is a very adhoc process right now. Expertise/knowledge 7 | # not centralized in code that is easily runnable (it's mostly in the app itself) 8 | 9 | # Solution: # Prism.py (working name...) "See the Rainbow" - WIP entry point to analysis suite to 10 | # organize tools for looking at LAMP data products inputs and outputs 11 | 12 | files = file_list_from_s3_with_details(bucket_name="mbta-ctd-dataplatform-staging-archive", file_prefix="lamp/tableau/") 13 | 14 | print(files) 15 | breakpoint() 16 | 17 | for f in files: 18 | print(f"{os.path.basename(f['s3_obj_path'])}: sz: {f['size_bytes']} last mod: {f['last_modified']}") 19 | 20 | # detect data source from what 21 | # returned object contains methods that are available given the input data 22 | 23 | # ideas... 24 | # e.g. prism(some_data_from_springboard) 25 | # - detect that it is Vehicle Positions file from path 26 | # - load it up 27 | # - implementations of various analysis chosen for VP 28 | 29 | # https://docs.python.org/3/library/functools.html#functools.singledispatch 30 | -------------------------------------------------------------------------------- /architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/architecture.jpg -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | 5 | rail_pm_rds: 6 | container_name: ${RPM_DB_HOST} 7 | image: postgres:14.4 8 | env_file: .env 9 | shm_size: '2gb' 10 | environment: 11 | POSTGRES_DB: ${RPM_DB_NAME} 12 | POSTGRES_PASSWORD: ${RPM_DB_PASSWORD} 13 | ports: 14 | - "${RPM_DB_PORT}:5432" 15 | command: ["postgres", "-c", "log_statement=all"] 16 | 17 | metadata_rds: 18 | container_name: ${MD_DB_HOST} 19 | image: postgres:15 20 | env_file: .env 21 | shm_size: '2gb' 22 | environment: 23 | POSTGRES_DB: ${MD_DB_NAME} 24 | POSTGRES_PASSWORD: ${MD_DB_PASSWORD} 25 | ports: 26 | - "${MD_DB_PORT}:5432" 27 | command: ["postgres", "-c", "log_statement=all"] 28 | 29 | performance_manager: 30 | container_name: performance_manager 31 | env_file: .env 32 | build: . 33 | depends_on: 34 | - rail_pm_rds 35 | - metadata_rds 36 | working_dir: /lamp 37 | volumes: 38 | - ~/.aws:/root/.aws:ro # map credentials to be used by boto3, read-only 39 | command: ["poetry", "run", "performance_manager"] 40 | 41 | bus_performance_manager: 42 | container_name: bus_performance_manager 43 | env_file: .env 44 | build: . 45 | depends_on: 46 | - metadata_rds 47 | working_dir: /lamp 48 | volumes: 49 | - ~/.aws:/root/.aws:ro # map credentials to be used by boto3, read-only 50 | command: ["poetry", "run", "bus_performance_manager"] 51 | 52 | seed_metadata: 53 | container_name: seed_metadata 54 | env_file: .env 55 | build: . 56 | depends_on: 57 | - rail_pm_rds 58 | - metadata_rds 59 | working_dir: /lamp 60 | volumes: 61 | # map credentials to be used by boto3, read-only 62 | - ~/.aws:/root/.aws:ro 63 | # add in filepath json that will be the default seed file path 64 | - ./tests/test_files/staging_dec_10.json:/seed_paths.json 65 | # entrypoint passes in seed file thats added as a volume. if you want to use a different 66 | # filepath run 67 | # docker-compose run -v /path/to/files.json:/seed.json seed_metadata --seed-file /seed.json 68 | entrypoint: 69 | [ 70 | "poetry", 71 | "run", 72 | "seed_metadata", 73 | "--clear-static", 74 | "--clear-rt", 75 | "--seed-file", 76 | "/seed_paths.json" 77 | ] 78 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "lamp_py" 3 | version = "0.1.0" 4 | description = "Lightweight Application for Monitoring Performance" 5 | authors = [ 6 | "MBTA CTD ", 7 | "Ryan Rymarczyk ", 8 | "Mike Zappitello ", 9 | "Henry Huang ", 10 | ] 11 | 12 | [tool.poetry.scripts] 13 | ingestion = 'lamp_py.ingestion.pipeline:start' 14 | performance_manager = 'lamp_py.performance_manager.pipeline:start' 15 | bus_performance_manager = 'lamp_py.bus_performance_manager.pipeline:start' 16 | seed_metadata = 'lamp_py.postgres.seed_metadata:run' 17 | hyper_update = 'lamp_py.tableau.pipeline:start_hyper_updates' 18 | transit_master_ingestion = 'lamp_py.ingestion_tm.pipeline:start' 19 | ad_hoc = 'lamp_py.ad_hoc.pipeline:start' 20 | 21 | [tool.poetry.dependencies] 22 | python = "^3.10" 23 | SQLAlchemy = "^2.0.30" 24 | pyarrow = "^19.0.1" 25 | boto3 = "^1.35.2" 26 | pandas = "^2.2.1" 27 | numpy = "^1.26.4" 28 | psycopg2 = "^2.9.3" 29 | psutil = "^5.9.8" 30 | schedule = "^1.1.0" 31 | alembic = "^1.10.2" 32 | types-pytz = "^2024.1.0.20240203" 33 | pyodbc = "^5.1.0" 34 | polars = "^1.3.0" 35 | 36 | [tool.poetry.group.tableau] 37 | optional = false 38 | 39 | [tool.poetry.group.tableau.dependencies] 40 | tableauhyperapi = "^0.0.21408" 41 | tableauserverclient = "0.30" 42 | 43 | [tool.poetry.group.investigation] 44 | optional = true 45 | 46 | [tool.poetry.group.investigation.dependencies] 47 | ipykernel = "^6.29.4" 48 | matplotlib = "^3.9.0" 49 | seaborn = "^0.13.2" 50 | tabulate = "^0.9.0" 51 | 52 | [tool.poetry.group.dev.dependencies] 53 | black = "^24.3.0" 54 | mypy = "^1.1.1" 55 | pylint = "^3.2.6" 56 | pytest = "^8.3.2" 57 | pytest-cov = "^5.0.0" 58 | types-python-dateutil = "^2.9.0.20240316" 59 | pytest-env = "^1.1.3" 60 | 61 | [build-system] 62 | requires = ["poetry-core>=1.0.0"] 63 | build-backend = "poetry.core.masonry.api" 64 | 65 | [tool.black] 66 | line-length = 120 67 | target-version = ['py310'] 68 | 69 | [tool.mypy] 70 | disallow_untyped_defs = true 71 | ignore_missing_imports = true 72 | plugins = ["sqlalchemy.ext.mypy.plugin"] 73 | pretty = true 74 | python_version = "3.10" 75 | warn_unreachable = true 76 | warn_unused_ignores = true 77 | exclude = ["investigation/", "runners"] 78 | 79 | [tool.pytest] 80 | log_cli = true 81 | log_cli_level = "DEBUG" 82 | verbose = true 83 | 84 | [tool.pytest.ini_options] 85 | env = [ 86 | "SPRINGBOARD_BUCKET=SPRINGBOARD", 87 | "PUBLIC_ARCHIVE_BUCKET=PUBLIC_ARCHIVE", 88 | "INCOMING_BUCKET=INCOMING", 89 | ] 90 | 91 | [tool.pylint] 92 | disable = [ 93 | # disable doc string requirements 94 | "missing-module-docstring", 95 | # allow catching a generic exception 96 | "broad-except", 97 | # caught by black 98 | "line-too-long", 99 | # we're logging everything so its "ok" 100 | "lost-exception", 101 | # for some reason Iterable[type] is triggering this error on github 102 | "unsubscriptable-object", 103 | # Converter abstract base class only has one common function 104 | "too-few-public-methods", 105 | # l1_rt_trips.py over 1000 lines 106 | "too-many-lines", 107 | ] 108 | good-names = ["e", "i", "s"] 109 | max-line-length = 120 110 | min-similarity-lines = 10 111 | # ignore session maker as it gives pylint fits 112 | # https://github.com/PyCQA/pylint/issues/7090 113 | ignored-classes = ['sqlalchemy.orm.session.sessionmaker', 'pyarrow.compute'] 114 | # ignore the migrations directory. its going to have duplication and _that is ok_. 115 | ignore-paths = ["^src/lamp_py/migrations/.*$"] 116 | -------------------------------------------------------------------------------- /runners/run_glides_parquet_converter.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from lamp_py.tableau.hyper import HyperJob 4 | from lamp_py.tableau.jobs.glides import HyperGlidesOperatorSignIns, HyperGlidesTripUpdates 5 | 6 | 7 | # don't run this in pytest - environment variables in pyproject.toml point to local SPRINGBOARD/ARCHIVE 8 | # need the .env values to run 9 | def start_glides_parquet_updates() -> None: 10 | """Run all Glides Parquet Update jobs""" 11 | 12 | parquet_update_jobs: List[HyperJob] = [ 13 | HyperGlidesTripUpdates(), 14 | HyperGlidesOperatorSignIns(), 15 | ] 16 | 17 | for job in parquet_update_jobs: 18 | breakpoint() 19 | job.run_parquet(None) 20 | outs = job.create_local_hyper() 21 | print(outs) 22 | 23 | 24 | if __name__ == "__main__": 25 | start_glides_parquet_updates() 26 | -------------------------------------------------------------------------------- /runners/run_query_s3_with_date_range.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from lamp_py.aws.s3 import file_list_from_s3_date_range 4 | from lamp_py.runtime_utils.remote_files import LAMP, S3_SPRINGBOARD 5 | from lamp_py.runtime_utils.remote_files import springboard_rt_vehicle_positions 6 | 7 | template = "year={yy}/month={mm}/day={dd}/" 8 | end_date = datetime.now() 9 | end_date = datetime(year=2025, month=3, day=30) 10 | start_date = end_date - timedelta(days=15) # type: ignore 11 | 12 | breakpoint() 13 | s3_uris = file_list_from_s3_date_range( 14 | bucket_name=S3_SPRINGBOARD, 15 | file_prefix=springboard_rt_vehicle_positions.prefix, 16 | path_template=template, 17 | end_date=end_date, 18 | start_date=start_date, 19 | ) 20 | 21 | print(s3_uris) 22 | -------------------------------------------------------------------------------- /runners/run_static_trips_subquery.py: -------------------------------------------------------------------------------- 1 | from lamp_py.performance_manager.l1_cte_statements import static_trips_subquery_pl 2 | 3 | 4 | static_trips_sub_res = static_trips_subquery_pl(20250415).sort(by="static_trip_id") 5 | 6 | static_trips_sub_res.write_csv("20250415_static_trips_subquery.csv") 7 | -------------------------------------------------------------------------------- /src/lamp_py/__version__.py: -------------------------------------------------------------------------------- 1 | # this is just a stub needed for imports to work correctly. 2 | # 3 | # this file will be overwritten in a docker image 4 | VERSION = "v0.0.0-unknown" 5 | -------------------------------------------------------------------------------- /src/lamp_py/ad_hoc/__init__.py: -------------------------------------------------------------------------------- 1 | """location for all ad-hoc process runner scripts""" 2 | -------------------------------------------------------------------------------- /src/lamp_py/ad_hoc/pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import logging 4 | import os 5 | 6 | from lamp_py.aws.ecs import check_for_parallel_tasks 7 | from lamp_py.runtime_utils.env_validation import validate_environment 8 | 9 | from lamp_py.ad_hoc.runner_001 import runner 10 | 11 | logging.getLogger().setLevel("INFO") 12 | DESCRIPTION = """Entry Point For Ad-Hoc Runner""" 13 | 14 | 15 | def start() -> None: 16 | """configure and start the ad-hoc runner""" 17 | # configure the environment 18 | os.environ["SERVICE_NAME"] = "ad_hoc" 19 | 20 | validate_environment( 21 | required_variables=[ 22 | "ARCHIVE_BUCKET", 23 | "ERROR_BUCKET", 24 | "INCOMING_BUCKET", 25 | "PUBLIC_ARCHIVE_BUCKET", 26 | "SPRINGBOARD_BUCKET", 27 | ], 28 | db_prefixes=["MD", "RPM"], 29 | ) 30 | 31 | check_for_parallel_tasks() 32 | 33 | # run the main method 34 | runner() 35 | 36 | 37 | if __name__ == "__main__": 38 | start() 39 | -------------------------------------------------------------------------------- /src/lamp_py/aws/__init__.py: -------------------------------------------------------------------------------- 1 | """ Suite of utilities for dealing with AWS infrastructure """ 2 | -------------------------------------------------------------------------------- /src/lamp_py/aws/ecs.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import sys 4 | from multiprocessing import Process 5 | from queue import Queue 6 | from typing import Any, Optional 7 | 8 | import boto3 9 | 10 | from lamp_py.runtime_utils.process_logger import ProcessLogger 11 | 12 | 13 | def handle_ecs_sigterm(_: int, __: Any) -> None: 14 | """ 15 | handler function for when ECS recieves ECS SIGTERM 16 | """ 17 | process_logger = ProcessLogger("sigterm_received") 18 | process_logger.log_start() 19 | os.environ["GOT_SIGTERM"] = "TRUE" 20 | process_logger.log_complete() 21 | 22 | 23 | def check_for_sigterm( 24 | metadata_queue: Optional[Queue[Optional[str]]] = None, 25 | rds_process: Optional[Process] = None, 26 | ) -> None: 27 | """ 28 | check if SIGTERM recived from ECS. If found, terminate process. 29 | """ 30 | if os.environ.get("GOT_SIGTERM") is not None: 31 | process_logger = ProcessLogger("stopping_ecs") 32 | process_logger.log_start() 33 | 34 | # send signal to stop rds writer process and wait for exit 35 | if metadata_queue is not None: 36 | metadata_queue.put(None) 37 | if rds_process is not None: 38 | rds_process.join() 39 | 40 | process_logger.log_complete() 41 | 42 | # delay for log statements to write before ecs death 43 | time.sleep(5) 44 | 45 | sys.exit() 46 | 47 | 48 | def running_in_aws() -> bool: 49 | """ 50 | return True if running on aws, else False 51 | """ 52 | return bool(os.getenv("AWS_DEFAULT_REGION")) 53 | 54 | 55 | def check_for_parallel_tasks() -> None: 56 | """ 57 | Check that that this task is not already running on ECS 58 | """ 59 | if not running_in_aws(): 60 | return 61 | 62 | process_logger = ProcessLogger("check_for_tasks") 63 | process_logger.log_start() 64 | 65 | client = boto3.client("ecs") 66 | ecs_cluster = os.environ["ECS_CLUSTER"] 67 | ecs_task_group = os.environ["ECS_TASK_GROUP"] 68 | 69 | try: 70 | # get all of the tasks running on the cluster 71 | task_arns = client.list_tasks(cluster=ecs_cluster)["taskArns"] 72 | 73 | # if tasks are running on the cluster, get their descriptions and check to 74 | # count matches the ecs task group. 75 | match_count = 0 76 | if task_arns: 77 | running_tasks = client.describe_tasks(cluster=ecs_cluster, tasks=task_arns)["tasks"] 78 | 79 | for task in running_tasks: 80 | if ecs_task_group == task["group"]: 81 | match_count += 1 82 | 83 | # if the group matches, raise an exception that will terminate the process 84 | if match_count > 1: 85 | raise SystemError(f"Multiple {ecs_task_group} ECS Tasks Running in {ecs_cluster}") 86 | 87 | except Exception as exception: 88 | process_logger.log_failure(exception) 89 | raise exception 90 | 91 | process_logger.log_complete() 92 | -------------------------------------------------------------------------------- /src/lamp_py/bus_performance_manager/README.md: -------------------------------------------------------------------------------- 1 | # Bus Performance Manager 2 | 3 | The Bus Performance Manager is an application to measure bus performance on the MBTA transit system. 4 | -------------------------------------------------------------------------------- /src/lamp_py/bus_performance_manager/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline for creating bus performance manager metrics 3 | """ 4 | -------------------------------------------------------------------------------- /src/lamp_py/bus_performance_manager/pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import sched 7 | import signal 8 | import sys 9 | import time 10 | from typing import List 11 | 12 | from lamp_py.aws.ecs import handle_ecs_sigterm, check_for_sigterm 13 | from lamp_py.runtime_utils.env_validation import validate_environment 14 | from lamp_py.runtime_utils.process_logger import ProcessLogger 15 | from lamp_py.bus_performance_manager.write_events import write_bus_metrics 16 | from lamp_py.tableau.pipeline import start_bus_parquet_updates 17 | 18 | logging.getLogger().setLevel("INFO") 19 | 20 | DESCRIPTION = """Entry Point to Bus Performance Manager""" 21 | 22 | 23 | def parse_args(args: List[str]) -> argparse.Namespace: 24 | """parse args for running this entrypoint script""" 25 | parser = argparse.ArgumentParser(description=DESCRIPTION) 26 | parser.add_argument( 27 | "--interval", 28 | default=300, 29 | dest="interval", 30 | help="interval to run event loop on", 31 | ) 32 | 33 | return parser.parse_args(args) 34 | 35 | 36 | def main(args: argparse.Namespace) -> None: 37 | """entrypoint into performance manager event loop""" 38 | main_process_logger = ProcessLogger("main", **vars(args)) 39 | main_process_logger.log_start() 40 | 41 | # schedule object that will control the "event loop" 42 | scheduler = sched.scheduler(time.time, time.sleep) 43 | 44 | # function to call each time on the event loop, rescheduling the loop at the 45 | # end of each iteration 46 | def iteration() -> None: 47 | """function to invoke on a scheduled routine""" 48 | check_for_sigterm() 49 | process_logger = ProcessLogger("event_loop") 50 | process_logger.log_start() 51 | try: 52 | write_bus_metrics() 53 | start_bus_parquet_updates() 54 | process_logger.log_complete() 55 | except Exception as exception: 56 | process_logger.log_failure(exception) 57 | finally: 58 | scheduler.enter(int(args.interval), 1, iteration) 59 | 60 | # schedule the initial loop and start the scheduler 61 | scheduler.enter(0, 1, iteration) 62 | scheduler.run() 63 | main_process_logger.log_complete() 64 | 65 | 66 | def start() -> None: 67 | """configure and start the bus performance manager process""" 68 | # parse arguments from the command line 69 | parsed_args = parse_args(sys.argv[1:]) 70 | 71 | # setup handling shutdown commands 72 | signal.signal(signal.SIGTERM, handle_ecs_sigterm) 73 | 74 | # configure the environment 75 | os.environ["SERVICE_NAME"] = "bus_performance_manager" 76 | validate_environment( 77 | required_variables=[ 78 | "SPRINGBOARD_BUCKET", 79 | "PUBLIC_ARCHIVE_BUCKET", 80 | "SERVICE_NAME", 81 | ], 82 | ) 83 | 84 | # run main method 85 | main(parsed_args) 86 | 87 | 88 | if __name__ == "__main__": 89 | start() 90 | -------------------------------------------------------------------------------- /src/lamp_py/bus_performance_manager/write_events.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | from lamp_py.bus_performance_manager.event_files import event_files_to_load 5 | from lamp_py.bus_performance_manager.events_metrics import bus_performance_metrics 6 | from lamp_py.runtime_utils.lamp_exception import LampExpectedNotFoundError, LampInvalidProcessingError 7 | from lamp_py.runtime_utils.remote_files import bus_events 8 | from lamp_py.runtime_utils.remote_files import VERSION_KEY 9 | from lamp_py.runtime_utils.process_logger import ProcessLogger 10 | from lamp_py.aws.s3 import upload_file 11 | 12 | 13 | def write_bus_metrics() -> None: 14 | """ 15 | Write bus-performance parquet files to S3 for service dates neeing to be processed 16 | """ 17 | logger = ProcessLogger("write_bus_metrics") 18 | logger.log_start() 19 | 20 | event_files = event_files_to_load() 21 | logger.add_metadata(service_date_count=len(event_files)) 22 | 23 | for service_date in event_files.keys(): 24 | gtfs_files = event_files[service_date]["gtfs_rt"] 25 | tm_files = event_files[service_date]["transit_master"] 26 | 27 | day_logger = ProcessLogger( 28 | "write_bus_metrics_day", 29 | service_date=service_date, 30 | gtfs_file_count=len(gtfs_files), 31 | tm_file_count=len(tm_files), 32 | ) 33 | day_logger.log_start() 34 | 35 | # need gtfs_rt files to run process 36 | if len(gtfs_files) == 0: 37 | day_logger.log_failure(FileNotFoundError(f"No RT_VEHICLE_POSITION files found for {service_date}")) 38 | continue 39 | 40 | try: 41 | events_df = bus_performance_metrics(service_date, gtfs_files, tm_files) 42 | day_logger.add_metadata(bus_performance_rows=events_df.shape[0]) 43 | 44 | with tempfile.TemporaryDirectory() as tempdir: 45 | write_file = f"{service_date.strftime('%Y%m%d')}.parquet" 46 | events_df.write_parquet(os.path.join(tempdir, write_file), use_pyarrow=True) 47 | 48 | upload_file( 49 | file_name=os.path.join(tempdir, write_file), 50 | object_path=os.path.join(bus_events.s3_uri, write_file), 51 | extra_args={"Metadata": {VERSION_KEY: bus_events.version}}, 52 | ) 53 | 54 | except LampExpectedNotFoundError as exception: 55 | # service_date not found = ExpectedNotFound 56 | day_logger.add_metadata(skipped_day=exception) 57 | continue 58 | except LampInvalidProcessingError as exception: 59 | # num service date > 1 = InvalidProcessing (this should never happen) 60 | day_logger.log_failure(exception) 61 | except Exception as exception: 62 | day_logger.log_failure(exception) 63 | 64 | day_logger.log_complete() 65 | 66 | logger.log_complete() 67 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/README.md: -------------------------------------------------------------------------------- 1 | # Ingestion 2 | 3 | Ingestion is an application to transform and aggregate GTFS-RT and GTFS Static files into parquet files for storage in AWS S3 buckets. 4 | 5 | ## Application Operation 6 | 7 | Ingestion operates with a chronologic event loop with a 5 minute delay between each iteration. 8 | 9 | Ingestion connects to the [Performance Manager](../performance_manager/README.md) application via the `metadata_log` table of the Metadata RDS. When Ingestion creates a new parquet file, the S3 path of that file is written to the `metadata_log` table for Performance Manager to process. 10 | 11 | For each event loop, GTFS Static files are processed prior to any GTFS-RT files, when available. 12 | 13 | ## Event Loop Summary 14 | 15 | 1. List all files from `incoming` S3 bucket 16 | 2. Bucket files into applicable `Converter` class 17 | 3. Start `converter` loop of each `Converter` class, creating parquet files 18 | 4. Write parquet file to S3 Bucket 19 | 5. Write S3 path of parquet file to `metadata_log` table for Performance Manager 20 | 6. Move successfully processed `incoming` files to `archive` bucket 21 | 7. Move un-successfully processed `incoming` files to `error` bucket 22 | 23 | # GTFS Static 24 | 25 | [GTFS Static](https://www.mbta.com/developers/gtfs) Zip files are generated by MBTA for internal and external distribution. 26 | 27 | This application converts GTFS Zip files to partitioned parquet files that are exported to an S3 bucket. This is done with the [GTFS Converter Class](./convert_gtfs.py). 28 | 29 | GTFS Static parquet files are written to S3 with the following partitioning: 30 | 31 | * [GTFS File Type](https://github.com/mbta/gtfs-documentation/blob/master/reference/gtfs.md#gtfs-files) 32 | * timestamp = datetime extracted from `feed_version` column of [feed_info.txt](https://github.com/mbta/gtfs-documentation/blob/master/reference/gtfs.md#feed_infotxt), converted to UNIX timestamp 33 | 34 | # GTFS-RT Data 35 | 36 | [GTFS-realtime](https://www.mbta.com/developers/gtfs-realtime) (GTFS-RT) is provided by MBTA as an industry standard for distributing realtime transit data. 37 | 38 | The CTD [Delta](https://github.com/mbta/delta) application is responsible for reading GTFS-RT updates from the MBTA [V3 API](https://www.mbta.com/developers/v3-api) and saving them to an AWS S3 Bucket, as gzipped JSON files, for use by LAMP. 39 | 40 | This application aggregates gzipped GTFS-RT update files, saved on S3 by Delta, into partitioned parquet files that are exported to an S3 bucket. The parquet files are partitioned daily, by GTFS-RT feed type. This is done with the [GTFS-RT Converter Class](./convert_gtfs_rt.py) 41 | 42 | GTFS-RT parquet files are transformed and partitioned based on their `Converter Class` configuration: 43 | 44 | * [Busloc Trip Updates](./config_busloc_trip.py) 45 | * [Busloc Vehicle Positions](./config_busloc_vehicle.py) 46 | * [Realtime Vehicle Positions](./config_rt_vehicle.py) 47 | * [Realtime Trip Updates](./config_rt_trip.py) 48 | * [Sevice Alerts](./config_rt_alerts.py) 49 | 50 | # Compressed GTFS Archive Files 51 | 52 | GTFS Zip files are converted to yearly partitioned parquet files, using a differential compression process, and exported to AWS S3 for publishing/storage. 53 | 54 | For more Information about these files, please see: https://performancedata.mbta.com/ 55 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline for processing ingesting GTFS static schedule files and GTFS real time 3 | files from an s3 bucket. The realtime files are collapsed into parquet files 4 | for long term storage. 5 | """ 6 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/compress_gtfs/__init__.py: -------------------------------------------------------------------------------- 1 | """Tools to compress GTFS schedules into parquet files""" 2 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/compress_gtfs/pipe.py: -------------------------------------------------------------------------------- 1 | from lamp_py.ingestion.compress_gtfs.gtfs_to_parquet import gtfs_to_parquet 2 | 3 | if __name__ == "__main__": 4 | gtfs_to_parquet() 5 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/compress_gtfs/pq_to_sqlite.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sqlite3 3 | 4 | import pyarrow 5 | import pyarrow.dataset as pd 6 | 7 | from lamp_py.runtime_utils.process_logger import ProcessLogger 8 | from lamp_py.ingestion.utils import gzip_file 9 | 10 | 11 | def sqlite_type(pq_type: str) -> str: 12 | """ 13 | return SQLITE type from pyarrow Field type 14 | """ 15 | if "int" in pq_type: 16 | return "INTEGER" 17 | if "bool" in pq_type: 18 | return "INTEGER" 19 | if "float" in pq_type: 20 | return "REAL" 21 | if "double" in pq_type: 22 | return "REAL" 23 | return "TEXT" 24 | 25 | 26 | def sqlite_table_query(table_name: str, schema: pyarrow.Schema) -> str: 27 | """ 28 | return CREATE TABLE query for sqlite table from pyarrow schema 29 | """ 30 | logger = ProcessLogger("sqlite_create_table") 31 | logger.log_start() 32 | field_list = [f"{field.name} {sqlite_type(str(field.type))}" for field in schema] 33 | query = f""" 34 | CREATE TABLE 35 | IF NOT EXISTS 36 | {table_name} 37 | ( 38 | {','.join(field_list)} 39 | ); 40 | """ 41 | logger.log_complete() 42 | return query 43 | 44 | 45 | def pq_folder_to_sqlite(year_path: str) -> None: 46 | """ 47 | load all files from year_path folder into SQLITE3 db file 48 | """ 49 | logger = ProcessLogger("pq_to_sqlite", year_path=year_path) 50 | logger.log_start() 51 | 52 | db_path = os.path.join(year_path, "GTFS_ARCHIVE.db") 53 | if os.path.exists(db_path): 54 | os.remove(db_path) 55 | try: 56 | for file in os.listdir(year_path): 57 | if ".parquet" not in file: 58 | continue 59 | logger.add_metadata(current_file=file) 60 | 61 | ds = pd.dataset(os.path.join(year_path, file)) 62 | 63 | table = file.replace(".parquet", "") 64 | columns = [f":{col}" for col in ds.schema.names] 65 | insert_query = f"INSERT INTO {table} VALUES({','.join(columns)});" 66 | 67 | conn = sqlite3.connect(db_path) 68 | with conn: 69 | conn.execute(sqlite_table_query(table, ds.schema)) 70 | with conn: 71 | for batch in ds.to_batches(batch_size=250_000): 72 | conn.executemany(insert_query, batch.to_pylist()) 73 | conn.close() 74 | 75 | gzip_file(db_path) 76 | 77 | logger.log_complete() 78 | except Exception as exception: 79 | logger.log_failure(exception) 80 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/config_busloc_trip.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | import pyarrow 3 | 4 | from lamp_py.ingestion.gtfs_rt_detail import GTFSRTDetail 5 | from lamp_py.ingestion.gtfs_rt_structs import ( 6 | trip_descriptor, 7 | vehicle_descriptor, 8 | stop_time_event, 9 | ) 10 | from lamp_py.ingestion.utils import explode_table_column, flatten_schema 11 | 12 | 13 | class RtBusTripDetail(GTFSRTDetail): 14 | """ 15 | Detail for how to convert RT GTFS Trip Updates from json entries into 16 | parquet tables. 17 | """ 18 | 19 | def transform_for_write(self, table: pyarrow.table) -> pyarrow.table: 20 | """modify table schema before write to parquet""" 21 | return flatten_schema(explode_table_column(flatten_schema(table), "trip_update.stop_time_update")) 22 | 23 | @property 24 | def partition_column(self) -> str: 25 | return "trip_update.trip.route_id" 26 | 27 | @property 28 | def import_schema(self) -> pyarrow.schema: 29 | return pyarrow.schema( 30 | [ 31 | ("id", pyarrow.string()), 32 | ( 33 | "trip_update", 34 | pyarrow.struct( 35 | [ 36 | ( 37 | "timestamp", 38 | pyarrow.uint64(), 39 | ), # Not currently provided by Busloc 40 | ( 41 | "delay", 42 | pyarrow.int32(), 43 | ), # Not currently provided by Busloc 44 | ( 45 | "trip", 46 | trip_descriptor, 47 | ), # Busloc currently only provides trip_id, route_id and schedule_relationship 48 | ( 49 | "vehicle", 50 | vehicle_descriptor, 51 | ), # Busloc currently only provides id and label 52 | ( 53 | "stop_time_update", 54 | pyarrow.list_( 55 | pyarrow.struct( 56 | [ 57 | ("stop_sequence", pyarrow.uint32()), 58 | ("stop_id", pyarrow.string()), 59 | ("arrival", stop_time_event), 60 | ("departure", stop_time_event), 61 | ( 62 | "schedule_relationship", 63 | pyarrow.string(), 64 | ), 65 | ("cause_id", pyarrow.uint16()), 66 | ( 67 | "cause_description", 68 | pyarrow.string(), 69 | ), 70 | ("remark", pyarrow.string()), 71 | ] 72 | ) 73 | ), 74 | ), 75 | ] 76 | ), 77 | ), 78 | ] 79 | ) 80 | 81 | @property 82 | def table_sort_order(self) -> List[Tuple[str, str]]: 83 | return [ 84 | ("trip_update.trip.route_pattern_id", "ascending"), 85 | ("trip_update.trip.direction_id", "ascending"), 86 | ("trip_update.vehicle.id", "ascending"), 87 | ] 88 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/config_busloc_vehicle.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | import pyarrow 3 | 4 | from .gtfs_rt_detail import GTFSRTDetail 5 | from .gtfs_rt_structs import ( 6 | position, 7 | vehicle_descriptor, 8 | trip_descriptor, 9 | ) 10 | 11 | 12 | class RtBusVehicleDetail(GTFSRTDetail): 13 | """ 14 | Detail for how to convert RT GTFS Bus Vehicle Positions from json 15 | entries into parquet tables. 16 | """ 17 | 18 | @property 19 | def partition_column(self) -> str: 20 | return "vehicle.trip.route_id" 21 | 22 | @property 23 | def import_schema(self) -> pyarrow.schema: 24 | return pyarrow.schema( 25 | [ 26 | ("id", pyarrow.string()), 27 | ("is_deleted", pyarrow.bool_()), 28 | ( 29 | "vehicle", 30 | pyarrow.struct( 31 | [ 32 | ("position", position), 33 | ("location_source", pyarrow.string()), 34 | ("timestamp", pyarrow.uint64()), 35 | ("trip", trip_descriptor), 36 | ("vehicle", vehicle_descriptor), 37 | ( 38 | "operator", 39 | pyarrow.struct( 40 | [ 41 | ("id", pyarrow.string()), 42 | ("first_name", pyarrow.string()), 43 | ("last_name", pyarrow.string()), 44 | ("name", pyarrow.string()), 45 | ("logon_time", pyarrow.uint64()), 46 | ] 47 | ), 48 | ), 49 | ("block_id", pyarrow.string()), 50 | ("run_id", pyarrow.string()), 51 | ("stop_id", pyarrow.string()), 52 | ("current_stop_sequence", pyarrow.uint32()), 53 | ("revenue", pyarrow.bool_()), 54 | ("current_status", pyarrow.string()), 55 | ("load", pyarrow.uint16()), 56 | ("capacity", pyarrow.uint16()), 57 | ("occupancy_percentage", pyarrow.uint16()), 58 | ("occupancy_status", pyarrow.string()), 59 | ] 60 | ), 61 | ), 62 | ] 63 | ) 64 | 65 | @property 66 | def table_sort_order(self) -> List[Tuple[str, str]]: 67 | return [ 68 | ("vehicle.block_id", "ascending"), 69 | ("vehicle.vehicle.id", "ascending"), 70 | ("feed_timestamp", "ascending"), 71 | ] 72 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/config_rt_trip.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | import pyarrow 3 | 4 | from lamp_py.ingestion.gtfs_rt_detail import GTFSRTDetail 5 | from lamp_py.ingestion.gtfs_rt_structs import ( 6 | trip_descriptor, 7 | vehicle_descriptor, 8 | stop_time_event, 9 | ) 10 | from lamp_py.ingestion.utils import explode_table_column, flatten_schema 11 | 12 | 13 | class RtTripDetail(GTFSRTDetail): 14 | """ 15 | Detail for how to convert RT GTFS Trip Updates from json entries into 16 | parquet tables. 17 | """ 18 | 19 | def transform_for_write(self, table: pyarrow.table) -> pyarrow.table: 20 | """modify table schema before write to parquet""" 21 | return flatten_schema(explode_table_column(flatten_schema(table), "trip_update.stop_time_update")) 22 | 23 | @property 24 | def partition_column(self) -> str: 25 | return "trip_update.trip.route_id" 26 | 27 | @property 28 | def import_schema(self) -> pyarrow.schema: 29 | return pyarrow.schema( 30 | [ 31 | ("id", pyarrow.string()), 32 | ( 33 | "trip_update", 34 | pyarrow.struct( 35 | [ 36 | ("trip", trip_descriptor), 37 | ("vehicle", vehicle_descriptor), 38 | ( 39 | "stop_time_update", 40 | pyarrow.list_( 41 | pyarrow.struct( 42 | [ 43 | ("stop_sequence", pyarrow.uint32()), 44 | ("stop_id", pyarrow.string()), 45 | ("arrival", stop_time_event), 46 | ("departure", stop_time_event), 47 | ( 48 | "schedule_relationship", 49 | pyarrow.string(), 50 | ), 51 | ( 52 | "boarding_status", 53 | pyarrow.string(), 54 | ), # MBTA Enhanced Field 55 | ] 56 | ) 57 | ), 58 | ), 59 | ("timestamp", pyarrow.uint64()), 60 | ("delay", pyarrow.int32()), 61 | ] 62 | ), 63 | ), 64 | ] 65 | ) 66 | 67 | # pylint: disable=R0801 68 | # Similar lines in 2 files 69 | @property 70 | def table_sort_order(self) -> List[Tuple[str, str]]: 71 | return [ 72 | ("trip_update.trip.route_pattern_id", "ascending"), 73 | ("trip_update.trip.direction_id", "ascending"), 74 | ("trip_update.vehicle.id", "ascending"), 75 | ] 76 | 77 | # pylint: enable=R0801 78 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/config_rt_vehicle.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | import pyarrow 3 | 4 | from .gtfs_rt_detail import GTFSRTDetail 5 | from .gtfs_rt_structs import position, trip_descriptor, vehicle_descriptor 6 | 7 | 8 | class RtVehicleDetail(GTFSRTDetail): 9 | """ 10 | Detail for how to convert RT GTFS Vehicle Positions from json entries into 11 | parquet tables. 12 | """ 13 | 14 | @property 15 | def partition_column(self) -> str: 16 | return "vehicle.trip.route_id" 17 | 18 | @property 19 | def import_schema(self) -> pyarrow.schema: 20 | return pyarrow.schema( 21 | [ 22 | ("id", pyarrow.string()), 23 | ( 24 | "vehicle", 25 | pyarrow.struct( 26 | [ 27 | ("trip", trip_descriptor), 28 | ("vehicle", vehicle_descriptor), 29 | ("position", position), 30 | ("current_stop_sequence", pyarrow.uint32()), 31 | ("stop_id", pyarrow.string()), 32 | ("current_status", pyarrow.string()), 33 | ("timestamp", pyarrow.uint64()), 34 | ("congestion_level", pyarrow.string()), 35 | ("occupancy_status", pyarrow.string()), 36 | ("occupancy_percentage", pyarrow.uint32()), 37 | ( 38 | "multi_carriage_details", 39 | pyarrow.list_( 40 | pyarrow.struct( 41 | [ 42 | ("id", pyarrow.string()), 43 | ("label", pyarrow.string()), 44 | ( 45 | "occupancy_status", 46 | pyarrow.string(), 47 | ), 48 | ( 49 | "occupancy_percentage", 50 | pyarrow.int32(), 51 | ), 52 | ( 53 | "carriage_sequence", 54 | pyarrow.uint32(), 55 | ), 56 | ] 57 | ) 58 | ), 59 | ), 60 | ] 61 | ), 62 | ), 63 | ] 64 | ) 65 | 66 | @property 67 | def table_sort_order(self) -> List[Tuple[str, str]]: 68 | return [ 69 | ("vehicle.vehicle.id", "ascending"), 70 | ("vehicle.trip.direction_id", "ascending"), 71 | ] 72 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/gtfs_rt_detail.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from abc import abstractmethod 3 | from typing import Optional, List, Tuple 4 | 5 | import pyarrow 6 | 7 | from lamp_py.ingestion.utils import flatten_schema 8 | 9 | 10 | class GTFSRTDetail(ABC): 11 | """ 12 | Abstract Base Class for all GTFSRTDetail implementations. 13 | 14 | GTFSRTDetail classes must implement all methods and properties that are 15 | defined. 16 | """ 17 | 18 | def transform_for_write(self, table: pyarrow.table) -> pyarrow.table: 19 | """modify table schema before write to parquet""" 20 | return flatten_schema(table) 21 | 22 | @property 23 | @abstractmethod 24 | def partition_column(self) -> str: 25 | """Column used to partition parquet files for this config""" 26 | 27 | @property 28 | @abstractmethod 29 | def import_schema(self) -> pyarrow.schema: 30 | """Get the import schema for the parquet table generated by this config""" 31 | 32 | @property 33 | def table_sort_order(self) -> Optional[List[Tuple[str, str]]]: 34 | """ 35 | Provide list of fields to sort pyarrow table before writing to parquet 36 | 37 | table_sort_order should be configured to optimize parquet file size 38 | when writing to disk 39 | 40 | Currently specified sort orders were determined by a small amount of experimentation 41 | 42 | TODO: perform additional experiments to optimize sort order of all parquet file types # pylint: disable=fixme 43 | """ 44 | return None 45 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/gtfs_rt_structs.py: -------------------------------------------------------------------------------- 1 | import pyarrow 2 | 3 | position = pyarrow.struct( 4 | [ 5 | ("bearing", pyarrow.uint16()), 6 | ("latitude", pyarrow.float64()), 7 | ("longitude", pyarrow.float64()), 8 | ("speed", pyarrow.float64()), 9 | ("odometer", pyarrow.float64()), 10 | ] 11 | ) 12 | 13 | trip_descriptor = pyarrow.struct( 14 | [ 15 | ("trip_id", pyarrow.string()), 16 | ("route_id", pyarrow.string()), 17 | ("direction_id", pyarrow.uint8()), 18 | ("start_time", pyarrow.string()), 19 | ("start_date", pyarrow.string()), 20 | ("schedule_relationship", pyarrow.string()), 21 | ("route_pattern_id", pyarrow.string()), # MBTA Enhanced Field 22 | ("tm_trip_id", pyarrow.string()), # Only used by Busloc 23 | ("overload_id", pyarrow.int64()), # Only used by Busloc 24 | ("overload_offset", pyarrow.int64()), # Only used by Busloc 25 | ("revenue", pyarrow.bool_()), # MBTA Enhanced Field 26 | ("last_trip", pyarrow.bool_()), # MBTA Enhanced Field 27 | ] 28 | ) 29 | 30 | vehicle_descriptor = pyarrow.struct( 31 | [ 32 | ("id", pyarrow.string()), 33 | ("label", pyarrow.string()), 34 | ("license_plate", pyarrow.string()), 35 | ( 36 | "consist", 37 | pyarrow.list_( 38 | pyarrow.struct( 39 | [ 40 | ("label", pyarrow.string()), 41 | ] 42 | ), 43 | ), 44 | ), # MBTA Enhanced Field 45 | ("assignment_status", pyarrow.string()), # Only used by Busloc 46 | ] 47 | ) 48 | 49 | translated_string = pyarrow.struct( 50 | [ 51 | ( 52 | "translation", 53 | pyarrow.list_( 54 | pyarrow.struct( 55 | [ 56 | ("text", pyarrow.string()), 57 | ("language", pyarrow.string()), 58 | ] 59 | ) 60 | ), 61 | ) 62 | ] 63 | ) 64 | 65 | stop_time_event = pyarrow.struct( 66 | [ 67 | ("delay", pyarrow.int32()), 68 | ("time", pyarrow.int64()), 69 | ("uncertainty", pyarrow.int32()), 70 | ] 71 | ) 72 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion/pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import time 5 | import logging 6 | import signal 7 | 8 | from lamp_py.aws.ecs import handle_ecs_sigterm, check_for_sigterm 9 | from lamp_py.aws.kinesis import KinesisReader 10 | from lamp_py.postgres.postgres_utils import start_rds_writer_process 11 | from lamp_py.runtime_utils.alembic_migration import alembic_upgrade_to_head 12 | from lamp_py.runtime_utils.env_validation import validate_environment 13 | from lamp_py.runtime_utils.process_logger import ProcessLogger 14 | 15 | from lamp_py.ingestion.ingest_gtfs import ingest_gtfs 16 | from lamp_py.ingestion.glides import ingest_glides_events 17 | 18 | # from lamp_py.ingestion.light_rail_gps import ingest_light_rail_gps 19 | from lamp_py.runtime_utils.remote_files import LAMP 20 | from lamp_py.utils.clear_folder import clear_folder 21 | 22 | logging.getLogger().setLevel("INFO") 23 | DESCRIPTION = """Entry Point For GTFS Ingestion Scripts""" 24 | 25 | 26 | def main() -> None: 27 | """ 28 | run the ingestion pipeline 29 | 30 | * setup metadata queue metadata writer process 31 | * setup a glides kinesis reader 32 | * on a loop 33 | * check to see if the pipeline should be terminated 34 | * ingest files from incoming s3 bucket 35 | * ingest glides events from kinesis 36 | """ 37 | # start rds writer process 38 | # this will create only one rds engine while app is running 39 | metadata_queue, rds_process = start_rds_writer_process() 40 | 41 | # connect to the glides kinesis stream 42 | glides_reader = KinesisReader(stream_name="ctd-glides-prod") 43 | 44 | # run the event loop every 30 seconds 45 | while True: 46 | process_logger = ProcessLogger(process_name="main") 47 | process_logger.log_start() 48 | bucket_filter = LAMP 49 | check_for_sigterm(metadata_queue, rds_process) 50 | # ingest_light_rail_gps(bucket_filter=bucket_filter) 51 | ingest_gtfs(metadata_queue, bucket_filter=bucket_filter) 52 | ingest_glides_events(glides_reader, metadata_queue) 53 | check_for_sigterm(metadata_queue, rds_process) 54 | 55 | process_logger.log_complete() 56 | 57 | time.sleep(30) 58 | 59 | 60 | def start() -> None: 61 | """configure and start the ingestion process""" 62 | clear_folder("/tmp") 63 | # setup handling shutdown commands 64 | signal.signal(signal.SIGTERM, handle_ecs_sigterm) 65 | 66 | # configure the environment 67 | os.environ["SERVICE_NAME"] = "ingestion" 68 | 69 | validate_environment( 70 | required_variables=[ 71 | "ARCHIVE_BUCKET", 72 | "ERROR_BUCKET", 73 | "INCOMING_BUCKET", 74 | "PUBLIC_ARCHIVE_BUCKET", 75 | "SPRINGBOARD_BUCKET", 76 | "ALEMBIC_MD_DB_NAME", 77 | ], 78 | db_prefixes=["MD", "RPM"], 79 | ) 80 | 81 | # run metadata rds migrations 82 | alembic_upgrade_to_head(db_name=os.environ["ALEMBIC_MD_DB_NAME"]) 83 | 84 | # run the main method 85 | main() 86 | 87 | 88 | if __name__ == "__main__": 89 | start() 90 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion_tm/ingest.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from lamp_py.mssql.mssql_utils import MSSQLManager 4 | from lamp_py.ingestion_tm.tm_export import TMExport 5 | from lamp_py.ingestion_tm.jobs.whole_table import ( 6 | TMMainGeoNode, 7 | TMMainRoute, 8 | TMMainTrip, 9 | TMMainVehicle, 10 | TMMainBlock, 11 | TMMainOperator, 12 | TMMainRun, 13 | TMMainWorkPiece, 14 | TMDailyLogDailySchedAdhereWaiver, 15 | ) 16 | from lamp_py.ingestion_tm.jobs.parition_table import ( 17 | TMDailyLogStopCrossing, 18 | TMDailyLogDailyWorkPiece, 19 | ) 20 | 21 | 22 | def get_ingestion_jobs() -> List[TMExport]: 23 | """ 24 | get a list of all ingestion jobs that 25 | """ 26 | return [ 27 | TMMainGeoNode(), 28 | TMMainRoute(), 29 | TMMainTrip(), 30 | TMMainVehicle(), 31 | TMMainBlock(), 32 | TMMainOperator(), 33 | TMMainRun(), 34 | TMMainWorkPiece(), 35 | TMDailyLogStopCrossing(), 36 | TMDailyLogDailyWorkPiece(), 37 | TMDailyLogDailySchedAdhereWaiver(), 38 | ] 39 | 40 | 41 | def ingest_tables() -> None: 42 | """ 43 | ingest tables from transmaster database 44 | """ 45 | tm_db = MSSQLManager(verbose=True) 46 | jobs: List[TMExport] = get_ingestion_jobs() 47 | 48 | for job in jobs: 49 | job.run_export(tm_db) 50 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion_tm/pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import logging 4 | import os 5 | 6 | from lamp_py.aws.ecs import check_for_parallel_tasks 7 | from lamp_py.runtime_utils.env_validation import validate_environment 8 | 9 | from lamp_py.ingestion_tm.ingest import ingest_tables 10 | 11 | logging.getLogger().setLevel("INFO") 12 | DESCRIPTION = """Entry Point For TM Ingestion Scripts""" 13 | 14 | 15 | def start() -> None: 16 | """configure and start the transitmaster ingestion process""" 17 | # configure the environment 18 | os.environ["SERVICE_NAME"] = "ingestion_tm" 19 | 20 | validate_environment( 21 | required_variables=[ 22 | "SPRINGBOARD_BUCKET", 23 | "TM_DB_HOST", 24 | "TM_DB_NAME", 25 | "TM_DB_USER", 26 | "TM_DB_PASSWORD", 27 | "TM_DB_PORT", 28 | "ECS_CLUSTER", 29 | "ECS_TASK_GROUP", 30 | ], 31 | private_variables=[ 32 | "TM_DB_PASSWORD", 33 | ], 34 | ) 35 | 36 | check_for_parallel_tasks() 37 | 38 | # run the main method 39 | ingest_tables() 40 | 41 | 42 | if __name__ == "__main__": 43 | start() 44 | -------------------------------------------------------------------------------- /src/lamp_py/ingestion_tm/tm_export.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from abc import abstractmethod 3 | 4 | import pyarrow 5 | 6 | from lamp_py.mssql.mssql_utils import MSSQLManager 7 | 8 | 9 | class TMExport(ABC): 10 | """ 11 | Abstract Base Class for TM Export jobs 12 | """ 13 | 14 | @property 15 | @abstractmethod 16 | def export_schema(self) -> pyarrow.schema: 17 | """Schema for export""" 18 | 19 | @abstractmethod 20 | def run_export(self, tm_db: MSSQLManager) -> None: 21 | """ 22 | Business logic to create new exprot parquet file 23 | """ 24 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. -------------------------------------------------------------------------------- /src/lamp_py/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/src/lamp_py/migrations/__init__.py -------------------------------------------------------------------------------- /src/lamp_py/migrations/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from alembic import context 4 | 5 | from lamp_py.postgres.postgres_utils import DatabaseIndex 6 | 7 | # this is the Alembic Config object, which provides 8 | # access to the values within the .ini file in use. 9 | config = context.config 10 | 11 | # gate to make sure alembic is run using -n flag 12 | if config.config_ini_section == "alembic": 13 | raise SyntaxError("Run alembic with -n flag to specifiy Database name.") 14 | 15 | # get database name from -n flag when alembic is run from cmd line 16 | db_name_env = config.config_ini_section 17 | 18 | # Interpret the config file for Python logging. 19 | # This line sets up loggers basically. 20 | if config.config_file_name is not None: 21 | fileConfig(config.config_file_name) 22 | 23 | # add your model's MetaData object here 24 | # for 'autogenerate' support 25 | # from myapp import mymodel 26 | # target_metadata = mymodel.Base.metadata 27 | from lamp_py.postgres.rail_performance_manager_schema import RpmSqlBase 28 | from lamp_py.postgres.metadata_schema import MetadataSqlBase 29 | 30 | # using dictionary for engine and target_metadata to support migrating multiple dbs 31 | # each dictionary name should have a section defined in alembic.ini that 32 | # matches the key used in the db_details dictionary 33 | rpm_psql_args = DatabaseIndex.RAIL_PERFORMANCE_MANAGER.get_args_from_env() 34 | md_psql_args = DatabaseIndex.METADATA.get_args_from_env() 35 | db_details = { 36 | "performance_manager": { 37 | "engine": rpm_psql_args.get_local_engine(), 38 | "target_metadata": RpmSqlBase.metadata, 39 | }, 40 | "metadata": { 41 | "engine": md_psql_args.get_local_engine(), 42 | "target_metadata": MetadataSqlBase.metadata, 43 | }, 44 | } 45 | 46 | # other values from the config, defined by the needs of env.py, 47 | # can be acquired: 48 | # my_important_option = config.get_main_option("my_important_option") 49 | # ... etc. 50 | 51 | 52 | # def run_migrations_offline() -> None: 53 | # """Run migrations in 'offline' mode. 54 | 55 | # This configures the context with just a URL 56 | # and not an Engine, though an Engine is acceptable 57 | # here as well. By skipping the Engine creation 58 | # we don't even need a DBAPI to be available. 59 | 60 | # Calls to context.execute() here emit the given string to the 61 | # script output. 62 | 63 | # """ 64 | # url = config.get_main_option("sqlalchemy.url") 65 | # context.configure( 66 | # url=url, 67 | # target_metadata=target_metadata, 68 | # literal_binds=True, 69 | # dialect_opts={"paramstyle": "named"}, 70 | # ) 71 | 72 | # with context.begin_transaction(): 73 | # context.run_migrations() 74 | 75 | 76 | def run_migrations_online() -> None: 77 | """Run migrations in 'online' mode. 78 | 79 | In this scenario we need to create an Engine 80 | and associate a connection with the context. 81 | 82 | """ 83 | # strip off the environment name at the end of the db_name_env. 84 | # expected format is "_" 85 | db_name = db_name_env.rsplit("_", 1)[0] 86 | connectable = db_details[db_name]["engine"] 87 | 88 | with connectable.connect() as connection: 89 | context.configure( 90 | connection=connection, 91 | target_metadata=db_details[db_name]["target_metadata"], 92 | ) 93 | 94 | with context.begin_transaction(): 95 | context.run_migrations() 96 | 97 | 98 | if context.is_offline_mode(): 99 | raise NotImplementedError("Alembic offline migration not implemented.") 100 | # run_migrations_offline() 101 | else: 102 | run_migrations_online() 103 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/migration_template_generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | 4 | 5 | def pick_n_directories(options: list[str]) -> list[str]: 6 | """ 7 | Given a list of directories, show them to user to select the set of directories to return as a list 8 | """ 9 | 10 | print("Select options (enter numbers separated by spaces):") 11 | for i, option in enumerate(options): 12 | print(f"{i + 1}. {option}") 13 | 14 | while True: 15 | try: 16 | choices = input("> ") 17 | selected_indices = [int(c) - 1 for c in choices.split()] 18 | if not all(0 <= i < len(options) for i in selected_indices): 19 | raise ValueError 20 | return [options[i] for i in selected_indices] 21 | except ValueError: 22 | print("Invalid input. Please enter numbers separated by spaces, corresponding to the options.") 23 | 24 | 25 | def migration_template( 26 | current_id: str, 27 | previous_id: str, 28 | date_string: str, 29 | alembic_string: str, 30 | detail_desc: str, 31 | upgrade_desc: str, 32 | downgrade_desc: str, 33 | ) -> str: 34 | """ 35 | Fillable template for a generic migration. This gets populated and 36 | filled out with directory dependent curr/prev id and data. WIP 37 | """ 38 | return f'''"""{alembic_string} 39 | 40 | Revision ID: {current_id} 41 | Revises: {previous_id} 42 | Create Date: {date_string} 43 | 44 | Details: {detail_desc} 45 | 46 | * upgrade -> {upgrade_desc} 47 | * downgrade -> {downgrade_desc} 48 | """ 49 | 50 | import logging 51 | import os 52 | import tempfile 53 | import polars as pl 54 | import pyarrow as pa 55 | import pyarrow.parquet as pq 56 | from typing import List 57 | 58 | from alembic import op 59 | import sqlalchemy as sa 60 | from sqlalchemy.exc import ProgrammingError 61 | 62 | from lamp_py.aws.s3 import download_file, upload_file 63 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 64 | 65 | # revision identifiers, used by Alembic. 66 | revision = "{current_id}" 67 | down_revision = "{previous_id}" 68 | branch_labels = None # tbd 69 | depends_on = None #tbd 70 | 71 | 72 | def upgrade() -> None: 73 | pass 74 | 75 | def downgrade() -> None: 76 | pass 77 | ''' 78 | 79 | 80 | if __name__ == "__main__": 81 | import uuid 82 | 83 | short_desc = "reprocess_422_423" 84 | uuid_new = uuid.uuid4().hex[-12:] 85 | 86 | versions_dir = "/Users/hhuang/lamp/lamp/src/lamp_py/migrations/versions" 87 | 88 | # List directories in the versions directory 89 | if os.path.exists(versions_dir): 90 | directories = sorted([d for d in os.listdir(versions_dir) if os.path.isdir(os.path.join(versions_dir, d))]) 91 | print("Directories in 'versions':", directories) 92 | else: 93 | print(f"The directory '{versions_dir}' does not exist.") 94 | 95 | options = pick_n_directories(directories) 96 | 97 | print(options) 98 | for o in options: 99 | latest_migration = sorted([d for d in os.listdir(os.path.join(versions_dir, o)) if not d.startswith("sql")])[-1] 100 | parts = os.path.basename(latest_migration).split("_") 101 | breakpoint() 102 | increment_migration_count = str(int(parts[0]) + 1).zfill(3) 103 | uuid_prev = parts[1] 104 | 105 | with open(f"{versions_dir}/{o}/{increment_migration_count}_{uuid_new}_{short_desc}.py", "w") as f: 106 | f.write( 107 | migration_template( 108 | current_id=uuid_new, 109 | previous_id=uuid_prev, 110 | alembic_string=short_desc, 111 | date_string=str(datetime.datetime.now()), 112 | detail_desc="FILL ME IN", 113 | upgrade_desc="test upgrade", 114 | downgrade_desc="None", 115 | ) 116 | ) 117 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade() -> None: 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade() -> None: 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/metadata_dev/001_07903947aabe_initial_changes.py: -------------------------------------------------------------------------------- 1 | """initial changes 2 | 3 | Revision ID: 07903947aabe 4 | Revises: 5 | Create Date: 2023-12-11 15:12:47.261091 6 | 7 | """ 8 | 9 | from alembic import op 10 | from sqlalchemy.exc import ProgrammingError 11 | from sqlalchemy.sql import text 12 | import logging 13 | import sqlalchemy as sa 14 | 15 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 16 | from lamp_py.postgres.metadata_schema import MetadataLog 17 | 18 | # revision identifiers, used by Alembic. 19 | revision = "07903947aabe" 20 | down_revision = None 21 | branch_labels = None 22 | depends_on = None 23 | 24 | 25 | def upgrade() -> None: 26 | # ### commands auto generated by Alembic - please adjust! ### 27 | op.create_table( 28 | "metadata_log", 29 | sa.Column("pk_id", sa.Integer(), nullable=False), 30 | sa.Column("rail_pm_processed", sa.Boolean(), nullable=True), 31 | sa.Column("rail_pm_process_fail", sa.Boolean(), nullable=True), 32 | sa.Column("path", sa.String(length=256), nullable=False), 33 | sa.Column( 34 | "created_on", 35 | sa.DateTime(timezone=True), 36 | server_default=sa.text("now()"), 37 | nullable=True, 38 | ), 39 | sa.PrimaryKeyConstraint("pk_id"), 40 | sa.UniqueConstraint("path"), 41 | ) 42 | op.create_index( 43 | "ix_metadata_log_not_processed", 44 | "metadata_log", 45 | ["path"], 46 | unique=False, 47 | postgresql_where=sa.text("rail_pm_processed = false"), 48 | ) 49 | 50 | # ### end Alembic commands ### 51 | 52 | 53 | def downgrade() -> None: 54 | # ### commands auto generated by Alembic - please adjust! ### 55 | op.drop_index( 56 | "ix_metadata_log_not_processed", 57 | table_name="metadata_log", 58 | ) 59 | op.drop_table("metadata_log") 60 | # ### end Alembic commands ### 61 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/metadata_dev/002_26db393ea854_update_glides_location_column_names.py: -------------------------------------------------------------------------------- 1 | """update_glides_location_column_names 2 | 3 | Revision ID: 26db393ea854 4 | Revises: 07903947aabe 5 | Create Date: 2024-07-09 12:12:04.325358 6 | 7 | Details 8 | * upgrade -> for each glides parquet file: 9 | * rename columns to match api. replace gtfsID with gtfsId and todsID with 10 | todsId 11 | * unique each dataset based on the 'id' uuid field. 12 | 13 | * downgrade -> Nothing 14 | """ 15 | 16 | import os 17 | import tempfile 18 | import polars as pl 19 | import pyarrow as pa 20 | import pyarrow.parquet as pq 21 | from typing import List 22 | 23 | from lamp_py.aws.s3 import download_file, upload_file 24 | 25 | # revision identifiers, used by Alembic. 26 | revision = "26db393ea854" 27 | down_revision = "07903947aabe" 28 | branch_labels = None 29 | depends_on = None 30 | 31 | 32 | def upgrade() -> None: 33 | def update_glides_archive(temp_dir: str, base_filename: str) -> None: 34 | """ 35 | * download the remote file to a local temp dir 36 | * rename columns with "gtfsID" or "todsID" in them to use "Id" 37 | * unique columns 38 | * sort the dataset based on 'time' column 39 | """ 40 | remote_path = f"s3://{os.environ['SPRINGBOARD_BUCKET']}/lamp/GLIDES/{base_filename}" 41 | old_local_path = os.path.join(temp_dir, f"old_{base_filename}") 42 | new_local_path = os.path.join(temp_dir, f"new_{base_filename}") 43 | 44 | file_exists = download_file(remote_path, old_local_path) 45 | if not file_exists: 46 | return 47 | 48 | old_table = pq.read_table(old_local_path) 49 | 50 | # build the new schema by converting names and keeping types 51 | fields: List[pa.Field] = [] 52 | for column in old_table.schema: 53 | if "gtfsID" in column.name: 54 | new_name = column.name.replace("gtfsID", "gtfsId") 55 | new_field = pa.field(new_name, column.type) 56 | fields.append(new_field) 57 | elif "todsID" in column.name: 58 | new_name = column.name.replace("todsID", "todsId") 59 | new_field = pa.field(new_name, column.type) 60 | fields.append(new_field) 61 | else: 62 | fields.append(column) 63 | 64 | schema = pa.schema(fields) 65 | 66 | # rename columns to match new schema 67 | # unique the records 68 | # cast to new schema (polars converts things) 69 | new_table = ( 70 | pl.DataFrame(old_table.rename_columns(schema.names)).unique().sort(by=["time"]).to_arrow().cast(schema) 71 | ) 72 | 73 | pq.write_table(new_table, new_local_path) 74 | upload_file(new_local_path, remote_path) 75 | 76 | files_to_update = [ 77 | "editor_changes.parquet", 78 | "operator_sign_ins.parquet", 79 | "trip_updates.parquet", 80 | ] 81 | 82 | with tempfile.TemporaryDirectory() as temp_dir: 83 | for filename in files_to_update: 84 | update_glides_archive(temp_dir, filename) 85 | 86 | 87 | def downgrade() -> None: 88 | pass 89 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/metadata_prod/001_07903947aabe_initial_changes.py: -------------------------------------------------------------------------------- 1 | """initial changes 2 | 3 | Revision ID: 07903947aabe 4 | Revises: 5 | Create Date: 2023-12-11 15:12:47.261091 6 | 7 | """ 8 | 9 | from alembic import op 10 | from sqlalchemy.exc import ProgrammingError 11 | from sqlalchemy.sql import text 12 | import logging 13 | import sqlalchemy as sa 14 | 15 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 16 | from lamp_py.postgres.metadata_schema import MetadataLog 17 | 18 | # revision identifiers, used by Alembic. 19 | revision = "07903947aabe" 20 | down_revision = None 21 | branch_labels = None 22 | depends_on = None 23 | 24 | 25 | def upgrade() -> None: 26 | # ### commands auto generated by Alembic - please adjust! ### 27 | op.create_table( 28 | "metadata_log", 29 | sa.Column("pk_id", sa.Integer(), nullable=False), 30 | sa.Column("rail_pm_processed", sa.Boolean(), nullable=True), 31 | sa.Column("rail_pm_process_fail", sa.Boolean(), nullable=True), 32 | sa.Column("path", sa.String(length=256), nullable=False), 33 | sa.Column( 34 | "created_on", 35 | sa.DateTime(timezone=True), 36 | server_default=sa.text("now()"), 37 | nullable=True, 38 | ), 39 | sa.PrimaryKeyConstraint("pk_id"), 40 | sa.UniqueConstraint("path"), 41 | ) 42 | op.create_index( 43 | "ix_metadata_log_not_processed", 44 | "metadata_log", 45 | ["path"], 46 | unique=False, 47 | postgresql_where=sa.text("rail_pm_processed = false"), 48 | ) 49 | 50 | # pull metadata from the rail performance manager database into the 51 | # metadata database. the table may or may not exist, so wrap this in a try 52 | # except 53 | try: 54 | rpm_db_manager = DatabaseManager(db_index=DatabaseIndex.RAIL_PERFORMANCE_MANAGER) 55 | 56 | insert_data = [] 57 | # pull metadata from the rail performance manager database via direct 58 | # sql query. the metadata_log table may or may not exist. 59 | with rpm_db_manager.session.begin() as session: 60 | result = session.execute(text("SELECT path, processed, process_fail FROM metadata_log")) 61 | for row in result: 62 | (path, processed, process_fail) = row 63 | insert_data.append( 64 | { 65 | "path": path, 66 | "rail_pm_processed": processed, 67 | "rail_pm_process_fail": process_fail, 68 | } 69 | ) 70 | 71 | except ProgrammingError as error: 72 | # Error 42P01 is an 'Undefined Table' error. This occurs when there is 73 | # no metadata_log table in the rail performance manager database 74 | # 75 | # Raise all other sql errors 76 | insert_data = [] 77 | original_error = error.orig 78 | if original_error is not None and hasattr(original_error, "pgcode") and original_error.pgcode == "42P01": 79 | logging.info("No Metadata Table in Rail Performance Manager") 80 | else: 81 | raise 82 | 83 | # insert data into the metadata database 84 | if insert_data: 85 | op.bulk_insert(MetadataLog.__table__, insert_data) 86 | 87 | # ### end Alembic commands ### 88 | 89 | 90 | def downgrade() -> None: 91 | # ### commands auto generated by Alembic - please adjust! ### 92 | op.drop_index( 93 | "ix_metadata_log_not_processed", 94 | table_name="metadata_log", 95 | ) 96 | op.drop_table("metadata_log") 97 | # ### end Alembic commands ### 98 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/metadata_prod/002_cce8dfee767a_re_run_input_files_from_2024_04_03.py: -------------------------------------------------------------------------------- 1 | """re-run input files from 2024-04-03 2 | 3 | Revision ID: cce8dfee767a 4 | Revises: 07903947aabe 5 | Create Date: 2024-04-04 11:50:55.161259 6 | 7 | Details 8 | * upgrade -> update metdata table to re-process failed parquet files from April 3, 2024 9 | 10 | * downgrade -> Nothing 11 | 12 | """ 13 | 14 | from alembic import op 15 | import sqlalchemy as sa 16 | 17 | 18 | # revision identifiers, used by Alembic. 19 | revision = "cce8dfee767a" 20 | down_revision = "07903947aabe" 21 | branch_labels = None 22 | depends_on = None 23 | 24 | 25 | def upgrade() -> None: 26 | update_query = """ 27 | UPDATE 28 | public.metadata_log 29 | SET 30 | rail_pm_process_fail = false 31 | , rail_pm_processed = false 32 | WHERE 33 | created_on > '2024-04-03 09:00:00' 34 | and created_on < '2024-04-03 15:00:00' 35 | and ( 36 | path LIKE '%RT_TRIP_UPDATES%' 37 | or path LIKE '%RT_VEHICLE_POSITION%' 38 | ) 39 | ; 40 | """ 41 | op.execute(update_query) 42 | 43 | 44 | def downgrade() -> None: 45 | pass 46 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/metadata_prod/003_26db393ea854_update_glides_location_column_names.py: -------------------------------------------------------------------------------- 1 | """update_glides_location_column_names 2 | 3 | Revision ID: 26db393ea854 4 | Revises: cce8dfee767a 5 | Create Date: 2024-07-09 12:12:04.325358 6 | 7 | Details 8 | * upgrade -> for each glides parquet file: 9 | * rename columns to match api. replace gtfsID with gtfsId and todsID with 10 | todsId 11 | * unique each dataset based on the 'id' uuid field. 12 | 13 | * downgrade -> Nothing 14 | """ 15 | 16 | import os 17 | import tempfile 18 | import polars as pl 19 | import pyarrow as pa 20 | import pyarrow.parquet as pq 21 | from typing import List 22 | 23 | from lamp_py.aws.s3 import download_file, upload_file 24 | 25 | # revision identifiers, used by Alembic. 26 | revision = "26db393ea854" 27 | down_revision = "cce8dfee767a" 28 | branch_labels = None 29 | depends_on = None 30 | 31 | 32 | def upgrade() -> None: 33 | def update_glides_archive(temp_dir: str, base_filename: str) -> None: 34 | """ 35 | * download the remote file to a local temp dir 36 | * rename columns with "gtfsID" or "todsID" in them to use "Id" 37 | * unique columns 38 | * sort the dataset based on 'time' column 39 | """ 40 | remote_path = f"s3://{os.environ['SPRINGBOARD_BUCKET']}/lamp/GLIDES/{base_filename}" 41 | old_local_path = os.path.join(temp_dir, f"old_{base_filename}") 42 | new_local_path = os.path.join(temp_dir, f"new_{base_filename}") 43 | 44 | file_exists = download_file(remote_path, old_local_path) 45 | if not file_exists: 46 | return 47 | 48 | old_table = pq.read_table(old_local_path) 49 | 50 | # build the new schema by converting names and keeping types 51 | fields: List[pa.Field] = [] 52 | for column in old_table.schema: 53 | if "gtfsID" in column.name: 54 | new_name = column.name.replace("gtfsID", "gtfsId") 55 | new_field = pa.field(new_name, column.type) 56 | fields.append(new_field) 57 | elif "todsID" in column.name: 58 | new_name = column.name.replace("todsID", "todsId") 59 | new_field = pa.field(new_name, column.type) 60 | fields.append(new_field) 61 | else: 62 | fields.append(column) 63 | 64 | schema = pa.schema(fields) 65 | 66 | # rename columns to match new schema 67 | # unique the records 68 | # cast to new schema (polars converts things) 69 | new_table = ( 70 | pl.DataFrame(old_table.rename_columns(schema.names)).unique().sort(by=["time"]).to_arrow().cast(schema) 71 | ) 72 | 73 | pq.write_table(new_table, new_local_path) 74 | upload_file(new_local_path, remote_path) 75 | 76 | files_to_update = [ 77 | "editor_changes.parquet", 78 | "operator_sign_ins.parquet", 79 | "trip_updates.parquet", 80 | ] 81 | 82 | with tempfile.TemporaryDirectory() as temp_dir: 83 | for filename in files_to_update: 84 | update_glides_archive(temp_dir, filename) 85 | 86 | 87 | def downgrade() -> None: 88 | pass 89 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/metadata_prod/004_a08c5fd37dbd_reprocess_422_423.py: -------------------------------------------------------------------------------- 1 | """backfill_rt_rail_data_0404_to_0422 2 | 3 | Revision ID: a08c5fd37dbd 4 | Revises: 26db393ea854 5 | Create Date: 2025-05-01 00:00:00 6 | 7 | Details: Reprocess 4/22 because it is missing. Include 4/22 and 4/23 because of UTC vs EST 8 | 9 | * upgrade -> reset processed flags in metadata for 4/22 and 4/23 10 | * downgrade -> None 11 | """ 12 | 13 | import logging 14 | import os 15 | import tempfile 16 | import polars as pl 17 | import pyarrow as pa 18 | import pyarrow.parquet as pq 19 | from typing import List 20 | 21 | from alembic import op 22 | import sqlalchemy as sa 23 | from sqlalchemy.exc import ProgrammingError 24 | 25 | from lamp_py.aws.s3 import download_file, upload_file 26 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 27 | 28 | # revision identifiers, used by Alembic. 29 | revision = "a08c5fd37dbd" 30 | down_revision = "26db393ea854" 31 | branch_labels = None 32 | depends_on = None 33 | 34 | 35 | def upgrade() -> None: 36 | pass 37 | 38 | # lamp_metadata=> SELECT path, created_on, rail_pm_processed, rail_pm_process_fail 39 | # FROM public.metadata_log 40 | # WHERE substring(path, '\d{4}-\d{2}-\d{2}')::date >= '2025-04-22' 41 | # and substring(path, '\d{4}-\d{2}-\d{2}')::date <= '2025-04-23' 42 | # and ( 43 | # path LIKE '%/RT_TRIP_UPDATES/%' 44 | # or path LIKE '%/RT_VEHICLE_POSITIONS/%' 45 | # ) 46 | # ORDER BY created_on; 47 | 48 | update_md_query = """ 49 | UPDATE 50 | metadata_log 51 | SET 52 | rail_pm_process_fail = false 53 | , rail_pm_processed = false 54 | WHERE 55 | substring(path, '\d{4}-\d{2}-\d{2}')::date >= '2025-04-22' 56 | and substring(path, '\d{4}-\d{2}-\d{2}')::date <= '2025-04-23' 57 | and ( 58 | path LIKE '%/RT_TRIP_UPDATES/%' 59 | or path LIKE '%/RT_VEHICLE_POSITIONS/%' 60 | ) 61 | ; 62 | """ 63 | op.execute(update_md_query) 64 | 65 | 66 | def downgrade() -> None: 67 | pass 68 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/metadata_staging/001_07903947aabe_initial_changes.py: -------------------------------------------------------------------------------- 1 | """initial changes 2 | 3 | Revision ID: 07903947aabe 4 | Revises: 5 | Create Date: 2023-12-11 15:12:47.261091 6 | 7 | """ 8 | 9 | from alembic import op 10 | from sqlalchemy.exc import ProgrammingError 11 | from sqlalchemy.sql import text 12 | import logging 13 | import sqlalchemy as sa 14 | 15 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 16 | from lamp_py.postgres.metadata_schema import MetadataLog 17 | 18 | # revision identifiers, used by Alembic. 19 | revision = "07903947aabe" 20 | down_revision = None 21 | branch_labels = None 22 | depends_on = None 23 | 24 | 25 | def upgrade() -> None: 26 | # ### commands auto generated by Alembic - please adjust! ### 27 | op.create_table( 28 | "metadata_log", 29 | sa.Column("pk_id", sa.Integer(), nullable=False), 30 | sa.Column("rail_pm_processed", sa.Boolean(), nullable=True), 31 | sa.Column("rail_pm_process_fail", sa.Boolean(), nullable=True), 32 | sa.Column("path", sa.String(length=256), nullable=False), 33 | sa.Column( 34 | "created_on", 35 | sa.DateTime(timezone=True), 36 | server_default=sa.text("now()"), 37 | nullable=True, 38 | ), 39 | sa.PrimaryKeyConstraint("pk_id"), 40 | sa.UniqueConstraint("path"), 41 | ) 42 | op.create_index( 43 | "ix_metadata_log_not_processed", 44 | "metadata_log", 45 | ["path"], 46 | unique=False, 47 | postgresql_where=sa.text("rail_pm_processed = false"), 48 | ) 49 | 50 | # pull metadata from the rail performance manager database into the 51 | # metadata database. the table may or may not exist, so wrap this in a try 52 | # except 53 | try: 54 | rpm_db_manager = DatabaseManager(db_index=DatabaseIndex.RAIL_PERFORMANCE_MANAGER) 55 | 56 | insert_data = [] 57 | # pull metadata from the rail performance manager database via direct 58 | # sql query. the metadata_log table may or may not exist. 59 | with rpm_db_manager.session.begin() as session: 60 | result = session.execute(text("SELECT path, processed, process_fail FROM metadata_log")) 61 | for row in result: 62 | (path, processed, process_fail) = row 63 | insert_data.append( 64 | { 65 | "path": path, 66 | "rail_pm_processed": processed, 67 | "rail_pm_process_fail": process_fail, 68 | } 69 | ) 70 | 71 | except ProgrammingError as error: 72 | # Error 42P01 is an 'Undefined Table' error. This occurs when there is 73 | # no metadata_log table in the rail performance manager database 74 | # 75 | # Raise all other sql errors 76 | insert_data = [] 77 | original_error = error.orig 78 | if original_error is not None and hasattr(original_error, "pgcode") and original_error.pgcode == "42P01": 79 | logging.info("No Metadata Table in Rail Performance Manager") 80 | else: 81 | raise 82 | 83 | # insert data into the metadata database 84 | if insert_data: 85 | op.bulk_insert(MetadataLog.__table__, insert_data) 86 | 87 | # ### end Alembic commands ### 88 | 89 | 90 | def downgrade() -> None: 91 | # ### commands auto generated by Alembic - please adjust! ### 92 | op.drop_index( 93 | "ix_metadata_log_not_processed", 94 | table_name="metadata_log", 95 | ) 96 | op.drop_table("metadata_log") 97 | # ### end Alembic commands ### 98 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/metadata_staging/002_26db393ea854_update_glides_location_column_names.py: -------------------------------------------------------------------------------- 1 | """update_glides_location_column_names 2 | 3 | Revision ID: 26db393ea854 4 | Revises: 07903947aabe 5 | Create Date: 2024-07-09 12:12:04.325358 6 | 7 | Details 8 | * upgrade -> for each glides parquet file: 9 | * rename columns to match api. replace gtfsID with gtfsId and todsID with 10 | todsId 11 | * unique each dataset based on the 'id' uuid field. 12 | 13 | * downgrade -> Nothing 14 | """ 15 | 16 | import os 17 | import tempfile 18 | import polars as pl 19 | import pyarrow as pa 20 | import pyarrow.parquet as pq 21 | from typing import List 22 | 23 | from lamp_py.aws.s3 import download_file, upload_file 24 | 25 | # revision identifiers, used by Alembic. 26 | revision = "26db393ea854" 27 | down_revision = "07903947aabe" 28 | branch_labels = None 29 | depends_on = None 30 | 31 | 32 | def upgrade() -> None: 33 | def update_glides_archive(temp_dir: str, base_filename: str) -> None: 34 | """ 35 | * download the remote file to a local temp dir 36 | * rename columns with "gtfsID" or "todsID" in them to use "Id" 37 | * unique columns 38 | * sort the dataset based on 'time' column 39 | """ 40 | remote_path = f"s3://{os.environ['SPRINGBOARD_BUCKET']}/lamp/GLIDES/{base_filename}" 41 | old_local_path = os.path.join(temp_dir, f"old_{base_filename}") 42 | new_local_path = os.path.join(temp_dir, f"new_{base_filename}") 43 | 44 | file_exists = download_file(remote_path, old_local_path) 45 | if not file_exists: 46 | return 47 | 48 | old_table = pq.read_table(old_local_path) 49 | 50 | # build the new schema by converting names and keeping types 51 | fields: List[pa.Field] = [] 52 | for column in old_table.schema: 53 | if "gtfsID" in column.name: 54 | new_name = column.name.replace("gtfsID", "gtfsId") 55 | new_field = pa.field(new_name, column.type) 56 | fields.append(new_field) 57 | elif "todsID" in column.name: 58 | new_name = column.name.replace("todsID", "todsId") 59 | new_field = pa.field(new_name, column.type) 60 | fields.append(new_field) 61 | else: 62 | fields.append(column) 63 | 64 | schema = pa.schema(fields) 65 | 66 | # rename columns to match new schema 67 | # unique the records 68 | # cast to new schema (polars converts things) 69 | new_table = ( 70 | pl.DataFrame(old_table.rename_columns(schema.names)).unique().sort(by=["time"]).to_arrow().cast(schema) 71 | ) 72 | 73 | pq.write_table(new_table, new_local_path) 74 | upload_file(new_local_path, remote_path) 75 | 76 | files_to_update = [ 77 | "editor_changes.parquet", 78 | "operator_sign_ins.parquet", 79 | "trip_updates.parquet", 80 | ] 81 | 82 | with tempfile.TemporaryDirectory() as temp_dir: 83 | for filename in files_to_update: 84 | update_glides_archive(temp_dir, filename) 85 | 86 | 87 | def downgrade() -> None: 88 | pass 89 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/metadata_staging/003_a08c5fd37dbd_reprocess_422_423.py: -------------------------------------------------------------------------------- 1 | """backfill_rt_rail_data_0404_to_0422 2 | 3 | Revision ID: a08c5fd37dbd 4 | Revises: 26db393ea854 5 | Create Date: 2025-05-01 00:00:00 6 | 7 | Details: Reprocess 4/22 because it is missing. Include 4/22 and 4/23 because of UTC vs EST 8 | 9 | * upgrade -> reset processed flags in metadata for 4/22 and 4/23 10 | * downgrade -> None 11 | """ 12 | 13 | import logging 14 | import os 15 | import tempfile 16 | import polars as pl 17 | import pyarrow as pa 18 | import pyarrow.parquet as pq 19 | from typing import List 20 | 21 | from alembic import op 22 | import sqlalchemy as sa 23 | from sqlalchemy.exc import ProgrammingError 24 | 25 | from lamp_py.aws.s3 import download_file, upload_file 26 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 27 | 28 | # revision identifiers, used by Alembic. 29 | revision = "a08c5fd37dbd" 30 | down_revision = "26db393ea854" 31 | branch_labels = None 32 | depends_on = None 33 | 34 | 35 | def upgrade() -> None: 36 | pass 37 | 38 | # lamp_metadata=> SELECT path, created_on, rail_pm_processed, rail_pm_process_fail 39 | # FROM public.metadata_log 40 | # WHERE substring(path, '\d{4}-\d{2}-\d{2}')::date >= '2025-04-22' 41 | # and substring(path, '\d{4}-\d{2}-\d{2}')::date <= '2025-04-23' 42 | # and ( 43 | # path LIKE '%/RT_TRIP_UPDATES/%' 44 | # or path LIKE '%/RT_VEHICLE_POSITIONS/%' 45 | # ) 46 | # ORDER BY created_on; 47 | 48 | update_md_query = """ 49 | UPDATE 50 | metadata_log 51 | SET 52 | rail_pm_process_fail = false 53 | , rail_pm_processed = false 54 | WHERE 55 | substring(path, '\d{4}-\d{2}-\d{2}')::date >= '2025-04-22' 56 | and substring(path, '\d{4}-\d{2}-\d{2}')::date <= '2025-04-23' 57 | and ( 58 | path LIKE '%/RT_TRIP_UPDATES/%' 59 | or path LIKE '%/RT_VEHICLE_POSITIONS/%' 60 | ) 61 | ; 62 | """ 63 | op.execute(update_md_query) 64 | 65 | 66 | def downgrade() -> None: 67 | pass 68 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_dev/002_1b53fd278b10_fix_trip_id_length.py: -------------------------------------------------------------------------------- 1 | """fix trip id length 2 | 3 | Revision ID: 1b53fd278b10 4 | Revises: 5d9a7ee21ae5 5 | Create Date: 2023-11-27 16:25:42.657967 6 | 7 | Details 8 | * upgrade -> change "trip_id" field from length 128 to 512 9 | * downgrade -> change "trip_id" field from length 512 to 128 10 | """ 11 | 12 | from alembic import op 13 | import sqlalchemy as sa 14 | 15 | from lamp_py.migrations.versions.performance_manager_staging.sql_strings.strings_001 import ( 16 | view_opmi_all_rt_fields_joined, 17 | ) 18 | 19 | # revision identifiers, used by Alembic. 20 | revision = "1b53fd278b10" 21 | down_revision = "5d9a7ee21ae5" 22 | branch_labels = None 23 | depends_on = None 24 | 25 | 26 | def upgrade() -> None: 27 | # ### commands auto generated by Alembic - please adjust! ### 28 | op.execute("DROP VIEW IF EXISTS opmi_all_rt_fields_joined;") 29 | op.alter_column( 30 | "static_route_patterns", 31 | "representative_trip_id", 32 | existing_type=sa.VARCHAR(length=128), 33 | type_=sa.String(length=512), 34 | existing_nullable=False, 35 | ) 36 | op.alter_column( 37 | "static_stop_times", 38 | "trip_id", 39 | existing_type=sa.VARCHAR(length=128), 40 | type_=sa.String(length=512), 41 | existing_nullable=False, 42 | ) 43 | op.alter_column( 44 | "static_trips", 45 | "trip_id", 46 | existing_type=sa.VARCHAR(length=128), 47 | type_=sa.String(length=512), 48 | existing_nullable=False, 49 | ) 50 | op.alter_column( 51 | "temp_event_compare", 52 | "trip_id", 53 | existing_type=sa.VARCHAR(length=128), 54 | type_=sa.String(length=512), 55 | existing_nullable=False, 56 | ) 57 | op.alter_column( 58 | "vehicle_trips", 59 | "trip_id", 60 | existing_type=sa.VARCHAR(length=128), 61 | type_=sa.String(length=512), 62 | existing_nullable=False, 63 | ) 64 | op.alter_column( 65 | "vehicle_trips", 66 | "static_trip_id_guess", 67 | existing_type=sa.VARCHAR(length=128), 68 | type_=sa.String(length=512), 69 | existing_nullable=True, 70 | ) 71 | op.execute(view_opmi_all_rt_fields_joined) 72 | # ### end Alembic commands ### 73 | 74 | 75 | def downgrade() -> None: 76 | # ### commands auto generated by Alembic - please adjust! ### 77 | op.execute("DROP VIEW IF EXISTS opmi_all_rt_fields_joined;") 78 | op.alter_column( 79 | "vehicle_trips", 80 | "static_trip_id_guess", 81 | existing_type=sa.String(length=512), 82 | type_=sa.VARCHAR(length=128), 83 | existing_nullable=True, 84 | ) 85 | op.alter_column( 86 | "vehicle_trips", 87 | "trip_id", 88 | existing_type=sa.String(length=512), 89 | type_=sa.VARCHAR(length=128), 90 | existing_nullable=False, 91 | ) 92 | op.alter_column( 93 | "temp_event_compare", 94 | "trip_id", 95 | existing_type=sa.String(length=512), 96 | type_=sa.VARCHAR(length=128), 97 | existing_nullable=False, 98 | ) 99 | op.alter_column( 100 | "static_trips", 101 | "trip_id", 102 | existing_type=sa.String(length=512), 103 | type_=sa.VARCHAR(length=128), 104 | existing_nullable=False, 105 | ) 106 | op.alter_column( 107 | "static_stop_times", 108 | "trip_id", 109 | existing_type=sa.String(length=512), 110 | type_=sa.VARCHAR(length=128), 111 | existing_nullable=False, 112 | ) 113 | op.alter_column( 114 | "static_route_patterns", 115 | "representative_trip_id", 116 | existing_type=sa.String(length=512), 117 | type_=sa.VARCHAR(length=128), 118 | existing_nullable=False, 119 | ) 120 | op.execute(view_opmi_all_rt_fields_joined) 121 | # ### end Alembic commands ### 122 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_dev/003_ae6c6e4b2df5_extend_service_id_view.py: -------------------------------------------------------------------------------- 1 | """extend service_id_by_date_and_route 2 | 3 | Revision ID: ae6c6e4b2df5 4 | Revises: 1b53fd278b10 5 | Create Date: 2023-12-17 06:56:17.330783 6 | 7 | Details 8 | * upgrade -> extend service_id_by_date_and_route VIEW to generate values past current date 9 | * upgrade -> update canonical_stop_sequence to use row_number function instead of direct from static schedule 10 | 11 | * downgrade -> Nothing 12 | """ 13 | 14 | from alembic import op 15 | 16 | from lamp_py.migrations.versions.performance_manager_staging.sql_strings.strings_003 import ( 17 | view_service_id_by_date_and_route, 18 | ) 19 | 20 | 21 | # revision identifiers, used by Alembic. 22 | revision = "ae6c6e4b2df5" 23 | down_revision = "1b53fd278b10" 24 | branch_labels = None 25 | depends_on = None 26 | 27 | 28 | def upgrade() -> None: 29 | op.execute("DROP VIEW IF EXISTS service_id_by_date_and_route;") 30 | op.execute(view_service_id_by_date_and_route) 31 | 32 | op.create_index( 33 | "ix_static_trips_composite_4", 34 | "static_trips", 35 | ["static_version_key", "service_id"], 36 | unique=False, 37 | ) 38 | 39 | update_stop_sequences = ( 40 | "UPDATE vehicle_events " 41 | "SET canonical_stop_sequence = static_canon.stop_sequence " 42 | "FROM vehicle_events AS ve " 43 | "JOIN vehicle_trips AS vt " 44 | "ON ve.pm_trip_id = vt.pm_trip_id " 45 | "JOIN " 46 | "(" 47 | " select " 48 | " srp.direction_id " 49 | " , coalesce(st.branch_route_id, st.trunk_route_id) AS route_id " 50 | " , ROW_NUMBER () OVER (PARTITION BY srp.static_version_key, srp.direction_id, coalesce(st.branch_route_id, st.trunk_route_id) ORDER BY sst.stop_sequence) AS stop_sequence" 51 | " , ss.parent_station " 52 | " , srp.static_version_key " 53 | " from static_route_patterns srp " 54 | " JOIN static_trips st " 55 | " ON srp.representative_trip_id = st.trip_id " 56 | " AND srp.static_version_key = st.static_version_key " 57 | " JOIN static_stop_times sst " 58 | " ON srp.representative_trip_id = sst.trip_id " 59 | " AND srp.static_version_key = sst.static_version_key " 60 | " JOIN static_stops ss " 61 | " ON sst.stop_id = ss.stop_id " 62 | " AND sst.static_version_key = ss.static_version_key " 63 | " WHERE " 64 | " srp.route_pattern_typicality = 1" 65 | ") AS static_canon " 66 | "ON ve.parent_station = static_canon.parent_station " 67 | "AND vt.static_version_key = static_canon.static_version_key " 68 | "AND vt.direction_id = static_canon.direction_id " 69 | "AND coalesce(vt.branch_route_id, vt.trunk_route_id) = static_canon.route_id " 70 | "WHERE vehicle_events.pm_trip_id = ve.pm_trip_id " 71 | "AND vehicle_events.parent_station = static_canon.parent_station " 72 | ";" 73 | ) 74 | op.execute(update_stop_sequences) 75 | 76 | 77 | def downgrade() -> None: 78 | op.drop_index("ix_static_trips_composite_4", table_name="static_trips") 79 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_dev/005_96187da84955_remove_metadata.py: -------------------------------------------------------------------------------- 1 | """remove_metadata 2 | 3 | Revision ID: 96187da84955 4 | Revises: 45dedc21086e 5 | Create Date: 2023-12-28 12:18:25.412282 6 | 7 | check that all information in the metadata table has been copied to the 8 | metadata database before dropping the table and its indexes entirely. 9 | """ 10 | 11 | import time 12 | 13 | from alembic import op 14 | from sqlalchemy.dialects import postgresql 15 | from sqlalchemy.exc import ProgrammingError 16 | from sqlalchemy.sql import text 17 | import logging 18 | import sqlalchemy as sa 19 | 20 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 21 | from lamp_py.postgres.metadata_schema import MetadataLog 22 | 23 | # revision identifiers, used by Alembic. 24 | revision = "96187da84955" 25 | down_revision = "45dedc21086e" 26 | branch_labels = None 27 | depends_on = None 28 | 29 | 30 | def upgrade() -> None: 31 | # ### commands auto generated by Alembic - please adjust! ### 32 | op.drop_index("ix_metadata_log_not_processed", table_name="metadata_log") 33 | op.drop_table("metadata_log") 34 | # ### end Alembic commands ### 35 | 36 | 37 | def downgrade() -> None: 38 | # ### commands auto generated by Alembic - please adjust! ### 39 | op.create_table( 40 | "metadata_log", 41 | sa.Column("pk_id", sa.INTEGER(), autoincrement=True, nullable=False), 42 | sa.Column("processed", sa.BOOLEAN(), autoincrement=False, nullable=True), 43 | sa.Column("process_fail", sa.BOOLEAN(), autoincrement=False, nullable=True), 44 | sa.Column("path", sa.VARCHAR(length=256), autoincrement=False, nullable=False), 45 | sa.Column( 46 | "created_on", 47 | postgresql.TIMESTAMP(timezone=True), 48 | server_default=sa.text("now()"), 49 | autoincrement=False, 50 | nullable=True, 51 | ), 52 | sa.PrimaryKeyConstraint("pk_id", name="metadata_log_pkey"), 53 | sa.UniqueConstraint("path", name="metadata_log_path_key"), 54 | ) 55 | op.create_index( 56 | "ix_metadata_log_not_processed", 57 | "metadata_log", 58 | ["path"], 59 | unique=False, 60 | postgresql_where="(processed = false)", 61 | ) 62 | # ### end Alembic commands ### 63 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_dev/008_32ba735d080c_add_revenue_columns.py: -------------------------------------------------------------------------------- 1 | """add revenue columns 2 | 3 | Revision ID: 32ba735d080c 4 | Revises: 896dedd8a4db 5 | Create Date: 2024-09-20 08:47:52.784591 6 | 7 | This change adds a boolean revenue column to the vehcile_trips table. 8 | Initially this will be filled with True and back-filled by a seperate operation 9 | 10 | Details 11 | * upgrade -> drop triggers and indexes from table and add revenue column 12 | 13 | * downgrade -> drop revenue column 14 | 15 | """ 16 | 17 | from alembic import op 18 | import sqlalchemy as sa 19 | 20 | from lamp_py.postgres.rail_performance_manager_schema import ( 21 | TempEventCompare, 22 | VehicleTrips, 23 | ) 24 | 25 | # revision identifiers, used by Alembic. 26 | revision = "32ba735d080c" 27 | down_revision = "896dedd8a4db" 28 | branch_labels = None 29 | depends_on = None 30 | 31 | 32 | def upgrade() -> None: 33 | op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER rt_trips_update_branch_trunk;") 34 | op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER update_vehicle_trips_modified;") 35 | op.drop_index("ix_vehicle_trips_composite_1", table_name="vehicle_trips") 36 | op.drop_constraint("vehicle_trips_unique_trip", table_name="vehicle_trips") 37 | 38 | op.add_column("temp_event_compare", sa.Column("revenue", sa.Boolean(), nullable=True)) 39 | op.add_column("vehicle_trips", sa.Column("revenue", sa.Boolean(), nullable=True)) 40 | op.execute(sa.update(TempEventCompare).values(revenue=True)) 41 | op.execute(sa.update(VehicleTrips).values(revenue=True)) 42 | op.alter_column("temp_event_compare", "revenue", nullable=False) 43 | op.alter_column("vehicle_trips", "revenue", nullable=False) 44 | 45 | op.create_unique_constraint( 46 | "vehicle_trips_unique_trip", 47 | "vehicle_trips", 48 | ["service_date", "route_id", "trip_id"], 49 | ) 50 | op.create_index( 51 | "ix_vehicle_trips_composite_1", 52 | "vehicle_trips", 53 | ["route_id", "direction_id", "vehicle_id"], 54 | unique=False, 55 | ) 56 | op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER rt_trips_update_branch_trunk;") 57 | op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER update_vehicle_trips_modified;") 58 | 59 | 60 | def downgrade() -> None: 61 | op.drop_column("vehicle_trips", "revenue") 62 | op.drop_column("temp_event_compare", "revenue") 63 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_dev/009_36e7a7aee148_upgrade_sequence.py: -------------------------------------------------------------------------------- 1 | """upgrade sequence 2 | 3 | Revision ID: 36e7a7aee148 4 | Revises: 32ba735d080c 5 | Create Date: 2025-01-07 13:57:50.433896 6 | 7 | This change upgrades the pm_event_id sequence type to bigint to avoid running out of keys 8 | 9 | Details 10 | * upgrade -> drop opmi view, upgrade sequence, update sequence storage columns 11 | 12 | * downgrade -> not possible, can't go from bigint to int 13 | 14 | """ 15 | 16 | from alembic import op 17 | import sqlalchemy as sa 18 | 19 | from lamp_py.migrations.versions.performance_manager_prod.sql_strings.strings_001 import view_opmi_all_rt_fields_joined 20 | 21 | # revision identifiers, used by Alembic. 22 | revision = "36e7a7aee148" 23 | down_revision = "32ba735d080c" 24 | branch_labels = None 25 | depends_on = None 26 | 27 | 28 | def upgrade() -> None: 29 | # Upgrade sequence to BIGINT 30 | op.execute("ALTER SEQUENCE vehicle_events_pm_event_id_seq as bigint MAXVALUE 9223372036854775807;") 31 | # DROP VIEW before upgrading columns 32 | drop_opmi_all_rt_fields_joined = "DROP VIEW IF EXISTS opmi_all_rt_fields_joined;" 33 | op.execute(drop_opmi_all_rt_fields_joined) 34 | # Upgrade event_id columns to BIGINT 35 | op.alter_column( 36 | "vehicle_events", 37 | "pm_event_id", 38 | existing_type=sa.INTEGER(), 39 | type_=sa.BigInteger(), 40 | existing_nullable=False, 41 | autoincrement=True, 42 | ) 43 | op.alter_column( 44 | "vehicle_events", 45 | "previous_trip_stop_pm_event_id", 46 | existing_type=sa.INTEGER(), 47 | type_=sa.BigInteger(), 48 | existing_nullable=True, 49 | ) 50 | op.alter_column( 51 | "vehicle_events", 52 | "next_trip_stop_pm_event_id", 53 | existing_type=sa.INTEGER(), 54 | type_=sa.BigInteger(), 55 | existing_nullable=True, 56 | ) 57 | op.execute(view_opmi_all_rt_fields_joined) 58 | 59 | 60 | def downgrade() -> None: 61 | # Can not migrate from INT to BIGINT without losing data. 62 | pass 63 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_prod/005_32ba735d080c_add_revenue_columns.py: -------------------------------------------------------------------------------- 1 | """add revenue columns 2 | 3 | Revision ID: 32ba735d080c 4 | Revises: 896dedd8a4db 5 | Create Date: 2024-09-20 08:47:52.784591 6 | 7 | This change adds a boolean revenue column to the vehcile_trips table. 8 | Initially this will be filled with True and back-filled by a seperate operation 9 | 10 | Details 11 | * upgrade -> drop triggers and indexes from table and add revenue column 12 | 13 | * downgrade -> drop revenue column 14 | 15 | """ 16 | 17 | from alembic import op 18 | import sqlalchemy as sa 19 | 20 | from lamp_py.postgres.rail_performance_manager_schema import ( 21 | TempEventCompare, 22 | VehicleTrips, 23 | ) 24 | 25 | # revision identifiers, used by Alembic. 26 | revision = "32ba735d080c" 27 | down_revision = "896dedd8a4db" 28 | branch_labels = None 29 | depends_on = None 30 | 31 | 32 | def upgrade() -> None: 33 | op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER rt_trips_update_branch_trunk;") 34 | op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER update_vehicle_trips_modified;") 35 | op.drop_index("ix_vehicle_trips_composite_1", table_name="vehicle_trips") 36 | op.drop_constraint("vehicle_trips_unique_trip", table_name="vehicle_trips") 37 | 38 | op.add_column("temp_event_compare", sa.Column("revenue", sa.Boolean(), nullable=True)) 39 | op.add_column("vehicle_trips", sa.Column("revenue", sa.Boolean(), nullable=True)) 40 | op.execute(sa.update(TempEventCompare).values(revenue=True)) 41 | op.execute(sa.update(VehicleTrips).values(revenue=True)) 42 | op.alter_column("temp_event_compare", "revenue", nullable=False) 43 | op.alter_column("vehicle_trips", "revenue", nullable=False) 44 | 45 | op.create_unique_constraint( 46 | "vehicle_trips_unique_trip", 47 | "vehicle_trips", 48 | ["service_date", "route_id", "trip_id"], 49 | ) 50 | op.create_index( 51 | "ix_vehicle_trips_composite_1", 52 | "vehicle_trips", 53 | ["route_id", "direction_id", "vehicle_id"], 54 | unique=False, 55 | ) 56 | op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER rt_trips_update_branch_trunk;") 57 | op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER update_vehicle_trips_modified;") 58 | 59 | 60 | def downgrade() -> None: 61 | op.drop_column("vehicle_trips", "revenue") 62 | op.drop_column("temp_event_compare", "revenue") 63 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_prod/006_36e7a7aee148_upgrade_sequence.py: -------------------------------------------------------------------------------- 1 | """upgrade sequence 2 | 3 | Revision ID: 36e7a7aee148 4 | Revises: 32ba735d080c 5 | Create Date: 2025-01-07 13:57:50.433896 6 | 7 | This change upgrades the pm_event_id sequence type to bigint to avoid running out of keys 8 | 9 | Details 10 | * upgrade -> drop opmi view, upgrade sequence, update sequence storage columns 11 | 12 | * downgrade -> not possible, can't go from bigint to int 13 | 14 | """ 15 | 16 | from alembic import op 17 | import sqlalchemy as sa 18 | 19 | from lamp_py.migrations.versions.performance_manager_prod.sql_strings.strings_001 import view_opmi_all_rt_fields_joined 20 | 21 | # revision identifiers, used by Alembic. 22 | revision = "36e7a7aee148" 23 | down_revision = "32ba735d080c" 24 | branch_labels = None 25 | depends_on = None 26 | 27 | 28 | def upgrade() -> None: 29 | # Upgrade sequence to BIGINT 30 | op.execute("ALTER SEQUENCE vehicle_events_pm_event_id_seq as bigint MAXVALUE 9223372036854775807;") 31 | # DROP VIEW before upgrading columns 32 | drop_opmi_all_rt_fields_joined = "DROP VIEW IF EXISTS opmi_all_rt_fields_joined;" 33 | op.execute(drop_opmi_all_rt_fields_joined) 34 | # Upgrade event_id columns to BIGINT 35 | op.alter_column( 36 | "vehicle_events", 37 | "pm_event_id", 38 | existing_type=sa.INTEGER(), 39 | type_=sa.BigInteger(), 40 | existing_nullable=False, 41 | autoincrement=True, 42 | ) 43 | op.alter_column( 44 | "vehicle_events", 45 | "previous_trip_stop_pm_event_id", 46 | existing_type=sa.INTEGER(), 47 | type_=sa.BigInteger(), 48 | existing_nullable=True, 49 | ) 50 | op.alter_column( 51 | "vehicle_events", 52 | "next_trip_stop_pm_event_id", 53 | existing_type=sa.INTEGER(), 54 | type_=sa.BigInteger(), 55 | existing_nullable=True, 56 | ) 57 | op.execute(view_opmi_all_rt_fields_joined) 58 | 59 | 60 | def downgrade() -> None: 61 | # Can not migrate from INT to BIGINT without losing data. 62 | pass 63 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_prod/007_da8f80a3dd90_upgrade_sequence.py: -------------------------------------------------------------------------------- 1 | """upgrade sequence 2 | 3 | Revision ID: da8f80a3dd90 4 | Revises: 36e7a7aee148 5 | Create Date: 2025-04-11 09:43:50.433896 6 | 7 | This change re-indexes all PROD table indexes in an attempt to resolve DB query degradation. 8 | 9 | Details 10 | * upgrade -> REINDEX all indexes on PRDO 11 | 12 | * downgrade -> None 13 | 14 | """ 15 | 16 | from alembic import op 17 | import sqlalchemy as sa 18 | 19 | from lamp_py.runtime_utils.process_logger import ProcessLogger 20 | 21 | # revision identifiers, used by Alembic. 22 | revision = "da8f80a3dd90" 23 | down_revision = "36e7a7aee148" 24 | branch_labels = None 25 | depends_on = None 26 | 27 | 28 | def upgrade() -> None: 29 | # REINDEX all tables 30 | tables = [ 31 | "vehicle_events", 32 | "vehicle_trips", 33 | "static_feed_info", 34 | "static_trips", 35 | "static_routes", 36 | "static_stops", 37 | "static_stop_times", 38 | "static_calendar", 39 | "static_calendar_dates", 40 | "static_directions", 41 | "static_route_patterns", 42 | ] 43 | for table in tables: 44 | try: 45 | log = ProcessLogger(f"reindex_{table}") 46 | log.log_start() 47 | op.execute(sa.text(f"REINDEX TABLE {table};")) 48 | log.log_complete() 49 | except Exception as e: 50 | log.log_failure(e) 51 | 52 | 53 | def downgrade() -> None: 54 | # No downgrade 55 | pass 56 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_prod/008_5e3066f113ff_backfill_rt_rail_2025_04_04_to_2025_04_18.py: -------------------------------------------------------------------------------- 1 | """backfill_rt_rail_data_0404_to_0422 2 | 3 | Revision ID: 5e3066f113ff 4 | Revises: da8f80a3dd90 5 | Create Date: Wed Apr 23 11:16:12 EDT 2025 6 | 7 | Details 8 | This will clean up missing data from RDS performance issues/outage from 4/14-4/17 9 | This will also clean up duplication of data in prod from 4/17-4/22 10 | 11 | This is the same as staging/012_9b461d7aa53a_backfill_rt_rail_2025_04_04_to_2025_04_22.py 12 | 13 | * upgrade -> Delete all records from 4/4 to 4/23 in vehicle events and vehicle_trips 14 | -> Set all flags to "unprocessed" in metadata log from 4/4 to 4/22 15 | * downgrade -> Nothing 16 | """ 17 | 18 | import os 19 | import tempfile 20 | import logging 21 | 22 | import polars as pl 23 | import pyarrow as pa 24 | import pyarrow.parquet as pq 25 | from typing import List 26 | 27 | from alembic import op 28 | import sqlalchemy as sa 29 | from sqlalchemy.exc import ProgrammingError 30 | 31 | from lamp_py.aws.s3 import download_file, upload_file 32 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 33 | 34 | # revision identifiers, used by Alembic. 35 | revision = "5e3066f113ff" 36 | down_revision = "da8f80a3dd90" 37 | branch_labels = None 38 | depends_on = None 39 | 40 | 41 | def upgrade() -> None: 42 | 43 | # SELECT FROM vehicle_events WHERE service_date >= 20250404 AND service_date <= 20250423;" 44 | 45 | clear_events = "DELETE FROM vehicle_events WHERE service_date >= 20250404 AND service_date <= 20250422;" 46 | op.execute(clear_events) 47 | 48 | clear_trips = "DELETE FROM vehicle_trips WHERE service_date >= 20250404 AND service_date <= 20250422;" 49 | op.execute(clear_trips) 50 | 51 | # Query to Check 52 | # SELECT created_on, rail_pm_processed, rail_pm_process_fail 53 | # FROM public.metadata_log 54 | # WHERE created_on > '2025-04-04' and created_on < '2025-04-22 23:59:59' 55 | # AND (path LIKE '%/RT_TRIP_UPDATES/%' or path LIKE '%/RT_VEHICLE_POSITIONS/%') 56 | # ORDER BY created_on; 57 | 58 | try: 59 | update_md_query = """ 60 | UPDATE 61 | metadata_log 62 | SET 63 | rail_pm_process_fail = false 64 | , rail_pm_processed = false 65 | WHERE 66 | created_on > '2025-04-04 00:00:00' 67 | and created_on < '2025-04-22 23:59:59' 68 | and ( 69 | path LIKE '%/RT_TRIP_UPDATES/%' 70 | or path LIKE '%/RT_VEHICLE_POSITIONS/%' 71 | ) 72 | ; 73 | """ 74 | md_manager = DatabaseManager(DatabaseIndex.METADATA) 75 | md_manager.execute(sa.text(update_md_query)) 76 | 77 | except ProgrammingError as error: 78 | # Error 42P01 is an 'Undefined Table' error. This occurs when there is 79 | # no metadata_log table in the rail performance manager database 80 | # 81 | # Raise all other sql errors 82 | original_error = error.orig 83 | if original_error is not None and hasattr(original_error, "pgcode") and original_error.pgcode == "42P01": 84 | logging.info("No Metadata Table in Rail Performance Manager") 85 | else: 86 | raise 87 | 88 | 89 | def downgrade() -> None: 90 | pass 91 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_staging/002_1b53fd278b10_fix_trip_id_length.py: -------------------------------------------------------------------------------- 1 | """fix trip id length 2 | 3 | Revision ID: 1b53fd278b10 4 | Revises: 5d9a7ee21ae5 5 | Create Date: 2023-11-27 16:25:42.657967 6 | 7 | Details 8 | * upgrade -> change "trip_id" field from length 128 to 512 9 | * downgrade -> change "trip_id" field from length 512 to 128 10 | """ 11 | 12 | from alembic import op 13 | import sqlalchemy as sa 14 | 15 | from lamp_py.migrations.versions.performance_manager_staging.sql_strings.strings_001 import ( 16 | view_opmi_all_rt_fields_joined, 17 | ) 18 | 19 | # revision identifiers, used by Alembic. 20 | revision = "1b53fd278b10" 21 | down_revision = "5d9a7ee21ae5" 22 | branch_labels = None 23 | depends_on = None 24 | 25 | 26 | def upgrade() -> None: 27 | # ### commands auto generated by Alembic - please adjust! ### 28 | op.execute("DROP VIEW IF EXISTS opmi_all_rt_fields_joined;") 29 | op.alter_column( 30 | "static_route_patterns", 31 | "representative_trip_id", 32 | existing_type=sa.VARCHAR(length=128), 33 | type_=sa.String(length=512), 34 | existing_nullable=False, 35 | ) 36 | op.alter_column( 37 | "static_stop_times", 38 | "trip_id", 39 | existing_type=sa.VARCHAR(length=128), 40 | type_=sa.String(length=512), 41 | existing_nullable=False, 42 | ) 43 | op.alter_column( 44 | "static_trips", 45 | "trip_id", 46 | existing_type=sa.VARCHAR(length=128), 47 | type_=sa.String(length=512), 48 | existing_nullable=False, 49 | ) 50 | op.alter_column( 51 | "temp_event_compare", 52 | "trip_id", 53 | existing_type=sa.VARCHAR(length=128), 54 | type_=sa.String(length=512), 55 | existing_nullable=False, 56 | ) 57 | op.alter_column( 58 | "vehicle_trips", 59 | "trip_id", 60 | existing_type=sa.VARCHAR(length=128), 61 | type_=sa.String(length=512), 62 | existing_nullable=False, 63 | ) 64 | op.alter_column( 65 | "vehicle_trips", 66 | "static_trip_id_guess", 67 | existing_type=sa.VARCHAR(length=128), 68 | type_=sa.String(length=512), 69 | existing_nullable=True, 70 | ) 71 | op.execute(view_opmi_all_rt_fields_joined) 72 | # ### end Alembic commands ### 73 | 74 | 75 | def downgrade() -> None: 76 | # ### commands auto generated by Alembic - please adjust! ### 77 | op.execute("DROP VIEW IF EXISTS opmi_all_rt_fields_joined;") 78 | op.alter_column( 79 | "vehicle_trips", 80 | "static_trip_id_guess", 81 | existing_type=sa.String(length=512), 82 | type_=sa.VARCHAR(length=128), 83 | existing_nullable=True, 84 | ) 85 | op.alter_column( 86 | "vehicle_trips", 87 | "trip_id", 88 | existing_type=sa.String(length=512), 89 | type_=sa.VARCHAR(length=128), 90 | existing_nullable=False, 91 | ) 92 | op.alter_column( 93 | "temp_event_compare", 94 | "trip_id", 95 | existing_type=sa.String(length=512), 96 | type_=sa.VARCHAR(length=128), 97 | existing_nullable=False, 98 | ) 99 | op.alter_column( 100 | "static_trips", 101 | "trip_id", 102 | existing_type=sa.String(length=512), 103 | type_=sa.VARCHAR(length=128), 104 | existing_nullable=False, 105 | ) 106 | op.alter_column( 107 | "static_stop_times", 108 | "trip_id", 109 | existing_type=sa.String(length=512), 110 | type_=sa.VARCHAR(length=128), 111 | existing_nullable=False, 112 | ) 113 | op.alter_column( 114 | "static_route_patterns", 115 | "representative_trip_id", 116 | existing_type=sa.String(length=512), 117 | type_=sa.VARCHAR(length=128), 118 | existing_nullable=False, 119 | ) 120 | op.execute(view_opmi_all_rt_fields_joined) 121 | # ### end Alembic commands ### 122 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_staging/003_ae6c6e4b2df5_extend_service_id_view.py: -------------------------------------------------------------------------------- 1 | """extend service_id_by_date_and_route 2 | 3 | Revision ID: ae6c6e4b2df5 4 | Revises: 1b53fd278b10 5 | Create Date: 2023-12-17 06:56:17.330783 6 | 7 | Details 8 | * upgrade -> extend service_id_by_date_and_route VIEW to generate values past current date 9 | * upgrade -> update canonical_stop_sequence to use row_number function instead of direct from static schedule 10 | 11 | * downgrade -> Nothing 12 | """ 13 | 14 | from alembic import op 15 | 16 | from lamp_py.migrations.versions.performance_manager_staging.sql_strings.strings_003 import ( 17 | view_service_id_by_date_and_route, 18 | ) 19 | 20 | 21 | # revision identifiers, used by Alembic. 22 | revision = "ae6c6e4b2df5" 23 | down_revision = "1b53fd278b10" 24 | branch_labels = None 25 | depends_on = None 26 | 27 | 28 | def upgrade() -> None: 29 | op.execute("DROP VIEW IF EXISTS service_id_by_date_and_route;") 30 | op.execute(view_service_id_by_date_and_route) 31 | 32 | op.create_index( 33 | "ix_static_trips_composite_4", 34 | "static_trips", 35 | ["static_version_key", "service_id"], 36 | unique=False, 37 | ) 38 | 39 | update_stop_sequences = ( 40 | "UPDATE vehicle_events " 41 | "SET canonical_stop_sequence = static_canon.stop_sequence " 42 | "FROM vehicle_events AS ve " 43 | "JOIN vehicle_trips AS vt " 44 | "ON ve.pm_trip_id = vt.pm_trip_id " 45 | "JOIN " 46 | "(" 47 | " select " 48 | " srp.direction_id " 49 | " , coalesce(st.branch_route_id, st.trunk_route_id) AS route_id " 50 | " , ROW_NUMBER () OVER (PARTITION BY srp.static_version_key, srp.direction_id, coalesce(st.branch_route_id, st.trunk_route_id) ORDER BY sst.stop_sequence) AS stop_sequence" 51 | " , ss.parent_station " 52 | " , srp.static_version_key " 53 | " from static_route_patterns srp " 54 | " JOIN static_trips st " 55 | " ON srp.representative_trip_id = st.trip_id " 56 | " AND srp.static_version_key = st.static_version_key " 57 | " JOIN static_stop_times sst " 58 | " ON srp.representative_trip_id = sst.trip_id " 59 | " AND srp.static_version_key = sst.static_version_key " 60 | " JOIN static_stops ss " 61 | " ON sst.stop_id = ss.stop_id " 62 | " AND sst.static_version_key = ss.static_version_key " 63 | " WHERE " 64 | " srp.route_pattern_typicality = 1" 65 | ") AS static_canon " 66 | "ON ve.parent_station = static_canon.parent_station " 67 | "AND vt.static_version_key = static_canon.static_version_key " 68 | "AND vt.direction_id = static_canon.direction_id " 69 | "AND coalesce(vt.branch_route_id, vt.trunk_route_id) = static_canon.route_id " 70 | "WHERE vehicle_events.pm_trip_id = ve.pm_trip_id " 71 | "AND vehicle_events.parent_station = static_canon.parent_station " 72 | ";" 73 | ) 74 | op.execute(update_stop_sequences) 75 | 76 | 77 | def downgrade() -> None: 78 | op.drop_index("ix_static_trips_composite_4", table_name="static_trips") 79 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_staging/006_e20a4f3f8c03_fix_null_vehicle_consist.py: -------------------------------------------------------------------------------- 1 | """fix null vehicle consist 2 | 3 | Revision ID: e20a4f3f8c03 4 | Revises: 96187da84955 5 | Create Date: 2024-03-07 15:44:22.989929 6 | 7 | On March 5th 2024, the vehicle consist field was removed from the VehiclePositions GTFS-RT feed 8 | this broke our data pipeline requiring a switch to the multi_carriage_details field 9 | this migration should re-process our realtime data from March 5th to present to fix missing 10 | vehicle consist values 11 | """ 12 | 13 | from alembic import op 14 | import sqlalchemy as sa 15 | 16 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 17 | 18 | # revision identifiers, used by Alembic. 19 | revision = "e20a4f3f8c03" 20 | down_revision = "96187da84955" 21 | branch_labels = None 22 | depends_on = None 23 | 24 | 25 | def upgrade() -> None: 26 | clear_events = "DELETE FROM vehicle_events WHERE service_date >= 20240305;" 27 | op.execute(clear_events) 28 | 29 | clear_trips = "DELETE FROM vehicle_trips WHERE service_date >= 20240305;" 30 | op.execute(clear_trips) 31 | 32 | update_md_query = """ 33 | UPDATE 34 | metadata_log 35 | SET rail_pm_processed = false 36 | WHERE 37 | ( 38 | "path" like '%RT_VEHICLE_POSITIONS%' 39 | OR "path" like '%RT_TRIP_UPDATES%' 40 | ) 41 | AND 42 | (substring("path", 'year=(\d+)') || '-' || substring("path", 'month=(\d+)') || '-' || substring("path", 'day=(\d+)'))::date >= '2024-3-5'::date 43 | ; 44 | """ 45 | md_manager = DatabaseManager(DatabaseIndex.METADATA) 46 | md_manager.execute(sa.text(update_md_query)) 47 | 48 | 49 | def downgrade() -> None: 50 | pass 51 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_staging/009_32ba735d080c_add_revenue_columns.py: -------------------------------------------------------------------------------- 1 | """add revenue columns 2 | 3 | Revision ID: 32ba735d080c 4 | Revises: 896dedd8a4db 5 | Create Date: 2024-09-20 08:47:52.784591 6 | 7 | This change adds a boolean revenue column to the vehcile_trips table. 8 | Initially this will be filled with True and back-filled by a seperate operation 9 | 10 | Details 11 | * upgrade -> drop triggers and indexes from table and add revenue column 12 | 13 | * downgrade -> drop revenue column 14 | 15 | """ 16 | 17 | from alembic import op 18 | import sqlalchemy as sa 19 | 20 | from lamp_py.postgres.rail_performance_manager_schema import ( 21 | TempEventCompare, 22 | VehicleTrips, 23 | ) 24 | 25 | # revision identifiers, used by Alembic. 26 | revision = "32ba735d080c" 27 | down_revision = "896dedd8a4db" 28 | branch_labels = None 29 | depends_on = None 30 | 31 | 32 | def upgrade() -> None: 33 | op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER rt_trips_update_branch_trunk;") 34 | op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER update_vehicle_trips_modified;") 35 | op.drop_index("ix_vehicle_trips_composite_1", table_name="vehicle_trips") 36 | op.drop_constraint("vehicle_trips_unique_trip", table_name="vehicle_trips") 37 | 38 | op.add_column("temp_event_compare", sa.Column("revenue", sa.Boolean(), nullable=True)) 39 | op.add_column("vehicle_trips", sa.Column("revenue", sa.Boolean(), nullable=True)) 40 | op.execute(sa.update(TempEventCompare).values(revenue=True)) 41 | op.execute(sa.update(VehicleTrips).values(revenue=True)) 42 | op.alter_column("temp_event_compare", "revenue", nullable=False) 43 | op.alter_column("vehicle_trips", "revenue", nullable=False) 44 | 45 | op.create_unique_constraint( 46 | "vehicle_trips_unique_trip", 47 | "vehicle_trips", 48 | ["service_date", "route_id", "trip_id"], 49 | ) 50 | op.create_index( 51 | "ix_vehicle_trips_composite_1", 52 | "vehicle_trips", 53 | ["route_id", "direction_id", "vehicle_id"], 54 | unique=False, 55 | ) 56 | op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER rt_trips_update_branch_trunk;") 57 | op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER update_vehicle_trips_modified;") 58 | 59 | 60 | def downgrade() -> None: 61 | op.drop_column("vehicle_trips", "revenue") 62 | op.drop_column("temp_event_compare", "revenue") 63 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_staging/010_36e7a7aee148_upgrade_sequence.py: -------------------------------------------------------------------------------- 1 | """upgrade sequence 2 | 3 | Revision ID: 36e7a7aee148 4 | Revises: 32ba735d080c 5 | Create Date: 2025-01-07 13:57:50.433896 6 | 7 | This change upgrades the pm_event_id sequence type to bigint to avoid running out of keys 8 | 9 | Details 10 | * upgrade -> drop opmi view, upgrade sequence, update sequence storage columns 11 | 12 | * downgrade -> not possible, can't go from bigint to int 13 | 14 | """ 15 | 16 | from alembic import op 17 | import sqlalchemy as sa 18 | 19 | from lamp_py.migrations.versions.performance_manager_staging.sql_strings.strings_001 import ( 20 | view_opmi_all_rt_fields_joined, 21 | ) 22 | 23 | # revision identifiers, used by Alembic. 24 | revision = "36e7a7aee148" 25 | down_revision = "32ba735d080c" 26 | branch_labels = None 27 | depends_on = None 28 | 29 | 30 | def upgrade() -> None: 31 | # Upgrade sequence to BIGINT 32 | op.execute("ALTER SEQUENCE vehicle_events_pm_event_id_seq as bigint MAXVALUE 9223372036854775807;") 33 | # DROP VIEW before upgrading columns 34 | drop_opmi_all_rt_fields_joined = "DROP VIEW IF EXISTS opmi_all_rt_fields_joined;" 35 | op.execute(drop_opmi_all_rt_fields_joined) 36 | # Upgrade event_id columns to BIGINT 37 | op.alter_column( 38 | "vehicle_events", 39 | "pm_event_id", 40 | existing_type=sa.INTEGER(), 41 | type_=sa.BigInteger(), 42 | existing_nullable=False, 43 | autoincrement=True, 44 | ) 45 | op.alter_column( 46 | "vehicle_events", 47 | "previous_trip_stop_pm_event_id", 48 | existing_type=sa.INTEGER(), 49 | type_=sa.BigInteger(), 50 | existing_nullable=True, 51 | ) 52 | op.alter_column( 53 | "vehicle_events", 54 | "next_trip_stop_pm_event_id", 55 | existing_type=sa.INTEGER(), 56 | type_=sa.BigInteger(), 57 | existing_nullable=True, 58 | ) 59 | op.execute(view_opmi_all_rt_fields_joined) 60 | 61 | 62 | def downgrade() -> None: 63 | # Can not migrate from INT to BIGINT without losing data. 64 | pass 65 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_staging/011_5e3066f113ff_backfill_rt_rail_2025_04_04_to_2025_04_18.py: -------------------------------------------------------------------------------- 1 | """update_glides_location_column_names 2 | 3 | Revision ID: 5e3066f113ff 4 | Revises: 36e7a7aee148 5 | Create Date: Wed Apr 23 11:16:12 EDT 2025 6 | 7 | Details 8 | This will clean up missing data from RDS performance issues/outage from 4/14-4/17 9 | This will also clean up duplication of data in prod from 4/17-4/22 10 | 11 | * upgrade -> Delete all records from 4/4 to 4/23 in vehicle events and vehicle_trips 12 | -> Set all flags to "unprocessed" in metadata log from 4/4 to 4/22 13 | * downgrade -> Nothing 14 | """ 15 | 16 | import logging 17 | import os 18 | import tempfile 19 | import polars as pl 20 | import pyarrow as pa 21 | import pyarrow.parquet as pq 22 | from typing import List 23 | 24 | from alembic import op 25 | import sqlalchemy as sa 26 | from sqlalchemy.exc import ProgrammingError 27 | 28 | from lamp_py.aws.s3 import download_file, upload_file 29 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 30 | 31 | # revision identifiers, used by Alembic. 32 | revision = "5e3066f113ff" 33 | down_revision = "36e7a7aee148" 34 | branch_labels = None 35 | depends_on = None 36 | 37 | 38 | def upgrade() -> None: 39 | # this migration partially failed due to a typo in the date range - 40 | # deleting the contents to make clear this was NOT successfully run 41 | # this job was rerun in the subsequent migration 42 | pass 43 | 44 | 45 | def downgrade() -> None: 46 | pass 47 | -------------------------------------------------------------------------------- /src/lamp_py/migrations/versions/performance_manager_staging/012_9b461d7aa53a_backfill_rt_rail_2025_04_04_to_2025_04_22.py: -------------------------------------------------------------------------------- 1 | """backfill_rt_rail_data_0404_to_0422 2 | 3 | Revision ID: 9b461d7aa53a 4 | Revises: 5e3066f113ff 5 | Create Date: Wed Apr 23 11:16:12 EDT 2025 6 | 7 | Details 8 | This will clean up missing data from RDS performance issues/outage from 4/14-4/17 9 | This will also clean up duplication of data in prod from 4/17-4/22 10 | 11 | This is a rerun due to incorrectly specified query in 5e3066f113ff for the metadata query. 12 | We are correcting that error and rerunning the whole migration again. 13 | 14 | * upgrade -> Delete all records from 4/4 to 4/23 in vehicle events and vehicle_trips 15 | -> Set all flags to "unprocessed" in metadata log from 4/4 to 4/22 16 | * downgrade -> Nothing 17 | """ 18 | 19 | import logging 20 | import os 21 | import tempfile 22 | import polars as pl 23 | import pyarrow as pa 24 | import pyarrow.parquet as pq 25 | from typing import List 26 | 27 | from alembic import op 28 | import sqlalchemy as sa 29 | from sqlalchemy.exc import ProgrammingError 30 | 31 | from lamp_py.aws.s3 import download_file, upload_file 32 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager 33 | 34 | # revision identifiers, used by Alembic. 35 | revision = "9b461d7aa53a" 36 | down_revision = "5e3066f113ff" 37 | branch_labels = None 38 | depends_on = None 39 | 40 | 41 | def upgrade() -> None: 42 | 43 | # SELECT FROM vehicle_events WHERE service_date >= 20250404 AND service_date <= 20250422;" 44 | # ~ (974142 rows) 45 | clear_events = "DELETE FROM vehicle_events WHERE service_date >= 20250404 AND service_date <= 20250422;" 46 | op.execute(clear_events) 47 | 48 | # ~ (75788 rows) 49 | clear_trips = "DELETE FROM vehicle_trips WHERE service_date >= 20250404 AND service_date <= 20250422;" 50 | op.execute(clear_trips) 51 | 52 | # Query to Check 53 | # SELECT 54 | # created_on, 55 | # rail_pm_process_fail, 56 | # rail_pm_processed 57 | # FROM public.metadata_log 58 | # WHERE 59 | # created_on > '2025-04-04 00:00:00' 60 | # and created_on < '2025-04-22 23:59:59' 61 | # and ( 62 | # path LIKE '%/RT_TRIP_UPDATES/%' 63 | # or path LIKE '%/RT_VEHICLE_POSITIONS/%' 64 | # ) 65 | # ; 66 | 67 | try: 68 | update_md_query = """ 69 | UPDATE 70 | metadata_log 71 | SET 72 | rail_pm_process_fail = false 73 | , rail_pm_processed = false 74 | WHERE 75 | created_on > '2025-04-04 00:00:00' 76 | and created_on < '2025-04-22 23:59:59' 77 | and ( 78 | path LIKE '%/RT_TRIP_UPDATES/%' 79 | or path LIKE '%/RT_VEHICLE_POSITIONS/%' 80 | ) 81 | ; 82 | """ 83 | md_manager = DatabaseManager(DatabaseIndex.METADATA) 84 | md_manager.execute(sa.text(update_md_query)) 85 | 86 | except ProgrammingError as error: 87 | # Error 42P01 is an 'Undefined Table' error. This occurs when there is 88 | # no metadata_log table in the rail performance manager database 89 | # 90 | # Raise all other sql errors 91 | original_error = error.orig 92 | if original_error is not None and hasattr(original_error, "pgcode") and original_error.pgcode == "42P01": 93 | logging.info("No Metadata Table in Rail Performance Manager") 94 | else: 95 | raise 96 | 97 | 98 | def downgrade() -> None: 99 | pass 100 | -------------------------------------------------------------------------------- /src/lamp_py/mssql/__init__.py: -------------------------------------------------------------------------------- 1 | """ Suite of utilities for interacting with microsoft sql database """ 2 | -------------------------------------------------------------------------------- /src/lamp_py/mssql/test_connect.py: -------------------------------------------------------------------------------- 1 | import sqlalchemy as sa 2 | from lamp_py.mssql.mssql_utils import MSSQLManager 3 | 4 | 5 | def start() -> None: 6 | """ 7 | Test MSSQL DB Connection 8 | """ 9 | db = MSSQLManager(verbose=True) 10 | select_query = sa.text("SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE='BASE TABLE';") 11 | for record in db.select_as_list(select_query): 12 | print(record) 13 | 14 | 15 | if __name__ == "__main__": 16 | start() 17 | -------------------------------------------------------------------------------- /src/lamp_py/performance_manager/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline for consuming GTFS realtime parquet files and converting them into 3 | trip summaries that are compared to trips that are planned in the GTFS static 4 | schedule 5 | """ 6 | -------------------------------------------------------------------------------- /src/lamp_py/postgres/__init__.py: -------------------------------------------------------------------------------- 1 | """ Suite of utilities for interacting with postgres database """ 2 | -------------------------------------------------------------------------------- /src/lamp_py/postgres/metadata_schema.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import sqlalchemy as sa 4 | from sqlalchemy.orm import declarative_base 5 | from sqlalchemy.sql.functions import now 6 | 7 | MetadataSqlBase: Any = declarative_base(name="Metadata") 8 | 9 | 10 | class MetadataLog(MetadataSqlBase): # pylint: disable=too-few-public-methods 11 | """Table for keeping track of parquet files in S3""" 12 | 13 | __tablename__ = "metadata_log" 14 | 15 | pk_id = sa.Column(sa.Integer, primary_key=True) 16 | rail_pm_processed = sa.Column(sa.Boolean, default=sa.false()) 17 | rail_pm_process_fail = sa.Column(sa.Boolean, default=sa.false()) 18 | path = sa.Column(sa.String(256), nullable=False, unique=True) 19 | created_on = sa.Column(sa.DateTime(timezone=True), server_default=now()) 20 | 21 | 22 | sa.Index( 23 | "ix_metadata_log_not_processed", 24 | MetadataLog.path, 25 | postgresql_where=(MetadataLog.rail_pm_processed == sa.false()), 26 | ) 27 | -------------------------------------------------------------------------------- /src/lamp_py/publishing/__init__.py: -------------------------------------------------------------------------------- 1 | """ Anything and Everything related to publicly publishing LAMP data """ 2 | -------------------------------------------------------------------------------- /src/lamp_py/publishing/performancedata.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from lamp_py.aws.s3 import upload_file 4 | from lamp_py.runtime_utils.remote_files import S3_PUBLIC 5 | 6 | 7 | def publish_performance_index() -> None: 8 | """ 9 | Upload index.html to https://performancedata.mbta.com bucket 10 | """ 11 | here = os.path.dirname(os.path.abspath(__file__)) 12 | index_file = "index.html" 13 | 14 | if "unset" in S3_PUBLIC: 15 | return 16 | 17 | local_index_path = os.path.join(here, index_file) 18 | upload_index_path = os.path.join(S3_PUBLIC, index_file) 19 | 20 | extra_args = { 21 | "ContentType": "text/html", 22 | } 23 | 24 | upload_file( 25 | file_name=local_index_path, 26 | object_path=upload_index_path, 27 | extra_args=extra_args, 28 | ) 29 | -------------------------------------------------------------------------------- /src/lamp_py/runtime_utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ Suite of utilities used when running data pipelines """ 2 | -------------------------------------------------------------------------------- /src/lamp_py/runtime_utils/alembic_migration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | from alembic.config import Config 5 | from alembic import command 6 | 7 | 8 | def get_alembic_config(db_name: str) -> Config: 9 | """ 10 | get alembic configuration for specified db_name 11 | 12 | will raise NotImplementedError if db_name is not supported 13 | """ 14 | here = os.path.dirname(os.path.abspath(__file__)) 15 | alembic_cfg_file = os.path.join(here, "..", "..", "..", "alembic.ini") 16 | alembic_cfg_file = os.path.abspath(alembic_cfg_file) 17 | logging.info("getting alembic config for %s from %s", db_name, alembic_cfg_file) 18 | 19 | db_names = ( 20 | "performance_manager_dev", 21 | "performance_manager_staging", 22 | "performance_manager_prod", 23 | "metadata_dev", 24 | "metadata_staging", 25 | "metadata_prod", 26 | ) 27 | 28 | if db_name not in db_names: 29 | raise NotImplementedError(f"Migration for {db_name} not implemented.") 30 | 31 | return Config(alembic_cfg_file, ini_section=db_name) 32 | 33 | 34 | def alembic_upgrade_to_head(db_name: str) -> None: 35 | """ 36 | upgrade db_name to head revision 37 | """ 38 | # load alembic configuation for db_name 39 | alembic_cfg = get_alembic_config(db_name) 40 | 41 | command.upgrade(alembic_cfg, revision="head") 42 | 43 | 44 | def alembic_downgrade_to_base(db_name: str) -> None: 45 | """ 46 | downgrade db_name to base revision 47 | """ 48 | # load alembic configuation for db_name 49 | alembic_cfg = get_alembic_config(db_name) 50 | 51 | command.downgrade(alembic_cfg, revision="base") 52 | -------------------------------------------------------------------------------- /src/lamp_py/runtime_utils/env_validation.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Iterable, List, Optional 3 | 4 | from lamp_py.runtime_utils.process_logger import ProcessLogger 5 | from lamp_py.__version__ import VERSION 6 | 7 | 8 | def validate_environment( 9 | required_variables: List[str], 10 | private_variables: Optional[List[str]] = None, 11 | optional_variables: Optional[List[str]] = None, 12 | db_prefixes: Iterable[str] = (), 13 | ) -> None: 14 | """ 15 | ensure that the environment has all the variables its required to have 16 | before starting triggering main, making certain errors easier to debug. 17 | """ 18 | process_logger = ProcessLogger("validate_env") 19 | process_logger.log_start() 20 | 21 | if private_variables is None: 22 | private_variables = [] 23 | 24 | metadata = {"lamp_version": VERSION} 25 | 26 | # every pipeline needs a service name for logging 27 | required_variables.append("SERVICE_NAME") 28 | 29 | # add required database variables 30 | for prefix in db_prefixes: 31 | required_variables += [ 32 | f"{prefix}_DB_HOST", 33 | f"{prefix}_DB_NAME", 34 | f"{prefix}_DB_PORT", 35 | f"{prefix}_DB_USER", 36 | ] 37 | # if db password is missing, db region is required to generate a 38 | # token to use as the password to the cloud database 39 | if os.environ.get(f"{prefix}_DB_PASSWORD", None) is None: 40 | required_variables.append("DB_REGION") 41 | 42 | # check for missing variables. add found variables to our logs. 43 | missing_required = [] 44 | for key in required_variables: 45 | value = os.environ.get(key, None) 46 | if value is None: 47 | missing_required.append(key) 48 | 49 | # do not log private variables 50 | if key in private_variables: 51 | value = "**********" 52 | metadata[key] = value 53 | 54 | # for optional variables, access ones that exist and add them to logs. 55 | if optional_variables: 56 | for key in optional_variables: 57 | value = os.environ.get(key, None) 58 | if value is not None: 59 | # do not log private variables 60 | if key in private_variables: 61 | value = "**********" 62 | metadata[key] = value 63 | 64 | process_logger.add_metadata(**metadata) 65 | 66 | # if required variables are missing, log a failure and throw. 67 | if missing_required: 68 | exception = EnvironmentError(f"Missing required environment variables {missing_required}") 69 | process_logger.log_failure(exception) 70 | raise exception 71 | 72 | process_logger.log_complete() 73 | -------------------------------------------------------------------------------- /src/lamp_py/runtime_utils/infinite_wait.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from lamp_py.aws.ecs import check_for_sigterm 5 | 6 | 7 | def infinite_wait(reason: str) -> None: 8 | """ 9 | When running on ECS, propagating an exception up the call stack and killing 10 | the processes will result in the process being restarted, to keep the task 11 | count at one. This method should be called instead when we want to pause 12 | the process for intervention before restarting. 13 | """ 14 | # amount of time to sleep between logging statements 15 | sleep_time = 60 16 | count = 0 17 | 18 | while True: 19 | check_for_sigterm() 20 | 21 | # log every ten minutes 22 | if count == 10: 23 | logging.error("Pausing for %s", reason) 24 | count = 0 25 | 26 | # sleep 27 | time.sleep(sleep_time) 28 | count += 1 29 | -------------------------------------------------------------------------------- /src/lamp_py/runtime_utils/lamp_exception.py: -------------------------------------------------------------------------------- 1 | class GTFSIngestException(Exception): 2 | """ 3 | Generic exception for the py gtfs_rt_ingestion library 4 | """ 5 | 6 | 7 | class ConfigTypeFromFilenameException(GTFSIngestException): 8 | """ 9 | Unable to derrive config type from a filename 10 | """ 11 | 12 | def __init__(self, filename: str): 13 | message = f"Unable to deduce Configuration Type from {filename}" 14 | super().__init__(message) 15 | self.filename = filename 16 | 17 | 18 | class ArgumentException(GTFSIngestException): 19 | """ 20 | General Error to throw when incoming events are malformed 21 | """ 22 | 23 | 24 | class NoImplException(GTFSIngestException): 25 | """ 26 | General Error for things LAMP hasn't implemented yet 27 | """ 28 | 29 | 30 | class IgnoreIngestion(GTFSIngestException): 31 | """ 32 | General Error for files GTFS Ingestion should ignore 33 | """ 34 | 35 | 36 | class AWSException(GTFSIngestException): 37 | """ 38 | General Error for raising with any AWS errors encountered. 39 | """ 40 | 41 | 42 | class LampExpectedNotFoundError(Exception): 43 | """ 44 | Exception raised when expected inputs are not available 45 | """ 46 | 47 | 48 | class LampInvalidProcessingError(Exception): 49 | """ 50 | Exception raised when invalid processing state is reached with inputs 51 | """ 52 | -------------------------------------------------------------------------------- /src/lamp_py/tableau/README.md: -------------------------------------------------------------------------------- 1 | # Tableau Publisher 2 | 3 | The Tableau Publisher is an application that takes data created by the Rail Performance Manager application as parquet files and publishes them to the ITD Managed Tableau Instance as hyper files. 4 | 5 | ## Application Operation 6 | 7 | The application itself is run via a cloudwatch event that is set to trigger on a cronlike schedule. 8 | 9 | On each run, it iterates through a list of jobs that generate hyper files and uploads them to the ITD Tableau server, where they can be used to generate dashboards and reports for external users. To generate the job reads a parquet file that has been created by upstream LAMP applications and converts it to a hyper file using the [Tableau Hyper API](https://www.tableau.com/developer/tools/hyper-api). The file is generated on local storage, and then uploaded to the ITD Managed Tableau server using the [Tableau Server Client](https://tableau.github.io/server-client-python/), a python library wrapping the [Tableau REST API](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api.htm). 10 | 11 | ### Upstream Applications 12 | 13 | To simplify the conversion from parquet to hyper, the schemas for both are defined within this module. We also store the hardcoded S3 filepaths. Because of this, components of this library are used by other applications when writing the parquet files. 14 | 15 | ## Developer Note 16 | 17 | The Tableau Hyper API is not currently supported on Apple Silicon. This means that local execution on Mac OSX with arm64 processors will not work without emulation. In light of that, imports from this directory will trigger `ModuleNotFound` exceptions if running on the wrong system. To avoid that, the `__init__.py` file includes a wrapper around components that are consumed by other applications. These functions will log an error when run without the desired dependencies. 18 | 19 | ### Installation without Tableau dependencies 20 | 21 | In `pyproject.toml`, there is an additional dependency group that contains the tableau dependencies. It is not marked optional, so these modules will be installed with `poetry install`. If you are on an arm64 architecture, you can avoid installing the tableau dependencies with `poetry install --without tableau`. This behavior is encoded in the `.envrcy`, `docker-compose.yml`, and `Dockerfile` files in this repository, so you should get the desired behavior without additional arguments. 22 | -------------------------------------------------------------------------------- /src/lamp_py/tableau/__init__.py: -------------------------------------------------------------------------------- 1 | """Utilities for Interacting with Tableau and Hyper files""" 2 | 3 | import logging 4 | from types import ModuleType 5 | from typing import Optional 6 | 7 | from lamp_py.postgres.postgres_utils import DatabaseManager 8 | 9 | # pylint: disable=C0103 (invalid-name) 10 | # pylint wants pipeline to conform to an UPPER_CASE constant naming style. its 11 | # a module though, so disabling to allow it to use normal import rules. 12 | pipeline: Optional[ModuleType] 13 | 14 | try: 15 | from . import pipeline 16 | except ModuleNotFoundError: 17 | pipeline = None 18 | 19 | # pylint: enable=C0103 (invalid-name) 20 | 21 | 22 | def start_parquet_updates(db_manager: DatabaseManager) -> None: 23 | """ 24 | wrapper around pipeline.start_parquet_updates function. if a module not 25 | found error occurs (which happens when using osx arm64 dependencies), log 26 | an error and do nothing. else, run the function. 27 | """ 28 | if pipeline is None: 29 | logging.error("Unable to run parquet files on this machine due to Module Not Found error") 30 | else: 31 | pipeline.start_parquet_updates(db_manager=db_manager) 32 | 33 | 34 | def clean_parquet_paths() -> None: 35 | """ 36 | wrapper around pipeline.clean_parquet_paths function. if a module not 37 | found error occurs (which happens when using osx arm64 dependencies), log 38 | an error and do nothing. else, run the function. 39 | """ 40 | if pipeline is None: 41 | logging.error("Unable to run parquet files on this machine due to Module Not Found error") 42 | else: 43 | pipeline.clean_parquet_paths() 44 | -------------------------------------------------------------------------------- /src/lamp_py/tableau/conversions/convert_bus_performance_data.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from pyarrow import Table 3 | 4 | 5 | def apply_bus_analysis_conversions(polars_df: pl.DataFrame) -> Table: 6 | """ 7 | Function to apply final conversions to lamp data before outputting for tableau consumption 8 | """ 9 | # Convert datetime to Eastern Time 10 | polars_df = polars_df.with_columns( 11 | pl.col("stop_arrival_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None), 12 | pl.col("stop_departure_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None), 13 | pl.col("gtfs_travel_to_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None), 14 | pl.col("tm_scheduled_time_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None), 15 | pl.col("tm_actual_arrival_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None), 16 | pl.col("tm_actual_departure_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None), 17 | pl.col("gtfs_sort_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None), 18 | pl.col("gtfs_departure_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None), 19 | pl.col("gtfs_arrival_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None), 20 | ) 21 | 22 | # Convert seconds columns to be aligned with Eastern Time 23 | polars_df = polars_df.with_columns( 24 | (pl.col("gtfs_travel_to_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d")) 25 | .dt.total_seconds() 26 | .alias("gtfs_travel_to_seconds"), 27 | (pl.col("stop_arrival_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d")) 28 | .dt.total_seconds() 29 | .alias("stop_arrival_seconds"), 30 | (pl.col("stop_departure_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d")) 31 | .dt.total_seconds() 32 | .alias("stop_departure_seconds"), 33 | ) 34 | 35 | polars_df = polars_df.with_columns(pl.col("service_date").str.strptime(pl.Date, "%Y%m%d", strict=False)) 36 | 37 | return polars_df.to_arrow() 38 | -------------------------------------------------------------------------------- /src/lamp_py/tableau/jobs/rt_alerts.py: -------------------------------------------------------------------------------- 1 | import pyarrow 2 | 3 | from lamp_py.aws.s3 import download_file 4 | from lamp_py.performance_manager.alerts import AlertsS3Info 5 | from lamp_py.postgres.postgres_utils import DatabaseManager 6 | from lamp_py.tableau.hyper import HyperJob 7 | 8 | 9 | class HyperRtAlerts(HyperJob): 10 | """HyperJob for LAMP Alerts dataset""" 11 | 12 | def __init__(self) -> None: 13 | HyperJob.__init__( 14 | self, 15 | hyper_file_name="LAMP_ALERTS.hyper", 16 | remote_parquet_path=AlertsS3Info.s3_path, 17 | lamp_version=AlertsS3Info.file_version, 18 | ) 19 | 20 | @property 21 | def parquet_schema(self) -> pyarrow.schema: 22 | return AlertsS3Info.parquet_schema 23 | 24 | def create_parquet(self, _: DatabaseManager) -> None: 25 | raise NotImplementedError("Alerts Hyper Job does not create parquet file") 26 | 27 | def update_parquet(self, _: DatabaseManager) -> bool: 28 | download_file( 29 | object_path=self.remote_parquet_path, 30 | file_name=self.local_parquet_path, 31 | ) 32 | return False 33 | -------------------------------------------------------------------------------- /src/lamp_py/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/src/lamp_py/utils/__init__.py -------------------------------------------------------------------------------- /src/lamp_py/utils/clear_folder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | 5 | def clear_folder(folder: str) -> None: 6 | """ 7 | Delete contents of entire folder. 8 | """ 9 | for filename in os.listdir(folder): 10 | file_path = os.path.join(folder, filename) 11 | try: 12 | if os.path.isfile(file_path) or os.path.islink(file_path): 13 | os.unlink(file_path) 14 | elif os.path.isdir(file_path): 15 | shutil.rmtree(file_path) 16 | except Exception as _: 17 | pass 18 | -------------------------------------------------------------------------------- /src/lamp_py/utils/date_range_builder.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | 4 | # Create a DataFrame with two date columns 5 | def build_data_range_paths(template_string: str, start_date: datetime, end_date: datetime) -> list[str]: 6 | """ 7 | Given an f-string template, fill in the {} in template with all the days between 8 | start_date and end_date (inclusive) and return the result as a list of strings 9 | """ 10 | 11 | # add 1 for inclusive 12 | date_diff_days = (start_date - end_date).days * -1 + 1 13 | 14 | date_paths = [] 15 | # 16 | for i in range(0, date_diff_days): 17 | tmp = start_date + timedelta(days=i) 18 | 19 | # wrong format - good for delta though 20 | # prefix_date_part = f"{yy}/{mm:02d}/{dd:02d}" 21 | 22 | # prefix_date_part = f"year={yy}/month={mm}/day={dd}/" 23 | # prefix_whole_path = f"year={yy}/month={mm}/day={dd}/{yy}-{mm:02d}-{dd:02d}T00:00:00.parquet" 24 | 25 | formatted = template_string.format(yy=tmp.year, mm=tmp.month, dd=tmp.day) 26 | date_paths.append(formatted) 27 | return date_paths 28 | -------------------------------------------------------------------------------- /src/lamp_py/utils/gtfs_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | from typing import List 3 | import polars as pl 4 | 5 | from lamp_py.aws.s3 import object_exists 6 | from lamp_py.runtime_utils.process_logger import ProcessLogger 7 | from lamp_py.runtime_utils.remote_files import compressed_gtfs 8 | 9 | 10 | def gtfs_from_parquet(file: str, service_date: date) -> pl.DataFrame: 11 | """ 12 | Get GTFS data from specified file and service date 13 | 14 | This will read from s3_uri of file 15 | 16 | :param file: gtfs file to acces (i.e. "feed_info") 17 | :param service_date: service date of requested GTFS data 18 | 19 | :return dataframe: 20 | data columns of parquet file for service_date 21 | """ 22 | logger = ProcessLogger("gtfs_from_parquet", file=file, service_date=service_date) 23 | logger.log_start() 24 | 25 | gtfs_year = service_date.year 26 | service_date_int = int(service_date.strftime("%Y%m%d")) 27 | 28 | gtfs_file = compressed_gtfs.parquet_path(gtfs_year, file).s3_uri 29 | 30 | if not object_exists(gtfs_file): 31 | gtfs_file = compressed_gtfs.parquet_path(gtfs_year - 1, file).s3_uri 32 | if not object_exists(gtfs_file): 33 | exception = FileNotFoundError(f"No GTFS archive files available for {service_date}") 34 | logger.log_failure(exception) 35 | raise exception 36 | 37 | logger.add_metadata(gtfs_file=gtfs_file) 38 | 39 | gtfs_df = ( 40 | pl.read_parquet(gtfs_file) 41 | .filter( 42 | (pl.col("gtfs_active_date") <= service_date_int), 43 | (pl.col("gtfs_end_date") >= service_date_int), 44 | ) 45 | .drop(["gtfs_active_date", "gtfs_end_date"]) 46 | ) 47 | logger.add_metadata(gtfs_row_count=gtfs_df.shape[0]) 48 | logger.log_complete() 49 | return gtfs_df 50 | 51 | 52 | def bus_route_ids_for_service_date(service_date: date) -> List[str]: 53 | """get a list of bus route ids for a given service date""" 54 | bus_routes = ( 55 | gtfs_from_parquet("routes", service_date).filter((pl.col("route_type") == 3)).get_column("route_id").unique() 56 | ) 57 | 58 | return bus_routes.to_list() 59 | 60 | 61 | def routes_for_service_date(service_date: date) -> pl.DataFrame: 62 | """get a list of all routes for a given service date""" 63 | routes = gtfs_from_parquet("routes", service_date) 64 | 65 | return routes 66 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/__init__.py -------------------------------------------------------------------------------- /tests/aws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/aws/__init__.py -------------------------------------------------------------------------------- /tests/bus_performance_manager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/bus_performance_manager/__init__.py -------------------------------------------------------------------------------- /tests/bus_performance_manager/test_bus_convert_for_tableau.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import polars as pl 4 | import pytest 5 | 6 | from lamp_py.tableau.conversions.convert_bus_performance_data import apply_bus_analysis_conversions 7 | 8 | 9 | # poetry run pytest -s tests/bus_performance_manager/test_bus_convert_for_tableau.py 10 | @pytest.mark.skip("temp skip - re-enable asap - need new data - Jun 2025") 11 | def test_apply_bus_analysis_conversions() -> None: 12 | """ 13 | Test extracted conversions for tableau user view 14 | """ 15 | df = pl.read_parquet("tests/test_files/PUBLIC_ARCHIVE/lamp/bus_vehicle_events/test_events.parquet") 16 | table = apply_bus_analysis_conversions(polars_df=df) 17 | print(df) 18 | print(table) 19 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | this file contains fixtures that are intended to be used across multiple test 3 | files 4 | """ 5 | 6 | from typing import ( 7 | Iterator, 8 | List, 9 | Optional, 10 | Union, 11 | ) 12 | 13 | import pytest 14 | from _pytest.monkeypatch import MonkeyPatch 15 | from pyarrow import fs 16 | import pyarrow.dataset as pd 17 | 18 | from .test_resources import LocalS3Location 19 | 20 | 21 | @pytest.fixture(autouse=True, name="get_pyarrow_dataset_patch") 22 | def fixture_get_pyarrow_dataset_patch( 23 | monkeypatch: MonkeyPatch, 24 | ) -> Iterator[None]: 25 | """ 26 | the aws.s3 function `_get_pyarrow_dataset` function reads parquet files from 27 | s3 and returns a pyarrow dataset. when testing on our github machines, we 28 | don't have access to s3, so all tests must be run against local files. 29 | monkeypatch the function to read from a local filepath. 30 | """ 31 | 32 | def mock__get_pyarrow_dataset( 33 | filename: Union[str, List[str]], 34 | filters: Optional[pd.Expression] = None, 35 | ) -> pd.Dataset: 36 | active_fs = fs.LocalFileSystem() 37 | 38 | if isinstance(filename, list): 39 | to_load = filename 40 | else: 41 | to_load = [filename] 42 | 43 | if len(to_load) == 0: 44 | return pd.dataset([]) 45 | 46 | ds = pd.dataset(to_load, filesystem=active_fs, partitioning="hive") 47 | if filters is not None: 48 | ds = ds.filter(filters) 49 | 50 | return ds 51 | 52 | monkeypatch.setattr("lamp_py.aws.s3._get_pyarrow_dataset", mock__get_pyarrow_dataset) 53 | 54 | yield 55 | 56 | 57 | @pytest.fixture(autouse=True, name="remote_file_locations_patch") 58 | def fixture_remote_file_locations_patch( 59 | monkeypatch: MonkeyPatch, 60 | ) -> Iterator[None]: 61 | """ 62 | We define S3 Filepaths in the RemoteFileLocations class in remote_files.py 63 | that can be used in our different applications. When testing on github, we 64 | don't have access to s3, so tests need to be run against local files. Use 65 | monkeypatch to redefine how these utilities work. 66 | """ 67 | monkeypatch.setattr("lamp_py.runtime_utils.remote_files.S3Location", LocalS3Location) 68 | 69 | yield 70 | -------------------------------------------------------------------------------- /tests/ingestion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/ingestion/__init__.py -------------------------------------------------------------------------------- /tests/ingestion/test_configuration.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from lamp_py.ingestion.converter import ConfigType 4 | from lamp_py.runtime_utils.lamp_exception import ConfigTypeFromFilenameException 5 | 6 | UPDATE_FILENAME = "2022-01-01T00:00:02Z_https_cdn.mbta.com_realtime_TripUpdates_enhanced.json.gz" 7 | 8 | VEHICLE_POSITIONS_FILENAME = "2022-01-01T00:00:03Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz" 9 | 10 | ALERTS_FILENAME = "2022-01-01T00:00:38Z_https_cdn.mbta.com_realtime_Alerts_enhanced.json.gz" 11 | 12 | 13 | def test_filname_parsing() -> None: 14 | """ 15 | Check that we are able to get the correct Configuration type for multiple 16 | filenames 17 | """ 18 | trip_updates_type = ConfigType.from_filename(UPDATE_FILENAME) 19 | assert trip_updates_type == ConfigType.RT_TRIP_UPDATES 20 | 21 | vehicle_positions_type = ConfigType.from_filename(VEHICLE_POSITIONS_FILENAME) 22 | assert vehicle_positions_type == ConfigType.RT_VEHICLE_POSITIONS 23 | 24 | alerts_type = ConfigType.from_filename(ALERTS_FILENAME) 25 | assert alerts_type == ConfigType.RT_ALERTS 26 | 27 | with pytest.raises(ConfigTypeFromFilenameException): 28 | ConfigType.from_filename("this.is.a.bad.filename.json.gz") 29 | -------------------------------------------------------------------------------- /tests/ingestion/test_ingest.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=[W0621, W0611] 2 | # disable these warnings that are triggered by pylint not understanding how test 3 | # fixtures work. https://stackoverflow.com/q/59664605 4 | 5 | import os 6 | from queue import Queue 7 | import pytest 8 | 9 | from lamp_py.ingestion.converter import ConfigType 10 | from lamp_py.runtime_utils.lamp_exception import NoImplException 11 | from lamp_py.runtime_utils.lamp_exception import IgnoreIngestion 12 | from lamp_py.ingestion.convert_gtfs_rt import GtfsRtConverter 13 | 14 | 15 | TEST_FILE_DIR = os.path.join(os.path.dirname(__file__), "test_files") 16 | 17 | 18 | def test_each_config_type() -> None: 19 | """ 20 | Test that each config type maps to a converter instance and that they map 21 | correctly. 22 | """ 23 | config_type_map = { 24 | ConfigType.RT_ALERTS: GtfsRtConverter, 25 | ConfigType.RT_TRIP_UPDATES: GtfsRtConverter, 26 | ConfigType.RT_VEHICLE_POSITIONS: GtfsRtConverter, 27 | ConfigType.BUS_TRIP_UPDATES: GtfsRtConverter, 28 | ConfigType.BUS_VEHICLE_POSITIONS: GtfsRtConverter, 29 | } 30 | for config_type, converter_type in config_type_map.items(): 31 | converter = GtfsRtConverter(config_type, Queue()) 32 | assert isinstance(converter, converter_type) 33 | 34 | bad_config_types = [ 35 | ConfigType.VEHICLE_COUNT, 36 | ConfigType.ERROR, 37 | ConfigType.SCHEDULE, 38 | ] 39 | 40 | for config_type in bad_config_types: 41 | with pytest.raises(NoImplException): 42 | converter = GtfsRtConverter(config_type, Queue()) 43 | 44 | # with pytest.raises(IgnoreIngestion): 45 | # converter = GtfsRtConverter(ConfigType.LIGHT_RAIL, Queue()) 46 | -------------------------------------------------------------------------------- /tests/ingestion/test_light_rail_gps.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from lamp_py.ingestion.light_rail_gps import raw_gps_schema 4 | from lamp_py.ingestion.light_rail_gps import dataframe_from_gz 5 | 6 | from ..test_resources import test_files_dir 7 | 8 | mock_file_list = [ 9 | os.path.join( 10 | test_files_dir, 11 | "INCOMING/2024-05-01T02:30:11Z_s3_mbta_ctd_trc_data_rtr_prod_LightRailRawGPS.json.gz", 12 | ) 13 | ] 14 | 15 | 16 | def test_light_rail_gps() -> None: 17 | """ 18 | test gtfs_events_for_date pipeline 19 | """ 20 | dataframe, archive_files, error_files = dataframe_from_gz(mock_file_list) 21 | 22 | assert len(archive_files) == 1 23 | 24 | assert len(error_files) == 0 25 | 26 | assert dataframe.schema == raw_gps_schema 27 | 28 | assert dataframe.shape[0] == 190 29 | -------------------------------------------------------------------------------- /tests/ingestion_tm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/ingestion_tm/__init__.py -------------------------------------------------------------------------------- /tests/ingestion_tm/test_ingest.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from typing import Set, List 3 | 4 | from lamp_py.ingestion_tm.tm_export import TMExport 5 | from lamp_py.ingestion_tm.ingest import get_ingestion_jobs 6 | 7 | 8 | def get_tm_export_subclasses( 9 | cls: type[TMExport] = TMExport, 10 | ) -> Set[type[TMExport]]: 11 | """ 12 | recursively get all of the concrete TMExport child classes 13 | """ 14 | subclasses: List[type[TMExport]] = [] 15 | for subclass in cls.__subclasses__(): 16 | if inspect.isabstract(subclass): 17 | subclasses += get_tm_export_subclasses(subclass) 18 | else: 19 | subclasses.append(subclass) 20 | 21 | return set(subclasses) 22 | 23 | 24 | def test_ingestion_job_count() -> None: 25 | """ 26 | test that the ingestion pipeline is aware of each tm export class 27 | """ 28 | # get all of the jobs run in ingestion, assert its not empty 29 | ingestion_jobs = get_ingestion_jobs() 30 | job_types = {type(job) for job in ingestion_jobs} 31 | assert job_types 32 | 33 | # get all potential jobs based on subclasses. assert its not empty 34 | all_job_types = get_tm_export_subclasses() 35 | assert all_job_types 36 | 37 | # ensure all job types are accounted for in ingestion 38 | assert all_job_types == job_types, f"Missing instances for subclasses: {all_job_types - job_types}" 39 | -------------------------------------------------------------------------------- /tests/performance_manager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/performance_manager/__init__.py -------------------------------------------------------------------------------- /tests/performance_manager/test_backup_trips_match.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | import polars as pl 3 | from lamp_py.performance_manager.l1_cte_statements import static_trips_subquery_pl 4 | from lamp_py.performance_manager.l1_rt_trips import backup_trips_match_pl 5 | 6 | 7 | @patch( 8 | "lamp_py.performance_manager.l1_cte_statements.GTFS_ARCHIVE", "https://performancedata.mbta.com/lamp/gtfs_archive" 9 | ) 10 | def test_backup_trips_match() -> None: 11 | """ 12 | test backup_trips_match 13 | """ 14 | # ┌─────────────────────────┬──────────────┬───────────────────┬───────────────────┬────────────────┐ 15 | # │ static_trip_id ┆ direction_id ┆ static_stop_count ┆ static_start_time ┆ route_id │ 16 | # │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ 17 | # │ str ┆ i64 ┆ u32 ┆ str ┆ str │ 18 | # ╞═════════════════════════╪══════════════╪═══════════════════╪═══════════════════╪════════════════╡ 19 | rt_trips_raw = pl.read_csv( 20 | "tests/test_files/replace_perf_mgr_query_test_data/20250415_rt_trips_for_backup_match_subquery.csv", 21 | infer_schema=False, 22 | ) 23 | rt_trips = rt_trips_raw.with_columns( 24 | pl.when(pl.col("direction_id") == "f").then(pl.lit(False)).otherwise(pl.lit(True)).alias("direction_id"), 25 | pl.col("start_time").cast(pl.Int32).alias("start_time"), 26 | ) 27 | 28 | static_trips = static_trips_subquery_pl(20250415) 29 | backup_matched_trips = backup_trips_match_pl(rt_trips, static_trips) 30 | 31 | assert backup_matched_trips.height == 1299 32 | -------------------------------------------------------------------------------- /tests/performance_manager/test_l0_gtfs_rt_events.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | from lamp_py.performance_manager.l0_rt_vehicle_positions import ( 5 | get_vp_dataframe, 6 | transform_vp_datatypes, 7 | ) 8 | from lamp_py.performance_manager.l0_rt_trip_updates import ( 9 | get_and_unwrap_tu_dataframe, 10 | ) 11 | from lamp_py.performance_manager.gtfs_utils import ( 12 | add_missing_service_dates, 13 | service_date_from_timestamp, 14 | ) 15 | 16 | from ..test_resources import test_files_dir, csv_to_vp_parquet 17 | 18 | 19 | def test_service_date_from_timestamp() -> None: 20 | """ 21 | test that the service date from timestamp function correctly handles 22 | timestamps around the threshold when the service date switches over. 23 | """ 24 | dst_expected = { 25 | # dst started on 8 march 2020, the clock goes from 1:59 -> 3:00 26 | 20200307: [ 27 | 1583650200, # 1:50 am 28 | 1583650740, # 1:59 am 29 | 1583650799, # 1:59:59 am 30 | ], 31 | 20200308: [ 32 | 1583650800, # 3:00 am 33 | 1583651400, # 3:10 am 34 | ], 35 | # dst ended on 1 nov 2020, the clock goes from 2:00 -> 1:00 36 | 20201031: [ 37 | 1604209800, # 1:50 am 38 | 1604210340, # 1:59 am 39 | 1604210399, # 1:59:59 am 40 | 1604210400, # 1:00 am (second time) 41 | 1604214000, # 2:00 am 42 | 1604214000, # 2:00 am 43 | 1604217000, # 2:50 am 44 | 1604217540, # 2:59 am 45 | 1604217599, # 2:59:59 am 46 | ], 47 | 20201101: [ 48 | 1604217600, # 3:00 am 49 | 1604218200, # 3:10 am 50 | ], 51 | } 52 | 53 | for service_date, timestamps in dst_expected.items(): 54 | for timestamp in timestamps: 55 | assert service_date == service_date_from_timestamp(timestamp) 56 | 57 | 58 | def test_vp_missing_service_date(tmp_path: pathlib.Path) -> None: 59 | """ 60 | test that missing service dates in gtfs-rt vehicle position files can be 61 | correctly backfilled. 62 | """ 63 | csv_file = os.path.join(test_files_dir, "vp_missing_start_date.csv") 64 | 65 | parquet_folder = tmp_path.joinpath("RT_VEHICLE_POSITIONS/year=2023/month=5/day=8/hour=11") 66 | parquet_folder.mkdir(parents=True) 67 | parquet_file = str(parquet_folder.joinpath("flat_file.parquet")) 68 | 69 | csv_to_vp_parquet(csv_file, parquet_file) 70 | 71 | events = get_vp_dataframe(to_load=[parquet_file], route_ids=["Blue"]) 72 | events = transform_vp_datatypes(events) 73 | 74 | # ensure that there are NaN service dates 75 | assert events["service_date"].hasnans 76 | 77 | # add the service dates that are missing 78 | events = add_missing_service_dates(events, timestamp_key="vehicle_timestamp") 79 | 80 | # check that new service dates match existing and are numbers 81 | assert len(events["service_date"].unique()) == 1 82 | assert not events["service_date"].hasnans 83 | 84 | 85 | def test_tu_missing_service_date() -> None: 86 | """ 87 | test that trip update gtfs data with missing service dates can be processed 88 | correctly. 89 | """ 90 | parquet_file = os.path.join(test_files_dir, "tu_missing_start_date.parquet") 91 | events = get_and_unwrap_tu_dataframe([parquet_file], route_ids=["Blue"]) 92 | 93 | # check that NaN service dates exist from reading the file 94 | assert events["service_date"].hasnans 95 | 96 | events = add_missing_service_dates(events_dataframe=events, timestamp_key="timestamp") 97 | 98 | # check that all service dates exist and are the same 99 | assert not events["service_date"].hasnans 100 | assert len(events["service_date"].unique()) == 1 101 | -------------------------------------------------------------------------------- /tests/performance_manager/test_static_trips_subquery.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | import polars as pl 3 | from polars.testing import assert_frame_equal 4 | from lamp_py.performance_manager.l1_cte_statements import static_trips_subquery_pl 5 | 6 | 7 | @patch( 8 | "lamp_py.performance_manager.l1_cte_statements.GTFS_ARCHIVE", "https://performancedata.mbta.com/lamp/gtfs_archive" 9 | ) 10 | def test_static_trips_subquery_pl() -> None: 11 | """ 12 | Passing unit test for static_trips_subquery implementation in polars/parquet 13 | """ 14 | 15 | static_trips_pl = static_trips_subquery_pl(20250410).sort(by="static_trip_id") 16 | 17 | compare_sql = pl.read_csv( 18 | "tests/test_files/replace_perf_mgr_query_test_data/staging_test_summary_sub.csv", infer_schema=False 19 | ) 20 | 21 | # need to do a few things because the csv output doesn't do types well 22 | static_trips_sql = compare_sql.with_columns( 23 | pl.col("static_stop_count").cast(pl.Int16), 24 | pl.col("static_start_time").cast(pl.Int32), 25 | pl.when(pl.col("direction_id") == "f").then(pl.lit(False)).otherwise(pl.lit(True)).alias("direction_id"), 26 | ) 27 | 28 | # assert against test csv for all rows 29 | assert_frame_equal(static_trips_pl, static_trips_sql, check_column_order=False) 30 | -------------------------------------------------------------------------------- /tests/test_files/INCOMING/2022-01-01T00:00:03Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-01-01T00:00:03Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz -------------------------------------------------------------------------------- /tests/test_files/INCOMING/2022-05-04T15:59:48Z_https_cdn.mbta.com_realtime_Alerts_enhanced.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-05-04T15:59:48Z_https_cdn.mbta.com_realtime_Alerts_enhanced.json.gz -------------------------------------------------------------------------------- /tests/test_files/INCOMING/2022-05-05T16_00_15Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_VehiclePositions_enhanced.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-05-05T16_00_15Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_VehiclePositions_enhanced.json.gz -------------------------------------------------------------------------------- /tests/test_files/INCOMING/2022-05-08T06:04:57Z_https_cdn.mbta.com_realtime_TripUpdates_enhanced.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-05-08T06:04:57Z_https_cdn.mbta.com_realtime_TripUpdates_enhanced.json.gz -------------------------------------------------------------------------------- /tests/test_files/INCOMING/2022-06-28T10_03_18Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_TripUpdates_enhanced.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-06-28T10_03_18Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_TripUpdates_enhanced.json.gz -------------------------------------------------------------------------------- /tests/test_files/INCOMING/2022-07-05T12:35:16Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-07-05T12:35:16Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz -------------------------------------------------------------------------------- /tests/test_files/INCOMING/2024-05-01T02:30:11Z_s3_mbta_ctd_trc_data_rtr_prod_LightRailRawGPS.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2024-05-01T02:30:11Z_s3_mbta_ctd_trc_data_rtr_prod_LightRailRawGPS.json.gz -------------------------------------------------------------------------------- /tests/test_files/INCOMING/MBTA_GTFS.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/MBTA_GTFS.zip -------------------------------------------------------------------------------- /tests/test_files/INCOMING/empty.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/empty.json.gz -------------------------------------------------------------------------------- /tests/test_files/INCOMING/one_blank_record.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/one_blank_record.json.gz -------------------------------------------------------------------------------- /tests/test_files/PUBLIC_ARCHIVE/lamp/bus_vehicle_events/test_events.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/PUBLIC_ARCHIVE/lamp/bus_vehicle_events/test_events.parquet -------------------------------------------------------------------------------- /tests/test_files/PUBLIC_ARCHIVE/lamp/gtfs_archive/2023/routes.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/PUBLIC_ARCHIVE/lamp/gtfs_archive/2023/routes.parquet -------------------------------------------------------------------------------- /tests/test_files/PUBLIC_ARCHIVE/lamp/gtfs_archive/2024/routes.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/PUBLIC_ARCHIVE/lamp/gtfs_archive/2024/routes.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/CALENDAR/timestamp=1682375024/f18c9f5747194660a793cf0cd6f9df90-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/CALENDAR/timestamp=1682375024/f18c9f5747194660a793cf0cd6f9df90-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/CALENDAR_DATES/timestamp=1682375024/7c0b0da47e284237a7b50df57e3ef33c-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/CALENDAR_DATES/timestamp=1682375024/7c0b0da47e284237a7b50df57e3ef33c-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/DIRECTIONS/timestamp=1682375024/562949d9931149f8a5d8f0cb2eb52c80-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/DIRECTIONS/timestamp=1682375024/562949d9931149f8a5d8f0cb2eb52c80-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/FEED_INFO/timestamp=1682375024/e84307ae774a4d8c8968c5e38e7affdc-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/FEED_INFO/timestamp=1682375024/e84307ae774a4d8c8968c5e38e7affdc-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/ROUTES/timestamp=1682375024/b4e038eb63da41fcb66eed81548f664a-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/ROUTES/timestamp=1682375024/b4e038eb63da41fcb66eed81548f664a-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/ROUTE_PATTERNS/timestamp=1682375024/57233d3677484fe1bd0373749c34cc63-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/ROUTE_PATTERNS/timestamp=1682375024/57233d3677484fe1bd0373749c34cc63-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/RT_ALERTS/year=2020/month=2/day=9/hour=1/6ef6922c20064cb9a8f09a3b3b1d2783-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_ALERTS/year=2020/month=2/day=9/hour=1/6ef6922c20064cb9a8f09a3b3b1d2783-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/RT_TRIP_UPDATES/year=2023/month=5/day=8/hour=12/8e2c182968e24ecea3d37f03d6bae84d-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_TRIP_UPDATES/year=2023/month=5/day=8/hour=12/8e2c182968e24ecea3d37f03d6bae84d-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/RT_TRIP_UPDATES/year=2023/month=5/day=8/hour=13/eaeee968b94b4a74b166df4b8ffd9f29-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_TRIP_UPDATES/year=2023/month=5/day=8/hour=13/eaeee968b94b4a74b166df4b8ffd9f29-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2023/month=5/day=8/hour=12/1613b49e4fa1459eabe9c83553ef1045-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2023/month=5/day=8/hour=12/1613b49e4fa1459eabe9c83553ef1045-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2023/month=5/day=8/hour=13/9a1bb1c5269042a284b2ed57b4dfebb9-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2023/month=5/day=8/hour=13/9a1bb1c5269042a284b2ed57b4dfebb9-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2024/month=6/day=1/hour=12/fcf91fbba92d418aa136d928c6243121-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2024/month=6/day=1/hour=12/fcf91fbba92d418aa136d928c6243121-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2024/month=6/day=1/hour=13/47ffb78637a5400aabdfd7c9c7142757-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2024/month=6/day=1/hour=13/47ffb78637a5400aabdfd7c9c7142757-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/STOPS/timestamp=1682375024/920a42ad1b5e4ef0942c7a1bc2ef2fea-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/STOPS/timestamp=1682375024/920a42ad1b5e4ef0942c7a1bc2ef2fea-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/STOP_TIMES/timestamp=1682375024/88c016320de440789357f14df6399d4c-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/STOP_TIMES/timestamp=1682375024/88c016320de440789357f14df6399d4c-0.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/TM/STOP_CROSSING/120240601.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/STOP_CROSSING/120240601.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/TM/STOP_CROSSING/120240811.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/STOP_CROSSING/120240811.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/TM/TMMAIN_GEO_NODE.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/TMMAIN_GEO_NODE.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/TM/TMMAIN_ROUTE.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/TMMAIN_ROUTE.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/TM/TMMAIN_TRIP.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/TMMAIN_TRIP.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/TM/TMMAIN_VEHICLE.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/TMMAIN_VEHICLE.parquet -------------------------------------------------------------------------------- /tests/test_files/SPRINGBOARD/TRIPS/timestamp=1682375024/cdca1ec8575c4705bb93bc76244c1a86-0.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TRIPS/timestamp=1682375024/cdca1ec8575c4705bb93bc76244c1a86-0.parquet -------------------------------------------------------------------------------- /tests/test_files/ingestion_BUSLOC_TU.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_BUSLOC_TU.parquet -------------------------------------------------------------------------------- /tests/test_files/ingestion_BUSLOC_VP.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_BUSLOC_VP.parquet -------------------------------------------------------------------------------- /tests/test_files/ingestion_GTFS-RT_ALERT.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_GTFS-RT_ALERT.parquet -------------------------------------------------------------------------------- /tests/test_files/ingestion_GTFS-RT_TU.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_GTFS-RT_TU.parquet -------------------------------------------------------------------------------- /tests/test_files/ingestion_GTFS-RT_TU_OLD.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_GTFS-RT_TU_OLD.parquet -------------------------------------------------------------------------------- /tests/test_files/ingestion_GTFS-RT_VP.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_GTFS-RT_VP.parquet -------------------------------------------------------------------------------- /tests/test_files/ingestion_GTFS-RT_VP_OLD.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_GTFS-RT_VP_OLD.parquet -------------------------------------------------------------------------------- /tests/test_files/short_list.json: -------------------------------------------------------------------------------- 1 | [ 2 | "mbta-ctd-dataplatform-dev-springboard/lamp/RT_VEHICLE_POSITIONS/year=2022/month=7/day=20/hour=10/c7be65cb26f04b9c86874a8b40195a72-0.parquet", 3 | "mbta-ctd-dataplatform-dev-springboard/lamp/RT_TRIP_UPDATES/year=2022/month=7/day=20/hour=10/8dee9a06766042fb8adeb2fa2b999c1a-0.parquet", 4 | "mbta-ctd-dataplatform-dev-springboard/lamp/FEED_INFO/timestamp=1668795415/dd9e9eb9f5d746a8ad3a0c0dbc73c521-0.parquet" 5 | ] -------------------------------------------------------------------------------- /tests/test_files/tu_missing_start_date.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/tu_missing_start_date.parquet -------------------------------------------------------------------------------- /tests/test_files/vp_missing_start_date.csv: -------------------------------------------------------------------------------- 1 | vehicle.current_status,vehicle.current_stop_sequence,vehicle.stop_id,vehicle.timestamp,vehicle.trip.direction_id,vehicle.trip.route_id,vehicle.trip.start_date,vehicle.trip.start_time,vehicle.vehicle.id,vehicle.trip.trip_id,vehicle.vehicle.label,vehicle.vehicle.consist,vehicle.multi_carriage_details 2 | STOPPED_AT,1,70059,1683547153,0,Blue,,07:38:00,B-54768A0A,55458882,0713,, 3 | INCOMING_AT,10,70057,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,, 4 | STOPPED_AT,10,70057,1683547246,0,Blue,,07:38:00,B-54768A0A,55458882,0713,, 5 | INCOMING_AT,20,70055,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,, 6 | IN_TRANSIT_TO,20,70055,1683547299,0,Blue,,07:38:00,B-54768A0A,55458882,0713,, 7 | STOPPED_AT,20,70055,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,, 8 | INCOMING_AT,30,70053,1683547429,0,Blue,,07:38:00,B-54768A0A,55458882,0713,, 9 | STOPPED_AT,30,70053,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,, 10 | INCOMING_AT,40,70051,1683547531,0,Blue,,07:38:00,B-54768A0A,55458882,0713,, 11 | STOPPED_AT,40,70051,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,, 12 | IN_TRANSIT_TO,50,70049,1683547652,0,Blue,,07:38:00,B-54768A0A,55458882,0713,, 13 | INCOMING_AT,50,70049,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,, 14 | STOPPED_AT,50,70049,1683547786,0,Blue,,07:38:00,B-54768A0A,55458882,0713,, 15 | IN_TRANSIT_TO,60,70047,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,, 16 | INCOMING_AT,60,70047,1683547846,0,Blue,,07:38:00,B-54768A0A,55458882,0713,, 17 | STOPPED_AT,60,70047,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,, 18 | IN_TRANSIT_TO,70,70045,1683547970,0,Blue,,07:38:00,B-54768A0A,55458882,0713,, 19 | INCOMING_AT,70,70045,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,, 20 | STOPPED_AT,70,70045,1683548133,0,Blue,,07:38:00,B-54768A0A,55458882,0713,, 21 | -------------------------------------------------------------------------------- /tests/test_resources.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | 4 | import pyarrow 5 | from pyarrow import csv, parquet 6 | 7 | from lamp_py.runtime_utils.remote_files import ( 8 | S3_SPRINGBOARD, 9 | S3_INCOMING, 10 | ) 11 | 12 | test_files_dir = os.path.join(os.path.dirname(__file__), "test_files") 13 | 14 | 15 | def csv_to_vp_parquet(csv_filepath: str, parquet_filepath: str) -> None: 16 | """ 17 | read vehicle position data in csv format and write it to a parquet file 18 | """ 19 | vp_csv_options = csv.ConvertOptions( 20 | column_types={ 21 | "vehicle.current_status": pyarrow.string(), 22 | "vehicle.current_stop_sequence": pyarrow.uint32(), 23 | "vehicle.stop_id": pyarrow.string(), 24 | "vehicle.timestamp": pyarrow.uint64(), 25 | "vehicle.trip.direction_id": pyarrow.uint8(), 26 | "vehicle.trip.route_id": pyarrow.string(), 27 | "vehicle.trip.trip_id": pyarrow.string(), 28 | "vehicle.trip.start_date": pyarrow.string(), 29 | "vehicle.trip.start_time": pyarrow.string(), 30 | "vehicle.vehicle.id": pyarrow.string(), 31 | "vehicle.vehicle.consist": pyarrow.string(), 32 | }, 33 | # in our ingestion, if a key is missing, the value written to the 34 | # parquet file is null. mimic this behavior by making empty strings 35 | # null instead of ''. 36 | strings_can_be_null=True, 37 | ) 38 | 39 | table = csv.read_csv(csv_filepath, convert_options=vp_csv_options) 40 | parquet.write_table(table, parquet_filepath) 41 | 42 | 43 | incoming_dir = os.path.join(test_files_dir, S3_INCOMING) 44 | springboard_dir = os.path.join(test_files_dir, S3_SPRINGBOARD) 45 | 46 | 47 | @dataclass 48 | class LocalS3Location: 49 | """replace an s3 location wrapper class so it can be used in testing""" 50 | 51 | bucket: str 52 | prefix: str 53 | 54 | @property 55 | def s3_uri(self) -> str: 56 | """generate the local path to the test file for this object""" 57 | return os.path.join(test_files_dir, self.bucket, self.prefix) 58 | 59 | 60 | rt_vehicle_positions = LocalS3Location( 61 | bucket=S3_SPRINGBOARD, 62 | prefix="RT_VEHICLE_POSITIONS", 63 | ) 64 | 65 | tm_stop_crossings = LocalS3Location( 66 | bucket=S3_SPRINGBOARD, 67 | prefix="TM/STOP_CROSSING", 68 | ) 69 | tm_geo_node_file = LocalS3Location(bucket=S3_SPRINGBOARD, prefix="TM/TMMAIN_GEO_NODE.parquet") 70 | tm_route_file = LocalS3Location(bucket=S3_SPRINGBOARD, prefix="TM/TMMAIN_ROUTE.parquet") 71 | tm_trip_file = LocalS3Location(bucket=S3_SPRINGBOARD, prefix="TM/TMMAIN_TRIP.parquet") 72 | tm_vehicle_file = LocalS3Location(bucket=S3_SPRINGBOARD, prefix="TM/TMMAIN_VEHICLE.parquet") 73 | -------------------------------------------------------------------------------- /tests/utils/test_date_range_builder.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from lamp_py.utils.date_range_builder import build_data_range_paths 3 | 4 | 5 | def test_simple_case() -> None: 6 | template = "year={yy}/month={mm}/day={dd}/{yy}-{mm:02d}-{dd:02d}T00:00:00.parquet" 7 | 8 | out = build_data_range_paths(template, start_date=datetime(2025, 4, 1), end_date=datetime(2025, 4, 20)) 9 | print(out) 10 | 11 | assert out == [ 12 | "year=2025/month=4/day=1/2025-04-01T00:00:00.parquet", 13 | "year=2025/month=4/day=2/2025-04-02T00:00:00.parquet", 14 | "year=2025/month=4/day=3/2025-04-03T00:00:00.parquet", 15 | "year=2025/month=4/day=4/2025-04-04T00:00:00.parquet", 16 | "year=2025/month=4/day=5/2025-04-05T00:00:00.parquet", 17 | "year=2025/month=4/day=6/2025-04-06T00:00:00.parquet", 18 | "year=2025/month=4/day=7/2025-04-07T00:00:00.parquet", 19 | "year=2025/month=4/day=8/2025-04-08T00:00:00.parquet", 20 | "year=2025/month=4/day=9/2025-04-09T00:00:00.parquet", 21 | "year=2025/month=4/day=10/2025-04-10T00:00:00.parquet", 22 | "year=2025/month=4/day=11/2025-04-11T00:00:00.parquet", 23 | "year=2025/month=4/day=12/2025-04-12T00:00:00.parquet", 24 | "year=2025/month=4/day=13/2025-04-13T00:00:00.parquet", 25 | "year=2025/month=4/day=14/2025-04-14T00:00:00.parquet", 26 | "year=2025/month=4/day=15/2025-04-15T00:00:00.parquet", 27 | "year=2025/month=4/day=16/2025-04-16T00:00:00.parquet", 28 | "year=2025/month=4/day=17/2025-04-17T00:00:00.parquet", 29 | "year=2025/month=4/day=18/2025-04-18T00:00:00.parquet", 30 | "year=2025/month=4/day=19/2025-04-19T00:00:00.parquet", 31 | "year=2025/month=4/day=20/2025-04-20T00:00:00.parquet", 32 | ] 33 | 34 | 35 | def test_year_crossing() -> None: 36 | template = "year={yy}/month={mm}/day={dd}/{yy}-{mm:02d}-{dd:02d}T00:00:00.parquet" 37 | 38 | out = build_data_range_paths(template, start_date=datetime(2024, 12, 30), end_date=datetime(2025, 1, 2)) 39 | print(out) 40 | 41 | assert out == [ 42 | "year=2024/month=12/day=30/2024-12-30T00:00:00.parquet", 43 | "year=2024/month=12/day=31/2024-12-31T00:00:00.parquet", 44 | "year=2025/month=1/day=1/2025-01-01T00:00:00.parquet", 45 | "year=2025/month=1/day=2/2025-01-02T00:00:00.parquet", 46 | ] 47 | 48 | 49 | def test_next_leap_year_2028() -> None: 50 | template = "year={yy}/month={mm}/day={dd}/{yy}-{mm:02d}-{dd:02d}T00:00:00.parquet" 51 | 52 | out = build_data_range_paths(template, start_date=datetime(2028, 2, 26), end_date=datetime(2028, 3, 2)) 53 | print(out) 54 | 55 | assert out == [ 56 | "year=2028/month=2/day=26/2028-02-26T00:00:00.parquet", 57 | "year=2028/month=2/day=27/2028-02-27T00:00:00.parquet", 58 | "year=2028/month=2/day=28/2028-02-28T00:00:00.parquet", 59 | "year=2028/month=2/day=29/2028-02-29T00:00:00.parquet", 60 | "year=2028/month=3/day=1/2028-03-01T00:00:00.parquet", 61 | "year=2028/month=3/day=2/2028-03-02T00:00:00.parquet", 62 | ] 63 | -------------------------------------------------------------------------------- /tests/utils/test_filter_bank.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import itertools 3 | from typing import Optional 4 | from unittest.mock import patch 5 | import polars as pl 6 | 7 | from lamp_py.utils.filter_bank import HeavyRailFilter, LightRailFilter 8 | 9 | # list files 10 | # grab latest 11 | # assert hardcodes still hold 12 | # get latest 13 | 14 | 15 | def test_hardcoded_terminal_prediction_names() -> None: 16 | # the stops listed for these filters are retrieved dynamically from gtfs. 17 | # ensure that the expected list contains all of the expected terminal values 18 | 19 | # associated runner: 20 | # runners/run_gtfs_rt_parquet_converter.py 21 | 22 | def list_station_child_stops_from_gtfs( 23 | stops: pl.DataFrame, parent_station: str, additional_filter: Optional[pl.Expr] = None 24 | ) -> pl.DataFrame: 25 | """ 26 | Filter gtfs stops by parent_station string, and additional filter if available 27 | """ 28 | df_parent_station = stops.filter(pl.col("parent_station") == parent_station) 29 | if additional_filter is not None: 30 | df_parent_station = df_parent_station.filter(additional_filter) 31 | return df_parent_station 32 | 33 | terminal_stop_ids = [] 34 | heavy_rail_filter = pl.col("vehicle_type") == 1 35 | 36 | # check that all stops in Filter lists exist 37 | service_date = datetime.now() 38 | stops = pl.read_parquet(f"https://performancedata.mbta.com/lamp/gtfs_archive/{service_date.year}/stops.parquet") 39 | 40 | for place_name in HeavyRailFilter._terminal_stop_place_names: 41 | gtfs_stops = list_station_child_stops_from_gtfs(stops, place_name, heavy_rail_filter) 42 | terminal_stop_ids.extend(gtfs_stops["stop_id"].to_list()) 43 | 44 | assert set(terminal_stop_ids).issuperset(set(HeavyRailFilter.terminal_stop_ids)) 45 | 46 | for stop in HeavyRailFilter.terminal_stop_ids: 47 | bb = stops.filter(pl.col("stop_id") == stop) 48 | assert stops.filter(pl.col("stop_id") == stop).height == 1 49 | 50 | for stop in LightRailFilter.terminal_stop_ids: 51 | bb = stops.filter(pl.col("stop_id") == stop) 52 | assert stops.filter(pl.col("stop_id") == stop).height == 1 53 | -------------------------------------------------------------------------------- /tests/utils/test_gtfs_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | from unittest import mock 3 | 4 | from lamp_py.utils.gtfs_utils import ( 5 | bus_route_ids_for_service_date, 6 | routes_for_service_date, 7 | ) 8 | 9 | 10 | @mock.patch("lamp_py.utils.gtfs_utils.object_exists") 11 | def test_bus_routes_for_service_date(exists_patch: mock.MagicMock) -> None: 12 | """ 13 | Test that bus routes be generated for a given service date. For the 14 | generated list ensure 15 | * they don't contain Subway, Commuter Rail, or Ferry routes 16 | * don't have a leading zero 17 | * contain a subset of known routes 18 | """ 19 | exists_patch.return_value = True 20 | 21 | service_date = date(year=2023, month=2, day=1) 22 | bus_routes = bus_route_ids_for_service_date(service_date) 23 | 24 | # check that we're getting a non empty list 25 | assert len(bus_routes) > 0 26 | 27 | subway_routes = [ 28 | "Green-E", 29 | "Green-B", 30 | "Green-D", 31 | "Green-C", 32 | "Red", 33 | "Blue", 34 | "Orange", 35 | ] 36 | 37 | for route in bus_routes: 38 | # ensure no commuter rails are being passed through 39 | assert route[:2] != "CR" 40 | 41 | # ensure no ferries are being passed through 42 | assert route[:4] != "Boat" 43 | 44 | # ensure no subways are being passed through 45 | assert route not in subway_routes 46 | 47 | # ensure our routes don't have leading zeros 48 | assert route[0] != "0" 49 | 50 | known_routes = [ 51 | "741", # Sliver Line 1 52 | "34E", # Walpole Center - Forest Hills Station 53 | "100", # Elm Street - Wellington Station 54 | "504", # Watertown Yard - Federal Street & Franklin Street 55 | ] 56 | 57 | for route in known_routes: 58 | assert route in bus_routes 59 | 60 | 61 | @mock.patch("lamp_py.utils.gtfs_utils.object_exists") 62 | def test_routes_for_service_date(exists_patch: mock.MagicMock) -> None: 63 | """ 64 | Test that routes be generated for a given service date. For the 65 | generated list ensure that all the route types available are represented 66 | """ 67 | exists_patch.return_value = True 68 | 69 | service_date = date(year=2023, month=2, day=1) 70 | routes = routes_for_service_date(service_date) 71 | 72 | # check that we're getting a non empty list 73 | assert len(routes) > 0 74 | assert routes["route_type"].unique().to_list() == [0, 1, 2, 3, 4] 75 | -------------------------------------------------------------------------------- /tests/utils/timezones.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | import pytest 3 | 4 | 5 | def test_timezone_typing_same_type() -> None: 6 | """ 7 | naive can compare with naive 8 | aware can compare with aware 9 | """ 10 | ts = ["2021-03-27 03:00", "2021-03-28 03:00"] 11 | tz_naive = pl.Series("tz_naive", ts).str.to_datetime() 12 | assert (tz_naive == tz_naive).rename("naive_compared").all() 13 | tz_aware = tz_naive.dt.replace_time_zone("UTC").rename("tz_aware") 14 | assert (tz_aware == tz_aware).rename("naive_compared").all() 15 | 16 | 17 | def test_timezone_typing_us_eastern_vs_america_new_york_fail() -> None: 18 | """ 19 | US/Eastern can not compare with America/New_York even though they are both EDT/EST 20 | """ 21 | ts = ["2021-03-27 03:00", "2021-03-28 03:00"] 22 | tz_naive = pl.Series("tz_naive", ts).str.to_datetime() 23 | tz_aware_ny = tz_naive.dt.replace_time_zone("America/New_York").rename("tz_aware_ny") 24 | tz_aware_eastern = tz_naive.dt.replace_time_zone("US/Eastern").rename("tz_aware_east") 25 | try: 26 | out_compared2 = (tz_aware_ny > tz_aware_eastern).rename("ny_vs_eastern") 27 | # this should fail...if it doesn't, something has gone awry 28 | assert False 29 | except pl.exceptions.SchemaError: 30 | assert True 31 | 32 | 33 | def test_timezone_typing_us_eastern_vs_utc_fail() -> None: 34 | """ 35 | UTC can not compare with America/New_York 36 | """ 37 | ts = ["2021-03-27 03:00", "2021-03-28 03:00"] 38 | tz_naive = pl.Series("tz_naive", ts).str.to_datetime() 39 | tz_aware = tz_naive.dt.replace_time_zone("UTC").rename("tz_aware") 40 | tz_aware_ny = tz_naive.dt.replace_time_zone("America/New_York").rename("tz_aware_ny") 41 | try: 42 | out_compared = (tz_aware_ny > tz_aware).rename("out compared") 43 | # this should fail...if it doesn't, something has gone awry 44 | assert False 45 | except pl.exceptions.SchemaError: 46 | assert True 47 | 48 | 49 | def test_timezone_typing_tz_vs_naive_fail() -> None: 50 | """ 51 | verify can't compare naive with aware 52 | """ 53 | ts = ["2021-03-27 03:00", "2021-03-28 03:00"] 54 | tz_naive = pl.Series("tz_naive", ts).str.to_datetime() 55 | tz_aware = tz_naive.dt.replace_time_zone("UTC").rename("tz_aware") 56 | 57 | try: 58 | # can't compare naive with aware 59 | out_compared = tz_naive > tz_aware 60 | # this should fail...if it doesn't, something has gone awry 61 | assert False 62 | except pl.exceptions.SchemaError: 63 | assert True 64 | --------------------------------------------------------------------------------