├── .env
├── .envrc
├── .github
    ├── actions
    │   ├── python_deps
    │   │   └── action.yaml
    │   ├── run_task
    │   │   ├── action.yaml
    │   │   └── run_task.sh
    │   └── tools
    │   │   └── action.yaml
    ├── dependabot.yaml
    ├── pull_request_template.md
    └── workflows
    │   ├── ad_hoc_deploy_run.yml
    │   ├── asana-pr-merged.yml
    │   ├── asana-pr-opened.yml
    │   ├── change_task_count.yaml
    │   ├── ci-yaml.yaml
    │   ├── ci_python.yaml
    │   ├── deploy-base.yaml
    │   ├── deploy-prod.yaml
    │   ├── deploy-staging.yaml
    │   ├── manual-deploy.yaml
    │   └── run-task.yaml
├── .gitignore
├── .tool-versions
├── .yamllint.yml
├── Data_Dictionary.md
├── Dockerfile
├── LICENSE
├── README.md
├── alembic.ini
├── analysis
    ├── check_bus.py
    ├── check_bus_tableau.py
    ├── check_data_all_days_in_LAMP_ALL_RT_fields.py
    ├── prism.py
    └── sample_data.py
├── architecture.jpg
├── docker-compose.yml
├── poetry.lock
├── pyproject.toml
├── runners
    ├── run_glides_parquet_converter.py
    ├── run_gtfs_rt_parquet_converter.py
    ├── run_query_s3_with_date_range.py
    └── run_static_trips_subquery.py
├── src
    └── lamp_py
    │   ├── __version__.py
    │   ├── ad_hoc
    │       ├── __init__.py
    │       ├── pipeline.py
    │       └── runner_001.py
    │   ├── aws
    │       ├── __init__.py
    │       ├── ecs.py
    │       ├── kinesis.py
    │       └── s3.py
    │   ├── bus_performance_manager
    │       ├── README.md
    │       ├── __init__.py
    │       ├── event_files.py
    │       ├── events_gtfs_rt.py
    │       ├── events_gtfs_schedule.py
    │       ├── events_joined.py
    │       ├── events_metrics.py
    │       ├── events_tm.py
    │       ├── pipeline.py
    │       └── write_events.py
    │   ├── ingestion
    │       ├── README.md
    │       ├── __init__.py
    │       ├── compress_gtfs
    │       │   ├── __init__.py
    │       │   ├── gtfs_schema_map.py
    │       │   ├── gtfs_to_parquet.py
    │       │   ├── pipe.py
    │       │   ├── pq_to_sqlite.py
    │       │   └── schedule_details.py
    │       ├── config_busloc_trip.py
    │       ├── config_busloc_vehicle.py
    │       ├── config_rt_alerts.py
    │       ├── config_rt_trip.py
    │       ├── config_rt_vehicle.py
    │       ├── convert_gtfs.py
    │       ├── convert_gtfs_rt.py
    │       ├── converter.py
    │       ├── glides.py
    │       ├── gtfs_rt_detail.py
    │       ├── gtfs_rt_structs.py
    │       ├── ingest_gtfs.py
    │       ├── light_rail_gps.py
    │       ├── pipeline.py
    │       └── utils.py
    │   ├── ingestion_tm
    │       ├── ingest.py
    │       ├── jobs
    │       │   ├── parition_table.py
    │       │   └── whole_table.py
    │       ├── pipeline.py
    │       └── tm_export.py
    │   ├── migrations
    │       ├── README
    │       ├── __init__.py
    │       ├── env.py
    │       ├── migration_template_generator.py
    │       ├── script.py.mako
    │       └── versions
    │       │   ├── metadata_dev
    │       │       ├── 001_07903947aabe_initial_changes.py
    │       │       └── 002_26db393ea854_update_glides_location_column_names.py
    │       │   ├── metadata_prod
    │       │       ├── 001_07903947aabe_initial_changes.py
    │       │       ├── 002_cce8dfee767a_re_run_input_files_from_2024_04_03.py
    │       │       ├── 003_26db393ea854_update_glides_location_column_names.py
    │       │       └── 004_a08c5fd37dbd_reprocess_422_423.py
    │       │   ├── metadata_staging
    │       │       ├── 001_07903947aabe_initial_changes.py
    │       │       ├── 002_26db393ea854_update_glides_location_column_names.py
    │       │       └── 003_a08c5fd37dbd_reprocess_422_423.py
    │       │   ├── performance_manager_dev
    │       │       ├── 001_5d9a7ee21ae5_initial_prod_schema.py
    │       │       ├── 002_1b53fd278b10_fix_trip_id_length.py
    │       │       ├── 003_ae6c6e4b2df5_extend_service_id_view.py
    │       │       ├── 004_45dedc21086e_canon_stop_seq.py
    │       │       ├── 005_96187da84955_remove_metadata.py
    │       │       ├── 006_2dfbde5ec151_sync_stop_trunk.py
    │       │       ├── 007_896dedd8a4db_dwell_time_update.py
    │       │       ├── 008_32ba735d080c_add_revenue_columns.py
    │       │       └── 009_36e7a7aee148_upgrade_sequence.py
    │       │   ├── performance_manager_prod
    │       │       ├── 001_5d9a7ee21ae5_initial_prod_schema.py
    │       │       ├── 002_f09e853d5672_update_prod_stop_sync.py
    │       │       ├── 003_2dfbde5ec151_sync_stop_trunk.py
    │       │       ├── 004_896dedd8a4db_dwell_time_update.py
    │       │       ├── 005_32ba735d080c_add_revenue_columns.py
    │       │       ├── 006_36e7a7aee148_upgrade_sequence.py
    │       │       ├── 007_da8f80a3dd90_upgrade_sequence.py
    │       │       ├── 008_5e3066f113ff_backfill_rt_rail_2025_04_04_to_2025_04_18.py
    │       │       └── sql_strings
    │       │       │   └── strings_001.py
    │       │   └── performance_manager_staging
    │       │       ├── 001_5d9a7ee21ae5_initial_prod_schema.py
    │       │       ├── 002_1b53fd278b10_fix_trip_id_length.py
    │       │       ├── 003_ae6c6e4b2df5_extend_service_id_view.py
    │       │       ├── 004_45dedc21086e_canon_stop_seq.py
    │       │       ├── 005_96187da84955_remove_metadata.py
    │       │       ├── 006_e20a4f3f8c03_fix_null_vehicle_consist.py
    │       │       ├── 007_2dfbde5ec151_sync_stop_trunk.py
    │       │       ├── 008_896dedd8a4db_dwell_time_update.py
    │       │       ├── 009_32ba735d080c_add_revenue_columns.py
    │       │       ├── 010_36e7a7aee148_upgrade_sequence.py
    │       │       ├── 011_5e3066f113ff_backfill_rt_rail_2025_04_04_to_2025_04_18.py
    │       │       ├── 012_9b461d7aa53a_backfill_rt_rail_2025_04_04_to_2025_04_22.py
    │       │       └── sql_strings
    │       │           ├── strings_001.py
    │       │           └── strings_003.py
    │   ├── mssql
    │       ├── __init__.py
    │       ├── mssql_utils.py
    │       └── test_connect.py
    │   ├── performance_manager
    │       ├── README.md
    │       ├── __init__.py
    │       ├── alerts.py
    │       ├── flat_file.py
    │       ├── gtfs_utils.py
    │       ├── l0_gtfs_rt_events.py
    │       ├── l0_gtfs_static_load.py
    │       ├── l0_gtfs_static_mod.py
    │       ├── l0_rt_trip_updates.py
    │       ├── l0_rt_vehicle_positions.py
    │       ├── l1_cte_statements.py
    │       ├── l1_rt_metrics.py
    │       ├── l1_rt_trips.py
    │       └── pipeline.py
    │   ├── postgres
    │       ├── __init__.py
    │       ├── metadata_schema.py
    │       ├── postgres_utils.py
    │       ├── rail_performance_manager_schema.py
    │       └── seed_metadata.py
    │   ├── publishing
    │       ├── __init__.py
    │       ├── index.html
    │       └── performancedata.py
    │   ├── runtime_utils
    │       ├── __init__.py
    │       ├── alembic_migration.py
    │       ├── env_validation.py
    │       ├── infinite_wait.py
    │       ├── lamp_exception.py
    │       ├── process_logger.py
    │       └── remote_files.py
    │   ├── tableau
    │       ├── README.md
    │       ├── __init__.py
    │       ├── conversions
    │       │   ├── convert_bus_performance_data.py
    │       │   ├── convert_gtfs_rt_trip_updates.py
    │       │   └── convert_gtfs_rt_vehicle_position.py
    │       ├── hyper.py
    │       ├── jobs
    │       │   ├── bus_performance.py
    │       │   ├── filtered_hyper.py
    │       │   ├── glides.py
    │       │   ├── gtfs_rail.py
    │       │   ├── rt_alerts.py
    │       │   └── rt_rail.py
    │       ├── pipeline.py
    │       └── server.py
    │   └── utils
    │       ├── __init__.py
    │       ├── clear_folder.py
    │       ├── date_range_builder.py
    │       ├── filter_bank.py
    │       └── gtfs_utils.py
└── tests
    ├── __init__.py
    ├── aws
        ├── __init__.py
        └── test_s3_utils.py
    ├── bus_performance_manager
        ├── __init__.py
        ├── bus_test_gtfs.csv
        ├── test_bus_convert_for_tableau.py
        ├── test_gtfs.py
        ├── test_gtfs_rt_ingestion.py
        └── test_tm_ingestion.py
    ├── conftest.py
    ├── ingestion
        ├── __init__.py
        ├── test_configuration.py
        ├── test_gtfs_compress.py
        ├── test_gtfs_converter.py
        ├── test_gtfs_rt_converter.py
        ├── test_ingest.py
        └── test_light_rail_gps.py
    ├── ingestion_tm
        ├── __init__.py
        └── test_ingest.py
    ├── performance_manager
        ├── __init__.py
        ├── test_alerts.py
        ├── test_backup_trips_match.py
        ├── test_l0_gtfs_rt_events.py
        ├── test_performance_manager.py
        └── test_static_trips_subquery.py
    ├── test_files
        ├── INCOMING
        │   ├── 2019-12-12T00_00_10_https___mbta_gtfs_s3_dev.s3.amazonaws.com_concentrate_VehiclePositions_enhanced.json
        │   ├── 2019-12-12T00_00_57_https___mbta_gtfs_s3_dev.s3.amazonaws.com_concentrate_TripUpdates_enhanced.json
        │   ├── 2022-01-01T00:00:03Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz
        │   ├── 2022-05-04T15:59:48Z_https_cdn.mbta.com_realtime_Alerts_enhanced.json.gz
        │   ├── 2022-05-05T16_00_15Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_VehiclePositions_enhanced.json.gz
        │   ├── 2022-05-08T06:04:57Z_https_cdn.mbta.com_realtime_TripUpdates_enhanced.json.gz
        │   ├── 2022-06-28T10_03_18Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_TripUpdates_enhanced.json.gz
        │   ├── 2022-07-05T12:35:16Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz
        │   ├── 2024-05-01T02:30:11Z_s3_mbta_ctd_trc_data_rtr_prod_LightRailRawGPS.json.gz
        │   ├── MBTA_GTFS.zip
        │   ├── empty.json.gz
        │   ├── large_page_obj_response.json
        │   └── one_blank_record.json.gz
        ├── PUBLIC_ARCHIVE
        │   └── lamp
        │   │   ├── bus_vehicle_events
        │   │       └── test_events.parquet
        │   │   └── gtfs_archive
        │   │       ├── 2023
        │   │           └── routes.parquet
        │   │       └── 2024
        │   │           └── routes.parquet
        ├── SPRINGBOARD
        │   ├── CALENDAR
        │   │   └── timestamp=1682375024
        │   │   │   └── f18c9f5747194660a793cf0cd6f9df90-0.parquet
        │   ├── CALENDAR_DATES
        │   │   └── timestamp=1682375024
        │   │   │   └── 7c0b0da47e284237a7b50df57e3ef33c-0.parquet
        │   ├── DIRECTIONS
        │   │   └── timestamp=1682375024
        │   │   │   └── 562949d9931149f8a5d8f0cb2eb52c80-0.parquet
        │   ├── FEED_INFO
        │   │   └── timestamp=1682375024
        │   │   │   └── e84307ae774a4d8c8968c5e38e7affdc-0.parquet
        │   ├── ROUTES
        │   │   └── timestamp=1682375024
        │   │   │   └── b4e038eb63da41fcb66eed81548f664a-0.parquet
        │   ├── ROUTE_PATTERNS
        │   │   └── timestamp=1682375024
        │   │   │   └── 57233d3677484fe1bd0373749c34cc63-0.parquet
        │   ├── RT_ALERTS
        │   │   └── year=2020
        │   │   │   └── month=2
        │   │   │       └── day=9
        │   │   │           └── hour=1
        │   │   │               └── 6ef6922c20064cb9a8f09a3b3b1d2783-0.parquet
        │   ├── RT_TRIP_UPDATES
        │   │   └── year=2023
        │   │   │   └── month=5
        │   │   │       └── day=8
        │   │   │           ├── hour=12
        │   │   │               └── 8e2c182968e24ecea3d37f03d6bae84d-0.parquet
        │   │   │           └── hour=13
        │   │   │               └── eaeee968b94b4a74b166df4b8ffd9f29-0.parquet
        │   ├── RT_VEHICLE_POSITIONS
        │   │   ├── year=2023
        │   │   │   └── month=5
        │   │   │   │   └── day=8
        │   │   │   │       ├── hour=12
        │   │   │   │           └── 1613b49e4fa1459eabe9c83553ef1045-0.parquet
        │   │   │   │       └── hour=13
        │   │   │   │           └── 9a1bb1c5269042a284b2ed57b4dfebb9-0.parquet
        │   │   └── year=2024
        │   │   │   └── month=6
        │   │   │       └── day=1
        │   │   │           ├── hour=12
        │   │   │               └── fcf91fbba92d418aa136d928c6243121-0.parquet
        │   │   │           └── hour=13
        │   │   │               └── 47ffb78637a5400aabdfd7c9c7142757-0.parquet
        │   ├── STOPS
        │   │   └── timestamp=1682375024
        │   │   │   └── 920a42ad1b5e4ef0942c7a1bc2ef2fea-0.parquet
        │   ├── STOP_TIMES
        │   │   └── timestamp=1682375024
        │   │   │   └── 88c016320de440789357f14df6399d4c-0.parquet
        │   ├── TM
        │   │   ├── STOP_CROSSING
        │   │   │   ├── 120240601.parquet
        │   │   │   └── 120240811.parquet
        │   │   ├── TMMAIN_GEO_NODE.parquet
        │   │   ├── TMMAIN_ROUTE.parquet
        │   │   ├── TMMAIN_TRIP.parquet
        │   │   └── TMMAIN_VEHICLE.parquet
        │   └── TRIPS
        │   │   └── timestamp=1682375024
        │   │       └── cdca1ec8575c4705bb93bc76244c1a86-0.parquet
        ├── april_2023_filepaths.json
        ├── before_times_filepaths.json
        ├── ingestion_BUSLOC_TU.parquet
        ├── ingestion_BUSLOC_VP.parquet
        ├── ingestion_GTFS-RT_ALERT.parquet
        ├── ingestion_GTFS-RT_TU.parquet
        ├── ingestion_GTFS-RT_TU_OLD.parquet
        ├── ingestion_GTFS-RT_VP.parquet
        ├── ingestion_GTFS-RT_VP_OLD.parquet
        ├── july_17_filepaths.json
        ├── may_8.json
        ├── pipeline_flat_out.csv
        ├── process_vp_files_flat_out.csv
        ├── replace_perf_mgr_query_test_data
        │   ├── 20250415_rt_trips_for_backup_match_subquery.csv
        │   ├── 20250415_static_trips_subquery.csv
        │   ├── staging_test_summary_sub.csv
        │   └── summary_sub.sql
        ├── short_list.json
        ├── staging_dec_10.json
        ├── tu_missing_start_date.parquet
        ├── vehicle_positions_flat_input.csv
        ├── vp_missing_start_date.csv
        └── vp_missing_start_time.csv
    ├── test_resources.py
    └── utils
        ├── test_date_range_builder.py
        ├── test_filter_bank.py
        ├── test_gtfs_utils.py
        └── timezones.py


/.env:
--------------------------------------------------------------------------------
 1 | # helper to know if env is already loaded
 2 | BOOTSTRAPPED=1
 3 | 
 4 | # metadata database
 5 | MD_DB_HOST=local_md_rds
 6 | MD_DB_PORT=5433
 7 | MD_DB_NAME=metadata
 8 | MD_DB_USER=postgres
 9 | MD_DB_PASSWORD=postgres
10 | ALEMBIC_MD_DB_NAME=metadata_prod
11 | 
12 | # performance manager database
13 | RPM_DB_HOST=local_rpm_rds
14 | RPM_DB_PORT=5434
15 | RPM_DB_NAME=performance_manager
16 | RPM_DB_USER=postgres
17 | RPM_DB_PASSWORD=postgres
18 | ALEMBIC_RPM_DB_NAME=performance_manager_prod
19 | 
20 | # MSSQL TransitMaster database
21 | TM_DB_HOST=do_update
22 | TM_DB_NAME=do_update
23 | TM_DB_USER=do_update
24 | TM_DB_PASSWORD=do_update
25 | 
26 | # s3 locations
27 | SPRINGBOARD_BUCKET=mbta-ctd-dataplatform-dev-springboard
28 | ARCHIVE_BUCKET=mbta-ctd-dataplatform-dev-archive
29 | ERROR_BUCKET=mbta-ctd-dataplatform-dev-error
30 | INCOMING_BUCKET=mbta-ctd-dataplatform-dev-incoming
31 | 
32 | # mbta-performance with personal access
33 | PUBLIC_ARCHIVE_BUCKET=mbta-ctd-dataplatform-dev-archive 
34 | 
35 | # Tableau
36 | TABLEAU_USER=DOUPDATE
37 | TABLEAU_PASSWORD=DOUPDATE
38 | TABLEAU_SERVER=http://awtabDEV02.mbta.com
39 | 


--------------------------------------------------------------------------------
/.envrc:
--------------------------------------------------------------------------------
1 | use asdf
2 | 
3 | dotenv
4 | 


--------------------------------------------------------------------------------
/.github/actions/python_deps/action.yaml:
--------------------------------------------------------------------------------
 1 | name: Setup Python Dependencies
 2 | description: Loads python dependencies for a CI/CD job, install them if not cached
 3 | 
 4 | runs:
 5 |   using: composite
 6 |   steps:
 7 |     - name: ASDF Tools Install
 8 |       uses: ./.github/actions/tools
 9 | 
10 |     - name: Python Deps Cache
11 |       uses: actions/cache@v3
12 |       id: python-cache
13 |       with:
14 |         path: |
15 |           ~/.cache/pypoetry
16 |           **/.venv
17 |         key: ${{ runner.os }}-poetry-${{ hashFiles('./poetry.lock') }}
18 | 
19 |     - name: Install Python Deps
20 |       working-directory: .
21 |       # env use python3.10 to force usage of python3.10 installed by asdf over system python version
22 |       run: |
23 |         poetry env use python3.10
24 |         poetry install -v
25 |       shell: bash
26 |       if: "!steps.python-cache.outputs.cache-hit"
27 | 


--------------------------------------------------------------------------------
/.github/actions/run_task/action.yaml:
--------------------------------------------------------------------------------
 1 | name: Manually Run ECS Task
 2 | description: Run an existing task in an existing AWS Service and Cluster
 3 | 
 4 | inputs:
 5 |   role-to-assume:
 6 |     description: IAM role
 7 |     required: true
 8 |   aws-region:
 9 |     description: AWS region to use
10 |     required: true
11 |     default: us-east-1
12 |   cluster:
13 |     description: ECS Cluster for Service
14 |     required: true
15 |   service:
16 |     description: ECS Service for task to run
17 |     required: true
18 | 
19 | runs:
20 |   using: composite
21 |   steps:
22 |     - name: Setup AWS Credentials
23 |       uses: aws-actions/configure-aws-credentials@v4
24 |       with:
25 |         role-to-assume: ${{ inputs.role-to-assume }}
26 |         aws-region: ${{ inputs.aws-region }}
27 |         mask-aws-account-id: true
28 |     - name: Start ECS Task
29 |       run: ${{ github.action_path }}/run_task.sh
30 |       shell: bash
31 |       env:
32 |         AWS_REGION: ${{ inputs.aws-region }}
33 |         CLUSTER: ${{ inputs.cluster }}
34 |         SERVICE: ${{ inputs.service }}
35 | 


--------------------------------------------------------------------------------
/.github/actions/run_task/run_task.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e -u
 3 | 
 4 | # uncomment to debug
 5 | # set -x
 6 | 
 7 | # Run an ECS Task from the provided CLUSTER and SERVICE
 8 | 
 9 | # required environment varialbes
10 | # - CLUSTER
11 | # - SERVICE
12 | 
13 | # Get the Security Groups that can run the task.
14 | echo "Retrieving SecurityGroups for SERVICE:${SERVICE} in CLUSTER:${CLUSTER}"
15 | SECURITY_GROUPS=$(aws ecs describe-services \
16 |   --services $SERVICE \
17 |   --cluster $CLUSTER \
18 |   --query services[0].networkConfiguration.awsvpcConfiguration.securityGroups \
19 |   --output text \
20 |   | sed 's/\t/,/g')
21 | echo "SECURITY GROUPS: ${SECURITY_GROUPS}"
22 | 
23 | # Get the Subnets that the task runs on.
24 | echo "Retrieving subnets for SERVICE:${SERVICE} in CLUSTER:${CLUSTER}"
25 | SUBNETS=$(aws ecs describe-services \
26 |   --services $SERVICE \
27 |   --cluster $CLUSTER \
28 |   --query services[0].networkConfiguration.awsvpcConfiguration.subnets \
29 |   --output text \
30 |   | sed 's/\t/,/g')
31 | echo "SUBNETS: ${SUBNETS}"
32 | 
33 | # Run the ECS task
34 | aws ecs run-task \
35 |   --cluster $CLUSTER \
36 |   --task-definition $SERVICE \
37 |   --launch-type FARGATE \
38 |   --count 1 \
39 |   --network-configuration "awsvpcConfiguration={subnets=[$SUBNETS],securityGroups=[$SECURITY_GROUPS],assignPublicIp=DISABLED}"
40 | 


--------------------------------------------------------------------------------
/.github/actions/tools/action.yaml:
--------------------------------------------------------------------------------
 1 | name: Setup ASDF Tools
 2 | description: Loads ASDF tools for for a CI/CD job, installing them if not cached
 3 | outputs:
 4 |   cache-hit:
 5 |     description: "Whether the ASDF cache was hit"
 6 |     value: ${{ steps.asdf-cache.outputs-cache-hit }}
 7 | runs:
 8 |   using: composite
 9 |   steps:
10 |     # cache the ASDF directory, using values from .tool-versions
11 |     - name: ASDF Tools Cache
12 |       uses: actions/cache@v3
13 |       id: asdf-cache
14 |       with:
15 |         path: ~/.asdf
16 |         # runner.os vs CACHE_UUID secret
17 |         key: ${{ runner.os}}-asdf-${{ hashFiles('**/.tool-versions') }}
18 | 
19 |     - name: Install ASDF Tools
20 |       uses: asdf-vm/actions/install@v2
21 |       if: steps.asdf-cache.outputs.cache-hit != 'true'
22 | 
23 |     - name: Re-shim ASDF Install
24 |       uses: mbta/actions/reshim-asdf@v1
25 | 


--------------------------------------------------------------------------------
/.github/dependabot.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | updates:
 4 |   - package-ecosystem: pip
 5 |     directory: "."
 6 |     schedule:
 7 |       interval: weekly
 8 |       time: "08:00"
 9 |       timezone: "America/New_York"
10 |     open-pull-requests-limit: 5
11 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | Asana Task: <asana_ticket_url>
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ad_hoc_deploy_run.yml:
--------------------------------------------------------------------------------
 1 | name: Ad-Hoc Deploy & Run
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       environment:
 7 |         description: Environment
 8 |         type: choice
 9 |         options:
10 |           - dev
11 |           - staging
12 |           - prod
13 |     secrets:
14 |       AWS_ROLE_ARN:
15 |         description: AWS_ROLE_ARN
16 |         required: true
17 | 
18 | jobs:
19 |   deploy:
20 |     uses: ./.github/workflows/deploy-base.yaml
21 |     with:
22 |       # pass the inputs from the workflow dispatch through to the deploy base. the booleans are
23 |       # converted to strings, so flip them back using fromJson function
24 |       environment: ${{ github.event.inputs.environment }}
25 |       deploy-ad-hoc: true
26 |     secrets: inherit
27 |   run_ad_hoc_task:
28 |     needs: deploy
29 |     runs-on: ubuntu-latest
30 |     permissions:
31 |       id-token: write
32 |       contents: read
33 |     steps:
34 |       - name: Checkout Branch
35 |         uses: actions/checkout@v3
36 |       - name: Run Ad-Hoc Task
37 |         uses: ./.github/actions/run_task
38 |         with:
39 |           role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
40 |           cluster: 'lamp'
41 |           service: lamp-ad-hoc-${{ inputs.environment }}
42 | 


--------------------------------------------------------------------------------
/.github/workflows/asana-pr-merged.yml:
--------------------------------------------------------------------------------
 1 | name: Move Asana Ticket after PR Merged
 2 | on:
 3 |   pull_request:
 4 |     types: [closed]
 5 | 
 6 | jobs:
 7 |   move-asana-ticket-to_done_job:
 8 |     runs-on: ubuntu-latest
 9 |     if: github.event.pull_request.merged == true
10 |     steps:
11 |       - name: Github-Asana Move Ticket Action
12 |         uses: mbta/github-asana-action@v4.3.0
13 |         with:
14 |           asana-pat: ${{ secrets.ASANA_SECRET_FOR_MOVE_ACTION }}
15 |           trigger-phrase: "Asana Task:"
16 |           target-section: "Done"
17 |           mark-complete: true
18 | 
19 |   move-asana-ticket-to_todo_job:
20 |     runs-on: ubuntu-latest
21 |     if: github.event.pull_request.merged == false
22 |     steps:
23 |       - name: Github-Asana Move Ticket Action
24 |         uses: mbta/github-asana-action@v4.3.0
25 |         with:
26 |           asana-pat: ${{ secrets.ASANA_SECRET_FOR_MOVE_ACTION }}
27 |           trigger-phrase: "Asana Task:"
28 |           target-section: "To Do"
29 | 


--------------------------------------------------------------------------------
/.github/workflows/asana-pr-opened.yml:
--------------------------------------------------------------------------------
 1 | name: Move Asana Ticket after PR Opened
 2 | on:
 3 |   pull_request:
 4 |     types: [opened, reopened]
 5 | 
 6 | jobs:
 7 |   move-asana-ticket-job:
 8 |     runs-on: ubuntu-latest
 9 |     if: ${{ !github.event.pull_request.head.repo.fork }}
10 |     steps:
11 |       - name: Github-Asana Move Ticket Action
12 |         uses: mbta/github-asana-action@v4.3.0
13 |         with:
14 |           asana-pat: ${{ secrets.ASANA_SECRET_FOR_MOVE_ACTION }}
15 |           trigger-phrase: "Asana Task:"
16 |           target-section: "In Review"
17 |           task-comment: "View Pull Request Here: "
18 | 


--------------------------------------------------------------------------------
/.github/workflows/change_task_count.yaml:
--------------------------------------------------------------------------------
 1 | name: Change Task Count
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       environment:
 7 |         type: choice
 8 |         description: What environment to change the task count for
 9 |         options:
10 |           - dev
11 |           - staging
12 |           - prod
13 |       new_count:
14 |         description: 1 to turn on 0 to turn off
15 |         required: true
16 |       application_name:
17 |         type: choice
18 |         description: What application to adjust the task count for
19 |         options:
20 |           - ingestion
21 |           - rail-performance-manager
22 |           - bus-performance-manager
23 | 
24 | jobs:
25 |   set_count:
26 |     if: |
27 |       ( github.event.inputs.new_count == 0 || github.event.inputs.new_count == 1)
28 | 
29 |     runs-on: ubuntu-latest
30 |     permissions:
31 |       id-token: write
32 |       contents: read
33 | 
34 |     steps:
35 |       - name: Configure AWS Credentials
36 |         uses: aws-actions/configure-aws-credentials@v4
37 |         with:
38 |           role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
39 |           aws-region: us-east-1
40 |       - name: Run ECS Update Service Command
41 |         # yamllint disable rule:line-length
42 |         run: >
43 |           aws ecs update-service
44 |           --cluster lamp
45 |           --service lamp-${{ github.event.inputs.application_name }}-${{ github.event.inputs.environment }}
46 |           --desired-count ${{ github.event.inputs.new_count }}
47 |         # yamllint enable
48 | 


--------------------------------------------------------------------------------
/.github/workflows/ci-yaml.yaml:
--------------------------------------------------------------------------------
 1 | name: Validate YAML
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - "**.yaml"
 9 |   pull_request:
10 |     paths:
11 |       - "**.yaml"
12 | 
13 | jobs:
14 |   build:
15 |     name: Validate YAML actions
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v3
20 |       - run: yamllint . -f parsable --strict
21 | 


--------------------------------------------------------------------------------
/.github/workflows/ci_python.yaml:
--------------------------------------------------------------------------------
  1 | name: Continuous Integration (Python)
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |     paths:
  8 |       - 'src/**'
  9 |       - 'tests/**'
 10 |       - 'pyproject.toml'
 11 |       - 'poetry.lock'
 12 |       - '.github/workflows/ci_python.yaml'
 13 |       - '.github/python_deps/action.yaml'
 14 |   pull_request:
 15 |     paths:
 16 |       - 'src/**'
 17 |       - 'tests/**'
 18 |       - 'pyproject.toml'
 19 |       - 'poetry.lock'
 20 |       - '.github/workflows/ci_python.yaml'
 21 |       - '.github/python_deps/action.yaml'
 22 | 
 23 | defaults:
 24 |   run:
 25 |     shell: bash
 26 |     working-directory: .
 27 | 
 28 | concurrency:
 29 |   group: python-ci-${{ github.ref }}
 30 |   cancel-in-progress: true
 31 | 
 32 | jobs:
 33 |   setup:
 34 |     name: Python Setup
 35 |     runs-on: ubuntu-22.04
 36 |     steps:
 37 |       - uses: actions/checkout@v3
 38 |       - uses: ./.github/actions/python_deps
 39 | 
 40 |   format:
 41 |     name: Format
 42 |     runs-on: ubuntu-22.04
 43 |     needs: setup
 44 |     steps:
 45 |       - uses: actions/checkout@v3
 46 |       - uses: ./.github/actions/python_deps
 47 | 
 48 |       - run: poetry run black . --check
 49 | 
 50 |   typing:
 51 |     name: Type Check
 52 |     runs-on: ubuntu-22.04
 53 |     needs: setup
 54 |     steps:
 55 |       - uses: actions/checkout@v3
 56 |       - uses: ./.github/actions/python_deps
 57 | 
 58 |       - run: poetry run mypy .
 59 | 
 60 |   lint:
 61 |     name: Lint
 62 |     runs-on: ubuntu-22.04
 63 |     needs: setup
 64 |     steps:
 65 |       - uses: actions/checkout@v3
 66 |       - uses: ./.github/actions/python_deps
 67 | 
 68 |       - run: poetry run pylint src tests --rcfile pyproject.toml
 69 | 
 70 |   test:
 71 |     name: Test
 72 |     runs-on: ubuntu-22.04
 73 |     needs: setup
 74 |     env:
 75 |       BOOTSTRAPPED: 1
 76 |       MD_DB_HOST: local_rds
 77 |       MD_DB_PORT: 5433
 78 |       MD_DB_NAME: metadata
 79 |       MD_DB_USER: postgres
 80 |       MD_DB_PASSWORD: postgres
 81 |       ALEMBIC_MD_DB_NAME: metadata_prod
 82 |       RPM_DB_HOST: local_rds
 83 |       RPM_DB_PORT: 5434
 84 |       RPM_DB_NAME: performance_manager
 85 |       RPM_DB_USER: postgres
 86 |       RPM_DB_PASSWORD: postgres
 87 |       ALEMBIC_RPM_DB_NAME: performance_manager_prod
 88 |     services:
 89 |       rpm_postgres:
 90 |         image: postgres:14.4
 91 |         ports:
 92 |           - 5434:5432
 93 |         env:
 94 |           POSTGRES_PASSWORD: ${{env.RPM_DB_PASSWORD}}
 95 |           POSTGRES_USER: ${{env.RPM_DB_USER}}
 96 |           POSTGRES_DB: ${{env.RPM_DB_NAME}}
 97 |         options:
 98 |           --health-cmd pg_isready
 99 |           --health-interval 10s
100 |           --health-timeout 5s
101 |           --health-retries 5
102 |       md_postgres:
103 |         image: postgres:14.4
104 |         ports:
105 |           - 5433:5432
106 |         env:
107 |           POSTGRES_PASSWORD: ${{env.MD_DB_PASSWORD}}
108 |           POSTGRES_USER: ${{env.MD_DB_USER}}
109 |           POSTGRES_DB: ${{env.MD_DB_NAME}}
110 |         options:
111 |           --health-cmd pg_isready
112 |           --health-interval 10s
113 |           --health-timeout 5s
114 |           --health-retries 5
115 |     steps:
116 |       - uses: actions/checkout@v3
117 |       - uses: ./.github/actions/python_deps
118 | 
119 |       # Execute tests and generate coverage report
120 |       - name: Run pytest With Coverage
121 |         run: |
122 |           poetry run pytest \
123 |           --cov-report lcov:coverage.info \
124 |           --cov-report term-missing \
125 |           --cov-branch \
126 |           --cov=lamp_py
127 | 
128 |       # Upload Coverage as an Artifact for Subsequent Jobs
129 |       - name: Setup LCOV
130 |         uses: hrishikesh-kadam/setup-lcov@v1
131 |       - name: Report code coverage
132 |         uses: mbta/github-actions-report-lcov@v4
133 |         with:
134 |           coverage-files: coverage.info
135 |           artifact-name: python-code-coverage
136 |           github-token: ${{ secrets.GITHUB_TOKEN }}
137 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-prod.yaml:
--------------------------------------------------------------------------------
 1 | name: Deploy to Production
 2 | 
 3 | on:
 4 |   # deploy when version tags are published
 5 |   push:
 6 |     tags:
 7 |       - v[0-9]+.[0-9]+.[0-9]+
 8 | 
 9 | jobs:
10 |   deploy:
11 |     name: Deploy to Production
12 |     concurrency:
13 |       group: prod
14 |     uses: ./.github/workflows/deploy-base.yaml
15 |     with:
16 |       environment: prod
17 |       deploy-ingestion: true
18 |       deploy-rail-pm: true
19 |       deploy-bus-pm: true
20 |       deploy-tm-ingestion: true
21 |       deploy-tableau-publisher: true
22 |     secrets: inherit
23 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-staging.yaml:
--------------------------------------------------------------------------------
 1 | name: Deploy to Staging
 2 | 
 3 | on:
 4 |   # deploy when ci has been completed on main (should occur after new commits are added to main
 5 |   # directly or via pull request)
 6 |   workflow_run:
 7 |     workflows: ["Continuous Integration (Python)"]
 8 |     types: [completed]
 9 |     branches:
10 |       - main
11 | 
12 | jobs:
13 |   deploy:
14 |     name: Deploy to Staging
15 |     concurrency:
16 |       group: staging
17 |     uses: ./.github/workflows/deploy-base.yaml
18 |     with:
19 |       environment: staging
20 |       deploy-ingestion: true
21 |       deploy-rail-pm: true
22 |       deploy-bus-pm: true
23 |       deploy-tm-ingestion: true
24 |       deploy-tableau-publisher: true
25 |     secrets: inherit
26 | 


--------------------------------------------------------------------------------
/.github/workflows/manual-deploy.yaml:
--------------------------------------------------------------------------------
 1 | name: Manual Deploy
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       environment:
 7 |         description: Environment
 8 |         type: choice
 9 |         options:
10 |           - dev
11 |           - staging
12 |           - prod
13 |       deploy-ingestion:
14 |         description: Deploy Ingestion
15 |         default: false
16 |         type: boolean
17 |       deploy-rail-pm:
18 |         description: Deploy Rail Performance Manager
19 |         default: false
20 |         type: boolean
21 |       deploy-bus-pm:
22 |         description: Deploy Bus Performance Manager
23 |         default: false
24 |         type: boolean
25 |       deploy-tm-ingestion:
26 |         description: Deploy TransitMaster Ingestion (not run on Dev)
27 |         default: false
28 |         type: boolean
29 |       deploy-tableau-publisher:
30 |         description: Deploy Tableau Publisher (not run on Dev)
31 |         default: false
32 |         type: boolean
33 | 
34 | jobs:
35 |   deploy:
36 |     concurrency:
37 |       group: github.event.inputs.environment
38 |     uses: ./.github/workflows/deploy-base.yaml
39 |     with:
40 |       # pass the inputs from the workflow dispatch through to the deploy base. the booleans are
41 |       # converted to strings, so flip them back using fromJson function
42 |       environment: ${{ github.event.inputs.environment }}
43 |       deploy-ingestion: ${{ fromJson(github.event.inputs.deploy-ingestion) }}
44 |       deploy-rail-pm: ${{ fromJson(github.event.inputs.deploy-rail-pm) }}
45 |       deploy-bus-pm: ${{ fromJson(github.event.inputs.deploy-bus-pm) }}
46 |       deploy-tm-ingestion: ${{ fromJson(github.event.inputs.deploy-tm-ingestion) }}
47 |       deploy-tableau-publisher: ${{ fromJson(github.event.inputs.deploy-tableau-publisher) }}
48 |     secrets: inherit
49 | 


--------------------------------------------------------------------------------
/.github/workflows/run-task.yaml:
--------------------------------------------------------------------------------
 1 | name: Run Task
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       environment:
 7 |         description: Environment
 8 |         type: choice
 9 |         options:
10 |           - staging
11 |           - prod
12 |       task:
13 |         description: Task
14 |         type: choice
15 |         options:
16 |           - Tableau Publisher
17 |           - Transit Master Ingestion
18 | 
19 | jobs:
20 |   run_task:
21 |     runs-on: ubuntu-latest
22 |     permissions:
23 |       id-token: write
24 |       contents: read
25 |     steps:
26 |       - name: Checkout Branch
27 |         uses: actions/checkout@v3
28 |       - name: Generate Task Name
29 |         run: |
30 |           if [ "${{ inputs.task }}" == "Tableau Publisher" ]; then
31 |             echo "task_name=tableau-publisher" >> $GITHUB_ENV
32 |           elif [ "${{ inputs.task }}" == "Transit Master Ingestion" ]; then
33 |             echo "task_name=tm-ingestion" >> $GITHUB_ENV
34 |           fi
35 |       - name: Run Task Action
36 |         uses: ./.github/actions/run_task
37 |         with:
38 |           role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
39 |           cluster: 'lamp'
40 |           service: lamp-${{ env.task_name }}-${{ inputs.environment }}
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /_build
 2 | /cover
 3 | /deps
 4 | /doc
 5 | /.fetch
 6 | erl_crash.dump
 7 | *.ez
 8 | *.beam
 9 | /config/*.secret.exs
10 | .elixir_ls/
11 | /performance_manager/test.db
12 | /notebook
13 | /investigation
14 | 
15 | 
16 | __pycache__
17 | venv
18 | *.ipynb
19 | .coverage
20 | htmlcov
21 | .DS_Store
22 | package
23 | dist
24 | *.sh
25 | .devcontainer/*
26 | .vscode/*
27 | 


--------------------------------------------------------------------------------
/.tool-versions:
--------------------------------------------------------------------------------
1 | poetry 1.7.1
2 | python 3.10.13
3 | direnv 2.32.2
4 | 


--------------------------------------------------------------------------------
/.yamllint.yml:
--------------------------------------------------------------------------------
 1 | extends: default
 2 | 
 3 | ignore-from-file: .gitignore
 4 | 
 5 | rules:
 6 |   document-start: disable
 7 |   line-length:
 8 |     max: 100
 9 |   truthy:
10 |     check-keys: false
11 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim-bookworm
 2 | 
 3 | # Keeps Python from generating .pyc files in the container
 4 | ENV PYTHONDONTWRITEBYTECODE 1
 5 | # Turns off buffering for easier container logging
 6 | ENV PYTHONUNBUFFERED 1
 7 | 
 8 | # Install non python dependencies
 9 | RUN apt-get update
10 | RUN apt-get install -y libpq-dev gcc curl gpg
11 | 
12 | # Fetch Amazon RDS certificate chain
13 | RUN curl https://truststore.pki.rds.amazonaws.com/global/global-bundle.pem -o /usr/local/share/amazon-certs.pem
14 | RUN chmod a=r /usr/local/share/amazon-certs.pem
15 | 
16 | # Install MSSQL ODBC 18 Driver
17 | # for TransitMaster DB connection and ingestion
18 | RUN mkdir -m 0755 -p /etc/apt/keyrings/ \
19 |     && curl -fsSL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor -o /etc/apt/keyrings/microsoft.gpg \
20 |     && echo "deb [signed-by=/etc/apt/keyrings/microsoft.gpg] https://packages.microsoft.com/debian/12/prod bookworm main" | tee /etc/apt/sources.list.d/mssql-release.list \
21 |     && apt-get update \
22 |     && ACCEPT_EULA=Y apt-get install -y msodbcsql18
23 | 
24 | # modify openssl config to allow TLSv1 connection
25 | # moves [openssl_init] section and creates [ssl_default_sect] section to allow TLSv1
26 | # for TransitMaster DB connection and ingestion
27 | RUN sed -i 's/\[openssl_init\]/# [openssl_init]/' /etc/ssl/openssl.cnf \
28 |     && echo '\n\n[openssl_init]\nssl_conf = ssl_sect\n\n[ssl_sect]\nsystem_default = ssl_default_sect\n\n[ssl_default_sect]\nMinProtocol = TLSv1\nCipherString = DEFAULT@SECLEVEL=0\n' >> /etc/ssl/openssl.cnf
29 | 
30 | # Create tmp directory that will mount to the ephemeral storage on ECS
31 | # Implemented to solve this problem: https://github.com/aws/amazon-ecs-agent/issues/3594
32 | # Where the reported memory usage reported up to ECS far exceeds the actual memory usage
33 | # when many reads/writes occur on a temp directory. 
34 | # Related terraform changes are here: https://github.com/mbta/devops/pull/2727 
35 | RUN mkdir -m 1777 -p /tmp
36 | VOLUME ["/tmp"]
37 | 
38 | # Install poetry
39 | RUN pip install -U pip
40 | RUN pip install "poetry==1.7.1"
41 | 
42 | # copy poetry and pyproject files and install dependencies
43 | WORKDIR /lamp/
44 | COPY poetry.lock poetry.lock
45 | COPY pyproject.toml pyproject.toml
46 | 
47 | # Tableau dependencies for arm64 cannot be resolved (since salesforce doesn't
48 | # support them yet). For that buildplatform build without those dependencies
49 | ARG TARGETARCH BUILDPLATFORM TARGETPLATFORM
50 | RUN echo "Installing python dependencies for build: ${BUILDPLATFORM} target: ${TARGETPLATFORM}"
51 | RUN if [ "$TARGETARCH" = "arm64" ]; then \
52 |     poetry install --without tableau --no-interaction --no-ansi -v ;\
53 |     else poetry install --no-interaction --no-ansi -v ;\
54 |     fi
55 | 
56 | # Copy src directory to run against and build lamp py
57 | COPY src src
58 | COPY alembic.ini alembic.ini
59 | 
60 | # Add Version information as an argument, it is provided by GHA and left to the
61 | # default for local development.
62 | ARG VERSION="v0.0.0-unknown"
63 | RUN echo "VERSION = '${VERSION}'" > src/lamp_py/__version__.py
64 | 
65 | RUN poetry install --only main --no-interaction --no-ansi -v
66 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Massachusetts Bay Transportation Authority 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/analysis/check_data_all_days_in_LAMP_ALL_RT_fields.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import datetime
 3 | import polars as pl
 4 | from pyarrow import dataset as pd
 5 | 
 6 | # ds = pd.dataset("s3://mbta-ctd-dataplatform-staging-archive/lamp/tableau/rail/LAMP_ALL_RT_fields.parquet")
 7 | ds = pd.dataset("https://performancedata.mbta.com/lamp/tableau/rail/LAMP_ALL_RT_fields.parquet")
 8 | dates = []
 9 | # todo - 30 days? 31 days?
10 | for day in range(1, 31):
11 |     date = datetime.datetime(2025, 4, day)
12 |     for bat in ds.to_batches(
13 |         batch_size=500_000, batch_readahead=5, fragment_readahead=0, columns=["service_date", "route_id"]
14 |     ):
15 |         # breakpoint()
16 |         pls = pl.from_arrow(bat)
17 |         res = pls.filter(pl.col("service_date") == date)
18 |         # breakpoint()
19 |         if res.height > 0:
20 |             dates.append(date)
21 |             print(f"ok: {date}, {res.height}")
22 |         # print(".")÷
23 | 
24 | assert all([(dates[i + 1] - dates[i]).days < 2 for i in range(len(dates) - 1)])
25 | 


--------------------------------------------------------------------------------
/analysis/prism.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from lamp_py.aws.s3 import file_list_from_s3_with_details
 4 | from lamp_py.runtime_utils.remote_files import S3_SPRINGBOARD
 5 | 
 6 | # Problem: Looking into data quality issues is a very adhoc process right now. Expertise/knowledge
 7 | # not centralized in code that is easily runnable (it's mostly in the app itself)
 8 | 
 9 | # Solution: # Prism.py (working name...) "See the Rainbow" - WIP entry point to analysis suite to
10 | # organize tools for looking at LAMP data products inputs and outputs
11 | 
12 | files = file_list_from_s3_with_details(bucket_name="mbta-ctd-dataplatform-staging-archive", file_prefix="lamp/tableau/")
13 | 
14 | print(files)
15 | breakpoint()
16 | 
17 | for f in files:
18 |     print(f"{os.path.basename(f['s3_obj_path'])}: sz: {f['size_bytes']} last mod: {f['last_modified']}")
19 | 
20 | # detect data source from what
21 | # returned object contains methods that are available given the input data
22 | 
23 | # ideas...
24 | # e.g. prism(some_data_from_springboard)
25 | #     - detect that it is Vehicle Positions file from path
26 | #     - load it up
27 | #     - implementations of various analysis chosen for VP
28 | 
29 | # https://docs.python.org/3/library/functools.html#functools.singledispatch
30 | 


--------------------------------------------------------------------------------
/architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/architecture.jpg


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 | 
 5 |   rail_pm_rds:
 6 |     container_name: ${RPM_DB_HOST}
 7 |     image: postgres:14.4
 8 |     env_file: .env
 9 |     shm_size: '2gb'
10 |     environment:
11 |       POSTGRES_DB: ${RPM_DB_NAME}
12 |       POSTGRES_PASSWORD: ${RPM_DB_PASSWORD}
13 |     ports:
14 |       - "${RPM_DB_PORT}:5432"
15 |     command: ["postgres", "-c", "log_statement=all"]
16 | 
17 |   metadata_rds:
18 |     container_name: ${MD_DB_HOST}
19 |     image: postgres:15
20 |     env_file: .env
21 |     shm_size: '2gb'
22 |     environment:
23 |       POSTGRES_DB: ${MD_DB_NAME}
24 |       POSTGRES_PASSWORD: ${MD_DB_PASSWORD}
25 |     ports:
26 |       - "${MD_DB_PORT}:5432"
27 |     command: ["postgres", "-c", "log_statement=all"]
28 | 
29 |   performance_manager:
30 |     container_name: performance_manager
31 |     env_file: .env
32 |     build: .
33 |     depends_on:
34 |       - rail_pm_rds
35 |       - metadata_rds
36 |     working_dir: /lamp
37 |     volumes:
38 |       - ~/.aws:/root/.aws:ro  # map credentials to be used by boto3, read-only
39 |     command: ["poetry", "run", "performance_manager"]
40 | 
41 |   bus_performance_manager:
42 |     container_name: bus_performance_manager
43 |     env_file: .env
44 |     build: .
45 |     depends_on:
46 |       - metadata_rds
47 |     working_dir: /lamp
48 |     volumes:
49 |       - ~/.aws:/root/.aws:ro  # map credentials to be used by boto3, read-only
50 |     command: ["poetry", "run", "bus_performance_manager"]
51 | 
52 |   seed_metadata:
53 |     container_name: seed_metadata
54 |     env_file: .env
55 |     build: .
56 |     depends_on:
57 |       - rail_pm_rds
58 |       - metadata_rds
59 |     working_dir: /lamp
60 |     volumes:
61 |       # map credentials to be used by boto3, read-only
62 |       - ~/.aws:/root/.aws:ro
63 |       # add in filepath json that will be the default seed file path
64 |       - ./tests/test_files/staging_dec_10.json:/seed_paths.json
65 |     # entrypoint passes in seed file thats added as a volume. if you want to use a different
66 |     # filepath run
67 |     # docker-compose run -v /path/to/files.json:/seed.json seed_metadata --seed-file /seed.json
68 |     entrypoint:
69 |       [
70 |         "poetry",
71 |         "run",
72 |         "seed_metadata",
73 |         "--clear-static",
74 |         "--clear-rt",
75 |         "--seed-file",
76 |         "/seed_paths.json"
77 |       ]
78 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "lamp_py"
  3 | version = "0.1.0"
  4 | description = "Lightweight Application for Monitoring Performance"
  5 | authors = [
  6 |   "MBTA CTD <developer@mbta.com>",
  7 |   "Ryan Rymarczyk <rrymarczyk@mbta.com>",
  8 |   "Mike Zappitello <mzappitello@mbta.com>",
  9 |   "Henry Huang <hhuang@mbta.com>",
 10 | ]
 11 | 
 12 | [tool.poetry.scripts]
 13 | ingestion = 'lamp_py.ingestion.pipeline:start'
 14 | performance_manager = 'lamp_py.performance_manager.pipeline:start'
 15 | bus_performance_manager = 'lamp_py.bus_performance_manager.pipeline:start'
 16 | seed_metadata = 'lamp_py.postgres.seed_metadata:run'
 17 | hyper_update = 'lamp_py.tableau.pipeline:start_hyper_updates'
 18 | transit_master_ingestion = 'lamp_py.ingestion_tm.pipeline:start'
 19 | ad_hoc = 'lamp_py.ad_hoc.pipeline:start'
 20 | 
 21 | [tool.poetry.dependencies]
 22 | python = "^3.10"
 23 | SQLAlchemy = "^2.0.30"
 24 | pyarrow = "^19.0.1"
 25 | boto3 = "^1.35.2"
 26 | pandas = "^2.2.1"
 27 | numpy = "^1.26.4"
 28 | psycopg2 = "^2.9.3"
 29 | psutil = "^5.9.8"
 30 | schedule = "^1.1.0"
 31 | alembic = "^1.10.2"
 32 | types-pytz = "^2024.1.0.20240203"
 33 | pyodbc = "^5.1.0"
 34 | polars = "^1.3.0"
 35 | 
 36 | [tool.poetry.group.tableau]
 37 | optional = false
 38 | 
 39 | [tool.poetry.group.tableau.dependencies]
 40 | tableauhyperapi = "^0.0.21408"
 41 | tableauserverclient = "0.30"
 42 | 
 43 | [tool.poetry.group.investigation]
 44 | optional = true
 45 | 
 46 | [tool.poetry.group.investigation.dependencies]
 47 | ipykernel = "^6.29.4"
 48 | matplotlib = "^3.9.0"
 49 | seaborn = "^0.13.2"
 50 | tabulate = "^0.9.0"
 51 | 
 52 | [tool.poetry.group.dev.dependencies]
 53 | black = "^24.3.0"
 54 | mypy = "^1.1.1"
 55 | pylint = "^3.2.6"
 56 | pytest = "^8.3.2"
 57 | pytest-cov = "^5.0.0"
 58 | types-python-dateutil = "^2.9.0.20240316"
 59 | pytest-env = "^1.1.3"
 60 | 
 61 | [build-system]
 62 | requires = ["poetry-core>=1.0.0"]
 63 | build-backend = "poetry.core.masonry.api"
 64 | 
 65 | [tool.black]
 66 | line-length = 120
 67 | target-version = ['py310']
 68 | 
 69 | [tool.mypy]
 70 | disallow_untyped_defs = true
 71 | ignore_missing_imports = true
 72 | plugins = ["sqlalchemy.ext.mypy.plugin"]
 73 | pretty = true
 74 | python_version = "3.10"
 75 | warn_unreachable = true
 76 | warn_unused_ignores = true
 77 | exclude = ["investigation/", "runners"]
 78 | 
 79 | [tool.pytest]
 80 | log_cli = true
 81 | log_cli_level = "DEBUG"
 82 | verbose = true
 83 | 
 84 | [tool.pytest.ini_options]
 85 | env = [
 86 |   "SPRINGBOARD_BUCKET=SPRINGBOARD",
 87 |   "PUBLIC_ARCHIVE_BUCKET=PUBLIC_ARCHIVE",
 88 |   "INCOMING_BUCKET=INCOMING",
 89 | ]
 90 | 
 91 | [tool.pylint]
 92 | disable = [
 93 |   # disable doc string requirements
 94 |   "missing-module-docstring",
 95 |   # allow catching a generic exception
 96 |   "broad-except",
 97 |   # caught by black
 98 |   "line-too-long",
 99 |   # we're logging everything so its "ok"
100 |   "lost-exception",
101 |   # for some reason Iterable[type] is triggering this error on github
102 |   "unsubscriptable-object",
103 |   # Converter abstract base class only has one common function
104 |   "too-few-public-methods",
105 |   # l1_rt_trips.py over 1000 lines
106 |   "too-many-lines",
107 | ]
108 | good-names = ["e", "i", "s"]
109 | max-line-length = 120
110 | min-similarity-lines = 10
111 | # ignore session maker as it gives pylint fits
112 | # https://github.com/PyCQA/pylint/issues/7090
113 | ignored-classes = ['sqlalchemy.orm.session.sessionmaker', 'pyarrow.compute']
114 | # ignore the migrations directory. its going to have duplication and _that is ok_.
115 | ignore-paths = ["^src/lamp_py/migrations/.*$"]
116 | 


--------------------------------------------------------------------------------
/runners/run_glides_parquet_converter.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from lamp_py.tableau.hyper import HyperJob
 4 | from lamp_py.tableau.jobs.glides import HyperGlidesOperatorSignIns, HyperGlidesTripUpdates
 5 | 
 6 | 
 7 | # don't run this in pytest - environment variables in pyproject.toml point to local SPRINGBOARD/ARCHIVE
 8 | # need the .env values to run
 9 | def start_glides_parquet_updates() -> None:
10 |     """Run all Glides Parquet Update jobs"""
11 | 
12 |     parquet_update_jobs: List[HyperJob] = [
13 |         HyperGlidesTripUpdates(),
14 |         HyperGlidesOperatorSignIns(),
15 |     ]
16 | 
17 |     for job in parquet_update_jobs:
18 |         breakpoint()
19 |         job.run_parquet(None)
20 |         outs = job.create_local_hyper()
21 |         print(outs)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     start_glides_parquet_updates()
26 | 


--------------------------------------------------------------------------------
/runners/run_query_s3_with_date_range.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from lamp_py.aws.s3 import file_list_from_s3_date_range
 4 | from lamp_py.runtime_utils.remote_files import LAMP, S3_SPRINGBOARD
 5 | from lamp_py.runtime_utils.remote_files import springboard_rt_vehicle_positions
 6 | 
 7 | template = "year={yy}/month={mm}/day={dd}/"
 8 | end_date = datetime.now()
 9 | end_date = datetime(year=2025, month=3, day=30)
10 | start_date = end_date - timedelta(days=15)  # type: ignore
11 | 
12 | breakpoint()
13 | s3_uris = file_list_from_s3_date_range(
14 |     bucket_name=S3_SPRINGBOARD,
15 |     file_prefix=springboard_rt_vehicle_positions.prefix,
16 |     path_template=template,
17 |     end_date=end_date,
18 |     start_date=start_date,
19 | )
20 | 
21 | print(s3_uris)
22 | 


--------------------------------------------------------------------------------
/runners/run_static_trips_subquery.py:
--------------------------------------------------------------------------------
1 | from lamp_py.performance_manager.l1_cte_statements import static_trips_subquery_pl
2 | 
3 | 
4 | static_trips_sub_res = static_trips_subquery_pl(20250415).sort(by="static_trip_id")
5 | 
6 | static_trips_sub_res.write_csv("20250415_static_trips_subquery.csv")
7 | 


--------------------------------------------------------------------------------
/src/lamp_py/__version__.py:
--------------------------------------------------------------------------------
1 | # this is just a stub needed for imports to work correctly.
2 | #
3 | # this file will be overwritten in a docker image
4 | VERSION = "v0.0.0-unknown"
5 | 


--------------------------------------------------------------------------------
/src/lamp_py/ad_hoc/__init__.py:
--------------------------------------------------------------------------------
1 | """location for all ad-hoc process runner scripts"""
2 | 


--------------------------------------------------------------------------------
/src/lamp_py/ad_hoc/pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import logging
 4 | import os
 5 | 
 6 | from lamp_py.aws.ecs import check_for_parallel_tasks
 7 | from lamp_py.runtime_utils.env_validation import validate_environment
 8 | 
 9 | from lamp_py.ad_hoc.runner_001 import runner
10 | 
11 | logging.getLogger().setLevel("INFO")
12 | DESCRIPTION = """Entry Point For Ad-Hoc Runner"""
13 | 
14 | 
15 | def start() -> None:
16 |     """configure and start the ad-hoc runner"""
17 |     # configure the environment
18 |     os.environ["SERVICE_NAME"] = "ad_hoc"
19 | 
20 |     validate_environment(
21 |         required_variables=[
22 |             "ARCHIVE_BUCKET",
23 |             "ERROR_BUCKET",
24 |             "INCOMING_BUCKET",
25 |             "PUBLIC_ARCHIVE_BUCKET",
26 |             "SPRINGBOARD_BUCKET",
27 |         ],
28 |         db_prefixes=["MD", "RPM"],
29 |     )
30 | 
31 |     check_for_parallel_tasks()
32 | 
33 |     # run the main method
34 |     runner()
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     start()
39 | 


--------------------------------------------------------------------------------
/src/lamp_py/aws/__init__.py:
--------------------------------------------------------------------------------
1 | """ Suite of utilities for dealing with AWS infrastructure """
2 | 


--------------------------------------------------------------------------------
/src/lamp_py/aws/ecs.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import sys
 4 | from multiprocessing import Process
 5 | from queue import Queue
 6 | from typing import Any, Optional
 7 | 
 8 | import boto3
 9 | 
10 | from lamp_py.runtime_utils.process_logger import ProcessLogger
11 | 
12 | 
13 | def handle_ecs_sigterm(_: int, __: Any) -> None:
14 |     """
15 |     handler function for when ECS recieves ECS SIGTERM
16 |     """
17 |     process_logger = ProcessLogger("sigterm_received")
18 |     process_logger.log_start()
19 |     os.environ["GOT_SIGTERM"] = "TRUE"
20 |     process_logger.log_complete()
21 | 
22 | 
23 | def check_for_sigterm(
24 |     metadata_queue: Optional[Queue[Optional[str]]] = None,
25 |     rds_process: Optional[Process] = None,
26 | ) -> None:
27 |     """
28 |     check if SIGTERM recived from ECS. If found, terminate process.
29 |     """
30 |     if os.environ.get("GOT_SIGTERM") is not None:
31 |         process_logger = ProcessLogger("stopping_ecs")
32 |         process_logger.log_start()
33 | 
34 |         # send signal to stop rds writer process and wait for exit
35 |         if metadata_queue is not None:
36 |             metadata_queue.put(None)
37 |         if rds_process is not None:
38 |             rds_process.join()
39 | 
40 |         process_logger.log_complete()
41 | 
42 |         # delay for log statements to write before ecs death
43 |         time.sleep(5)
44 | 
45 |         sys.exit()
46 | 
47 | 
48 | def running_in_aws() -> bool:
49 |     """
50 |     return True if running on aws, else False
51 |     """
52 |     return bool(os.getenv("AWS_DEFAULT_REGION"))
53 | 
54 | 
55 | def check_for_parallel_tasks() -> None:
56 |     """
57 |     Check that that this task is not already running on ECS
58 |     """
59 |     if not running_in_aws():
60 |         return
61 | 
62 |     process_logger = ProcessLogger("check_for_tasks")
63 |     process_logger.log_start()
64 | 
65 |     client = boto3.client("ecs")
66 |     ecs_cluster = os.environ["ECS_CLUSTER"]
67 |     ecs_task_group = os.environ["ECS_TASK_GROUP"]
68 | 
69 |     try:
70 |         # get all of the tasks running on the cluster
71 |         task_arns = client.list_tasks(cluster=ecs_cluster)["taskArns"]
72 | 
73 |         # if tasks are running on the cluster, get their descriptions and check to
74 |         # count matches the ecs task group.
75 |         match_count = 0
76 |         if task_arns:
77 |             running_tasks = client.describe_tasks(cluster=ecs_cluster, tasks=task_arns)["tasks"]
78 | 
79 |             for task in running_tasks:
80 |                 if ecs_task_group == task["group"]:
81 |                     match_count += 1
82 | 
83 |         # if the group matches, raise an exception that will terminate the process
84 |         if match_count > 1:
85 |             raise SystemError(f"Multiple {ecs_task_group} ECS Tasks Running in {ecs_cluster}")
86 | 
87 |     except Exception as exception:
88 |         process_logger.log_failure(exception)
89 |         raise exception
90 | 
91 |     process_logger.log_complete()
92 | 


--------------------------------------------------------------------------------
/src/lamp_py/bus_performance_manager/README.md:
--------------------------------------------------------------------------------
1 | # Bus Performance Manager
2 | 
3 | The Bus Performance Manager is an application to measure bus performance on the MBTA transit system.
4 | 


--------------------------------------------------------------------------------
/src/lamp_py/bus_performance_manager/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Pipeline for creating bus performance manager metrics 
3 | """
4 | 


--------------------------------------------------------------------------------
/src/lamp_py/bus_performance_manager/pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import logging
 5 | import os
 6 | import sched
 7 | import signal
 8 | import sys
 9 | import time
10 | from typing import List
11 | 
12 | from lamp_py.aws.ecs import handle_ecs_sigterm, check_for_sigterm
13 | from lamp_py.runtime_utils.env_validation import validate_environment
14 | from lamp_py.runtime_utils.process_logger import ProcessLogger
15 | from lamp_py.bus_performance_manager.write_events import write_bus_metrics
16 | from lamp_py.tableau.pipeline import start_bus_parquet_updates
17 | 
18 | logging.getLogger().setLevel("INFO")
19 | 
20 | DESCRIPTION = """Entry Point to Bus Performance Manager"""
21 | 
22 | 
23 | def parse_args(args: List[str]) -> argparse.Namespace:
24 |     """parse args for running this entrypoint script"""
25 |     parser = argparse.ArgumentParser(description=DESCRIPTION)
26 |     parser.add_argument(
27 |         "--interval",
28 |         default=300,
29 |         dest="interval",
30 |         help="interval to run event loop on",
31 |     )
32 | 
33 |     return parser.parse_args(args)
34 | 
35 | 
36 | def main(args: argparse.Namespace) -> None:
37 |     """entrypoint into performance manager event loop"""
38 |     main_process_logger = ProcessLogger("main", **vars(args))
39 |     main_process_logger.log_start()
40 | 
41 |     # schedule object that will control the "event loop"
42 |     scheduler = sched.scheduler(time.time, time.sleep)
43 | 
44 |     # function to call each time on the event loop, rescheduling the loop at the
45 |     # end of each iteration
46 |     def iteration() -> None:
47 |         """function to invoke on a scheduled routine"""
48 |         check_for_sigterm()
49 |         process_logger = ProcessLogger("event_loop")
50 |         process_logger.log_start()
51 |         try:
52 |             write_bus_metrics()
53 |             start_bus_parquet_updates()
54 |             process_logger.log_complete()
55 |         except Exception as exception:
56 |             process_logger.log_failure(exception)
57 |         finally:
58 |             scheduler.enter(int(args.interval), 1, iteration)
59 | 
60 |     # schedule the initial loop and start the scheduler
61 |     scheduler.enter(0, 1, iteration)
62 |     scheduler.run()
63 |     main_process_logger.log_complete()
64 | 
65 | 
66 | def start() -> None:
67 |     """configure and start the bus performance manager process"""
68 |     # parse arguments from the command line
69 |     parsed_args = parse_args(sys.argv[1:])
70 | 
71 |     # setup handling shutdown commands
72 |     signal.signal(signal.SIGTERM, handle_ecs_sigterm)
73 | 
74 |     # configure the environment
75 |     os.environ["SERVICE_NAME"] = "bus_performance_manager"
76 |     validate_environment(
77 |         required_variables=[
78 |             "SPRINGBOARD_BUCKET",
79 |             "PUBLIC_ARCHIVE_BUCKET",
80 |             "SERVICE_NAME",
81 |         ],
82 |     )
83 | 
84 |     # run main method
85 |     main(parsed_args)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     start()
90 | 


--------------------------------------------------------------------------------
/src/lamp_py/bus_performance_manager/write_events.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | from lamp_py.bus_performance_manager.event_files import event_files_to_load
 5 | from lamp_py.bus_performance_manager.events_metrics import bus_performance_metrics
 6 | from lamp_py.runtime_utils.lamp_exception import LampExpectedNotFoundError, LampInvalidProcessingError
 7 | from lamp_py.runtime_utils.remote_files import bus_events
 8 | from lamp_py.runtime_utils.remote_files import VERSION_KEY
 9 | from lamp_py.runtime_utils.process_logger import ProcessLogger
10 | from lamp_py.aws.s3 import upload_file
11 | 
12 | 
13 | def write_bus_metrics() -> None:
14 |     """
15 |     Write bus-performance parquet files to S3 for service dates neeing to be processed
16 |     """
17 |     logger = ProcessLogger("write_bus_metrics")
18 |     logger.log_start()
19 | 
20 |     event_files = event_files_to_load()
21 |     logger.add_metadata(service_date_count=len(event_files))
22 | 
23 |     for service_date in event_files.keys():
24 |         gtfs_files = event_files[service_date]["gtfs_rt"]
25 |         tm_files = event_files[service_date]["transit_master"]
26 | 
27 |         day_logger = ProcessLogger(
28 |             "write_bus_metrics_day",
29 |             service_date=service_date,
30 |             gtfs_file_count=len(gtfs_files),
31 |             tm_file_count=len(tm_files),
32 |         )
33 |         day_logger.log_start()
34 | 
35 |         # need gtfs_rt files to run process
36 |         if len(gtfs_files) == 0:
37 |             day_logger.log_failure(FileNotFoundError(f"No RT_VEHICLE_POSITION files found for {service_date}"))
38 |             continue
39 | 
40 |         try:
41 |             events_df = bus_performance_metrics(service_date, gtfs_files, tm_files)
42 |             day_logger.add_metadata(bus_performance_rows=events_df.shape[0])
43 | 
44 |             with tempfile.TemporaryDirectory() as tempdir:
45 |                 write_file = f"{service_date.strftime('%Y%m%d')}.parquet"
46 |                 events_df.write_parquet(os.path.join(tempdir, write_file), use_pyarrow=True)
47 | 
48 |                 upload_file(
49 |                     file_name=os.path.join(tempdir, write_file),
50 |                     object_path=os.path.join(bus_events.s3_uri, write_file),
51 |                     extra_args={"Metadata": {VERSION_KEY: bus_events.version}},
52 |                 )
53 | 
54 |         except LampExpectedNotFoundError as exception:
55 |             # service_date not found = ExpectedNotFound
56 |             day_logger.add_metadata(skipped_day=exception)
57 |             continue
58 |         except LampInvalidProcessingError as exception:
59 |             # num service date > 1 = InvalidProcessing (this should never happen)
60 |             day_logger.log_failure(exception)
61 |         except Exception as exception:
62 |             day_logger.log_failure(exception)
63 | 
64 |         day_logger.log_complete()
65 | 
66 |     logger.log_complete()
67 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/README.md:
--------------------------------------------------------------------------------
 1 | # Ingestion
 2 | 
 3 | Ingestion is an application to transform and aggregate GTFS-RT and GTFS Static files into parquet files for storage in AWS S3 buckets.
 4 | 
 5 | ## Application Operation
 6 | 
 7 | Ingestion operates with a chronologic event loop with a 5 minute delay between each iteration.
 8 | 
 9 | Ingestion connects to the [Performance Manager](../performance_manager/README.md) application via the `metadata_log` table of the Metadata RDS. When Ingestion creates a new parquet file, the S3 path of that file is written to the `metadata_log` table for Performance Manager to process.
10 | 
11 | For each event loop, GTFS Static files are processed prior to any GTFS-RT files, when available.
12 | 
13 | ## Event Loop Summary
14 | 
15 | 1. List all files from `incoming` S3 bucket
16 | 2. Bucket files into applicable `Converter` class
17 | 3. Start `converter` loop of each `Converter` class, creating parquet files
18 | 4. Write parquet file to S3 Bucket
19 | 5. Write S3 path of parquet file to `metadata_log` table for Performance Manager
20 | 6. Move successfully processed `incoming` files to `archive` bucket
21 | 7. Move un-successfully processed `incoming` files to `error` bucket
22 | 
23 | # GTFS Static
24 | 
25 | [GTFS Static](https://www.mbta.com/developers/gtfs) Zip files are generated by MBTA for internal and external distribution. 
26 | 
27 | This application converts GTFS Zip files to partitioned parquet files that are exported to an S3 bucket. This is done with the [GTFS Converter Class](./convert_gtfs.py).
28 | 
29 | GTFS Static parquet files are written to S3 with the following partitioning:
30 | 
31 | * [GTFS File Type](https://github.com/mbta/gtfs-documentation/blob/master/reference/gtfs.md#gtfs-files)
32 | * timestamp = datetime extracted from `feed_version` column of [feed_info.txt](https://github.com/mbta/gtfs-documentation/blob/master/reference/gtfs.md#feed_infotxt), converted to UNIX timestamp
33 | 
34 | # GTFS-RT Data
35 | 
36 | [GTFS-realtime](https://www.mbta.com/developers/gtfs-realtime) (GTFS-RT) is provided by MBTA as an industry standard for distributing realtime transit data. 
37 | 
38 | The CTD [Delta](https://github.com/mbta/delta) application is responsible for reading GTFS-RT updates from the MBTA [V3 API](https://www.mbta.com/developers/v3-api) and saving them to an AWS S3 Bucket, as gzipped JSON files, for use by LAMP.
39 | 
40 | This application aggregates gzipped GTFS-RT update files, saved on S3 by Delta, into partitioned parquet files that are exported to an S3 bucket. The parquet files are partitioned daily, by GTFS-RT feed type. This is done with the [GTFS-RT Converter Class](./convert_gtfs_rt.py)
41 | 
42 | GTFS-RT parquet files are transformed and partitioned based on their `Converter Class` configuration:
43 | 
44 | * [Busloc Trip Updates](./config_busloc_trip.py)
45 | * [Busloc Vehicle Positions](./config_busloc_vehicle.py)
46 | * [Realtime Vehicle Positions](./config_rt_vehicle.py)
47 | * [Realtime Trip Updates](./config_rt_trip.py)
48 | * [Sevice Alerts](./config_rt_alerts.py)
49 | 
50 | # Compressed GTFS Archive Files
51 | 
52 | GTFS Zip files are converted to yearly partitioned parquet files, using a differential compression process, and exported to AWS S3 for publishing/storage.
53 | 
54 | For more Information about these files, please see: https://performancedata.mbta.com/
55 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Pipeline for processing ingesting GTFS static schedule files and GTFS real time
3 | files from an s3 bucket. The realtime files are collapsed into parquet files
4 | for long term storage.
5 | """
6 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/compress_gtfs/__init__.py:
--------------------------------------------------------------------------------
1 | """Tools to compress GTFS schedules into parquet files"""
2 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/compress_gtfs/pipe.py:
--------------------------------------------------------------------------------
1 | from lamp_py.ingestion.compress_gtfs.gtfs_to_parquet import gtfs_to_parquet
2 | 
3 | if __name__ == "__main__":
4 |     gtfs_to_parquet()
5 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/compress_gtfs/pq_to_sqlite.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sqlite3
 3 | 
 4 | import pyarrow
 5 | import pyarrow.dataset as pd
 6 | 
 7 | from lamp_py.runtime_utils.process_logger import ProcessLogger
 8 | from lamp_py.ingestion.utils import gzip_file
 9 | 
10 | 
11 | def sqlite_type(pq_type: str) -> str:
12 |     """
13 |     return SQLITE type from pyarrow Field type
14 |     """
15 |     if "int" in pq_type:
16 |         return "INTEGER"
17 |     if "bool" in pq_type:
18 |         return "INTEGER"
19 |     if "float" in pq_type:
20 |         return "REAL"
21 |     if "double" in pq_type:
22 |         return "REAL"
23 |     return "TEXT"
24 | 
25 | 
26 | def sqlite_table_query(table_name: str, schema: pyarrow.Schema) -> str:
27 |     """
28 |     return CREATE TABLE query for sqlite table from pyarrow schema
29 |     """
30 |     logger = ProcessLogger("sqlite_create_table")
31 |     logger.log_start()
32 |     field_list = [f"{field.name} {sqlite_type(str(field.type))}" for field in schema]
33 |     query = f"""
34 |         CREATE TABLE 
35 |         IF NOT EXISTS 
36 |         {table_name}
37 |         (
38 |             {','.join(field_list)}
39 |         );
40 |     """
41 |     logger.log_complete()
42 |     return query
43 | 
44 | 
45 | def pq_folder_to_sqlite(year_path: str) -> None:
46 |     """
47 |     load all files from year_path folder into SQLITE3 db file
48 |     """
49 |     logger = ProcessLogger("pq_to_sqlite", year_path=year_path)
50 |     logger.log_start()
51 | 
52 |     db_path = os.path.join(year_path, "GTFS_ARCHIVE.db")
53 |     if os.path.exists(db_path):
54 |         os.remove(db_path)
55 |     try:
56 |         for file in os.listdir(year_path):
57 |             if ".parquet" not in file:
58 |                 continue
59 |             logger.add_metadata(current_file=file)
60 | 
61 |             ds = pd.dataset(os.path.join(year_path, file))
62 | 
63 |             table = file.replace(".parquet", "")
64 |             columns = [f":{col}" for col in ds.schema.names]
65 |             insert_query = f"INSERT INTO {table} VALUES({','.join(columns)});"
66 | 
67 |             conn = sqlite3.connect(db_path)
68 |             with conn:
69 |                 conn.execute(sqlite_table_query(table, ds.schema))
70 |             with conn:
71 |                 for batch in ds.to_batches(batch_size=250_000):
72 |                     conn.executemany(insert_query, batch.to_pylist())
73 |             conn.close()
74 | 
75 |         gzip_file(db_path)
76 | 
77 |         logger.log_complete()
78 |     except Exception as exception:
79 |         logger.log_failure(exception)
80 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/config_busloc_trip.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | import pyarrow
 3 | 
 4 | from lamp_py.ingestion.gtfs_rt_detail import GTFSRTDetail
 5 | from lamp_py.ingestion.gtfs_rt_structs import (
 6 |     trip_descriptor,
 7 |     vehicle_descriptor,
 8 |     stop_time_event,
 9 | )
10 | from lamp_py.ingestion.utils import explode_table_column, flatten_schema
11 | 
12 | 
13 | class RtBusTripDetail(GTFSRTDetail):
14 |     """
15 |     Detail for how to convert RT GTFS Trip Updates from json entries into
16 |     parquet tables.
17 |     """
18 | 
19 |     def transform_for_write(self, table: pyarrow.table) -> pyarrow.table:
20 |         """modify table schema before write to parquet"""
21 |         return flatten_schema(explode_table_column(flatten_schema(table), "trip_update.stop_time_update"))
22 | 
23 |     @property
24 |     def partition_column(self) -> str:
25 |         return "trip_update.trip.route_id"
26 | 
27 |     @property
28 |     def import_schema(self) -> pyarrow.schema:
29 |         return pyarrow.schema(
30 |             [
31 |                 ("id", pyarrow.string()),
32 |                 (
33 |                     "trip_update",
34 |                     pyarrow.struct(
35 |                         [
36 |                             (
37 |                                 "timestamp",
38 |                                 pyarrow.uint64(),
39 |                             ),  # Not currently provided by Busloc
40 |                             (
41 |                                 "delay",
42 |                                 pyarrow.int32(),
43 |                             ),  # Not currently provided by Busloc
44 |                             (
45 |                                 "trip",
46 |                                 trip_descriptor,
47 |                             ),  # Busloc currently only provides trip_id, route_id and schedule_relationship
48 |                             (
49 |                                 "vehicle",
50 |                                 vehicle_descriptor,
51 |                             ),  # Busloc currently only provides id and label
52 |                             (
53 |                                 "stop_time_update",
54 |                                 pyarrow.list_(
55 |                                     pyarrow.struct(
56 |                                         [
57 |                                             ("stop_sequence", pyarrow.uint32()),
58 |                                             ("stop_id", pyarrow.string()),
59 |                                             ("arrival", stop_time_event),
60 |                                             ("departure", stop_time_event),
61 |                                             (
62 |                                                 "schedule_relationship",
63 |                                                 pyarrow.string(),
64 |                                             ),
65 |                                             ("cause_id", pyarrow.uint16()),
66 |                                             (
67 |                                                 "cause_description",
68 |                                                 pyarrow.string(),
69 |                                             ),
70 |                                             ("remark", pyarrow.string()),
71 |                                         ]
72 |                                     )
73 |                                 ),
74 |                             ),
75 |                         ]
76 |                     ),
77 |                 ),
78 |             ]
79 |         )
80 | 
81 |     @property
82 |     def table_sort_order(self) -> List[Tuple[str, str]]:
83 |         return [
84 |             ("trip_update.trip.route_pattern_id", "ascending"),
85 |             ("trip_update.trip.direction_id", "ascending"),
86 |             ("trip_update.vehicle.id", "ascending"),
87 |         ]
88 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/config_busloc_vehicle.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | import pyarrow
 3 | 
 4 | from .gtfs_rt_detail import GTFSRTDetail
 5 | from .gtfs_rt_structs import (
 6 |     position,
 7 |     vehicle_descriptor,
 8 |     trip_descriptor,
 9 | )
10 | 
11 | 
12 | class RtBusVehicleDetail(GTFSRTDetail):
13 |     """
14 |     Detail for how to convert RT GTFS Bus Vehicle Positions from json
15 |     entries into parquet tables.
16 |     """
17 | 
18 |     @property
19 |     def partition_column(self) -> str:
20 |         return "vehicle.trip.route_id"
21 | 
22 |     @property
23 |     def import_schema(self) -> pyarrow.schema:
24 |         return pyarrow.schema(
25 |             [
26 |                 ("id", pyarrow.string()),
27 |                 ("is_deleted", pyarrow.bool_()),
28 |                 (
29 |                     "vehicle",
30 |                     pyarrow.struct(
31 |                         [
32 |                             ("position", position),
33 |                             ("location_source", pyarrow.string()),
34 |                             ("timestamp", pyarrow.uint64()),
35 |                             ("trip", trip_descriptor),
36 |                             ("vehicle", vehicle_descriptor),
37 |                             (
38 |                                 "operator",
39 |                                 pyarrow.struct(
40 |                                     [
41 |                                         ("id", pyarrow.string()),
42 |                                         ("first_name", pyarrow.string()),
43 |                                         ("last_name", pyarrow.string()),
44 |                                         ("name", pyarrow.string()),
45 |                                         ("logon_time", pyarrow.uint64()),
46 |                                     ]
47 |                                 ),
48 |                             ),
49 |                             ("block_id", pyarrow.string()),
50 |                             ("run_id", pyarrow.string()),
51 |                             ("stop_id", pyarrow.string()),
52 |                             ("current_stop_sequence", pyarrow.uint32()),
53 |                             ("revenue", pyarrow.bool_()),
54 |                             ("current_status", pyarrow.string()),
55 |                             ("load", pyarrow.uint16()),
56 |                             ("capacity", pyarrow.uint16()),
57 |                             ("occupancy_percentage", pyarrow.uint16()),
58 |                             ("occupancy_status", pyarrow.string()),
59 |                         ]
60 |                     ),
61 |                 ),
62 |             ]
63 |         )
64 | 
65 |     @property
66 |     def table_sort_order(self) -> List[Tuple[str, str]]:
67 |         return [
68 |             ("vehicle.block_id", "ascending"),
69 |             ("vehicle.vehicle.id", "ascending"),
70 |             ("feed_timestamp", "ascending"),
71 |         ]
72 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/config_rt_trip.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | import pyarrow
 3 | 
 4 | from lamp_py.ingestion.gtfs_rt_detail import GTFSRTDetail
 5 | from lamp_py.ingestion.gtfs_rt_structs import (
 6 |     trip_descriptor,
 7 |     vehicle_descriptor,
 8 |     stop_time_event,
 9 | )
10 | from lamp_py.ingestion.utils import explode_table_column, flatten_schema
11 | 
12 | 
13 | class RtTripDetail(GTFSRTDetail):
14 |     """
15 |     Detail for how to convert RT GTFS Trip Updates from json entries into
16 |     parquet tables.
17 |     """
18 | 
19 |     def transform_for_write(self, table: pyarrow.table) -> pyarrow.table:
20 |         """modify table schema before write to parquet"""
21 |         return flatten_schema(explode_table_column(flatten_schema(table), "trip_update.stop_time_update"))
22 | 
23 |     @property
24 |     def partition_column(self) -> str:
25 |         return "trip_update.trip.route_id"
26 | 
27 |     @property
28 |     def import_schema(self) -> pyarrow.schema:
29 |         return pyarrow.schema(
30 |             [
31 |                 ("id", pyarrow.string()),
32 |                 (
33 |                     "trip_update",
34 |                     pyarrow.struct(
35 |                         [
36 |                             ("trip", trip_descriptor),
37 |                             ("vehicle", vehicle_descriptor),
38 |                             (
39 |                                 "stop_time_update",
40 |                                 pyarrow.list_(
41 |                                     pyarrow.struct(
42 |                                         [
43 |                                             ("stop_sequence", pyarrow.uint32()),
44 |                                             ("stop_id", pyarrow.string()),
45 |                                             ("arrival", stop_time_event),
46 |                                             ("departure", stop_time_event),
47 |                                             (
48 |                                                 "schedule_relationship",
49 |                                                 pyarrow.string(),
50 |                                             ),
51 |                                             (
52 |                                                 "boarding_status",
53 |                                                 pyarrow.string(),
54 |                                             ),  # MBTA Enhanced Field
55 |                                         ]
56 |                                     )
57 |                                 ),
58 |                             ),
59 |                             ("timestamp", pyarrow.uint64()),
60 |                             ("delay", pyarrow.int32()),
61 |                         ]
62 |                     ),
63 |                 ),
64 |             ]
65 |         )
66 | 
67 |     # pylint: disable=R0801
68 |     # Similar lines in 2 files
69 |     @property
70 |     def table_sort_order(self) -> List[Tuple[str, str]]:
71 |         return [
72 |             ("trip_update.trip.route_pattern_id", "ascending"),
73 |             ("trip_update.trip.direction_id", "ascending"),
74 |             ("trip_update.vehicle.id", "ascending"),
75 |         ]
76 | 
77 |     # pylint: enable=R0801
78 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/config_rt_vehicle.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | import pyarrow
 3 | 
 4 | from .gtfs_rt_detail import GTFSRTDetail
 5 | from .gtfs_rt_structs import position, trip_descriptor, vehicle_descriptor
 6 | 
 7 | 
 8 | class RtVehicleDetail(GTFSRTDetail):
 9 |     """
10 |     Detail for how to convert RT GTFS Vehicle Positions from json entries into
11 |     parquet tables.
12 |     """
13 | 
14 |     @property
15 |     def partition_column(self) -> str:
16 |         return "vehicle.trip.route_id"
17 | 
18 |     @property
19 |     def import_schema(self) -> pyarrow.schema:
20 |         return pyarrow.schema(
21 |             [
22 |                 ("id", pyarrow.string()),
23 |                 (
24 |                     "vehicle",
25 |                     pyarrow.struct(
26 |                         [
27 |                             ("trip", trip_descriptor),
28 |                             ("vehicle", vehicle_descriptor),
29 |                             ("position", position),
30 |                             ("current_stop_sequence", pyarrow.uint32()),
31 |                             ("stop_id", pyarrow.string()),
32 |                             ("current_status", pyarrow.string()),
33 |                             ("timestamp", pyarrow.uint64()),
34 |                             ("congestion_level", pyarrow.string()),
35 |                             ("occupancy_status", pyarrow.string()),
36 |                             ("occupancy_percentage", pyarrow.uint32()),
37 |                             (
38 |                                 "multi_carriage_details",
39 |                                 pyarrow.list_(
40 |                                     pyarrow.struct(
41 |                                         [
42 |                                             ("id", pyarrow.string()),
43 |                                             ("label", pyarrow.string()),
44 |                                             (
45 |                                                 "occupancy_status",
46 |                                                 pyarrow.string(),
47 |                                             ),
48 |                                             (
49 |                                                 "occupancy_percentage",
50 |                                                 pyarrow.int32(),
51 |                                             ),
52 |                                             (
53 |                                                 "carriage_sequence",
54 |                                                 pyarrow.uint32(),
55 |                                             ),
56 |                                         ]
57 |                                     )
58 |                                 ),
59 |                             ),
60 |                         ]
61 |                     ),
62 |                 ),
63 |             ]
64 |         )
65 | 
66 |     @property
67 |     def table_sort_order(self) -> List[Tuple[str, str]]:
68 |         return [
69 |             ("vehicle.vehicle.id", "ascending"),
70 |             ("vehicle.trip.direction_id", "ascending"),
71 |         ]
72 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/gtfs_rt_detail.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from abc import abstractmethod
 3 | from typing import Optional, List, Tuple
 4 | 
 5 | import pyarrow
 6 | 
 7 | from lamp_py.ingestion.utils import flatten_schema
 8 | 
 9 | 
10 | class GTFSRTDetail(ABC):
11 |     """
12 |     Abstract Base Class for all GTFSRTDetail implementations.
13 | 
14 |     GTFSRTDetail classes must implement all methods and properties that are
15 |     defined.
16 |     """
17 | 
18 |     def transform_for_write(self, table: pyarrow.table) -> pyarrow.table:
19 |         """modify table schema before write to parquet"""
20 |         return flatten_schema(table)
21 | 
22 |     @property
23 |     @abstractmethod
24 |     def partition_column(self) -> str:
25 |         """Column used to partition parquet files for this config"""
26 | 
27 |     @property
28 |     @abstractmethod
29 |     def import_schema(self) -> pyarrow.schema:
30 |         """Get the import schema for the parquet table generated by this config"""
31 | 
32 |     @property
33 |     def table_sort_order(self) -> Optional[List[Tuple[str, str]]]:
34 |         """
35 |         Provide list of fields to sort pyarrow table before writing to parquet
36 | 
37 |         table_sort_order should be configured to optimize parquet file size
38 |         when writing to disk
39 | 
40 |         Currently specified sort orders were determined by a small amount of experimentation
41 | 
42 |         TODO: perform additional experiments to optimize sort order of all parquet file types  # pylint: disable=fixme
43 |         """
44 |         return None
45 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/gtfs_rt_structs.py:
--------------------------------------------------------------------------------
 1 | import pyarrow
 2 | 
 3 | position = pyarrow.struct(
 4 |     [
 5 |         ("bearing", pyarrow.uint16()),
 6 |         ("latitude", pyarrow.float64()),
 7 |         ("longitude", pyarrow.float64()),
 8 |         ("speed", pyarrow.float64()),
 9 |         ("odometer", pyarrow.float64()),
10 |     ]
11 | )
12 | 
13 | trip_descriptor = pyarrow.struct(
14 |     [
15 |         ("trip_id", pyarrow.string()),
16 |         ("route_id", pyarrow.string()),
17 |         ("direction_id", pyarrow.uint8()),
18 |         ("start_time", pyarrow.string()),
19 |         ("start_date", pyarrow.string()),
20 |         ("schedule_relationship", pyarrow.string()),
21 |         ("route_pattern_id", pyarrow.string()),  # MBTA Enhanced Field
22 |         ("tm_trip_id", pyarrow.string()),  # Only used by Busloc
23 |         ("overload_id", pyarrow.int64()),  # Only used by Busloc
24 |         ("overload_offset", pyarrow.int64()),  # Only used by Busloc
25 |         ("revenue", pyarrow.bool_()),  # MBTA Enhanced Field
26 |         ("last_trip", pyarrow.bool_()),  # MBTA Enhanced Field
27 |     ]
28 | )
29 | 
30 | vehicle_descriptor = pyarrow.struct(
31 |     [
32 |         ("id", pyarrow.string()),
33 |         ("label", pyarrow.string()),
34 |         ("license_plate", pyarrow.string()),
35 |         (
36 |             "consist",
37 |             pyarrow.list_(
38 |                 pyarrow.struct(
39 |                     [
40 |                         ("label", pyarrow.string()),
41 |                     ]
42 |                 ),
43 |             ),
44 |         ),  # MBTA Enhanced Field
45 |         ("assignment_status", pyarrow.string()),  # Only used by Busloc
46 |     ]
47 | )
48 | 
49 | translated_string = pyarrow.struct(
50 |     [
51 |         (
52 |             "translation",
53 |             pyarrow.list_(
54 |                 pyarrow.struct(
55 |                     [
56 |                         ("text", pyarrow.string()),
57 |                         ("language", pyarrow.string()),
58 |                     ]
59 |                 )
60 |             ),
61 |         )
62 |     ]
63 | )
64 | 
65 | stop_time_event = pyarrow.struct(
66 |     [
67 |         ("delay", pyarrow.int32()),
68 |         ("time", pyarrow.int64()),
69 |         ("uncertainty", pyarrow.int32()),
70 |     ]
71 | )
72 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion/pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import time
 5 | import logging
 6 | import signal
 7 | 
 8 | from lamp_py.aws.ecs import handle_ecs_sigterm, check_for_sigterm
 9 | from lamp_py.aws.kinesis import KinesisReader
10 | from lamp_py.postgres.postgres_utils import start_rds_writer_process
11 | from lamp_py.runtime_utils.alembic_migration import alembic_upgrade_to_head
12 | from lamp_py.runtime_utils.env_validation import validate_environment
13 | from lamp_py.runtime_utils.process_logger import ProcessLogger
14 | 
15 | from lamp_py.ingestion.ingest_gtfs import ingest_gtfs
16 | from lamp_py.ingestion.glides import ingest_glides_events
17 | 
18 | # from lamp_py.ingestion.light_rail_gps import ingest_light_rail_gps
19 | from lamp_py.runtime_utils.remote_files import LAMP
20 | from lamp_py.utils.clear_folder import clear_folder
21 | 
22 | logging.getLogger().setLevel("INFO")
23 | DESCRIPTION = """Entry Point For GTFS Ingestion Scripts"""
24 | 
25 | 
26 | def main() -> None:
27 |     """
28 |     run the ingestion pipeline
29 | 
30 |     * setup metadata queue metadata writer process
31 |     * setup a glides kinesis reader
32 |     * on a loop
33 |         * check to see if the pipeline should be terminated
34 |         * ingest files from incoming s3 bucket
35 |         * ingest glides events from kinesis
36 |     """
37 |     # start rds writer process
38 |     # this will create only one rds engine while app is running
39 |     metadata_queue, rds_process = start_rds_writer_process()
40 | 
41 |     # connect to the glides kinesis stream
42 |     glides_reader = KinesisReader(stream_name="ctd-glides-prod")
43 | 
44 |     # run the event loop every 30 seconds
45 |     while True:
46 |         process_logger = ProcessLogger(process_name="main")
47 |         process_logger.log_start()
48 |         bucket_filter = LAMP
49 |         check_for_sigterm(metadata_queue, rds_process)
50 |         # ingest_light_rail_gps(bucket_filter=bucket_filter)
51 |         ingest_gtfs(metadata_queue, bucket_filter=bucket_filter)
52 |         ingest_glides_events(glides_reader, metadata_queue)
53 |         check_for_sigterm(metadata_queue, rds_process)
54 | 
55 |         process_logger.log_complete()
56 | 
57 |         time.sleep(30)
58 | 
59 | 
60 | def start() -> None:
61 |     """configure and start the ingestion process"""
62 |     clear_folder("/tmp")
63 |     # setup handling shutdown commands
64 |     signal.signal(signal.SIGTERM, handle_ecs_sigterm)
65 | 
66 |     # configure the environment
67 |     os.environ["SERVICE_NAME"] = "ingestion"
68 | 
69 |     validate_environment(
70 |         required_variables=[
71 |             "ARCHIVE_BUCKET",
72 |             "ERROR_BUCKET",
73 |             "INCOMING_BUCKET",
74 |             "PUBLIC_ARCHIVE_BUCKET",
75 |             "SPRINGBOARD_BUCKET",
76 |             "ALEMBIC_MD_DB_NAME",
77 |         ],
78 |         db_prefixes=["MD", "RPM"],
79 |     )
80 | 
81 |     # run metadata rds migrations
82 |     alembic_upgrade_to_head(db_name=os.environ["ALEMBIC_MD_DB_NAME"])
83 | 
84 |     # run the main method
85 |     main()
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     start()
90 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion_tm/ingest.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from lamp_py.mssql.mssql_utils import MSSQLManager
 4 | from lamp_py.ingestion_tm.tm_export import TMExport
 5 | from lamp_py.ingestion_tm.jobs.whole_table import (
 6 |     TMMainGeoNode,
 7 |     TMMainRoute,
 8 |     TMMainTrip,
 9 |     TMMainVehicle,
10 |     TMMainBlock,
11 |     TMMainOperator,
12 |     TMMainRun,
13 |     TMMainWorkPiece,
14 |     TMDailyLogDailySchedAdhereWaiver,
15 | )
16 | from lamp_py.ingestion_tm.jobs.parition_table import (
17 |     TMDailyLogStopCrossing,
18 |     TMDailyLogDailyWorkPiece,
19 | )
20 | 
21 | 
22 | def get_ingestion_jobs() -> List[TMExport]:
23 |     """
24 |     get a list of all ingestion jobs that
25 |     """
26 |     return [
27 |         TMMainGeoNode(),
28 |         TMMainRoute(),
29 |         TMMainTrip(),
30 |         TMMainVehicle(),
31 |         TMMainBlock(),
32 |         TMMainOperator(),
33 |         TMMainRun(),
34 |         TMMainWorkPiece(),
35 |         TMDailyLogStopCrossing(),
36 |         TMDailyLogDailyWorkPiece(),
37 |         TMDailyLogDailySchedAdhereWaiver(),
38 |     ]
39 | 
40 | 
41 | def ingest_tables() -> None:
42 |     """
43 |     ingest tables from transmaster database
44 |     """
45 |     tm_db = MSSQLManager(verbose=True)
46 |     jobs: List[TMExport] = get_ingestion_jobs()
47 | 
48 |     for job in jobs:
49 |         job.run_export(tm_db)
50 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion_tm/pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import logging
 4 | import os
 5 | 
 6 | from lamp_py.aws.ecs import check_for_parallel_tasks
 7 | from lamp_py.runtime_utils.env_validation import validate_environment
 8 | 
 9 | from lamp_py.ingestion_tm.ingest import ingest_tables
10 | 
11 | logging.getLogger().setLevel("INFO")
12 | DESCRIPTION = """Entry Point For TM Ingestion Scripts"""
13 | 
14 | 
15 | def start() -> None:
16 |     """configure and start the transitmaster ingestion process"""
17 |     # configure the environment
18 |     os.environ["SERVICE_NAME"] = "ingestion_tm"
19 | 
20 |     validate_environment(
21 |         required_variables=[
22 |             "SPRINGBOARD_BUCKET",
23 |             "TM_DB_HOST",
24 |             "TM_DB_NAME",
25 |             "TM_DB_USER",
26 |             "TM_DB_PASSWORD",
27 |             "TM_DB_PORT",
28 |             "ECS_CLUSTER",
29 |             "ECS_TASK_GROUP",
30 |         ],
31 |         private_variables=[
32 |             "TM_DB_PASSWORD",
33 |         ],
34 |     )
35 | 
36 |     check_for_parallel_tasks()
37 | 
38 |     # run the main method
39 |     ingest_tables()
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     start()
44 | 


--------------------------------------------------------------------------------
/src/lamp_py/ingestion_tm/tm_export.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from abc import abstractmethod
 3 | 
 4 | import pyarrow
 5 | 
 6 | from lamp_py.mssql.mssql_utils import MSSQLManager
 7 | 
 8 | 
 9 | class TMExport(ABC):
10 |     """
11 |     Abstract Base Class for TM Export jobs
12 |     """
13 | 
14 |     @property
15 |     @abstractmethod
16 |     def export_schema(self) -> pyarrow.schema:
17 |         """Schema for export"""
18 | 
19 |     @abstractmethod
20 |     def run_export(self, tm_db: MSSQLManager) -> None:
21 |         """
22 |         Business logic to create new exprot parquet file
23 |         """
24 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.


--------------------------------------------------------------------------------
/src/lamp_py/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/src/lamp_py/migrations/__init__.py


--------------------------------------------------------------------------------
/src/lamp_py/migrations/env.py:
--------------------------------------------------------------------------------
  1 | from logging.config import fileConfig
  2 | 
  3 | from alembic import context
  4 | 
  5 | from lamp_py.postgres.postgres_utils import DatabaseIndex
  6 | 
  7 | # this is the Alembic Config object, which provides
  8 | # access to the values within the .ini file in use.
  9 | config = context.config
 10 | 
 11 | # gate to make sure alembic is run using -n flag
 12 | if config.config_ini_section == "alembic":
 13 |     raise SyntaxError("Run alembic with -n flag to specifiy Database name.")
 14 | 
 15 | # get database name from -n flag when alembic is run from cmd line
 16 | db_name_env = config.config_ini_section
 17 | 
 18 | # Interpret the config file for Python logging.
 19 | # This line sets up loggers basically.
 20 | if config.config_file_name is not None:
 21 |     fileConfig(config.config_file_name)
 22 | 
 23 | # add your model's MetaData object here
 24 | # for 'autogenerate' support
 25 | # from myapp import mymodel
 26 | # target_metadata = mymodel.Base.metadata
 27 | from lamp_py.postgres.rail_performance_manager_schema import RpmSqlBase
 28 | from lamp_py.postgres.metadata_schema import MetadataSqlBase
 29 | 
 30 | # using dictionary for engine and target_metadata to support migrating multiple dbs
 31 | # each dictionary name should have a section defined in alembic.ini that
 32 | # matches the key used in the db_details dictionary
 33 | rpm_psql_args = DatabaseIndex.RAIL_PERFORMANCE_MANAGER.get_args_from_env()
 34 | md_psql_args = DatabaseIndex.METADATA.get_args_from_env()
 35 | db_details = {
 36 |     "performance_manager": {
 37 |         "engine": rpm_psql_args.get_local_engine(),
 38 |         "target_metadata": RpmSqlBase.metadata,
 39 |     },
 40 |     "metadata": {
 41 |         "engine": md_psql_args.get_local_engine(),
 42 |         "target_metadata": MetadataSqlBase.metadata,
 43 |     },
 44 | }
 45 | 
 46 | # other values from the config, defined by the needs of env.py,
 47 | # can be acquired:
 48 | # my_important_option = config.get_main_option("my_important_option")
 49 | # ... etc.
 50 | 
 51 | 
 52 | # def run_migrations_offline() -> None:
 53 | #     """Run migrations in 'offline' mode.
 54 | 
 55 | #     This configures the context with just a URL
 56 | #     and not an Engine, though an Engine is acceptable
 57 | #     here as well.  By skipping the Engine creation
 58 | #     we don't even need a DBAPI to be available.
 59 | 
 60 | #     Calls to context.execute() here emit the given string to the
 61 | #     script output.
 62 | 
 63 | #     """
 64 | #     url = config.get_main_option("sqlalchemy.url")
 65 | #     context.configure(
 66 | #         url=url,
 67 | #         target_metadata=target_metadata,
 68 | #         literal_binds=True,
 69 | #         dialect_opts={"paramstyle": "named"},
 70 | #     )
 71 | 
 72 | #     with context.begin_transaction():
 73 | #         context.run_migrations()
 74 | 
 75 | 
 76 | def run_migrations_online() -> None:
 77 |     """Run migrations in 'online' mode.
 78 | 
 79 |     In this scenario we need to create an Engine
 80 |     and associate a connection with the context.
 81 | 
 82 |     """
 83 |     # strip off the environment name at the end of the db_name_env.
 84 |     # expected format is "<db_name>_<env>"
 85 |     db_name = db_name_env.rsplit("_", 1)[0]
 86 |     connectable = db_details[db_name]["engine"]
 87 | 
 88 |     with connectable.connect() as connection:
 89 |         context.configure(
 90 |             connection=connection,
 91 |             target_metadata=db_details[db_name]["target_metadata"],
 92 |         )
 93 | 
 94 |         with context.begin_transaction():
 95 |             context.run_migrations()
 96 | 
 97 | 
 98 | if context.is_offline_mode():
 99 |     raise NotImplementedError("Alembic offline migration not implemented.")
100 |     # run_migrations_offline()
101 | else:
102 |     run_migrations_online()
103 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/migration_template_generator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import datetime
  3 | 
  4 | 
  5 | def pick_n_directories(options: list[str]) -> list[str]:
  6 |     """
  7 |     Given a list of directories, show them to user to select the set of directories to return as a list
  8 |     """
  9 | 
 10 |     print("Select options (enter numbers separated by spaces):")
 11 |     for i, option in enumerate(options):
 12 |         print(f"{i + 1}. {option}")
 13 | 
 14 |     while True:
 15 |         try:
 16 |             choices = input("> ")
 17 |             selected_indices = [int(c) - 1 for c in choices.split()]
 18 |             if not all(0 <= i < len(options) for i in selected_indices):
 19 |                 raise ValueError
 20 |             return [options[i] for i in selected_indices]
 21 |         except ValueError:
 22 |             print("Invalid input. Please enter numbers separated by spaces, corresponding to the options.")
 23 | 
 24 | 
 25 | def migration_template(
 26 |     current_id: str,
 27 |     previous_id: str,
 28 |     date_string: str,
 29 |     alembic_string: str,
 30 |     detail_desc: str,
 31 |     upgrade_desc: str,
 32 |     downgrade_desc: str,
 33 | ) -> str:
 34 |     """
 35 |     Fillable template for a generic migration. This gets populated and
 36 |     filled out with directory dependent curr/prev id and data. WIP
 37 |     """
 38 |     return f'''"""{alembic_string}
 39 | 
 40 |     Revision ID: {current_id}
 41 |     Revises: {previous_id}
 42 |     Create Date: {date_string}
 43 | 
 44 |     Details: {detail_desc}
 45 | 
 46 |     * upgrade -> {upgrade_desc}
 47 |     * downgrade -> {downgrade_desc}
 48 |     """
 49 | 
 50 | import logging
 51 | import os
 52 | import tempfile
 53 | import polars as pl
 54 | import pyarrow as pa
 55 | import pyarrow.parquet as pq
 56 | from typing import List
 57 | 
 58 | from alembic import op
 59 | import sqlalchemy as sa
 60 | from sqlalchemy.exc import ProgrammingError
 61 | 
 62 | from lamp_py.aws.s3 import download_file, upload_file
 63 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
 64 | 
 65 | # revision identifiers, used by Alembic.
 66 | revision = "{current_id}"
 67 | down_revision = "{previous_id}"
 68 | branch_labels = None # tbd
 69 | depends_on = None #tbd
 70 | 
 71 | 
 72 | def upgrade() -> None:
 73 |     pass
 74 | 
 75 | def downgrade() -> None:
 76 |     pass
 77 |     '''
 78 | 
 79 | 
 80 | if __name__ == "__main__":
 81 |     import uuid
 82 | 
 83 |     short_desc = "reprocess_422_423"
 84 |     uuid_new = uuid.uuid4().hex[-12:]
 85 | 
 86 |     versions_dir = "/Users/hhuang/lamp/lamp/src/lamp_py/migrations/versions"
 87 | 
 88 |     # List directories in the versions directory
 89 |     if os.path.exists(versions_dir):
 90 |         directories = sorted([d for d in os.listdir(versions_dir) if os.path.isdir(os.path.join(versions_dir, d))])
 91 |         print("Directories in 'versions':", directories)
 92 |     else:
 93 |         print(f"The directory '{versions_dir}' does not exist.")
 94 | 
 95 |     options = pick_n_directories(directories)
 96 | 
 97 |     print(options)
 98 |     for o in options:
 99 |         latest_migration = sorted([d for d in os.listdir(os.path.join(versions_dir, o)) if not d.startswith("sql")])[-1]
100 |         parts = os.path.basename(latest_migration).split("_")
101 |         breakpoint()
102 |         increment_migration_count = str(int(parts[0]) + 1).zfill(3)
103 |         uuid_prev = parts[1]
104 | 
105 |         with open(f"{versions_dir}/{o}/{increment_migration_count}_{uuid_new}_{short_desc}.py", "w") as f:
106 |             f.write(
107 |                 migration_template(
108 |                     current_id=uuid_new,
109 |                     previous_id=uuid_prev,
110 |                     alembic_string=short_desc,
111 |                     date_string=str(datetime.datetime.now()),
112 |                     detail_desc="FILL ME IN",
113 |                     upgrade_desc="test upgrade",
114 |                     downgrade_desc="None",
115 |                 )
116 |             )
117 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade() -> None:
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade() -> None:
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/metadata_dev/001_07903947aabe_initial_changes.py:
--------------------------------------------------------------------------------
 1 | """initial changes
 2 | 
 3 | Revision ID: 07903947aabe
 4 | Revises: 
 5 | Create Date: 2023-12-11 15:12:47.261091
 6 | 
 7 | """
 8 | 
 9 | from alembic import op
10 | from sqlalchemy.exc import ProgrammingError
11 | from sqlalchemy.sql import text
12 | import logging
13 | import sqlalchemy as sa
14 | 
15 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
16 | from lamp_py.postgres.metadata_schema import MetadataLog
17 | 
18 | # revision identifiers, used by Alembic.
19 | revision = "07903947aabe"
20 | down_revision = None
21 | branch_labels = None
22 | depends_on = None
23 | 
24 | 
25 | def upgrade() -> None:
26 |     # ### commands auto generated by Alembic - please adjust! ###
27 |     op.create_table(
28 |         "metadata_log",
29 |         sa.Column("pk_id", sa.Integer(), nullable=False),
30 |         sa.Column("rail_pm_processed", sa.Boolean(), nullable=True),
31 |         sa.Column("rail_pm_process_fail", sa.Boolean(), nullable=True),
32 |         sa.Column("path", sa.String(length=256), nullable=False),
33 |         sa.Column(
34 |             "created_on",
35 |             sa.DateTime(timezone=True),
36 |             server_default=sa.text("now()"),
37 |             nullable=True,
38 |         ),
39 |         sa.PrimaryKeyConstraint("pk_id"),
40 |         sa.UniqueConstraint("path"),
41 |     )
42 |     op.create_index(
43 |         "ix_metadata_log_not_processed",
44 |         "metadata_log",
45 |         ["path"],
46 |         unique=False,
47 |         postgresql_where=sa.text("rail_pm_processed = false"),
48 |     )
49 | 
50 |     # ### end Alembic commands ###
51 | 
52 | 
53 | def downgrade() -> None:
54 |     # ### commands auto generated by Alembic - please adjust! ###
55 |     op.drop_index(
56 |         "ix_metadata_log_not_processed",
57 |         table_name="metadata_log",
58 |     )
59 |     op.drop_table("metadata_log")
60 |     # ### end Alembic commands ###
61 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/metadata_dev/002_26db393ea854_update_glides_location_column_names.py:
--------------------------------------------------------------------------------
 1 | """update_glides_location_column_names
 2 | 
 3 | Revision ID: 26db393ea854
 4 | Revises: 07903947aabe
 5 | Create Date: 2024-07-09 12:12:04.325358
 6 | 
 7 | Details
 8 | * upgrade -> for each glides parquet file:
 9 |     * rename columns to match api. replace gtfsID with gtfsId and todsID with
10 |         todsId
11 |     * unique each dataset based on the 'id' uuid field.
12 | 
13 | * downgrade -> Nothing
14 | """
15 | 
16 | import os
17 | import tempfile
18 | import polars as pl
19 | import pyarrow as pa
20 | import pyarrow.parquet as pq
21 | from typing import List
22 | 
23 | from lamp_py.aws.s3 import download_file, upload_file
24 | 
25 | # revision identifiers, used by Alembic.
26 | revision = "26db393ea854"
27 | down_revision = "07903947aabe"
28 | branch_labels = None
29 | depends_on = None
30 | 
31 | 
32 | def upgrade() -> None:
33 |     def update_glides_archive(temp_dir: str, base_filename: str) -> None:
34 |         """
35 |         * download the remote file to a local temp dir
36 |         * rename columns with "gtfsID" or "todsID" in them to use "Id"
37 |         * unique columns
38 |         * sort the dataset based on 'time' column
39 |         """
40 |         remote_path = f"s3://{os.environ['SPRINGBOARD_BUCKET']}/lamp/GLIDES/{base_filename}"
41 |         old_local_path = os.path.join(temp_dir, f"old_{base_filename}")
42 |         new_local_path = os.path.join(temp_dir, f"new_{base_filename}")
43 | 
44 |         file_exists = download_file(remote_path, old_local_path)
45 |         if not file_exists:
46 |             return
47 | 
48 |         old_table = pq.read_table(old_local_path)
49 | 
50 |         # build the new schema by converting names and keeping types
51 |         fields: List[pa.Field] = []
52 |         for column in old_table.schema:
53 |             if "gtfsID" in column.name:
54 |                 new_name = column.name.replace("gtfsID", "gtfsId")
55 |                 new_field = pa.field(new_name, column.type)
56 |                 fields.append(new_field)
57 |             elif "todsID" in column.name:
58 |                 new_name = column.name.replace("todsID", "todsId")
59 |                 new_field = pa.field(new_name, column.type)
60 |                 fields.append(new_field)
61 |             else:
62 |                 fields.append(column)
63 | 
64 |         schema = pa.schema(fields)
65 | 
66 |         # rename columns to match new schema
67 |         # unique the records
68 |         # cast to new schema (polars converts things)
69 |         new_table = (
70 |             pl.DataFrame(old_table.rename_columns(schema.names)).unique().sort(by=["time"]).to_arrow().cast(schema)
71 |         )
72 | 
73 |         pq.write_table(new_table, new_local_path)
74 |         upload_file(new_local_path, remote_path)
75 | 
76 |     files_to_update = [
77 |         "editor_changes.parquet",
78 |         "operator_sign_ins.parquet",
79 |         "trip_updates.parquet",
80 |     ]
81 | 
82 |     with tempfile.TemporaryDirectory() as temp_dir:
83 |         for filename in files_to_update:
84 |             update_glides_archive(temp_dir, filename)
85 | 
86 | 
87 | def downgrade() -> None:
88 |     pass
89 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/metadata_prod/001_07903947aabe_initial_changes.py:
--------------------------------------------------------------------------------
 1 | """initial changes
 2 | 
 3 | Revision ID: 07903947aabe
 4 | Revises: 
 5 | Create Date: 2023-12-11 15:12:47.261091
 6 | 
 7 | """
 8 | 
 9 | from alembic import op
10 | from sqlalchemy.exc import ProgrammingError
11 | from sqlalchemy.sql import text
12 | import logging
13 | import sqlalchemy as sa
14 | 
15 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
16 | from lamp_py.postgres.metadata_schema import MetadataLog
17 | 
18 | # revision identifiers, used by Alembic.
19 | revision = "07903947aabe"
20 | down_revision = None
21 | branch_labels = None
22 | depends_on = None
23 | 
24 | 
25 | def upgrade() -> None:
26 |     # ### commands auto generated by Alembic - please adjust! ###
27 |     op.create_table(
28 |         "metadata_log",
29 |         sa.Column("pk_id", sa.Integer(), nullable=False),
30 |         sa.Column("rail_pm_processed", sa.Boolean(), nullable=True),
31 |         sa.Column("rail_pm_process_fail", sa.Boolean(), nullable=True),
32 |         sa.Column("path", sa.String(length=256), nullable=False),
33 |         sa.Column(
34 |             "created_on",
35 |             sa.DateTime(timezone=True),
36 |             server_default=sa.text("now()"),
37 |             nullable=True,
38 |         ),
39 |         sa.PrimaryKeyConstraint("pk_id"),
40 |         sa.UniqueConstraint("path"),
41 |     )
42 |     op.create_index(
43 |         "ix_metadata_log_not_processed",
44 |         "metadata_log",
45 |         ["path"],
46 |         unique=False,
47 |         postgresql_where=sa.text("rail_pm_processed = false"),
48 |     )
49 | 
50 |     # pull metadata from the rail performance manager database into the
51 |     # metadata database. the table may or may not exist, so wrap this in a try
52 |     # except
53 |     try:
54 |         rpm_db_manager = DatabaseManager(db_index=DatabaseIndex.RAIL_PERFORMANCE_MANAGER)
55 | 
56 |         insert_data = []
57 |         # pull metadata from the rail performance manager database via direct
58 |         # sql query. the metadata_log table may or may not exist.
59 |         with rpm_db_manager.session.begin() as session:
60 |             result = session.execute(text("SELECT path, processed, process_fail FROM metadata_log"))
61 |             for row in result:
62 |                 (path, processed, process_fail) = row
63 |                 insert_data.append(
64 |                     {
65 |                         "path": path,
66 |                         "rail_pm_processed": processed,
67 |                         "rail_pm_process_fail": process_fail,
68 |                     }
69 |                 )
70 | 
71 |     except ProgrammingError as error:
72 |         # Error 42P01 is an 'Undefined Table' error. This occurs when there is
73 |         # no metadata_log table in the rail performance manager database
74 |         #
75 |         # Raise all other sql errors
76 |         insert_data = []
77 |         original_error = error.orig
78 |         if original_error is not None and hasattr(original_error, "pgcode") and original_error.pgcode == "42P01":
79 |             logging.info("No Metadata Table in Rail Performance Manager")
80 |         else:
81 |             raise
82 | 
83 |     # insert data into the metadata database
84 |     if insert_data:
85 |         op.bulk_insert(MetadataLog.__table__, insert_data)
86 | 
87 |     # ### end Alembic commands ###
88 | 
89 | 
90 | def downgrade() -> None:
91 |     # ### commands auto generated by Alembic - please adjust! ###
92 |     op.drop_index(
93 |         "ix_metadata_log_not_processed",
94 |         table_name="metadata_log",
95 |     )
96 |     op.drop_table("metadata_log")
97 |     # ### end Alembic commands ###
98 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/metadata_prod/002_cce8dfee767a_re_run_input_files_from_2024_04_03.py:
--------------------------------------------------------------------------------
 1 | """re-run input files from 2024-04-03
 2 | 
 3 | Revision ID: cce8dfee767a
 4 | Revises: 07903947aabe
 5 | Create Date: 2024-04-04 11:50:55.161259
 6 | 
 7 | Details
 8 | * upgrade -> update metdata table to re-process failed parquet files from April 3, 2024
 9 | 
10 | * downgrade -> Nothing
11 | 
12 | """
13 | 
14 | from alembic import op
15 | import sqlalchemy as sa
16 | 
17 | 
18 | # revision identifiers, used by Alembic.
19 | revision = "cce8dfee767a"
20 | down_revision = "07903947aabe"
21 | branch_labels = None
22 | depends_on = None
23 | 
24 | 
25 | def upgrade() -> None:
26 |     update_query = """
27 |     UPDATE 
28 |         public.metadata_log 
29 |     SET 
30 |         rail_pm_process_fail = false 
31 |         , rail_pm_processed = false 
32 |     WHERE 
33 |         created_on > '2024-04-03 09:00:00'
34 |         and created_on < '2024-04-03 15:00:00'
35 |         and (
36 |             path LIKE '%RT_TRIP_UPDATES%'
37 |             or path LIKE '%RT_VEHICLE_POSITION%'
38 |         )
39 |     ;
40 |     """
41 |     op.execute(update_query)
42 | 
43 | 
44 | def downgrade() -> None:
45 |     pass
46 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/metadata_prod/003_26db393ea854_update_glides_location_column_names.py:
--------------------------------------------------------------------------------
 1 | """update_glides_location_column_names
 2 | 
 3 | Revision ID: 26db393ea854
 4 | Revises: cce8dfee767a
 5 | Create Date: 2024-07-09 12:12:04.325358
 6 | 
 7 | Details
 8 | * upgrade -> for each glides parquet file:
 9 |     * rename columns to match api. replace gtfsID with gtfsId and todsID with
10 |         todsId
11 |     * unique each dataset based on the 'id' uuid field.
12 | 
13 | * downgrade -> Nothing
14 | """
15 | 
16 | import os
17 | import tempfile
18 | import polars as pl
19 | import pyarrow as pa
20 | import pyarrow.parquet as pq
21 | from typing import List
22 | 
23 | from lamp_py.aws.s3 import download_file, upload_file
24 | 
25 | # revision identifiers, used by Alembic.
26 | revision = "26db393ea854"
27 | down_revision = "cce8dfee767a"
28 | branch_labels = None
29 | depends_on = None
30 | 
31 | 
32 | def upgrade() -> None:
33 |     def update_glides_archive(temp_dir: str, base_filename: str) -> None:
34 |         """
35 |         * download the remote file to a local temp dir
36 |         * rename columns with "gtfsID" or "todsID" in them to use "Id"
37 |         * unique columns
38 |         * sort the dataset based on 'time' column
39 |         """
40 |         remote_path = f"s3://{os.environ['SPRINGBOARD_BUCKET']}/lamp/GLIDES/{base_filename}"
41 |         old_local_path = os.path.join(temp_dir, f"old_{base_filename}")
42 |         new_local_path = os.path.join(temp_dir, f"new_{base_filename}")
43 | 
44 |         file_exists = download_file(remote_path, old_local_path)
45 |         if not file_exists:
46 |             return
47 | 
48 |         old_table = pq.read_table(old_local_path)
49 | 
50 |         # build the new schema by converting names and keeping types
51 |         fields: List[pa.Field] = []
52 |         for column in old_table.schema:
53 |             if "gtfsID" in column.name:
54 |                 new_name = column.name.replace("gtfsID", "gtfsId")
55 |                 new_field = pa.field(new_name, column.type)
56 |                 fields.append(new_field)
57 |             elif "todsID" in column.name:
58 |                 new_name = column.name.replace("todsID", "todsId")
59 |                 new_field = pa.field(new_name, column.type)
60 |                 fields.append(new_field)
61 |             else:
62 |                 fields.append(column)
63 | 
64 |         schema = pa.schema(fields)
65 | 
66 |         # rename columns to match new schema
67 |         # unique the records
68 |         # cast to new schema (polars converts things)
69 |         new_table = (
70 |             pl.DataFrame(old_table.rename_columns(schema.names)).unique().sort(by=["time"]).to_arrow().cast(schema)
71 |         )
72 | 
73 |         pq.write_table(new_table, new_local_path)
74 |         upload_file(new_local_path, remote_path)
75 | 
76 |     files_to_update = [
77 |         "editor_changes.parquet",
78 |         "operator_sign_ins.parquet",
79 |         "trip_updates.parquet",
80 |     ]
81 | 
82 |     with tempfile.TemporaryDirectory() as temp_dir:
83 |         for filename in files_to_update:
84 |             update_glides_archive(temp_dir, filename)
85 | 
86 | 
87 | def downgrade() -> None:
88 |     pass
89 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/metadata_prod/004_a08c5fd37dbd_reprocess_422_423.py:
--------------------------------------------------------------------------------
 1 | """backfill_rt_rail_data_0404_to_0422
 2 | 
 3 |     Revision ID: a08c5fd37dbd
 4 |     Revises: 26db393ea854
 5 |     Create Date: 2025-05-01 00:00:00
 6 | 
 7 |     Details: Reprocess 4/22 because it is missing. Include 4/22 and 4/23 because of UTC vs EST
 8 | 
 9 |     * upgrade -> reset processed flags in metadata for 4/22 and 4/23
10 |     * downgrade -> None
11 |     """
12 | 
13 | import logging
14 | import os
15 | import tempfile
16 | import polars as pl
17 | import pyarrow as pa
18 | import pyarrow.parquet as pq
19 | from typing import List
20 | 
21 | from alembic import op
22 | import sqlalchemy as sa
23 | from sqlalchemy.exc import ProgrammingError
24 | 
25 | from lamp_py.aws.s3 import download_file, upload_file
26 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
27 | 
28 | # revision identifiers, used by Alembic.
29 | revision = "a08c5fd37dbd"
30 | down_revision = "26db393ea854"
31 | branch_labels = None
32 | depends_on = None
33 | 
34 | 
35 | def upgrade() -> None:
36 |     pass
37 | 
38 |     #     lamp_metadata=>     SELECT path, created_on, rail_pm_processed, rail_pm_process_fail
39 |     #     FROM public.metadata_log
40 |     #     WHERE substring(path, '\d{4}-\d{2}-\d{2}')::date >= '2025-04-22'
41 |     #     and substring(path, '\d{4}-\d{2}-\d{2}')::date <= '2025-04-23'
42 |     #     and (
43 |     #         path LIKE '%/RT_TRIP_UPDATES/%'
44 |     #         or path LIKE '%/RT_VEHICLE_POSITIONS/%'
45 |     #     )
46 |     #     ORDER BY created_on;
47 | 
48 |     update_md_query = """
49 |             UPDATE
50 |                 metadata_log
51 |             SET
52 |                 rail_pm_process_fail = false
53 |                 , rail_pm_processed = false
54 |             WHERE
55 |                 substring(path, '\d{4}-\d{2}-\d{2}')::date >= '2025-04-22'
56 |                 and substring(path, '\d{4}-\d{2}-\d{2}')::date <= '2025-04-23'
57 |                 and (
58 |                     path LIKE '%/RT_TRIP_UPDATES/%'
59 |                     or path LIKE '%/RT_VEHICLE_POSITIONS/%'
60 |                 )
61 |             ;
62 |             """
63 |     op.execute(update_md_query)
64 | 
65 | 
66 | def downgrade() -> None:
67 |     pass
68 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/metadata_staging/001_07903947aabe_initial_changes.py:
--------------------------------------------------------------------------------
 1 | """initial changes
 2 | 
 3 | Revision ID: 07903947aabe
 4 | Revises: 
 5 | Create Date: 2023-12-11 15:12:47.261091
 6 | 
 7 | """
 8 | 
 9 | from alembic import op
10 | from sqlalchemy.exc import ProgrammingError
11 | from sqlalchemy.sql import text
12 | import logging
13 | import sqlalchemy as sa
14 | 
15 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
16 | from lamp_py.postgres.metadata_schema import MetadataLog
17 | 
18 | # revision identifiers, used by Alembic.
19 | revision = "07903947aabe"
20 | down_revision = None
21 | branch_labels = None
22 | depends_on = None
23 | 
24 | 
25 | def upgrade() -> None:
26 |     # ### commands auto generated by Alembic - please adjust! ###
27 |     op.create_table(
28 |         "metadata_log",
29 |         sa.Column("pk_id", sa.Integer(), nullable=False),
30 |         sa.Column("rail_pm_processed", sa.Boolean(), nullable=True),
31 |         sa.Column("rail_pm_process_fail", sa.Boolean(), nullable=True),
32 |         sa.Column("path", sa.String(length=256), nullable=False),
33 |         sa.Column(
34 |             "created_on",
35 |             sa.DateTime(timezone=True),
36 |             server_default=sa.text("now()"),
37 |             nullable=True,
38 |         ),
39 |         sa.PrimaryKeyConstraint("pk_id"),
40 |         sa.UniqueConstraint("path"),
41 |     )
42 |     op.create_index(
43 |         "ix_metadata_log_not_processed",
44 |         "metadata_log",
45 |         ["path"],
46 |         unique=False,
47 |         postgresql_where=sa.text("rail_pm_processed = false"),
48 |     )
49 | 
50 |     # pull metadata from the rail performance manager database into the
51 |     # metadata database. the table may or may not exist, so wrap this in a try
52 |     # except
53 |     try:
54 |         rpm_db_manager = DatabaseManager(db_index=DatabaseIndex.RAIL_PERFORMANCE_MANAGER)
55 | 
56 |         insert_data = []
57 |         # pull metadata from the rail performance manager database via direct
58 |         # sql query. the metadata_log table may or may not exist.
59 |         with rpm_db_manager.session.begin() as session:
60 |             result = session.execute(text("SELECT path, processed, process_fail FROM metadata_log"))
61 |             for row in result:
62 |                 (path, processed, process_fail) = row
63 |                 insert_data.append(
64 |                     {
65 |                         "path": path,
66 |                         "rail_pm_processed": processed,
67 |                         "rail_pm_process_fail": process_fail,
68 |                     }
69 |                 )
70 | 
71 |     except ProgrammingError as error:
72 |         # Error 42P01 is an 'Undefined Table' error. This occurs when there is
73 |         # no metadata_log table in the rail performance manager database
74 |         #
75 |         # Raise all other sql errors
76 |         insert_data = []
77 |         original_error = error.orig
78 |         if original_error is not None and hasattr(original_error, "pgcode") and original_error.pgcode == "42P01":
79 |             logging.info("No Metadata Table in Rail Performance Manager")
80 |         else:
81 |             raise
82 | 
83 |     # insert data into the metadata database
84 |     if insert_data:
85 |         op.bulk_insert(MetadataLog.__table__, insert_data)
86 | 
87 |     # ### end Alembic commands ###
88 | 
89 | 
90 | def downgrade() -> None:
91 |     # ### commands auto generated by Alembic - please adjust! ###
92 |     op.drop_index(
93 |         "ix_metadata_log_not_processed",
94 |         table_name="metadata_log",
95 |     )
96 |     op.drop_table("metadata_log")
97 |     # ### end Alembic commands ###
98 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/metadata_staging/002_26db393ea854_update_glides_location_column_names.py:
--------------------------------------------------------------------------------
 1 | """update_glides_location_column_names
 2 | 
 3 | Revision ID: 26db393ea854
 4 | Revises: 07903947aabe
 5 | Create Date: 2024-07-09 12:12:04.325358
 6 | 
 7 | Details
 8 | * upgrade -> for each glides parquet file:
 9 |     * rename columns to match api. replace gtfsID with gtfsId and todsID with
10 |         todsId
11 |     * unique each dataset based on the 'id' uuid field.
12 | 
13 | * downgrade -> Nothing
14 | """
15 | 
16 | import os
17 | import tempfile
18 | import polars as pl
19 | import pyarrow as pa
20 | import pyarrow.parquet as pq
21 | from typing import List
22 | 
23 | from lamp_py.aws.s3 import download_file, upload_file
24 | 
25 | # revision identifiers, used by Alembic.
26 | revision = "26db393ea854"
27 | down_revision = "07903947aabe"
28 | branch_labels = None
29 | depends_on = None
30 | 
31 | 
32 | def upgrade() -> None:
33 |     def update_glides_archive(temp_dir: str, base_filename: str) -> None:
34 |         """
35 |         * download the remote file to a local temp dir
36 |         * rename columns with "gtfsID" or "todsID" in them to use "Id"
37 |         * unique columns
38 |         * sort the dataset based on 'time' column
39 |         """
40 |         remote_path = f"s3://{os.environ['SPRINGBOARD_BUCKET']}/lamp/GLIDES/{base_filename}"
41 |         old_local_path = os.path.join(temp_dir, f"old_{base_filename}")
42 |         new_local_path = os.path.join(temp_dir, f"new_{base_filename}")
43 | 
44 |         file_exists = download_file(remote_path, old_local_path)
45 |         if not file_exists:
46 |             return
47 | 
48 |         old_table = pq.read_table(old_local_path)
49 | 
50 |         # build the new schema by converting names and keeping types
51 |         fields: List[pa.Field] = []
52 |         for column in old_table.schema:
53 |             if "gtfsID" in column.name:
54 |                 new_name = column.name.replace("gtfsID", "gtfsId")
55 |                 new_field = pa.field(new_name, column.type)
56 |                 fields.append(new_field)
57 |             elif "todsID" in column.name:
58 |                 new_name = column.name.replace("todsID", "todsId")
59 |                 new_field = pa.field(new_name, column.type)
60 |                 fields.append(new_field)
61 |             else:
62 |                 fields.append(column)
63 | 
64 |         schema = pa.schema(fields)
65 | 
66 |         # rename columns to match new schema
67 |         # unique the records
68 |         # cast to new schema (polars converts things)
69 |         new_table = (
70 |             pl.DataFrame(old_table.rename_columns(schema.names)).unique().sort(by=["time"]).to_arrow().cast(schema)
71 |         )
72 | 
73 |         pq.write_table(new_table, new_local_path)
74 |         upload_file(new_local_path, remote_path)
75 | 
76 |     files_to_update = [
77 |         "editor_changes.parquet",
78 |         "operator_sign_ins.parquet",
79 |         "trip_updates.parquet",
80 |     ]
81 | 
82 |     with tempfile.TemporaryDirectory() as temp_dir:
83 |         for filename in files_to_update:
84 |             update_glides_archive(temp_dir, filename)
85 | 
86 | 
87 | def downgrade() -> None:
88 |     pass
89 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/metadata_staging/003_a08c5fd37dbd_reprocess_422_423.py:
--------------------------------------------------------------------------------
 1 | """backfill_rt_rail_data_0404_to_0422
 2 | 
 3 |     Revision ID: a08c5fd37dbd
 4 |     Revises: 26db393ea854
 5 |     Create Date: 2025-05-01 00:00:00
 6 | 
 7 |     Details: Reprocess 4/22 because it is missing. Include 4/22 and 4/23 because of UTC vs EST
 8 | 
 9 |     * upgrade -> reset processed flags in metadata for 4/22 and 4/23
10 |     * downgrade -> None
11 |     """
12 | 
13 | import logging
14 | import os
15 | import tempfile
16 | import polars as pl
17 | import pyarrow as pa
18 | import pyarrow.parquet as pq
19 | from typing import List
20 | 
21 | from alembic import op
22 | import sqlalchemy as sa
23 | from sqlalchemy.exc import ProgrammingError
24 | 
25 | from lamp_py.aws.s3 import download_file, upload_file
26 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
27 | 
28 | # revision identifiers, used by Alembic.
29 | revision = "a08c5fd37dbd"
30 | down_revision = "26db393ea854"
31 | branch_labels = None
32 | depends_on = None
33 | 
34 | 
35 | def upgrade() -> None:
36 |     pass
37 | 
38 |     #     lamp_metadata=>     SELECT path, created_on, rail_pm_processed, rail_pm_process_fail
39 |     #     FROM public.metadata_log
40 |     #     WHERE substring(path, '\d{4}-\d{2}-\d{2}')::date >= '2025-04-22'
41 |     #     and substring(path, '\d{4}-\d{2}-\d{2}')::date <= '2025-04-23'
42 |     #     and (
43 |     #         path LIKE '%/RT_TRIP_UPDATES/%'
44 |     #         or path LIKE '%/RT_VEHICLE_POSITIONS/%'
45 |     #     )
46 |     #     ORDER BY created_on;
47 | 
48 |     update_md_query = """
49 |             UPDATE
50 |                 metadata_log
51 |             SET
52 |                 rail_pm_process_fail = false
53 |                 , rail_pm_processed = false
54 |             WHERE
55 |                 substring(path, '\d{4}-\d{2}-\d{2}')::date >= '2025-04-22'
56 |                 and substring(path, '\d{4}-\d{2}-\d{2}')::date <= '2025-04-23'
57 |                 and (
58 |                     path LIKE '%/RT_TRIP_UPDATES/%'
59 |                     or path LIKE '%/RT_VEHICLE_POSITIONS/%'
60 |                 )
61 |             ;
62 |             """
63 |     op.execute(update_md_query)
64 | 
65 | 
66 | def downgrade() -> None:
67 |     pass
68 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_dev/002_1b53fd278b10_fix_trip_id_length.py:
--------------------------------------------------------------------------------
  1 | """fix trip id length
  2 | 
  3 | Revision ID: 1b53fd278b10
  4 | Revises: 5d9a7ee21ae5
  5 | Create Date: 2023-11-27 16:25:42.657967
  6 | 
  7 | Details
  8 | * upgrade -> change "trip_id" field from length 128 to 512
  9 | * downgrade -> change "trip_id" field from length 512 to 128
 10 | """
 11 | 
 12 | from alembic import op
 13 | import sqlalchemy as sa
 14 | 
 15 | from lamp_py.migrations.versions.performance_manager_staging.sql_strings.strings_001 import (
 16 |     view_opmi_all_rt_fields_joined,
 17 | )
 18 | 
 19 | # revision identifiers, used by Alembic.
 20 | revision = "1b53fd278b10"
 21 | down_revision = "5d9a7ee21ae5"
 22 | branch_labels = None
 23 | depends_on = None
 24 | 
 25 | 
 26 | def upgrade() -> None:
 27 |     # ### commands auto generated by Alembic - please adjust! ###
 28 |     op.execute("DROP VIEW IF EXISTS opmi_all_rt_fields_joined;")
 29 |     op.alter_column(
 30 |         "static_route_patterns",
 31 |         "representative_trip_id",
 32 |         existing_type=sa.VARCHAR(length=128),
 33 |         type_=sa.String(length=512),
 34 |         existing_nullable=False,
 35 |     )
 36 |     op.alter_column(
 37 |         "static_stop_times",
 38 |         "trip_id",
 39 |         existing_type=sa.VARCHAR(length=128),
 40 |         type_=sa.String(length=512),
 41 |         existing_nullable=False,
 42 |     )
 43 |     op.alter_column(
 44 |         "static_trips",
 45 |         "trip_id",
 46 |         existing_type=sa.VARCHAR(length=128),
 47 |         type_=sa.String(length=512),
 48 |         existing_nullable=False,
 49 |     )
 50 |     op.alter_column(
 51 |         "temp_event_compare",
 52 |         "trip_id",
 53 |         existing_type=sa.VARCHAR(length=128),
 54 |         type_=sa.String(length=512),
 55 |         existing_nullable=False,
 56 |     )
 57 |     op.alter_column(
 58 |         "vehicle_trips",
 59 |         "trip_id",
 60 |         existing_type=sa.VARCHAR(length=128),
 61 |         type_=sa.String(length=512),
 62 |         existing_nullable=False,
 63 |     )
 64 |     op.alter_column(
 65 |         "vehicle_trips",
 66 |         "static_trip_id_guess",
 67 |         existing_type=sa.VARCHAR(length=128),
 68 |         type_=sa.String(length=512),
 69 |         existing_nullable=True,
 70 |     )
 71 |     op.execute(view_opmi_all_rt_fields_joined)
 72 |     # ### end Alembic commands ###
 73 | 
 74 | 
 75 | def downgrade() -> None:
 76 |     # ### commands auto generated by Alembic - please adjust! ###
 77 |     op.execute("DROP VIEW IF EXISTS opmi_all_rt_fields_joined;")
 78 |     op.alter_column(
 79 |         "vehicle_trips",
 80 |         "static_trip_id_guess",
 81 |         existing_type=sa.String(length=512),
 82 |         type_=sa.VARCHAR(length=128),
 83 |         existing_nullable=True,
 84 |     )
 85 |     op.alter_column(
 86 |         "vehicle_trips",
 87 |         "trip_id",
 88 |         existing_type=sa.String(length=512),
 89 |         type_=sa.VARCHAR(length=128),
 90 |         existing_nullable=False,
 91 |     )
 92 |     op.alter_column(
 93 |         "temp_event_compare",
 94 |         "trip_id",
 95 |         existing_type=sa.String(length=512),
 96 |         type_=sa.VARCHAR(length=128),
 97 |         existing_nullable=False,
 98 |     )
 99 |     op.alter_column(
100 |         "static_trips",
101 |         "trip_id",
102 |         existing_type=sa.String(length=512),
103 |         type_=sa.VARCHAR(length=128),
104 |         existing_nullable=False,
105 |     )
106 |     op.alter_column(
107 |         "static_stop_times",
108 |         "trip_id",
109 |         existing_type=sa.String(length=512),
110 |         type_=sa.VARCHAR(length=128),
111 |         existing_nullable=False,
112 |     )
113 |     op.alter_column(
114 |         "static_route_patterns",
115 |         "representative_trip_id",
116 |         existing_type=sa.String(length=512),
117 |         type_=sa.VARCHAR(length=128),
118 |         existing_nullable=False,
119 |     )
120 |     op.execute(view_opmi_all_rt_fields_joined)
121 |     # ### end Alembic commands ###
122 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_dev/003_ae6c6e4b2df5_extend_service_id_view.py:
--------------------------------------------------------------------------------
 1 | """extend service_id_by_date_and_route
 2 | 
 3 | Revision ID: ae6c6e4b2df5
 4 | Revises: 1b53fd278b10
 5 | Create Date: 2023-12-17 06:56:17.330783
 6 | 
 7 | Details
 8 | * upgrade -> extend service_id_by_date_and_route VIEW to generate values past current date
 9 | * upgrade -> update canonical_stop_sequence to use row_number function instead of direct from static schedule
10 | 
11 | * downgrade -> Nothing
12 | """
13 | 
14 | from alembic import op
15 | 
16 | from lamp_py.migrations.versions.performance_manager_staging.sql_strings.strings_003 import (
17 |     view_service_id_by_date_and_route,
18 | )
19 | 
20 | 
21 | # revision identifiers, used by Alembic.
22 | revision = "ae6c6e4b2df5"
23 | down_revision = "1b53fd278b10"
24 | branch_labels = None
25 | depends_on = None
26 | 
27 | 
28 | def upgrade() -> None:
29 |     op.execute("DROP VIEW IF EXISTS service_id_by_date_and_route;")
30 |     op.execute(view_service_id_by_date_and_route)
31 | 
32 |     op.create_index(
33 |         "ix_static_trips_composite_4",
34 |         "static_trips",
35 |         ["static_version_key", "service_id"],
36 |         unique=False,
37 |     )
38 | 
39 |     update_stop_sequences = (
40 |         "UPDATE vehicle_events "
41 |         "SET canonical_stop_sequence = static_canon.stop_sequence "
42 |         "FROM vehicle_events AS ve "
43 |         "JOIN vehicle_trips AS vt "
44 |         "ON ve.pm_trip_id = vt.pm_trip_id "
45 |         "JOIN "
46 |         "("
47 |         "    select "
48 |         "      srp.direction_id "
49 |         "    , coalesce(st.branch_route_id, st.trunk_route_id) AS route_id "
50 |         "    , ROW_NUMBER () OVER (PARTITION BY srp.static_version_key, srp.direction_id, coalesce(st.branch_route_id, st.trunk_route_id) ORDER BY sst.stop_sequence) AS stop_sequence"
51 |         "    , ss.parent_station "
52 |         "    , srp.static_version_key "
53 |         "    from static_route_patterns srp "
54 |         "    JOIN static_trips st "
55 |         "    ON srp.representative_trip_id = st.trip_id "
56 |         "    AND srp.static_version_key = st.static_version_key "
57 |         "    JOIN static_stop_times sst "
58 |         "    ON srp.representative_trip_id = sst.trip_id "
59 |         "    AND srp.static_version_key = sst.static_version_key "
60 |         "    JOIN static_stops ss "
61 |         "    ON sst.stop_id = ss.stop_id "
62 |         "    AND sst.static_version_key = ss.static_version_key "
63 |         "    WHERE "
64 |         "    srp.route_pattern_typicality = 1"
65 |         ") AS static_canon "
66 |         "ON ve.parent_station = static_canon.parent_station "
67 |         "AND vt.static_version_key = static_canon.static_version_key "
68 |         "AND vt.direction_id = static_canon.direction_id "
69 |         "AND coalesce(vt.branch_route_id, vt.trunk_route_id) = static_canon.route_id "
70 |         "WHERE vehicle_events.pm_trip_id = ve.pm_trip_id "
71 |         "AND vehicle_events.parent_station = static_canon.parent_station "
72 |         ";"
73 |     )
74 |     op.execute(update_stop_sequences)
75 | 
76 | 
77 | def downgrade() -> None:
78 |     op.drop_index("ix_static_trips_composite_4", table_name="static_trips")
79 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_dev/005_96187da84955_remove_metadata.py:
--------------------------------------------------------------------------------
 1 | """remove_metadata
 2 | 
 3 | Revision ID: 96187da84955
 4 | Revises: 45dedc21086e
 5 | Create Date: 2023-12-28 12:18:25.412282
 6 | 
 7 | check that all information in the metadata table has been copied to the
 8 | metadata database before dropping the table and its indexes entirely.
 9 | """
10 | 
11 | import time
12 | 
13 | from alembic import op
14 | from sqlalchemy.dialects import postgresql
15 | from sqlalchemy.exc import ProgrammingError
16 | from sqlalchemy.sql import text
17 | import logging
18 | import sqlalchemy as sa
19 | 
20 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
21 | from lamp_py.postgres.metadata_schema import MetadataLog
22 | 
23 | # revision identifiers, used by Alembic.
24 | revision = "96187da84955"
25 | down_revision = "45dedc21086e"
26 | branch_labels = None
27 | depends_on = None
28 | 
29 | 
30 | def upgrade() -> None:
31 |     # ### commands auto generated by Alembic - please adjust! ###
32 |     op.drop_index("ix_metadata_log_not_processed", table_name="metadata_log")
33 |     op.drop_table("metadata_log")
34 |     # ### end Alembic commands ###
35 | 
36 | 
37 | def downgrade() -> None:
38 |     # ### commands auto generated by Alembic - please adjust! ###
39 |     op.create_table(
40 |         "metadata_log",
41 |         sa.Column("pk_id", sa.INTEGER(), autoincrement=True, nullable=False),
42 |         sa.Column("processed", sa.BOOLEAN(), autoincrement=False, nullable=True),
43 |         sa.Column("process_fail", sa.BOOLEAN(), autoincrement=False, nullable=True),
44 |         sa.Column("path", sa.VARCHAR(length=256), autoincrement=False, nullable=False),
45 |         sa.Column(
46 |             "created_on",
47 |             postgresql.TIMESTAMP(timezone=True),
48 |             server_default=sa.text("now()"),
49 |             autoincrement=False,
50 |             nullable=True,
51 |         ),
52 |         sa.PrimaryKeyConstraint("pk_id", name="metadata_log_pkey"),
53 |         sa.UniqueConstraint("path", name="metadata_log_path_key"),
54 |     )
55 |     op.create_index(
56 |         "ix_metadata_log_not_processed",
57 |         "metadata_log",
58 |         ["path"],
59 |         unique=False,
60 |         postgresql_where="(processed = false)",
61 |     )
62 |     # ### end Alembic commands ###
63 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_dev/008_32ba735d080c_add_revenue_columns.py:
--------------------------------------------------------------------------------
 1 | """add revenue columns
 2 | 
 3 | Revision ID: 32ba735d080c
 4 | Revises: 896dedd8a4db
 5 | Create Date: 2024-09-20 08:47:52.784591
 6 | 
 7 | This change adds a boolean revenue column to the vehcile_trips table.
 8 | Initially this will be filled with True and back-filled by a seperate operation
 9 | 
10 | Details
11 | * upgrade -> drop triggers and indexes from table and add revenue column
12 | 
13 | * downgrade -> drop revenue column
14 | 
15 | """
16 | 
17 | from alembic import op
18 | import sqlalchemy as sa
19 | 
20 | from lamp_py.postgres.rail_performance_manager_schema import (
21 |     TempEventCompare,
22 |     VehicleTrips,
23 | )
24 | 
25 | # revision identifiers, used by Alembic.
26 | revision = "32ba735d080c"
27 | down_revision = "896dedd8a4db"
28 | branch_labels = None
29 | depends_on = None
30 | 
31 | 
32 | def upgrade() -> None:
33 |     op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER rt_trips_update_branch_trunk;")
34 |     op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER update_vehicle_trips_modified;")
35 |     op.drop_index("ix_vehicle_trips_composite_1", table_name="vehicle_trips")
36 |     op.drop_constraint("vehicle_trips_unique_trip", table_name="vehicle_trips")
37 | 
38 |     op.add_column("temp_event_compare", sa.Column("revenue", sa.Boolean(), nullable=True))
39 |     op.add_column("vehicle_trips", sa.Column("revenue", sa.Boolean(), nullable=True))
40 |     op.execute(sa.update(TempEventCompare).values(revenue=True))
41 |     op.execute(sa.update(VehicleTrips).values(revenue=True))
42 |     op.alter_column("temp_event_compare", "revenue", nullable=False)
43 |     op.alter_column("vehicle_trips", "revenue", nullable=False)
44 | 
45 |     op.create_unique_constraint(
46 |         "vehicle_trips_unique_trip",
47 |         "vehicle_trips",
48 |         ["service_date", "route_id", "trip_id"],
49 |     )
50 |     op.create_index(
51 |         "ix_vehicle_trips_composite_1",
52 |         "vehicle_trips",
53 |         ["route_id", "direction_id", "vehicle_id"],
54 |         unique=False,
55 |     )
56 |     op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER rt_trips_update_branch_trunk;")
57 |     op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER update_vehicle_trips_modified;")
58 | 
59 | 
60 | def downgrade() -> None:
61 |     op.drop_column("vehicle_trips", "revenue")
62 |     op.drop_column("temp_event_compare", "revenue")
63 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_dev/009_36e7a7aee148_upgrade_sequence.py:
--------------------------------------------------------------------------------
 1 | """upgrade sequence
 2 | 
 3 | Revision ID: 36e7a7aee148
 4 | Revises: 32ba735d080c
 5 | Create Date: 2025-01-07 13:57:50.433896
 6 | 
 7 | This change upgrades the pm_event_id sequence type to bigint to avoid running out of keys
 8 | 
 9 | Details
10 | * upgrade -> drop opmi view, upgrade sequence, update sequence storage columns
11 | 
12 | * downgrade -> not possible, can't go from bigint to int
13 | 
14 | """
15 | 
16 | from alembic import op
17 | import sqlalchemy as sa
18 | 
19 | from lamp_py.migrations.versions.performance_manager_prod.sql_strings.strings_001 import view_opmi_all_rt_fields_joined
20 | 
21 | # revision identifiers, used by Alembic.
22 | revision = "36e7a7aee148"
23 | down_revision = "32ba735d080c"
24 | branch_labels = None
25 | depends_on = None
26 | 
27 | 
28 | def upgrade() -> None:
29 |     # Upgrade sequence to BIGINT
30 |     op.execute("ALTER SEQUENCE vehicle_events_pm_event_id_seq as bigint MAXVALUE 9223372036854775807;")
31 |     # DROP VIEW before upgrading columns
32 |     drop_opmi_all_rt_fields_joined = "DROP VIEW IF EXISTS opmi_all_rt_fields_joined;"
33 |     op.execute(drop_opmi_all_rt_fields_joined)
34 |     # Upgrade event_id columns to BIGINT
35 |     op.alter_column(
36 |         "vehicle_events",
37 |         "pm_event_id",
38 |         existing_type=sa.INTEGER(),
39 |         type_=sa.BigInteger(),
40 |         existing_nullable=False,
41 |         autoincrement=True,
42 |     )
43 |     op.alter_column(
44 |         "vehicle_events",
45 |         "previous_trip_stop_pm_event_id",
46 |         existing_type=sa.INTEGER(),
47 |         type_=sa.BigInteger(),
48 |         existing_nullable=True,
49 |     )
50 |     op.alter_column(
51 |         "vehicle_events",
52 |         "next_trip_stop_pm_event_id",
53 |         existing_type=sa.INTEGER(),
54 |         type_=sa.BigInteger(),
55 |         existing_nullable=True,
56 |     )
57 |     op.execute(view_opmi_all_rt_fields_joined)
58 | 
59 | 
60 | def downgrade() -> None:
61 |     # Can not migrate from INT to BIGINT without losing data.
62 |     pass
63 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_prod/005_32ba735d080c_add_revenue_columns.py:
--------------------------------------------------------------------------------
 1 | """add revenue columns
 2 | 
 3 | Revision ID: 32ba735d080c
 4 | Revises: 896dedd8a4db
 5 | Create Date: 2024-09-20 08:47:52.784591
 6 | 
 7 | This change adds a boolean revenue column to the vehcile_trips table.
 8 | Initially this will be filled with True and back-filled by a seperate operation
 9 | 
10 | Details
11 | * upgrade -> drop triggers and indexes from table and add revenue column
12 | 
13 | * downgrade -> drop revenue column
14 | 
15 | """
16 | 
17 | from alembic import op
18 | import sqlalchemy as sa
19 | 
20 | from lamp_py.postgres.rail_performance_manager_schema import (
21 |     TempEventCompare,
22 |     VehicleTrips,
23 | )
24 | 
25 | # revision identifiers, used by Alembic.
26 | revision = "32ba735d080c"
27 | down_revision = "896dedd8a4db"
28 | branch_labels = None
29 | depends_on = None
30 | 
31 | 
32 | def upgrade() -> None:
33 |     op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER rt_trips_update_branch_trunk;")
34 |     op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER update_vehicle_trips_modified;")
35 |     op.drop_index("ix_vehicle_trips_composite_1", table_name="vehicle_trips")
36 |     op.drop_constraint("vehicle_trips_unique_trip", table_name="vehicle_trips")
37 | 
38 |     op.add_column("temp_event_compare", sa.Column("revenue", sa.Boolean(), nullable=True))
39 |     op.add_column("vehicle_trips", sa.Column("revenue", sa.Boolean(), nullable=True))
40 |     op.execute(sa.update(TempEventCompare).values(revenue=True))
41 |     op.execute(sa.update(VehicleTrips).values(revenue=True))
42 |     op.alter_column("temp_event_compare", "revenue", nullable=False)
43 |     op.alter_column("vehicle_trips", "revenue", nullable=False)
44 | 
45 |     op.create_unique_constraint(
46 |         "vehicle_trips_unique_trip",
47 |         "vehicle_trips",
48 |         ["service_date", "route_id", "trip_id"],
49 |     )
50 |     op.create_index(
51 |         "ix_vehicle_trips_composite_1",
52 |         "vehicle_trips",
53 |         ["route_id", "direction_id", "vehicle_id"],
54 |         unique=False,
55 |     )
56 |     op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER rt_trips_update_branch_trunk;")
57 |     op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER update_vehicle_trips_modified;")
58 | 
59 | 
60 | def downgrade() -> None:
61 |     op.drop_column("vehicle_trips", "revenue")
62 |     op.drop_column("temp_event_compare", "revenue")
63 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_prod/006_36e7a7aee148_upgrade_sequence.py:
--------------------------------------------------------------------------------
 1 | """upgrade sequence
 2 | 
 3 | Revision ID: 36e7a7aee148
 4 | Revises: 32ba735d080c
 5 | Create Date: 2025-01-07 13:57:50.433896
 6 | 
 7 | This change upgrades the pm_event_id sequence type to bigint to avoid running out of keys
 8 | 
 9 | Details
10 | * upgrade -> drop opmi view, upgrade sequence, update sequence storage columns
11 | 
12 | * downgrade -> not possible, can't go from bigint to int
13 | 
14 | """
15 | 
16 | from alembic import op
17 | import sqlalchemy as sa
18 | 
19 | from lamp_py.migrations.versions.performance_manager_prod.sql_strings.strings_001 import view_opmi_all_rt_fields_joined
20 | 
21 | # revision identifiers, used by Alembic.
22 | revision = "36e7a7aee148"
23 | down_revision = "32ba735d080c"
24 | branch_labels = None
25 | depends_on = None
26 | 
27 | 
28 | def upgrade() -> None:
29 |     # Upgrade sequence to BIGINT
30 |     op.execute("ALTER SEQUENCE vehicle_events_pm_event_id_seq as bigint MAXVALUE 9223372036854775807;")
31 |     # DROP VIEW before upgrading columns
32 |     drop_opmi_all_rt_fields_joined = "DROP VIEW IF EXISTS opmi_all_rt_fields_joined;"
33 |     op.execute(drop_opmi_all_rt_fields_joined)
34 |     # Upgrade event_id columns to BIGINT
35 |     op.alter_column(
36 |         "vehicle_events",
37 |         "pm_event_id",
38 |         existing_type=sa.INTEGER(),
39 |         type_=sa.BigInteger(),
40 |         existing_nullable=False,
41 |         autoincrement=True,
42 |     )
43 |     op.alter_column(
44 |         "vehicle_events",
45 |         "previous_trip_stop_pm_event_id",
46 |         existing_type=sa.INTEGER(),
47 |         type_=sa.BigInteger(),
48 |         existing_nullable=True,
49 |     )
50 |     op.alter_column(
51 |         "vehicle_events",
52 |         "next_trip_stop_pm_event_id",
53 |         existing_type=sa.INTEGER(),
54 |         type_=sa.BigInteger(),
55 |         existing_nullable=True,
56 |     )
57 |     op.execute(view_opmi_all_rt_fields_joined)
58 | 
59 | 
60 | def downgrade() -> None:
61 |     # Can not migrate from INT to BIGINT without losing data.
62 |     pass
63 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_prod/007_da8f80a3dd90_upgrade_sequence.py:
--------------------------------------------------------------------------------
 1 | """upgrade sequence
 2 | 
 3 | Revision ID: da8f80a3dd90
 4 | Revises: 36e7a7aee148
 5 | Create Date: 2025-04-11 09:43:50.433896
 6 | 
 7 | This change re-indexes all PROD table indexes in an attempt to resolve DB query degradation.
 8 | 
 9 | Details
10 | * upgrade -> REINDEX all indexes on PRDO
11 | 
12 | * downgrade -> None
13 | 
14 | """
15 | 
16 | from alembic import op
17 | import sqlalchemy as sa
18 | 
19 | from lamp_py.runtime_utils.process_logger import ProcessLogger
20 | 
21 | # revision identifiers, used by Alembic.
22 | revision = "da8f80a3dd90"
23 | down_revision = "36e7a7aee148"
24 | branch_labels = None
25 | depends_on = None
26 | 
27 | 
28 | def upgrade() -> None:
29 |     # REINDEX all tables
30 |     tables = [
31 |         "vehicle_events",
32 |         "vehicle_trips",
33 |         "static_feed_info",
34 |         "static_trips",
35 |         "static_routes",
36 |         "static_stops",
37 |         "static_stop_times",
38 |         "static_calendar",
39 |         "static_calendar_dates",
40 |         "static_directions",
41 |         "static_route_patterns",
42 |     ]
43 |     for table in tables:
44 |         try:
45 |             log = ProcessLogger(f"reindex_{table}")
46 |             log.log_start()
47 |             op.execute(sa.text(f"REINDEX TABLE {table};"))
48 |             log.log_complete()
49 |         except Exception as e:
50 |             log.log_failure(e)
51 | 
52 | 
53 | def downgrade() -> None:
54 |     # No downgrade
55 |     pass
56 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_prod/008_5e3066f113ff_backfill_rt_rail_2025_04_04_to_2025_04_18.py:
--------------------------------------------------------------------------------
 1 | """backfill_rt_rail_data_0404_to_0422
 2 | 
 3 | Revision ID: 5e3066f113ff
 4 | Revises: da8f80a3dd90
 5 | Create Date: Wed Apr 23 11:16:12 EDT 2025
 6 | 
 7 | Details
 8 | This will clean up missing data from RDS performance issues/outage from 4/14-4/17
 9 | This will also clean up duplication of data in prod from 4/17-4/22
10 | 
11 | This is the same as staging/012_9b461d7aa53a_backfill_rt_rail_2025_04_04_to_2025_04_22.py
12 | 
13 | * upgrade -> Delete all records from 4/4 to 4/23 in vehicle events and vehicle_trips
14 |           -> Set all flags to "unprocessed" in metadata log from 4/4 to 4/22
15 | * downgrade -> Nothing
16 | """
17 | 
18 | import os
19 | import tempfile
20 | import logging
21 | 
22 | import polars as pl
23 | import pyarrow as pa
24 | import pyarrow.parquet as pq
25 | from typing import List
26 | 
27 | from alembic import op
28 | import sqlalchemy as sa
29 | from sqlalchemy.exc import ProgrammingError
30 | 
31 | from lamp_py.aws.s3 import download_file, upload_file
32 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
33 | 
34 | # revision identifiers, used by Alembic.
35 | revision = "5e3066f113ff"
36 | down_revision = "da8f80a3dd90"
37 | branch_labels = None
38 | depends_on = None
39 | 
40 | 
41 | def upgrade() -> None:
42 | 
43 |     # SELECT FROM vehicle_events WHERE service_date >= 20250404 AND service_date <= 20250423;"
44 | 
45 |     clear_events = "DELETE FROM vehicle_events WHERE service_date >= 20250404 AND service_date <= 20250422;"
46 |     op.execute(clear_events)
47 | 
48 |     clear_trips = "DELETE FROM vehicle_trips WHERE service_date >= 20250404 AND service_date <= 20250422;"
49 |     op.execute(clear_trips)
50 | 
51 |     # Query to Check
52 |     # SELECT created_on, rail_pm_processed, rail_pm_process_fail
53 |     # FROM public.metadata_log
54 |     # WHERE created_on > '2025-04-04' and created_on < '2025-04-22 23:59:59'
55 |     # AND (path LIKE '%/RT_TRIP_UPDATES/%' or path LIKE '%/RT_VEHICLE_POSITIONS/%')
56 |     # ORDER BY created_on;
57 | 
58 |     try:
59 |         update_md_query = """
60 |         UPDATE
61 |             metadata_log
62 |         SET
63 |             rail_pm_process_fail = false
64 |             , rail_pm_processed = false
65 |         WHERE
66 |             created_on > '2025-04-04 00:00:00'
67 |             and created_on < '2025-04-22 23:59:59'
68 |             and (
69 |                 path LIKE '%/RT_TRIP_UPDATES/%'
70 |                 or path LIKE '%/RT_VEHICLE_POSITIONS/%'
71 |             )
72 |         ;
73 |         """
74 |         md_manager = DatabaseManager(DatabaseIndex.METADATA)
75 |         md_manager.execute(sa.text(update_md_query))
76 | 
77 |     except ProgrammingError as error:
78 |         # Error 42P01 is an 'Undefined Table' error. This occurs when there is
79 |         # no metadata_log table in the rail performance manager database
80 |         #
81 |         # Raise all other sql errors
82 |         original_error = error.orig
83 |         if original_error is not None and hasattr(original_error, "pgcode") and original_error.pgcode == "42P01":
84 |             logging.info("No Metadata Table in Rail Performance Manager")
85 |         else:
86 |             raise
87 | 
88 | 
89 | def downgrade() -> None:
90 |     pass
91 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_staging/002_1b53fd278b10_fix_trip_id_length.py:
--------------------------------------------------------------------------------
  1 | """fix trip id length
  2 | 
  3 | Revision ID: 1b53fd278b10
  4 | Revises: 5d9a7ee21ae5
  5 | Create Date: 2023-11-27 16:25:42.657967
  6 | 
  7 | Details
  8 | * upgrade -> change "trip_id" field from length 128 to 512
  9 | * downgrade -> change "trip_id" field from length 512 to 128
 10 | """
 11 | 
 12 | from alembic import op
 13 | import sqlalchemy as sa
 14 | 
 15 | from lamp_py.migrations.versions.performance_manager_staging.sql_strings.strings_001 import (
 16 |     view_opmi_all_rt_fields_joined,
 17 | )
 18 | 
 19 | # revision identifiers, used by Alembic.
 20 | revision = "1b53fd278b10"
 21 | down_revision = "5d9a7ee21ae5"
 22 | branch_labels = None
 23 | depends_on = None
 24 | 
 25 | 
 26 | def upgrade() -> None:
 27 |     # ### commands auto generated by Alembic - please adjust! ###
 28 |     op.execute("DROP VIEW IF EXISTS opmi_all_rt_fields_joined;")
 29 |     op.alter_column(
 30 |         "static_route_patterns",
 31 |         "representative_trip_id",
 32 |         existing_type=sa.VARCHAR(length=128),
 33 |         type_=sa.String(length=512),
 34 |         existing_nullable=False,
 35 |     )
 36 |     op.alter_column(
 37 |         "static_stop_times",
 38 |         "trip_id",
 39 |         existing_type=sa.VARCHAR(length=128),
 40 |         type_=sa.String(length=512),
 41 |         existing_nullable=False,
 42 |     )
 43 |     op.alter_column(
 44 |         "static_trips",
 45 |         "trip_id",
 46 |         existing_type=sa.VARCHAR(length=128),
 47 |         type_=sa.String(length=512),
 48 |         existing_nullable=False,
 49 |     )
 50 |     op.alter_column(
 51 |         "temp_event_compare",
 52 |         "trip_id",
 53 |         existing_type=sa.VARCHAR(length=128),
 54 |         type_=sa.String(length=512),
 55 |         existing_nullable=False,
 56 |     )
 57 |     op.alter_column(
 58 |         "vehicle_trips",
 59 |         "trip_id",
 60 |         existing_type=sa.VARCHAR(length=128),
 61 |         type_=sa.String(length=512),
 62 |         existing_nullable=False,
 63 |     )
 64 |     op.alter_column(
 65 |         "vehicle_trips",
 66 |         "static_trip_id_guess",
 67 |         existing_type=sa.VARCHAR(length=128),
 68 |         type_=sa.String(length=512),
 69 |         existing_nullable=True,
 70 |     )
 71 |     op.execute(view_opmi_all_rt_fields_joined)
 72 |     # ### end Alembic commands ###
 73 | 
 74 | 
 75 | def downgrade() -> None:
 76 |     # ### commands auto generated by Alembic - please adjust! ###
 77 |     op.execute("DROP VIEW IF EXISTS opmi_all_rt_fields_joined;")
 78 |     op.alter_column(
 79 |         "vehicle_trips",
 80 |         "static_trip_id_guess",
 81 |         existing_type=sa.String(length=512),
 82 |         type_=sa.VARCHAR(length=128),
 83 |         existing_nullable=True,
 84 |     )
 85 |     op.alter_column(
 86 |         "vehicle_trips",
 87 |         "trip_id",
 88 |         existing_type=sa.String(length=512),
 89 |         type_=sa.VARCHAR(length=128),
 90 |         existing_nullable=False,
 91 |     )
 92 |     op.alter_column(
 93 |         "temp_event_compare",
 94 |         "trip_id",
 95 |         existing_type=sa.String(length=512),
 96 |         type_=sa.VARCHAR(length=128),
 97 |         existing_nullable=False,
 98 |     )
 99 |     op.alter_column(
100 |         "static_trips",
101 |         "trip_id",
102 |         existing_type=sa.String(length=512),
103 |         type_=sa.VARCHAR(length=128),
104 |         existing_nullable=False,
105 |     )
106 |     op.alter_column(
107 |         "static_stop_times",
108 |         "trip_id",
109 |         existing_type=sa.String(length=512),
110 |         type_=sa.VARCHAR(length=128),
111 |         existing_nullable=False,
112 |     )
113 |     op.alter_column(
114 |         "static_route_patterns",
115 |         "representative_trip_id",
116 |         existing_type=sa.String(length=512),
117 |         type_=sa.VARCHAR(length=128),
118 |         existing_nullable=False,
119 |     )
120 |     op.execute(view_opmi_all_rt_fields_joined)
121 |     # ### end Alembic commands ###
122 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_staging/003_ae6c6e4b2df5_extend_service_id_view.py:
--------------------------------------------------------------------------------
 1 | """extend service_id_by_date_and_route
 2 | 
 3 | Revision ID: ae6c6e4b2df5
 4 | Revises: 1b53fd278b10
 5 | Create Date: 2023-12-17 06:56:17.330783
 6 | 
 7 | Details
 8 | * upgrade -> extend service_id_by_date_and_route VIEW to generate values past current date
 9 | * upgrade -> update canonical_stop_sequence to use row_number function instead of direct from static schedule
10 | 
11 | * downgrade -> Nothing
12 | """
13 | 
14 | from alembic import op
15 | 
16 | from lamp_py.migrations.versions.performance_manager_staging.sql_strings.strings_003 import (
17 |     view_service_id_by_date_and_route,
18 | )
19 | 
20 | 
21 | # revision identifiers, used by Alembic.
22 | revision = "ae6c6e4b2df5"
23 | down_revision = "1b53fd278b10"
24 | branch_labels = None
25 | depends_on = None
26 | 
27 | 
28 | def upgrade() -> None:
29 |     op.execute("DROP VIEW IF EXISTS service_id_by_date_and_route;")
30 |     op.execute(view_service_id_by_date_and_route)
31 | 
32 |     op.create_index(
33 |         "ix_static_trips_composite_4",
34 |         "static_trips",
35 |         ["static_version_key", "service_id"],
36 |         unique=False,
37 |     )
38 | 
39 |     update_stop_sequences = (
40 |         "UPDATE vehicle_events "
41 |         "SET canonical_stop_sequence = static_canon.stop_sequence "
42 |         "FROM vehicle_events AS ve "
43 |         "JOIN vehicle_trips AS vt "
44 |         "ON ve.pm_trip_id = vt.pm_trip_id "
45 |         "JOIN "
46 |         "("
47 |         "    select "
48 |         "      srp.direction_id "
49 |         "    , coalesce(st.branch_route_id, st.trunk_route_id) AS route_id "
50 |         "    , ROW_NUMBER () OVER (PARTITION BY srp.static_version_key, srp.direction_id, coalesce(st.branch_route_id, st.trunk_route_id) ORDER BY sst.stop_sequence) AS stop_sequence"
51 |         "    , ss.parent_station "
52 |         "    , srp.static_version_key "
53 |         "    from static_route_patterns srp "
54 |         "    JOIN static_trips st "
55 |         "    ON srp.representative_trip_id = st.trip_id "
56 |         "    AND srp.static_version_key = st.static_version_key "
57 |         "    JOIN static_stop_times sst "
58 |         "    ON srp.representative_trip_id = sst.trip_id "
59 |         "    AND srp.static_version_key = sst.static_version_key "
60 |         "    JOIN static_stops ss "
61 |         "    ON sst.stop_id = ss.stop_id "
62 |         "    AND sst.static_version_key = ss.static_version_key "
63 |         "    WHERE "
64 |         "    srp.route_pattern_typicality = 1"
65 |         ") AS static_canon "
66 |         "ON ve.parent_station = static_canon.parent_station "
67 |         "AND vt.static_version_key = static_canon.static_version_key "
68 |         "AND vt.direction_id = static_canon.direction_id "
69 |         "AND coalesce(vt.branch_route_id, vt.trunk_route_id) = static_canon.route_id "
70 |         "WHERE vehicle_events.pm_trip_id = ve.pm_trip_id "
71 |         "AND vehicle_events.parent_station = static_canon.parent_station "
72 |         ";"
73 |     )
74 |     op.execute(update_stop_sequences)
75 | 
76 | 
77 | def downgrade() -> None:
78 |     op.drop_index("ix_static_trips_composite_4", table_name="static_trips")
79 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_staging/006_e20a4f3f8c03_fix_null_vehicle_consist.py:
--------------------------------------------------------------------------------
 1 | """fix null vehicle consist
 2 | 
 3 | Revision ID: e20a4f3f8c03
 4 | Revises: 96187da84955
 5 | Create Date: 2024-03-07 15:44:22.989929
 6 | 
 7 | On March 5th 2024, the vehicle consist field was removed from the VehiclePositions GTFS-RT feed
 8 | this broke our data pipeline requiring a switch to the multi_carriage_details field
 9 | this migration should re-process our realtime data from March 5th to present to fix missing
10 | vehicle consist values
11 | """
12 | 
13 | from alembic import op
14 | import sqlalchemy as sa
15 | 
16 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
17 | 
18 | # revision identifiers, used by Alembic.
19 | revision = "e20a4f3f8c03"
20 | down_revision = "96187da84955"
21 | branch_labels = None
22 | depends_on = None
23 | 
24 | 
25 | def upgrade() -> None:
26 |     clear_events = "DELETE FROM vehicle_events WHERE service_date >= 20240305;"
27 |     op.execute(clear_events)
28 | 
29 |     clear_trips = "DELETE FROM vehicle_trips WHERE service_date >= 20240305;"
30 |     op.execute(clear_trips)
31 | 
32 |     update_md_query = """
33 |         UPDATE
34 |             metadata_log 
35 |         SET rail_pm_processed = false
36 |         WHERE
37 |         (
38 |             "path" like '%RT_VEHICLE_POSITIONS%'
39 |             OR "path" like '%RT_TRIP_UPDATES%' 
40 |         )
41 |         AND
42 |         (substring("path", 'year=(\d+)') || '-' || substring("path", 'month=(\d+)') || '-' || substring("path", 'day=(\d+)'))::date >= '2024-3-5'::date
43 |     ;
44 |     """
45 |     md_manager = DatabaseManager(DatabaseIndex.METADATA)
46 |     md_manager.execute(sa.text(update_md_query))
47 | 
48 | 
49 | def downgrade() -> None:
50 |     pass
51 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_staging/009_32ba735d080c_add_revenue_columns.py:
--------------------------------------------------------------------------------
 1 | """add revenue columns
 2 | 
 3 | Revision ID: 32ba735d080c
 4 | Revises: 896dedd8a4db
 5 | Create Date: 2024-09-20 08:47:52.784591
 6 | 
 7 | This change adds a boolean revenue column to the vehcile_trips table.
 8 | Initially this will be filled with True and back-filled by a seperate operation
 9 | 
10 | Details
11 | * upgrade -> drop triggers and indexes from table and add revenue column
12 | 
13 | * downgrade -> drop revenue column
14 | 
15 | """
16 | 
17 | from alembic import op
18 | import sqlalchemy as sa
19 | 
20 | from lamp_py.postgres.rail_performance_manager_schema import (
21 |     TempEventCompare,
22 |     VehicleTrips,
23 | )
24 | 
25 | # revision identifiers, used by Alembic.
26 | revision = "32ba735d080c"
27 | down_revision = "896dedd8a4db"
28 | branch_labels = None
29 | depends_on = None
30 | 
31 | 
32 | def upgrade() -> None:
33 |     op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER rt_trips_update_branch_trunk;")
34 |     op.execute(f"ALTER TABLE public.vehicle_trips DISABLE TRIGGER update_vehicle_trips_modified;")
35 |     op.drop_index("ix_vehicle_trips_composite_1", table_name="vehicle_trips")
36 |     op.drop_constraint("vehicle_trips_unique_trip", table_name="vehicle_trips")
37 | 
38 |     op.add_column("temp_event_compare", sa.Column("revenue", sa.Boolean(), nullable=True))
39 |     op.add_column("vehicle_trips", sa.Column("revenue", sa.Boolean(), nullable=True))
40 |     op.execute(sa.update(TempEventCompare).values(revenue=True))
41 |     op.execute(sa.update(VehicleTrips).values(revenue=True))
42 |     op.alter_column("temp_event_compare", "revenue", nullable=False)
43 |     op.alter_column("vehicle_trips", "revenue", nullable=False)
44 | 
45 |     op.create_unique_constraint(
46 |         "vehicle_trips_unique_trip",
47 |         "vehicle_trips",
48 |         ["service_date", "route_id", "trip_id"],
49 |     )
50 |     op.create_index(
51 |         "ix_vehicle_trips_composite_1",
52 |         "vehicle_trips",
53 |         ["route_id", "direction_id", "vehicle_id"],
54 |         unique=False,
55 |     )
56 |     op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER rt_trips_update_branch_trunk;")
57 |     op.execute(f"ALTER TABLE public.vehicle_trips ENABLE TRIGGER update_vehicle_trips_modified;")
58 | 
59 | 
60 | def downgrade() -> None:
61 |     op.drop_column("vehicle_trips", "revenue")
62 |     op.drop_column("temp_event_compare", "revenue")
63 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_staging/010_36e7a7aee148_upgrade_sequence.py:
--------------------------------------------------------------------------------
 1 | """upgrade sequence
 2 | 
 3 | Revision ID: 36e7a7aee148
 4 | Revises: 32ba735d080c
 5 | Create Date: 2025-01-07 13:57:50.433896
 6 | 
 7 | This change upgrades the pm_event_id sequence type to bigint to avoid running out of keys
 8 | 
 9 | Details
10 | * upgrade -> drop opmi view, upgrade sequence, update sequence storage columns
11 | 
12 | * downgrade -> not possible, can't go from bigint to int
13 | 
14 | """
15 | 
16 | from alembic import op
17 | import sqlalchemy as sa
18 | 
19 | from lamp_py.migrations.versions.performance_manager_staging.sql_strings.strings_001 import (
20 |     view_opmi_all_rt_fields_joined,
21 | )
22 | 
23 | # revision identifiers, used by Alembic.
24 | revision = "36e7a7aee148"
25 | down_revision = "32ba735d080c"
26 | branch_labels = None
27 | depends_on = None
28 | 
29 | 
30 | def upgrade() -> None:
31 |     # Upgrade sequence to BIGINT
32 |     op.execute("ALTER SEQUENCE vehicle_events_pm_event_id_seq as bigint MAXVALUE 9223372036854775807;")
33 |     # DROP VIEW before upgrading columns
34 |     drop_opmi_all_rt_fields_joined = "DROP VIEW IF EXISTS opmi_all_rt_fields_joined;"
35 |     op.execute(drop_opmi_all_rt_fields_joined)
36 |     # Upgrade event_id columns to BIGINT
37 |     op.alter_column(
38 |         "vehicle_events",
39 |         "pm_event_id",
40 |         existing_type=sa.INTEGER(),
41 |         type_=sa.BigInteger(),
42 |         existing_nullable=False,
43 |         autoincrement=True,
44 |     )
45 |     op.alter_column(
46 |         "vehicle_events",
47 |         "previous_trip_stop_pm_event_id",
48 |         existing_type=sa.INTEGER(),
49 |         type_=sa.BigInteger(),
50 |         existing_nullable=True,
51 |     )
52 |     op.alter_column(
53 |         "vehicle_events",
54 |         "next_trip_stop_pm_event_id",
55 |         existing_type=sa.INTEGER(),
56 |         type_=sa.BigInteger(),
57 |         existing_nullable=True,
58 |     )
59 |     op.execute(view_opmi_all_rt_fields_joined)
60 | 
61 | 
62 | def downgrade() -> None:
63 |     # Can not migrate from INT to BIGINT without losing data.
64 |     pass
65 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_staging/011_5e3066f113ff_backfill_rt_rail_2025_04_04_to_2025_04_18.py:
--------------------------------------------------------------------------------
 1 | """update_glides_location_column_names
 2 | 
 3 | Revision ID: 5e3066f113ff
 4 | Revises: 36e7a7aee148
 5 | Create Date: Wed Apr 23 11:16:12 EDT 2025
 6 | 
 7 | Details
 8 | This will clean up missing data from RDS performance issues/outage from 4/14-4/17
 9 | This will also clean up duplication of data in prod from 4/17-4/22
10 | 
11 | * upgrade -> Delete all records from 4/4 to 4/23 in vehicle events and vehicle_trips
12 |           -> Set all flags to "unprocessed" in metadata log from 4/4 to 4/22
13 | * downgrade -> Nothing
14 | """
15 | 
16 | import logging
17 | import os
18 | import tempfile
19 | import polars as pl
20 | import pyarrow as pa
21 | import pyarrow.parquet as pq
22 | from typing import List
23 | 
24 | from alembic import op
25 | import sqlalchemy as sa
26 | from sqlalchemy.exc import ProgrammingError
27 | 
28 | from lamp_py.aws.s3 import download_file, upload_file
29 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
30 | 
31 | # revision identifiers, used by Alembic.
32 | revision = "5e3066f113ff"
33 | down_revision = "36e7a7aee148"
34 | branch_labels = None
35 | depends_on = None
36 | 
37 | 
38 | def upgrade() -> None:
39 |     # this migration partially failed due to a typo in the date range -
40 |     # deleting the contents to make clear this was NOT successfully run
41 |     # this job was rerun in the subsequent migration
42 |     pass
43 | 
44 | 
45 | def downgrade() -> None:
46 |     pass
47 | 


--------------------------------------------------------------------------------
/src/lamp_py/migrations/versions/performance_manager_staging/012_9b461d7aa53a_backfill_rt_rail_2025_04_04_to_2025_04_22.py:
--------------------------------------------------------------------------------
  1 | """backfill_rt_rail_data_0404_to_0422
  2 | 
  3 | Revision ID: 9b461d7aa53a
  4 | Revises: 5e3066f113ff
  5 | Create Date: Wed Apr 23 11:16:12 EDT 2025
  6 | 
  7 | Details
  8 | This will clean up missing data from RDS performance issues/outage from 4/14-4/17
  9 | This will also clean up duplication of data in prod from 4/17-4/22
 10 | 
 11 | This is a rerun due to incorrectly specified query in 5e3066f113ff for the metadata query.
 12 | We are correcting that error and rerunning the whole migration again.
 13 | 
 14 | * upgrade -> Delete all records from 4/4 to 4/23 in vehicle events and vehicle_trips
 15 |           -> Set all flags to "unprocessed" in metadata log from 4/4 to 4/22
 16 | * downgrade -> Nothing
 17 | """
 18 | 
 19 | import logging
 20 | import os
 21 | import tempfile
 22 | import polars as pl
 23 | import pyarrow as pa
 24 | import pyarrow.parquet as pq
 25 | from typing import List
 26 | 
 27 | from alembic import op
 28 | import sqlalchemy as sa
 29 | from sqlalchemy.exc import ProgrammingError
 30 | 
 31 | from lamp_py.aws.s3 import download_file, upload_file
 32 | from lamp_py.postgres.postgres_utils import DatabaseIndex, DatabaseManager
 33 | 
 34 | # revision identifiers, used by Alembic.
 35 | revision = "9b461d7aa53a"
 36 | down_revision = "5e3066f113ff"
 37 | branch_labels = None
 38 | depends_on = None
 39 | 
 40 | 
 41 | def upgrade() -> None:
 42 | 
 43 |     # SELECT FROM vehicle_events WHERE service_date >= 20250404 AND service_date <= 20250422;"
 44 |     # ~ (974142 rows)
 45 |     clear_events = "DELETE FROM vehicle_events WHERE service_date >= 20250404 AND service_date <= 20250422;"
 46 |     op.execute(clear_events)
 47 | 
 48 |     # ~ (75788 rows)
 49 |     clear_trips = "DELETE FROM vehicle_trips WHERE service_date >= 20250404 AND service_date <= 20250422;"
 50 |     op.execute(clear_trips)
 51 | 
 52 |     # Query to Check
 53 |     # SELECT
 54 |     #     created_on,
 55 |     #     rail_pm_process_fail,
 56 |     #     rail_pm_processed
 57 |     # FROM public.metadata_log
 58 |     # WHERE
 59 |     #     created_on > '2025-04-04 00:00:00'
 60 |     #     and created_on < '2025-04-22 23:59:59'
 61 |     #     and (
 62 |     #         path LIKE '%/RT_TRIP_UPDATES/%'
 63 |     #         or path LIKE '%/RT_VEHICLE_POSITIONS/%'
 64 |     #     )
 65 |     # ;
 66 | 
 67 |     try:
 68 |         update_md_query = """
 69 |         UPDATE
 70 |             metadata_log
 71 |         SET
 72 |             rail_pm_process_fail = false
 73 |             , rail_pm_processed = false
 74 |         WHERE
 75 |             created_on > '2025-04-04 00:00:00'
 76 |             and created_on < '2025-04-22 23:59:59'
 77 |             and (
 78 |                 path LIKE '%/RT_TRIP_UPDATES/%'
 79 |                 or path LIKE '%/RT_VEHICLE_POSITIONS/%'
 80 |             )
 81 |         ;
 82 |         """
 83 |         md_manager = DatabaseManager(DatabaseIndex.METADATA)
 84 |         md_manager.execute(sa.text(update_md_query))
 85 | 
 86 |     except ProgrammingError as error:
 87 |         # Error 42P01 is an 'Undefined Table' error. This occurs when there is
 88 |         # no metadata_log table in the rail performance manager database
 89 |         #
 90 |         # Raise all other sql errors
 91 |         original_error = error.orig
 92 |         if original_error is not None and hasattr(original_error, "pgcode") and original_error.pgcode == "42P01":
 93 |             logging.info("No Metadata Table in Rail Performance Manager")
 94 |         else:
 95 |             raise
 96 | 
 97 | 
 98 | def downgrade() -> None:
 99 |     pass
100 | 


--------------------------------------------------------------------------------
/src/lamp_py/mssql/__init__.py:
--------------------------------------------------------------------------------
1 | """ Suite of utilities for interacting with microsoft sql database """
2 | 


--------------------------------------------------------------------------------
/src/lamp_py/mssql/test_connect.py:
--------------------------------------------------------------------------------
 1 | import sqlalchemy as sa
 2 | from lamp_py.mssql.mssql_utils import MSSQLManager
 3 | 
 4 | 
 5 | def start() -> None:
 6 |     """
 7 |     Test MSSQL DB Connection
 8 |     """
 9 |     db = MSSQLManager(verbose=True)
10 |     select_query = sa.text("SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE='BASE TABLE';")
11 |     for record in db.select_as_list(select_query):
12 |         print(record)
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     start()
17 | 


--------------------------------------------------------------------------------
/src/lamp_py/performance_manager/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Pipeline for consuming GTFS realtime parquet files and converting them into
3 | trip summaries that are compared to trips that are planned in the GTFS static
4 | schedule
5 | """
6 | 


--------------------------------------------------------------------------------
/src/lamp_py/postgres/__init__.py:
--------------------------------------------------------------------------------
1 | """ Suite of utilities for interacting with postgres database """
2 | 


--------------------------------------------------------------------------------
/src/lamp_py/postgres/metadata_schema.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import sqlalchemy as sa
 4 | from sqlalchemy.orm import declarative_base
 5 | from sqlalchemy.sql.functions import now
 6 | 
 7 | MetadataSqlBase: Any = declarative_base(name="Metadata")
 8 | 
 9 | 
10 | class MetadataLog(MetadataSqlBase):  # pylint: disable=too-few-public-methods
11 |     """Table for keeping track of parquet files in S3"""
12 | 
13 |     __tablename__ = "metadata_log"
14 | 
15 |     pk_id = sa.Column(sa.Integer, primary_key=True)
16 |     rail_pm_processed = sa.Column(sa.Boolean, default=sa.false())
17 |     rail_pm_process_fail = sa.Column(sa.Boolean, default=sa.false())
18 |     path = sa.Column(sa.String(256), nullable=False, unique=True)
19 |     created_on = sa.Column(sa.DateTime(timezone=True), server_default=now())
20 | 
21 | 
22 | sa.Index(
23 |     "ix_metadata_log_not_processed",
24 |     MetadataLog.path,
25 |     postgresql_where=(MetadataLog.rail_pm_processed == sa.false()),
26 | )
27 | 


--------------------------------------------------------------------------------
/src/lamp_py/publishing/__init__.py:
--------------------------------------------------------------------------------
1 | """ Anything and Everything related to publicly publishing LAMP data """
2 | 


--------------------------------------------------------------------------------
/src/lamp_py/publishing/performancedata.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from lamp_py.aws.s3 import upload_file
 4 | from lamp_py.runtime_utils.remote_files import S3_PUBLIC
 5 | 
 6 | 
 7 | def publish_performance_index() -> None:
 8 |     """
 9 |     Upload index.html to https://performancedata.mbta.com bucket
10 |     """
11 |     here = os.path.dirname(os.path.abspath(__file__))
12 |     index_file = "index.html"
13 | 
14 |     if "unset" in S3_PUBLIC:
15 |         return
16 | 
17 |     local_index_path = os.path.join(here, index_file)
18 |     upload_index_path = os.path.join(S3_PUBLIC, index_file)
19 | 
20 |     extra_args = {
21 |         "ContentType": "text/html",
22 |     }
23 | 
24 |     upload_file(
25 |         file_name=local_index_path,
26 |         object_path=upload_index_path,
27 |         extra_args=extra_args,
28 |     )
29 | 


--------------------------------------------------------------------------------
/src/lamp_py/runtime_utils/__init__.py:
--------------------------------------------------------------------------------
1 | """ Suite of utilities used when running data pipelines """
2 | 


--------------------------------------------------------------------------------
/src/lamp_py/runtime_utils/alembic_migration.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | 
 4 | from alembic.config import Config
 5 | from alembic import command
 6 | 
 7 | 
 8 | def get_alembic_config(db_name: str) -> Config:
 9 |     """
10 |     get alembic configuration for specified db_name
11 | 
12 |     will raise NotImplementedError if db_name is not supported
13 |     """
14 |     here = os.path.dirname(os.path.abspath(__file__))
15 |     alembic_cfg_file = os.path.join(here, "..", "..", "..", "alembic.ini")
16 |     alembic_cfg_file = os.path.abspath(alembic_cfg_file)
17 |     logging.info("getting alembic config for %s from %s", db_name, alembic_cfg_file)
18 | 
19 |     db_names = (
20 |         "performance_manager_dev",
21 |         "performance_manager_staging",
22 |         "performance_manager_prod",
23 |         "metadata_dev",
24 |         "metadata_staging",
25 |         "metadata_prod",
26 |     )
27 | 
28 |     if db_name not in db_names:
29 |         raise NotImplementedError(f"Migration for {db_name} not implemented.")
30 | 
31 |     return Config(alembic_cfg_file, ini_section=db_name)
32 | 
33 | 
34 | def alembic_upgrade_to_head(db_name: str) -> None:
35 |     """
36 |     upgrade db_name to head revision
37 |     """
38 |     # load alembic configuation for db_name
39 |     alembic_cfg = get_alembic_config(db_name)
40 | 
41 |     command.upgrade(alembic_cfg, revision="head")
42 | 
43 | 
44 | def alembic_downgrade_to_base(db_name: str) -> None:
45 |     """
46 |     downgrade db_name to base revision
47 |     """
48 |     # load alembic configuation for db_name
49 |     alembic_cfg = get_alembic_config(db_name)
50 | 
51 |     command.downgrade(alembic_cfg, revision="base")
52 | 


--------------------------------------------------------------------------------
/src/lamp_py/runtime_utils/env_validation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Iterable, List, Optional
 3 | 
 4 | from lamp_py.runtime_utils.process_logger import ProcessLogger
 5 | from lamp_py.__version__ import VERSION
 6 | 
 7 | 
 8 | def validate_environment(
 9 |     required_variables: List[str],
10 |     private_variables: Optional[List[str]] = None,
11 |     optional_variables: Optional[List[str]] = None,
12 |     db_prefixes: Iterable[str] = (),
13 | ) -> None:
14 |     """
15 |     ensure that the environment has all the variables its required to have
16 |     before starting triggering main, making certain errors easier to debug.
17 |     """
18 |     process_logger = ProcessLogger("validate_env")
19 |     process_logger.log_start()
20 | 
21 |     if private_variables is None:
22 |         private_variables = []
23 | 
24 |     metadata = {"lamp_version": VERSION}
25 | 
26 |     # every pipeline needs a service name for logging
27 |     required_variables.append("SERVICE_NAME")
28 | 
29 |     # add required database variables
30 |     for prefix in db_prefixes:
31 |         required_variables += [
32 |             f"{prefix}_DB_HOST",
33 |             f"{prefix}_DB_NAME",
34 |             f"{prefix}_DB_PORT",
35 |             f"{prefix}_DB_USER",
36 |         ]
37 |         # if db password is missing, db region is required to generate a
38 |         # token to use as the password to the cloud database
39 |         if os.environ.get(f"{prefix}_DB_PASSWORD", None) is None:
40 |             required_variables.append("DB_REGION")
41 | 
42 |     # check for missing variables. add found variables to our logs.
43 |     missing_required = []
44 |     for key in required_variables:
45 |         value = os.environ.get(key, None)
46 |         if value is None:
47 |             missing_required.append(key)
48 | 
49 |         # do not log private variables
50 |         if key in private_variables:
51 |             value = "**********"
52 |         metadata[key] = value
53 | 
54 |     # for optional variables, access ones that exist and add them to logs.
55 |     if optional_variables:
56 |         for key in optional_variables:
57 |             value = os.environ.get(key, None)
58 |             if value is not None:
59 |                 # do not log private variables
60 |                 if key in private_variables:
61 |                     value = "**********"
62 |                 metadata[key] = value
63 | 
64 |     process_logger.add_metadata(**metadata)
65 | 
66 |     # if required variables are missing, log a failure and throw.
67 |     if missing_required:
68 |         exception = EnvironmentError(f"Missing required environment variables {missing_required}")
69 |         process_logger.log_failure(exception)
70 |         raise exception
71 | 
72 |     process_logger.log_complete()
73 | 


--------------------------------------------------------------------------------
/src/lamp_py/runtime_utils/infinite_wait.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | 
 4 | from lamp_py.aws.ecs import check_for_sigterm
 5 | 
 6 | 
 7 | def infinite_wait(reason: str) -> None:
 8 |     """
 9 |     When running on ECS, propagating an exception up the call stack and killing
10 |     the processes will result in the process being restarted, to keep the task
11 |     count at one. This method should be called instead when we want to pause
12 |     the process for intervention before restarting.
13 |     """
14 |     # amount of time to sleep between logging statements
15 |     sleep_time = 60
16 |     count = 0
17 | 
18 |     while True:
19 |         check_for_sigterm()
20 | 
21 |         # log every ten minutes
22 |         if count == 10:
23 |             logging.error("Pausing for %s", reason)
24 |             count = 0
25 | 
26 |         # sleep
27 |         time.sleep(sleep_time)
28 |         count += 1
29 | 


--------------------------------------------------------------------------------
/src/lamp_py/runtime_utils/lamp_exception.py:
--------------------------------------------------------------------------------
 1 | class GTFSIngestException(Exception):
 2 |     """
 3 |     Generic exception for the py gtfs_rt_ingestion library
 4 |     """
 5 | 
 6 | 
 7 | class ConfigTypeFromFilenameException(GTFSIngestException):
 8 |     """
 9 |     Unable to derrive config type from a filename
10 |     """
11 | 
12 |     def __init__(self, filename: str):
13 |         message = f"Unable to deduce Configuration Type from {filename}"
14 |         super().__init__(message)
15 |         self.filename = filename
16 | 
17 | 
18 | class ArgumentException(GTFSIngestException):
19 |     """
20 |     General Error to throw when incoming events are malformed
21 |     """
22 | 
23 | 
24 | class NoImplException(GTFSIngestException):
25 |     """
26 |     General Error for things LAMP hasn't implemented yet
27 |     """
28 | 
29 | 
30 | class IgnoreIngestion(GTFSIngestException):
31 |     """
32 |     General Error for files GTFS Ingestion should ignore
33 |     """
34 | 
35 | 
36 | class AWSException(GTFSIngestException):
37 |     """
38 |     General Error for raising with any AWS errors encountered.
39 |     """
40 | 
41 | 
42 | class LampExpectedNotFoundError(Exception):
43 |     """
44 |     Exception raised when expected inputs are not available
45 |     """
46 | 
47 | 
48 | class LampInvalidProcessingError(Exception):
49 |     """
50 |     Exception raised when invalid processing state is reached with inputs
51 |     """
52 | 


--------------------------------------------------------------------------------
/src/lamp_py/tableau/README.md:
--------------------------------------------------------------------------------
 1 | # Tableau Publisher
 2 | 
 3 | The Tableau Publisher is an application that takes data created by the Rail Performance Manager application as parquet files and publishes them to the ITD Managed Tableau Instance as hyper files. 
 4 | 
 5 | ## Application Operation
 6 | 
 7 | The application itself is run via a cloudwatch event that is set to trigger on a cronlike schedule.
 8 | 
 9 | On each run, it iterates through a list of jobs that generate hyper files and uploads them to the ITD Tableau server, where they can be used to generate dashboards and reports for external users. To generate the job reads a parquet file that has been created by upstream LAMP applications and converts it to a hyper file using the [Tableau Hyper API](https://www.tableau.com/developer/tools/hyper-api). The file is generated on local storage, and then uploaded to the ITD Managed Tableau server using the [Tableau Server Client](https://tableau.github.io/server-client-python/), a python library wrapping the [Tableau REST API](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api.htm).
10 | 
11 | ### Upstream Applications
12 | 
13 | To simplify the conversion from parquet to hyper, the schemas for both are defined within this module. We also store the hardcoded S3 filepaths. Because of this, components of this library are used by other applications when writing the parquet files.
14 | 
15 | ## Developer Note
16 | 
17 | The Tableau Hyper API is not currently supported on Apple Silicon. This means that local execution on Mac OSX with arm64 processors will not work without emulation. In light of that, imports from this directory will trigger `ModuleNotFound` exceptions if running on the wrong system. To avoid that, the `__init__.py` file includes a wrapper around components that are consumed by other applications. These functions will log an error when run without the desired dependencies.
18 | 
19 | ### Installation without Tableau dependencies
20 | 
21 | In `pyproject.toml`, there is an additional dependency group that contains the tableau dependencies. It is not marked optional, so these modules will be installed with `poetry install`. If you are on an arm64 architecture, you can avoid installing the tableau dependencies with `poetry install --without tableau`. This behavior is encoded in the `.envrcy`, `docker-compose.yml`, and `Dockerfile` files in this repository, so you should get the desired behavior without additional arguments.
22 | 


--------------------------------------------------------------------------------
/src/lamp_py/tableau/__init__.py:
--------------------------------------------------------------------------------
 1 | """Utilities for Interacting with Tableau and Hyper files"""
 2 | 
 3 | import logging
 4 | from types import ModuleType
 5 | from typing import Optional
 6 | 
 7 | from lamp_py.postgres.postgres_utils import DatabaseManager
 8 | 
 9 | # pylint: disable=C0103 (invalid-name)
10 | # pylint wants pipeline to conform to an UPPER_CASE constant naming style. its
11 | # a module though, so disabling to allow it to use normal import rules.
12 | pipeline: Optional[ModuleType]
13 | 
14 | try:
15 |     from . import pipeline
16 | except ModuleNotFoundError:
17 |     pipeline = None
18 | 
19 | # pylint: enable=C0103 (invalid-name)
20 | 
21 | 
22 | def start_parquet_updates(db_manager: DatabaseManager) -> None:
23 |     """
24 |     wrapper around pipeline.start_parquet_updates function. if a module not
25 |     found error occurs (which happens when using osx arm64 dependencies), log
26 |     an error and do nothing. else, run the function.
27 |     """
28 |     if pipeline is None:
29 |         logging.error("Unable to run parquet files on this machine due to Module Not Found error")
30 |     else:
31 |         pipeline.start_parquet_updates(db_manager=db_manager)
32 | 
33 | 
34 | def clean_parquet_paths() -> None:
35 |     """
36 |     wrapper around pipeline.clean_parquet_paths function. if a module not
37 |     found error occurs (which happens when using osx arm64 dependencies), log
38 |     an error and do nothing. else, run the function.
39 |     """
40 |     if pipeline is None:
41 |         logging.error("Unable to run parquet files on this machine due to Module Not Found error")
42 |     else:
43 |         pipeline.clean_parquet_paths()
44 | 


--------------------------------------------------------------------------------
/src/lamp_py/tableau/conversions/convert_bus_performance_data.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | from pyarrow import Table
 3 | 
 4 | 
 5 | def apply_bus_analysis_conversions(polars_df: pl.DataFrame) -> Table:
 6 |     """
 7 |     Function to apply final conversions to lamp data before outputting for tableau consumption
 8 |     """
 9 |     # Convert datetime to Eastern Time
10 |     polars_df = polars_df.with_columns(
11 |         pl.col("stop_arrival_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None),
12 |         pl.col("stop_departure_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None),
13 |         pl.col("gtfs_travel_to_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None),
14 |         pl.col("tm_scheduled_time_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None),
15 |         pl.col("tm_actual_arrival_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None),
16 |         pl.col("tm_actual_departure_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None),
17 |         pl.col("gtfs_sort_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None),
18 |         pl.col("gtfs_departure_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None),
19 |         pl.col("gtfs_arrival_dt").dt.convert_time_zone(time_zone="America/New_York").dt.replace_time_zone(None),
20 |     )
21 | 
22 |     # Convert seconds columns to be aligned with Eastern Time
23 |     polars_df = polars_df.with_columns(
24 |         (pl.col("gtfs_travel_to_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d"))
25 |         .dt.total_seconds()
26 |         .alias("gtfs_travel_to_seconds"),
27 |         (pl.col("stop_arrival_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d"))
28 |         .dt.total_seconds()
29 |         .alias("stop_arrival_seconds"),
30 |         (pl.col("stop_departure_dt") - pl.col("service_date").str.strptime(pl.Date, "%Y%m%d"))
31 |         .dt.total_seconds()
32 |         .alias("stop_departure_seconds"),
33 |     )
34 | 
35 |     polars_df = polars_df.with_columns(pl.col("service_date").str.strptime(pl.Date, "%Y%m%d", strict=False))
36 | 
37 |     return polars_df.to_arrow()
38 | 


--------------------------------------------------------------------------------
/src/lamp_py/tableau/jobs/rt_alerts.py:
--------------------------------------------------------------------------------
 1 | import pyarrow
 2 | 
 3 | from lamp_py.aws.s3 import download_file
 4 | from lamp_py.performance_manager.alerts import AlertsS3Info
 5 | from lamp_py.postgres.postgres_utils import DatabaseManager
 6 | from lamp_py.tableau.hyper import HyperJob
 7 | 
 8 | 
 9 | class HyperRtAlerts(HyperJob):
10 |     """HyperJob for LAMP Alerts dataset"""
11 | 
12 |     def __init__(self) -> None:
13 |         HyperJob.__init__(
14 |             self,
15 |             hyper_file_name="LAMP_ALERTS.hyper",
16 |             remote_parquet_path=AlertsS3Info.s3_path,
17 |             lamp_version=AlertsS3Info.file_version,
18 |         )
19 | 
20 |     @property
21 |     def parquet_schema(self) -> pyarrow.schema:
22 |         return AlertsS3Info.parquet_schema
23 | 
24 |     def create_parquet(self, _: DatabaseManager) -> None:
25 |         raise NotImplementedError("Alerts Hyper Job does not create parquet file")
26 | 
27 |     def update_parquet(self, _: DatabaseManager) -> bool:
28 |         download_file(
29 |             object_path=self.remote_parquet_path,
30 |             file_name=self.local_parquet_path,
31 |         )
32 |         return False
33 | 


--------------------------------------------------------------------------------
/src/lamp_py/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/src/lamp_py/utils/__init__.py


--------------------------------------------------------------------------------
/src/lamp_py/utils/clear_folder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | 
 5 | def clear_folder(folder: str) -> None:
 6 |     """
 7 |     Delete contents of entire folder.
 8 |     """
 9 |     for filename in os.listdir(folder):
10 |         file_path = os.path.join(folder, filename)
11 |         try:
12 |             if os.path.isfile(file_path) or os.path.islink(file_path):
13 |                 os.unlink(file_path)
14 |             elif os.path.isdir(file_path):
15 |                 shutil.rmtree(file_path)
16 |         except Exception as _:
17 |             pass
18 | 


--------------------------------------------------------------------------------
/src/lamp_py/utils/date_range_builder.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | 
 4 | # Create a DataFrame with two date columns
 5 | def build_data_range_paths(template_string: str, start_date: datetime, end_date: datetime) -> list[str]:
 6 |     """
 7 |     Given an f-string template, fill in the {} in template with all the days between
 8 |     start_date and end_date (inclusive) and return the result as a list of strings
 9 |     """
10 | 
11 |     # add 1 for inclusive
12 |     date_diff_days = (start_date - end_date).days * -1 + 1
13 | 
14 |     date_paths = []
15 |     #
16 |     for i in range(0, date_diff_days):
17 |         tmp = start_date + timedelta(days=i)
18 | 
19 |         # wrong format - good for delta though
20 |         # prefix_date_part = f"{yy}/{mm:02d}/{dd:02d}"
21 | 
22 |         # prefix_date_part = f"year={yy}/month={mm}/day={dd}/"
23 |         # prefix_whole_path = f"year={yy}/month={mm}/day={dd}/{yy}-{mm:02d}-{dd:02d}T00:00:00.parquet"
24 | 
25 |         formatted = template_string.format(yy=tmp.year, mm=tmp.month, dd=tmp.day)
26 |         date_paths.append(formatted)
27 |     return date_paths
28 | 


--------------------------------------------------------------------------------
/src/lamp_py/utils/gtfs_utils.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | from typing import List
 3 | import polars as pl
 4 | 
 5 | from lamp_py.aws.s3 import object_exists
 6 | from lamp_py.runtime_utils.process_logger import ProcessLogger
 7 | from lamp_py.runtime_utils.remote_files import compressed_gtfs
 8 | 
 9 | 
10 | def gtfs_from_parquet(file: str, service_date: date) -> pl.DataFrame:
11 |     """
12 |     Get GTFS data from specified file and service date
13 | 
14 |     This will read from s3_uri of file
15 | 
16 |     :param file: gtfs file to acces (i.e. "feed_info")
17 |     :param service_date: service date of requested GTFS data
18 | 
19 |     :return dataframe:
20 |         data columns of parquet file for service_date
21 |     """
22 |     logger = ProcessLogger("gtfs_from_parquet", file=file, service_date=service_date)
23 |     logger.log_start()
24 | 
25 |     gtfs_year = service_date.year
26 |     service_date_int = int(service_date.strftime("%Y%m%d"))
27 | 
28 |     gtfs_file = compressed_gtfs.parquet_path(gtfs_year, file).s3_uri
29 | 
30 |     if not object_exists(gtfs_file):
31 |         gtfs_file = compressed_gtfs.parquet_path(gtfs_year - 1, file).s3_uri
32 |         if not object_exists(gtfs_file):
33 |             exception = FileNotFoundError(f"No GTFS archive files available for {service_date}")
34 |             logger.log_failure(exception)
35 |             raise exception
36 | 
37 |     logger.add_metadata(gtfs_file=gtfs_file)
38 | 
39 |     gtfs_df = (
40 |         pl.read_parquet(gtfs_file)
41 |         .filter(
42 |             (pl.col("gtfs_active_date") <= service_date_int),
43 |             (pl.col("gtfs_end_date") >= service_date_int),
44 |         )
45 |         .drop(["gtfs_active_date", "gtfs_end_date"])
46 |     )
47 |     logger.add_metadata(gtfs_row_count=gtfs_df.shape[0])
48 |     logger.log_complete()
49 |     return gtfs_df
50 | 
51 | 
52 | def bus_route_ids_for_service_date(service_date: date) -> List[str]:
53 |     """get a list of bus route ids for a given service date"""
54 |     bus_routes = (
55 |         gtfs_from_parquet("routes", service_date).filter((pl.col("route_type") == 3)).get_column("route_id").unique()
56 |     )
57 | 
58 |     return bus_routes.to_list()
59 | 
60 | 
61 | def routes_for_service_date(service_date: date) -> pl.DataFrame:
62 |     """get a list of all routes for a given service date"""
63 |     routes = gtfs_from_parquet("routes", service_date)
64 | 
65 |     return routes
66 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/__init__.py


--------------------------------------------------------------------------------
/tests/aws/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/aws/__init__.py


--------------------------------------------------------------------------------
/tests/bus_performance_manager/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/bus_performance_manager/__init__.py


--------------------------------------------------------------------------------
/tests/bus_performance_manager/test_bus_convert_for_tableau.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import polars as pl
 4 | import pytest
 5 | 
 6 | from lamp_py.tableau.conversions.convert_bus_performance_data import apply_bus_analysis_conversions
 7 | 
 8 | 
 9 | # poetry run pytest -s tests/bus_performance_manager/test_bus_convert_for_tableau.py
10 | @pytest.mark.skip("temp skip - re-enable asap - need new data - Jun 2025")
11 | def test_apply_bus_analysis_conversions() -> None:
12 |     """
13 |     Test extracted conversions for tableau user view
14 |     """
15 |     df = pl.read_parquet("tests/test_files/PUBLIC_ARCHIVE/lamp/bus_vehicle_events/test_events.parquet")
16 |     table = apply_bus_analysis_conversions(polars_df=df)
17 |     print(df)
18 |     print(table)
19 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | this file contains fixtures that are intended to be used across multiple test
 3 | files
 4 | """
 5 | 
 6 | from typing import (
 7 |     Iterator,
 8 |     List,
 9 |     Optional,
10 |     Union,
11 | )
12 | 
13 | import pytest
14 | from _pytest.monkeypatch import MonkeyPatch
15 | from pyarrow import fs
16 | import pyarrow.dataset as pd
17 | 
18 | from .test_resources import LocalS3Location
19 | 
20 | 
21 | @pytest.fixture(autouse=True, name="get_pyarrow_dataset_patch")
22 | def fixture_get_pyarrow_dataset_patch(
23 |     monkeypatch: MonkeyPatch,
24 | ) -> Iterator[None]:
25 |     """
26 |     the aws.s3 function `_get_pyarrow_dataset` function reads parquet files from
27 |     s3 and returns a pyarrow dataset. when testing on our github machines, we
28 |     don't have access to s3, so all tests must be run against local files.
29 |     monkeypatch the function to read from a local filepath.
30 |     """
31 | 
32 |     def mock__get_pyarrow_dataset(
33 |         filename: Union[str, List[str]],
34 |         filters: Optional[pd.Expression] = None,
35 |     ) -> pd.Dataset:
36 |         active_fs = fs.LocalFileSystem()
37 | 
38 |         if isinstance(filename, list):
39 |             to_load = filename
40 |         else:
41 |             to_load = [filename]
42 | 
43 |         if len(to_load) == 0:
44 |             return pd.dataset([])
45 | 
46 |         ds = pd.dataset(to_load, filesystem=active_fs, partitioning="hive")
47 |         if filters is not None:
48 |             ds = ds.filter(filters)
49 | 
50 |         return ds
51 | 
52 |     monkeypatch.setattr("lamp_py.aws.s3._get_pyarrow_dataset", mock__get_pyarrow_dataset)
53 | 
54 |     yield
55 | 
56 | 
57 | @pytest.fixture(autouse=True, name="remote_file_locations_patch")
58 | def fixture_remote_file_locations_patch(
59 |     monkeypatch: MonkeyPatch,
60 | ) -> Iterator[None]:
61 |     """
62 |     We define S3 Filepaths in the RemoteFileLocations class in remote_files.py
63 |     that can be used in our different applications. When testing on github, we
64 |     don't have access to s3, so tests need to be run against local files. Use
65 |     monkeypatch to redefine how these utilities work.
66 |     """
67 |     monkeypatch.setattr("lamp_py.runtime_utils.remote_files.S3Location", LocalS3Location)
68 | 
69 |     yield
70 | 


--------------------------------------------------------------------------------
/tests/ingestion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/ingestion/__init__.py


--------------------------------------------------------------------------------
/tests/ingestion/test_configuration.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from lamp_py.ingestion.converter import ConfigType
 4 | from lamp_py.runtime_utils.lamp_exception import ConfigTypeFromFilenameException
 5 | 
 6 | UPDATE_FILENAME = "2022-01-01T00:00:02Z_https_cdn.mbta.com_realtime_TripUpdates_enhanced.json.gz"
 7 | 
 8 | VEHICLE_POSITIONS_FILENAME = "2022-01-01T00:00:03Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz"
 9 | 
10 | ALERTS_FILENAME = "2022-01-01T00:00:38Z_https_cdn.mbta.com_realtime_Alerts_enhanced.json.gz"
11 | 
12 | 
13 | def test_filname_parsing() -> None:
14 |     """
15 |     Check that we are able to get the correct Configuration type for multiple
16 |     filenames
17 |     """
18 |     trip_updates_type = ConfigType.from_filename(UPDATE_FILENAME)
19 |     assert trip_updates_type == ConfigType.RT_TRIP_UPDATES
20 | 
21 |     vehicle_positions_type = ConfigType.from_filename(VEHICLE_POSITIONS_FILENAME)
22 |     assert vehicle_positions_type == ConfigType.RT_VEHICLE_POSITIONS
23 | 
24 |     alerts_type = ConfigType.from_filename(ALERTS_FILENAME)
25 |     assert alerts_type == ConfigType.RT_ALERTS
26 | 
27 |     with pytest.raises(ConfigTypeFromFilenameException):
28 |         ConfigType.from_filename("this.is.a.bad.filename.json.gz")
29 | 


--------------------------------------------------------------------------------
/tests/ingestion/test_ingest.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=[W0621, W0611]
 2 | # disable these warnings that are triggered by pylint not understanding how test
 3 | # fixtures work. https://stackoverflow.com/q/59664605
 4 | 
 5 | import os
 6 | from queue import Queue
 7 | import pytest
 8 | 
 9 | from lamp_py.ingestion.converter import ConfigType
10 | from lamp_py.runtime_utils.lamp_exception import NoImplException
11 | from lamp_py.runtime_utils.lamp_exception import IgnoreIngestion
12 | from lamp_py.ingestion.convert_gtfs_rt import GtfsRtConverter
13 | 
14 | 
15 | TEST_FILE_DIR = os.path.join(os.path.dirname(__file__), "test_files")
16 | 
17 | 
18 | def test_each_config_type() -> None:
19 |     """
20 |     Test that each config type maps to a converter instance and that they map
21 |     correctly.
22 |     """
23 |     config_type_map = {
24 |         ConfigType.RT_ALERTS: GtfsRtConverter,
25 |         ConfigType.RT_TRIP_UPDATES: GtfsRtConverter,
26 |         ConfigType.RT_VEHICLE_POSITIONS: GtfsRtConverter,
27 |         ConfigType.BUS_TRIP_UPDATES: GtfsRtConverter,
28 |         ConfigType.BUS_VEHICLE_POSITIONS: GtfsRtConverter,
29 |     }
30 |     for config_type, converter_type in config_type_map.items():
31 |         converter = GtfsRtConverter(config_type, Queue())
32 |         assert isinstance(converter, converter_type)
33 | 
34 |     bad_config_types = [
35 |         ConfigType.VEHICLE_COUNT,
36 |         ConfigType.ERROR,
37 |         ConfigType.SCHEDULE,
38 |     ]
39 | 
40 |     for config_type in bad_config_types:
41 |         with pytest.raises(NoImplException):
42 |             converter = GtfsRtConverter(config_type, Queue())
43 | 
44 |     # with pytest.raises(IgnoreIngestion):
45 |     #     converter = GtfsRtConverter(ConfigType.LIGHT_RAIL, Queue())
46 | 


--------------------------------------------------------------------------------
/tests/ingestion/test_light_rail_gps.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from lamp_py.ingestion.light_rail_gps import raw_gps_schema
 4 | from lamp_py.ingestion.light_rail_gps import dataframe_from_gz
 5 | 
 6 | from ..test_resources import test_files_dir
 7 | 
 8 | mock_file_list = [
 9 |     os.path.join(
10 |         test_files_dir,
11 |         "INCOMING/2024-05-01T02:30:11Z_s3_mbta_ctd_trc_data_rtr_prod_LightRailRawGPS.json.gz",
12 |     )
13 | ]
14 | 
15 | 
16 | def test_light_rail_gps() -> None:
17 |     """
18 |     test gtfs_events_for_date pipeline
19 |     """
20 |     dataframe, archive_files, error_files = dataframe_from_gz(mock_file_list)
21 | 
22 |     assert len(archive_files) == 1
23 | 
24 |     assert len(error_files) == 0
25 | 
26 |     assert dataframe.schema == raw_gps_schema
27 | 
28 |     assert dataframe.shape[0] == 190
29 | 


--------------------------------------------------------------------------------
/tests/ingestion_tm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/ingestion_tm/__init__.py


--------------------------------------------------------------------------------
/tests/ingestion_tm/test_ingest.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | from typing import Set, List
 3 | 
 4 | from lamp_py.ingestion_tm.tm_export import TMExport
 5 | from lamp_py.ingestion_tm.ingest import get_ingestion_jobs
 6 | 
 7 | 
 8 | def get_tm_export_subclasses(
 9 |     cls: type[TMExport] = TMExport,
10 | ) -> Set[type[TMExport]]:
11 |     """
12 |     recursively get all of the concrete TMExport child classes
13 |     """
14 |     subclasses: List[type[TMExport]] = []
15 |     for subclass in cls.__subclasses__():
16 |         if inspect.isabstract(subclass):
17 |             subclasses += get_tm_export_subclasses(subclass)
18 |         else:
19 |             subclasses.append(subclass)
20 | 
21 |     return set(subclasses)
22 | 
23 | 
24 | def test_ingestion_job_count() -> None:
25 |     """
26 |     test that the ingestion pipeline is aware of each tm export class
27 |     """
28 |     # get all of the jobs run in ingestion, assert its not empty
29 |     ingestion_jobs = get_ingestion_jobs()
30 |     job_types = {type(job) for job in ingestion_jobs}
31 |     assert job_types
32 | 
33 |     # get all potential jobs based on subclasses. assert its not empty
34 |     all_job_types = get_tm_export_subclasses()
35 |     assert all_job_types
36 | 
37 |     # ensure all job types are accounted for in ingestion
38 |     assert all_job_types == job_types, f"Missing instances for subclasses: {all_job_types - job_types}"
39 | 


--------------------------------------------------------------------------------
/tests/performance_manager/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/performance_manager/__init__.py


--------------------------------------------------------------------------------
/tests/performance_manager/test_backup_trips_match.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | import polars as pl
 3 | from lamp_py.performance_manager.l1_cte_statements import static_trips_subquery_pl
 4 | from lamp_py.performance_manager.l1_rt_trips import backup_trips_match_pl
 5 | 
 6 | 
 7 | @patch(
 8 |     "lamp_py.performance_manager.l1_cte_statements.GTFS_ARCHIVE", "https://performancedata.mbta.com/lamp/gtfs_archive"
 9 | )
10 | def test_backup_trips_match() -> None:
11 |     """
12 |     test backup_trips_match
13 |     """
14 |     # ┌─────────────────────────┬──────────────┬───────────────────┬───────────────────┬────────────────┐
15 |     # │ static_trip_id          ┆ direction_id ┆ static_stop_count ┆ static_start_time ┆ route_id       │
16 |     # │ ---                     ┆ ---          ┆ ---               ┆ ---               ┆ ---            │
17 |     # │ str                     ┆ i64          ┆ u32               ┆ str               ┆ str            │
18 |     # ╞═════════════════════════╪══════════════╪═══════════════════╪═══════════════════╪════════════════╡
19 |     rt_trips_raw = pl.read_csv(
20 |         "tests/test_files/replace_perf_mgr_query_test_data/20250415_rt_trips_for_backup_match_subquery.csv",
21 |         infer_schema=False,
22 |     )
23 |     rt_trips = rt_trips_raw.with_columns(
24 |         pl.when(pl.col("direction_id") == "f").then(pl.lit(False)).otherwise(pl.lit(True)).alias("direction_id"),
25 |         pl.col("start_time").cast(pl.Int32).alias("start_time"),
26 |     )
27 | 
28 |     static_trips = static_trips_subquery_pl(20250415)
29 |     backup_matched_trips = backup_trips_match_pl(rt_trips, static_trips)
30 | 
31 |     assert backup_matched_trips.height == 1299
32 | 


--------------------------------------------------------------------------------
/tests/performance_manager/test_l0_gtfs_rt_events.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pathlib
  3 | 
  4 | from lamp_py.performance_manager.l0_rt_vehicle_positions import (
  5 |     get_vp_dataframe,
  6 |     transform_vp_datatypes,
  7 | )
  8 | from lamp_py.performance_manager.l0_rt_trip_updates import (
  9 |     get_and_unwrap_tu_dataframe,
 10 | )
 11 | from lamp_py.performance_manager.gtfs_utils import (
 12 |     add_missing_service_dates,
 13 |     service_date_from_timestamp,
 14 | )
 15 | 
 16 | from ..test_resources import test_files_dir, csv_to_vp_parquet
 17 | 
 18 | 
 19 | def test_service_date_from_timestamp() -> None:
 20 |     """
 21 |     test that the service date from timestamp function correctly handles
 22 |     timestamps around the threshold when the service date switches over.
 23 |     """
 24 |     dst_expected = {
 25 |         # dst started on 8 march 2020, the clock goes from 1:59 -> 3:00
 26 |         20200307: [
 27 |             1583650200,  # 1:50 am
 28 |             1583650740,  # 1:59 am
 29 |             1583650799,  # 1:59:59 am
 30 |         ],
 31 |         20200308: [
 32 |             1583650800,  # 3:00 am
 33 |             1583651400,  # 3:10 am
 34 |         ],
 35 |         # dst ended on 1 nov 2020, the clock goes from 2:00 -> 1:00
 36 |         20201031: [
 37 |             1604209800,  # 1:50 am
 38 |             1604210340,  # 1:59 am
 39 |             1604210399,  # 1:59:59 am
 40 |             1604210400,  # 1:00 am (second time)
 41 |             1604214000,  # 2:00 am
 42 |             1604214000,  # 2:00 am
 43 |             1604217000,  # 2:50 am
 44 |             1604217540,  # 2:59 am
 45 |             1604217599,  # 2:59:59 am
 46 |         ],
 47 |         20201101: [
 48 |             1604217600,  # 3:00 am
 49 |             1604218200,  # 3:10 am
 50 |         ],
 51 |     }
 52 | 
 53 |     for service_date, timestamps in dst_expected.items():
 54 |         for timestamp in timestamps:
 55 |             assert service_date == service_date_from_timestamp(timestamp)
 56 | 
 57 | 
 58 | def test_vp_missing_service_date(tmp_path: pathlib.Path) -> None:
 59 |     """
 60 |     test that missing service dates in gtfs-rt vehicle position files can be
 61 |     correctly backfilled.
 62 |     """
 63 |     csv_file = os.path.join(test_files_dir, "vp_missing_start_date.csv")
 64 | 
 65 |     parquet_folder = tmp_path.joinpath("RT_VEHICLE_POSITIONS/year=2023/month=5/day=8/hour=11")
 66 |     parquet_folder.mkdir(parents=True)
 67 |     parquet_file = str(parquet_folder.joinpath("flat_file.parquet"))
 68 | 
 69 |     csv_to_vp_parquet(csv_file, parquet_file)
 70 | 
 71 |     events = get_vp_dataframe(to_load=[parquet_file], route_ids=["Blue"])
 72 |     events = transform_vp_datatypes(events)
 73 | 
 74 |     # ensure that there are NaN service dates
 75 |     assert events["service_date"].hasnans
 76 | 
 77 |     # add the service dates that are missing
 78 |     events = add_missing_service_dates(events, timestamp_key="vehicle_timestamp")
 79 | 
 80 |     # check that new service dates match existing and are numbers
 81 |     assert len(events["service_date"].unique()) == 1
 82 |     assert not events["service_date"].hasnans
 83 | 
 84 | 
 85 | def test_tu_missing_service_date() -> None:
 86 |     """
 87 |     test that trip update gtfs data with missing service dates can be processed
 88 |     correctly.
 89 |     """
 90 |     parquet_file = os.path.join(test_files_dir, "tu_missing_start_date.parquet")
 91 |     events = get_and_unwrap_tu_dataframe([parquet_file], route_ids=["Blue"])
 92 | 
 93 |     # check that NaN service dates exist from reading the file
 94 |     assert events["service_date"].hasnans
 95 | 
 96 |     events = add_missing_service_dates(events_dataframe=events, timestamp_key="timestamp")
 97 | 
 98 |     # check that all service dates exist and are the same
 99 |     assert not events["service_date"].hasnans
100 |     assert len(events["service_date"].unique()) == 1
101 | 


--------------------------------------------------------------------------------
/tests/performance_manager/test_static_trips_subquery.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | import polars as pl
 3 | from polars.testing import assert_frame_equal
 4 | from lamp_py.performance_manager.l1_cte_statements import static_trips_subquery_pl
 5 | 
 6 | 
 7 | @patch(
 8 |     "lamp_py.performance_manager.l1_cte_statements.GTFS_ARCHIVE", "https://performancedata.mbta.com/lamp/gtfs_archive"
 9 | )
10 | def test_static_trips_subquery_pl() -> None:
11 |     """
12 |     Passing unit test for static_trips_subquery implementation in polars/parquet
13 |     """
14 | 
15 |     static_trips_pl = static_trips_subquery_pl(20250410).sort(by="static_trip_id")
16 | 
17 |     compare_sql = pl.read_csv(
18 |         "tests/test_files/replace_perf_mgr_query_test_data/staging_test_summary_sub.csv", infer_schema=False
19 |     )
20 | 
21 |     # need to do a few things because the csv output doesn't do types well
22 |     static_trips_sql = compare_sql.with_columns(
23 |         pl.col("static_stop_count").cast(pl.Int16),
24 |         pl.col("static_start_time").cast(pl.Int32),
25 |         pl.when(pl.col("direction_id") == "f").then(pl.lit(False)).otherwise(pl.lit(True)).alias("direction_id"),
26 |     )
27 | 
28 |     # assert against test csv for all rows
29 |     assert_frame_equal(static_trips_pl, static_trips_sql, check_column_order=False)
30 | 


--------------------------------------------------------------------------------
/tests/test_files/INCOMING/2022-01-01T00:00:03Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-01-01T00:00:03Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz


--------------------------------------------------------------------------------
/tests/test_files/INCOMING/2022-05-04T15:59:48Z_https_cdn.mbta.com_realtime_Alerts_enhanced.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-05-04T15:59:48Z_https_cdn.mbta.com_realtime_Alerts_enhanced.json.gz


--------------------------------------------------------------------------------
/tests/test_files/INCOMING/2022-05-05T16_00_15Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_VehiclePositions_enhanced.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-05-05T16_00_15Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_VehiclePositions_enhanced.json.gz


--------------------------------------------------------------------------------
/tests/test_files/INCOMING/2022-05-08T06:04:57Z_https_cdn.mbta.com_realtime_TripUpdates_enhanced.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-05-08T06:04:57Z_https_cdn.mbta.com_realtime_TripUpdates_enhanced.json.gz


--------------------------------------------------------------------------------
/tests/test_files/INCOMING/2022-06-28T10_03_18Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_TripUpdates_enhanced.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-06-28T10_03_18Z_https_mbta_busloc_s3.s3.amazonaws.com_prod_TripUpdates_enhanced.json.gz


--------------------------------------------------------------------------------
/tests/test_files/INCOMING/2022-07-05T12:35:16Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2022-07-05T12:35:16Z_https_cdn.mbta.com_realtime_VehiclePositions_enhanced.json.gz


--------------------------------------------------------------------------------
/tests/test_files/INCOMING/2024-05-01T02:30:11Z_s3_mbta_ctd_trc_data_rtr_prod_LightRailRawGPS.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/2024-05-01T02:30:11Z_s3_mbta_ctd_trc_data_rtr_prod_LightRailRawGPS.json.gz


--------------------------------------------------------------------------------
/tests/test_files/INCOMING/MBTA_GTFS.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/MBTA_GTFS.zip


--------------------------------------------------------------------------------
/tests/test_files/INCOMING/empty.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/empty.json.gz


--------------------------------------------------------------------------------
/tests/test_files/INCOMING/one_blank_record.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/INCOMING/one_blank_record.json.gz


--------------------------------------------------------------------------------
/tests/test_files/PUBLIC_ARCHIVE/lamp/bus_vehicle_events/test_events.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/PUBLIC_ARCHIVE/lamp/bus_vehicle_events/test_events.parquet


--------------------------------------------------------------------------------
/tests/test_files/PUBLIC_ARCHIVE/lamp/gtfs_archive/2023/routes.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/PUBLIC_ARCHIVE/lamp/gtfs_archive/2023/routes.parquet


--------------------------------------------------------------------------------
/tests/test_files/PUBLIC_ARCHIVE/lamp/gtfs_archive/2024/routes.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/PUBLIC_ARCHIVE/lamp/gtfs_archive/2024/routes.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/CALENDAR/timestamp=1682375024/f18c9f5747194660a793cf0cd6f9df90-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/CALENDAR/timestamp=1682375024/f18c9f5747194660a793cf0cd6f9df90-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/CALENDAR_DATES/timestamp=1682375024/7c0b0da47e284237a7b50df57e3ef33c-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/CALENDAR_DATES/timestamp=1682375024/7c0b0da47e284237a7b50df57e3ef33c-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/DIRECTIONS/timestamp=1682375024/562949d9931149f8a5d8f0cb2eb52c80-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/DIRECTIONS/timestamp=1682375024/562949d9931149f8a5d8f0cb2eb52c80-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/FEED_INFO/timestamp=1682375024/e84307ae774a4d8c8968c5e38e7affdc-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/FEED_INFO/timestamp=1682375024/e84307ae774a4d8c8968c5e38e7affdc-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/ROUTES/timestamp=1682375024/b4e038eb63da41fcb66eed81548f664a-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/ROUTES/timestamp=1682375024/b4e038eb63da41fcb66eed81548f664a-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/ROUTE_PATTERNS/timestamp=1682375024/57233d3677484fe1bd0373749c34cc63-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/ROUTE_PATTERNS/timestamp=1682375024/57233d3677484fe1bd0373749c34cc63-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/RT_ALERTS/year=2020/month=2/day=9/hour=1/6ef6922c20064cb9a8f09a3b3b1d2783-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_ALERTS/year=2020/month=2/day=9/hour=1/6ef6922c20064cb9a8f09a3b3b1d2783-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/RT_TRIP_UPDATES/year=2023/month=5/day=8/hour=12/8e2c182968e24ecea3d37f03d6bae84d-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_TRIP_UPDATES/year=2023/month=5/day=8/hour=12/8e2c182968e24ecea3d37f03d6bae84d-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/RT_TRIP_UPDATES/year=2023/month=5/day=8/hour=13/eaeee968b94b4a74b166df4b8ffd9f29-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_TRIP_UPDATES/year=2023/month=5/day=8/hour=13/eaeee968b94b4a74b166df4b8ffd9f29-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2023/month=5/day=8/hour=12/1613b49e4fa1459eabe9c83553ef1045-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2023/month=5/day=8/hour=12/1613b49e4fa1459eabe9c83553ef1045-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2023/month=5/day=8/hour=13/9a1bb1c5269042a284b2ed57b4dfebb9-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2023/month=5/day=8/hour=13/9a1bb1c5269042a284b2ed57b4dfebb9-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2024/month=6/day=1/hour=12/fcf91fbba92d418aa136d928c6243121-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2024/month=6/day=1/hour=12/fcf91fbba92d418aa136d928c6243121-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2024/month=6/day=1/hour=13/47ffb78637a5400aabdfd7c9c7142757-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/RT_VEHICLE_POSITIONS/year=2024/month=6/day=1/hour=13/47ffb78637a5400aabdfd7c9c7142757-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/STOPS/timestamp=1682375024/920a42ad1b5e4ef0942c7a1bc2ef2fea-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/STOPS/timestamp=1682375024/920a42ad1b5e4ef0942c7a1bc2ef2fea-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/STOP_TIMES/timestamp=1682375024/88c016320de440789357f14df6399d4c-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/STOP_TIMES/timestamp=1682375024/88c016320de440789357f14df6399d4c-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/TM/STOP_CROSSING/120240601.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/STOP_CROSSING/120240601.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/TM/STOP_CROSSING/120240811.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/STOP_CROSSING/120240811.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/TM/TMMAIN_GEO_NODE.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/TMMAIN_GEO_NODE.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/TM/TMMAIN_ROUTE.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/TMMAIN_ROUTE.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/TM/TMMAIN_TRIP.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/TMMAIN_TRIP.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/TM/TMMAIN_VEHICLE.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TM/TMMAIN_VEHICLE.parquet


--------------------------------------------------------------------------------
/tests/test_files/SPRINGBOARD/TRIPS/timestamp=1682375024/cdca1ec8575c4705bb93bc76244c1a86-0.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/SPRINGBOARD/TRIPS/timestamp=1682375024/cdca1ec8575c4705bb93bc76244c1a86-0.parquet


--------------------------------------------------------------------------------
/tests/test_files/ingestion_BUSLOC_TU.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_BUSLOC_TU.parquet


--------------------------------------------------------------------------------
/tests/test_files/ingestion_BUSLOC_VP.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_BUSLOC_VP.parquet


--------------------------------------------------------------------------------
/tests/test_files/ingestion_GTFS-RT_ALERT.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_GTFS-RT_ALERT.parquet


--------------------------------------------------------------------------------
/tests/test_files/ingestion_GTFS-RT_TU.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_GTFS-RT_TU.parquet


--------------------------------------------------------------------------------
/tests/test_files/ingestion_GTFS-RT_TU_OLD.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_GTFS-RT_TU_OLD.parquet


--------------------------------------------------------------------------------
/tests/test_files/ingestion_GTFS-RT_VP.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_GTFS-RT_VP.parquet


--------------------------------------------------------------------------------
/tests/test_files/ingestion_GTFS-RT_VP_OLD.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/ingestion_GTFS-RT_VP_OLD.parquet


--------------------------------------------------------------------------------
/tests/test_files/short_list.json:
--------------------------------------------------------------------------------
1 | [
2 |     "mbta-ctd-dataplatform-dev-springboard/lamp/RT_VEHICLE_POSITIONS/year=2022/month=7/day=20/hour=10/c7be65cb26f04b9c86874a8b40195a72-0.parquet",
3 |     "mbta-ctd-dataplatform-dev-springboard/lamp/RT_TRIP_UPDATES/year=2022/month=7/day=20/hour=10/8dee9a06766042fb8adeb2fa2b999c1a-0.parquet",
4 |     "mbta-ctd-dataplatform-dev-springboard/lamp/FEED_INFO/timestamp=1668795415/dd9e9eb9f5d746a8ad3a0c0dbc73c521-0.parquet"
5 | ]


--------------------------------------------------------------------------------
/tests/test_files/tu_missing_start_date.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbta/lamp/85a285f8db862885377df9c3211c2ae042676979/tests/test_files/tu_missing_start_date.parquet


--------------------------------------------------------------------------------
/tests/test_files/vp_missing_start_date.csv:
--------------------------------------------------------------------------------
 1 | vehicle.current_status,vehicle.current_stop_sequence,vehicle.stop_id,vehicle.timestamp,vehicle.trip.direction_id,vehicle.trip.route_id,vehicle.trip.start_date,vehicle.trip.start_time,vehicle.vehicle.id,vehicle.trip.trip_id,vehicle.vehicle.label,vehicle.vehicle.consist,vehicle.multi_carriage_details
 2 | STOPPED_AT,1,70059,1683547153,0,Blue,,07:38:00,B-54768A0A,55458882,0713,,
 3 | INCOMING_AT,10,70057,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,,
 4 | STOPPED_AT,10,70057,1683547246,0,Blue,,07:38:00,B-54768A0A,55458882,0713,,
 5 | INCOMING_AT,20,70055,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,,
 6 | IN_TRANSIT_TO,20,70055,1683547299,0,Blue,,07:38:00,B-54768A0A,55458882,0713,,
 7 | STOPPED_AT,20,70055,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,,
 8 | INCOMING_AT,30,70053,1683547429,0,Blue,,07:38:00,B-54768A0A,55458882,0713,,
 9 | STOPPED_AT,30,70053,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,,
10 | INCOMING_AT,40,70051,1683547531,0,Blue,,07:38:00,B-54768A0A,55458882,0713,,
11 | STOPPED_AT,40,70051,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,,
12 | IN_TRANSIT_TO,50,70049,1683547652,0,Blue,,07:38:00,B-54768A0A,55458882,0713,,
13 | INCOMING_AT,50,70049,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,,
14 | STOPPED_AT,50,70049,1683547786,0,Blue,,07:38:00,B-54768A0A,55458882,0713,,
15 | IN_TRANSIT_TO,60,70047,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,,
16 | INCOMING_AT,60,70047,1683547846,0,Blue,,07:38:00,B-54768A0A,55458882,0713,,
17 | STOPPED_AT,60,70047,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,,
18 | IN_TRANSIT_TO,70,70045,1683547970,0,Blue,,07:38:00,B-54768A0A,55458882,0713,,
19 | INCOMING_AT,70,70045,,0,Blue,20230508,07:38:00,B-54768A0A,55458882,0713,,
20 | STOPPED_AT,70,70045,1683548133,0,Blue,,07:38:00,B-54768A0A,55458882,0713,,
21 | 


--------------------------------------------------------------------------------
/tests/test_resources.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | import pyarrow
 5 | from pyarrow import csv, parquet
 6 | 
 7 | from lamp_py.runtime_utils.remote_files import (
 8 |     S3_SPRINGBOARD,
 9 |     S3_INCOMING,
10 | )
11 | 
12 | test_files_dir = os.path.join(os.path.dirname(__file__), "test_files")
13 | 
14 | 
15 | def csv_to_vp_parquet(csv_filepath: str, parquet_filepath: str) -> None:
16 |     """
17 |     read vehicle position data in csv format and write it to a parquet file
18 |     """
19 |     vp_csv_options = csv.ConvertOptions(
20 |         column_types={
21 |             "vehicle.current_status": pyarrow.string(),
22 |             "vehicle.current_stop_sequence": pyarrow.uint32(),
23 |             "vehicle.stop_id": pyarrow.string(),
24 |             "vehicle.timestamp": pyarrow.uint64(),
25 |             "vehicle.trip.direction_id": pyarrow.uint8(),
26 |             "vehicle.trip.route_id": pyarrow.string(),
27 |             "vehicle.trip.trip_id": pyarrow.string(),
28 |             "vehicle.trip.start_date": pyarrow.string(),
29 |             "vehicle.trip.start_time": pyarrow.string(),
30 |             "vehicle.vehicle.id": pyarrow.string(),
31 |             "vehicle.vehicle.consist": pyarrow.string(),
32 |         },
33 |         # in our ingestion, if a key is missing, the value written to the
34 |         # parquet file is null. mimic this behavior by making empty strings
35 |         # null instead of ''.
36 |         strings_can_be_null=True,
37 |     )
38 | 
39 |     table = csv.read_csv(csv_filepath, convert_options=vp_csv_options)
40 |     parquet.write_table(table, parquet_filepath)
41 | 
42 | 
43 | incoming_dir = os.path.join(test_files_dir, S3_INCOMING)
44 | springboard_dir = os.path.join(test_files_dir, S3_SPRINGBOARD)
45 | 
46 | 
47 | @dataclass
48 | class LocalS3Location:
49 |     """replace an s3 location wrapper class so it can be used in testing"""
50 | 
51 |     bucket: str
52 |     prefix: str
53 | 
54 |     @property
55 |     def s3_uri(self) -> str:
56 |         """generate the local path to the test file for this object"""
57 |         return os.path.join(test_files_dir, self.bucket, self.prefix)
58 | 
59 | 
60 | rt_vehicle_positions = LocalS3Location(
61 |     bucket=S3_SPRINGBOARD,
62 |     prefix="RT_VEHICLE_POSITIONS",
63 | )
64 | 
65 | tm_stop_crossings = LocalS3Location(
66 |     bucket=S3_SPRINGBOARD,
67 |     prefix="TM/STOP_CROSSING",
68 | )
69 | tm_geo_node_file = LocalS3Location(bucket=S3_SPRINGBOARD, prefix="TM/TMMAIN_GEO_NODE.parquet")
70 | tm_route_file = LocalS3Location(bucket=S3_SPRINGBOARD, prefix="TM/TMMAIN_ROUTE.parquet")
71 | tm_trip_file = LocalS3Location(bucket=S3_SPRINGBOARD, prefix="TM/TMMAIN_TRIP.parquet")
72 | tm_vehicle_file = LocalS3Location(bucket=S3_SPRINGBOARD, prefix="TM/TMMAIN_VEHICLE.parquet")
73 | 


--------------------------------------------------------------------------------
/tests/utils/test_date_range_builder.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from lamp_py.utils.date_range_builder import build_data_range_paths
 3 | 
 4 | 
 5 | def test_simple_case() -> None:
 6 |     template = "year={yy}/month={mm}/day={dd}/{yy}-{mm:02d}-{dd:02d}T00:00:00.parquet"
 7 | 
 8 |     out = build_data_range_paths(template, start_date=datetime(2025, 4, 1), end_date=datetime(2025, 4, 20))
 9 |     print(out)
10 | 
11 |     assert out == [
12 |         "year=2025/month=4/day=1/2025-04-01T00:00:00.parquet",
13 |         "year=2025/month=4/day=2/2025-04-02T00:00:00.parquet",
14 |         "year=2025/month=4/day=3/2025-04-03T00:00:00.parquet",
15 |         "year=2025/month=4/day=4/2025-04-04T00:00:00.parquet",
16 |         "year=2025/month=4/day=5/2025-04-05T00:00:00.parquet",
17 |         "year=2025/month=4/day=6/2025-04-06T00:00:00.parquet",
18 |         "year=2025/month=4/day=7/2025-04-07T00:00:00.parquet",
19 |         "year=2025/month=4/day=8/2025-04-08T00:00:00.parquet",
20 |         "year=2025/month=4/day=9/2025-04-09T00:00:00.parquet",
21 |         "year=2025/month=4/day=10/2025-04-10T00:00:00.parquet",
22 |         "year=2025/month=4/day=11/2025-04-11T00:00:00.parquet",
23 |         "year=2025/month=4/day=12/2025-04-12T00:00:00.parquet",
24 |         "year=2025/month=4/day=13/2025-04-13T00:00:00.parquet",
25 |         "year=2025/month=4/day=14/2025-04-14T00:00:00.parquet",
26 |         "year=2025/month=4/day=15/2025-04-15T00:00:00.parquet",
27 |         "year=2025/month=4/day=16/2025-04-16T00:00:00.parquet",
28 |         "year=2025/month=4/day=17/2025-04-17T00:00:00.parquet",
29 |         "year=2025/month=4/day=18/2025-04-18T00:00:00.parquet",
30 |         "year=2025/month=4/day=19/2025-04-19T00:00:00.parquet",
31 |         "year=2025/month=4/day=20/2025-04-20T00:00:00.parquet",
32 |     ]
33 | 
34 | 
35 | def test_year_crossing() -> None:
36 |     template = "year={yy}/month={mm}/day={dd}/{yy}-{mm:02d}-{dd:02d}T00:00:00.parquet"
37 | 
38 |     out = build_data_range_paths(template, start_date=datetime(2024, 12, 30), end_date=datetime(2025, 1, 2))
39 |     print(out)
40 | 
41 |     assert out == [
42 |         "year=2024/month=12/day=30/2024-12-30T00:00:00.parquet",
43 |         "year=2024/month=12/day=31/2024-12-31T00:00:00.parquet",
44 |         "year=2025/month=1/day=1/2025-01-01T00:00:00.parquet",
45 |         "year=2025/month=1/day=2/2025-01-02T00:00:00.parquet",
46 |     ]
47 | 
48 | 
49 | def test_next_leap_year_2028() -> None:
50 |     template = "year={yy}/month={mm}/day={dd}/{yy}-{mm:02d}-{dd:02d}T00:00:00.parquet"
51 | 
52 |     out = build_data_range_paths(template, start_date=datetime(2028, 2, 26), end_date=datetime(2028, 3, 2))
53 |     print(out)
54 | 
55 |     assert out == [
56 |         "year=2028/month=2/day=26/2028-02-26T00:00:00.parquet",
57 |         "year=2028/month=2/day=27/2028-02-27T00:00:00.parquet",
58 |         "year=2028/month=2/day=28/2028-02-28T00:00:00.parquet",
59 |         "year=2028/month=2/day=29/2028-02-29T00:00:00.parquet",
60 |         "year=2028/month=3/day=1/2028-03-01T00:00:00.parquet",
61 |         "year=2028/month=3/day=2/2028-03-02T00:00:00.parquet",
62 |     ]
63 | 


--------------------------------------------------------------------------------
/tests/utils/test_filter_bank.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import itertools
 3 | from typing import Optional
 4 | from unittest.mock import patch
 5 | import polars as pl
 6 | 
 7 | from lamp_py.utils.filter_bank import HeavyRailFilter, LightRailFilter
 8 | 
 9 | # list files
10 | # grab latest
11 | # assert hardcodes still hold
12 | # get latest
13 | 
14 | 
15 | def test_hardcoded_terminal_prediction_names() -> None:
16 |     # the stops listed for these filters are retrieved dynamically from gtfs.
17 |     # ensure that the expected list contains all of the expected terminal values
18 | 
19 |     # associated runner:
20 |     # runners/run_gtfs_rt_parquet_converter.py
21 | 
22 |     def list_station_child_stops_from_gtfs(
23 |         stops: pl.DataFrame, parent_station: str, additional_filter: Optional[pl.Expr] = None
24 |     ) -> pl.DataFrame:
25 |         """
26 |         Filter gtfs stops by parent_station string, and additional filter if available
27 |         """
28 |         df_parent_station = stops.filter(pl.col("parent_station") == parent_station)
29 |         if additional_filter is not None:
30 |             df_parent_station = df_parent_station.filter(additional_filter)
31 |         return df_parent_station
32 | 
33 |     terminal_stop_ids = []
34 |     heavy_rail_filter = pl.col("vehicle_type") == 1
35 | 
36 |     # check that all stops in Filter lists exist
37 |     service_date = datetime.now()
38 |     stops = pl.read_parquet(f"https://performancedata.mbta.com/lamp/gtfs_archive/{service_date.year}/stops.parquet")
39 | 
40 |     for place_name in HeavyRailFilter._terminal_stop_place_names:
41 |         gtfs_stops = list_station_child_stops_from_gtfs(stops, place_name, heavy_rail_filter)
42 |         terminal_stop_ids.extend(gtfs_stops["stop_id"].to_list())
43 | 
44 |     assert set(terminal_stop_ids).issuperset(set(HeavyRailFilter.terminal_stop_ids))
45 | 
46 |     for stop in HeavyRailFilter.terminal_stop_ids:
47 |         bb = stops.filter(pl.col("stop_id") == stop)
48 |         assert stops.filter(pl.col("stop_id") == stop).height == 1
49 | 
50 |     for stop in LightRailFilter.terminal_stop_ids:
51 |         bb = stops.filter(pl.col("stop_id") == stop)
52 |         assert stops.filter(pl.col("stop_id") == stop).height == 1
53 | 


--------------------------------------------------------------------------------
/tests/utils/test_gtfs_utils.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | from unittest import mock
 3 | 
 4 | from lamp_py.utils.gtfs_utils import (
 5 |     bus_route_ids_for_service_date,
 6 |     routes_for_service_date,
 7 | )
 8 | 
 9 | 
10 | @mock.patch("lamp_py.utils.gtfs_utils.object_exists")
11 | def test_bus_routes_for_service_date(exists_patch: mock.MagicMock) -> None:
12 |     """
13 |     Test that bus routes be generated for a given service date. For the
14 |     generated list ensure
15 |         * they don't contain Subway, Commuter Rail, or Ferry routes
16 |         * don't have a leading zero
17 |         * contain a subset of known routes
18 |     """
19 |     exists_patch.return_value = True
20 | 
21 |     service_date = date(year=2023, month=2, day=1)
22 |     bus_routes = bus_route_ids_for_service_date(service_date)
23 | 
24 |     # check that we're getting a non empty list
25 |     assert len(bus_routes) > 0
26 | 
27 |     subway_routes = [
28 |         "Green-E",
29 |         "Green-B",
30 |         "Green-D",
31 |         "Green-C",
32 |         "Red",
33 |         "Blue",
34 |         "Orange",
35 |     ]
36 | 
37 |     for route in bus_routes:
38 |         # ensure no commuter rails are being passed through
39 |         assert route[:2] != "CR"
40 | 
41 |         # ensure no ferries are being passed through
42 |         assert route[:4] != "Boat"
43 | 
44 |         # ensure no subways are being passed through
45 |         assert route not in subway_routes
46 | 
47 |         # ensure our routes don't have leading zeros
48 |         assert route[0] != "0"
49 | 
50 |     known_routes = [
51 |         "741",  # Sliver Line 1
52 |         "34E",  # Walpole Center - Forest Hills Station
53 |         "100",  # Elm Street - Wellington Station
54 |         "504",  # Watertown Yard - Federal Street & Franklin Street
55 |     ]
56 | 
57 |     for route in known_routes:
58 |         assert route in bus_routes
59 | 
60 | 
61 | @mock.patch("lamp_py.utils.gtfs_utils.object_exists")
62 | def test_routes_for_service_date(exists_patch: mock.MagicMock) -> None:
63 |     """
64 |     Test that routes be generated for a given service date. For the
65 |     generated list ensure that all the route types available are represented
66 |     """
67 |     exists_patch.return_value = True
68 | 
69 |     service_date = date(year=2023, month=2, day=1)
70 |     routes = routes_for_service_date(service_date)
71 | 
72 |     # check that we're getting a non empty list
73 |     assert len(routes) > 0
74 |     assert routes["route_type"].unique().to_list() == [0, 1, 2, 3, 4]
75 | 


--------------------------------------------------------------------------------
/tests/utils/timezones.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | import pytest
 3 | 
 4 | 
 5 | def test_timezone_typing_same_type() -> None:
 6 |     """
 7 |     naive can compare with naive
 8 |     aware can compare with aware
 9 |     """
10 |     ts = ["2021-03-27 03:00", "2021-03-28 03:00"]
11 |     tz_naive = pl.Series("tz_naive", ts).str.to_datetime()
12 |     assert (tz_naive == tz_naive).rename("naive_compared").all()
13 |     tz_aware = tz_naive.dt.replace_time_zone("UTC").rename("tz_aware")
14 |     assert (tz_aware == tz_aware).rename("naive_compared").all()
15 | 
16 | 
17 | def test_timezone_typing_us_eastern_vs_america_new_york_fail() -> None:
18 |     """
19 |     US/Eastern can not compare with America/New_York even though they are both EDT/EST
20 |     """
21 |     ts = ["2021-03-27 03:00", "2021-03-28 03:00"]
22 |     tz_naive = pl.Series("tz_naive", ts).str.to_datetime()
23 |     tz_aware_ny = tz_naive.dt.replace_time_zone("America/New_York").rename("tz_aware_ny")
24 |     tz_aware_eastern = tz_naive.dt.replace_time_zone("US/Eastern").rename("tz_aware_east")
25 |     try:
26 |         out_compared2 = (tz_aware_ny > tz_aware_eastern).rename("ny_vs_eastern")
27 |         # this should fail...if it doesn't, something has gone awry
28 |         assert False
29 |     except pl.exceptions.SchemaError:
30 |         assert True
31 | 
32 | 
33 | def test_timezone_typing_us_eastern_vs_utc_fail() -> None:
34 |     """
35 |     UTC can not compare with America/New_York
36 |     """
37 |     ts = ["2021-03-27 03:00", "2021-03-28 03:00"]
38 |     tz_naive = pl.Series("tz_naive", ts).str.to_datetime()
39 |     tz_aware = tz_naive.dt.replace_time_zone("UTC").rename("tz_aware")
40 |     tz_aware_ny = tz_naive.dt.replace_time_zone("America/New_York").rename("tz_aware_ny")
41 |     try:
42 |         out_compared = (tz_aware_ny > tz_aware).rename("out compared")
43 |         # this should fail...if it doesn't, something has gone awry
44 |         assert False
45 |     except pl.exceptions.SchemaError:
46 |         assert True
47 | 
48 | 
49 | def test_timezone_typing_tz_vs_naive_fail() -> None:
50 |     """
51 |     verify can't compare naive with aware
52 |     """
53 |     ts = ["2021-03-27 03:00", "2021-03-28 03:00"]
54 |     tz_naive = pl.Series("tz_naive", ts).str.to_datetime()
55 |     tz_aware = tz_naive.dt.replace_time_zone("UTC").rename("tz_aware")
56 | 
57 |     try:
58 |         # can't compare naive with aware
59 |         out_compared = tz_naive > tz_aware
60 |         # this should fail...if it doesn't, something has gone awry
61 |         assert False
62 |     except pl.exceptions.SchemaError:
63 |         assert True
64 | 


--------------------------------------------------------------------------------