├── .codecov.yml ├── .github ├── CODEOWNERS └── workflows │ ├── jobstats-metrics-bench.yml │ ├── scrape-memory-metrics-bench.yml │ ├── lustre-metrics-bench.yml │ ├── quota-parsing.yml │ ├── testing.yml │ └── release.yml ├── clippy.toml ├── lustrefs-exporter ├── debian │ ├── docs │ ├── install │ ├── service │ ├── control │ ├── rules │ └── changelog ├── fixtures │ ├── jobstats_only │ │ ├── 2.14.0_162.txt │ │ ├── 2.14.0_164.txt │ │ ├── co-vm03.txt │ │ ├── ds86.txt │ │ └── some_empty.txt │ ├── lnetctl_stats.txt │ └── lnetctl_net_show.txt ├── src │ ├── historical_snapshots │ │ ├── lustrefs_exporter__tests__valid_fixture_lustre-2.14.0_ddn145__2.14.0_ddn145_recovery.txt.histsnap │ │ └── lustrefs_exporter__tests__valid_fixture_lustre-2-14-0__client__llite_client.txt.histsnap │ ├── snapshots │ │ ├── lustrefs_exporter__tests__valid_fixture_otel@lustre-2.14.0_ddn145__2.14.0_ddn145_recovery.txt.snap │ │ ├── lustrefs_exporter__routes__tests__lnet_stats_output_with_mock.snap │ │ ├── lustrefs_exporter__tests__valid_fixture_otel@lustre-2-14-0__client__llite_client.txt.snap │ │ └── lustrefs_exporter__routes__tests__net_show_output_with_mock.snap │ ├── otel_snapshots │ │ ├── lustrefs_exporter__tests__valid_fixture_otel@lustre-2.14.0_ddn145__2.14.0_ddn145_recovery.txt.otelsnap │ │ └── lustrefs_exporter__tests__valid_fixture_otel@lustre-2-14-0__client__llite_client.txt.otelsnap │ ├── main.rs │ ├── llite.rs │ ├── service.rs │ ├── host.rs │ ├── metrics.rs │ ├── lnet.rs │ └── quota.rs ├── Makefile ├── lustrefs_exporter.service ├── README.md ├── testcmds │ ├── cmds_test_lnet_stats_output_with_mock.json │ ├── cmds_test_concurrent_requests.json │ ├── cmds_test_net_show_output_with_mock.json │ └── cmds_test_app_routes.json ├── lustrefs_exporter.spec ├── LICENSE ├── Cargo.toml └── benches │ ├── common │ └── mod.rs │ ├── lustre_metrics.rs │ └── jobstats.rs ├── .taplo.toml ├── Makefile ├── rust-toolchain ├── lustre-collector ├── src │ ├── snapshots │ │ ├── lustre_collector__lnetctl_parser__tests__lnet_down.snap │ │ ├── lustre_collector__stats_parser__tests__empty_mdstats.snap │ │ ├── lustre_collector__time__tests__time_triple.snap │ │ ├── lustre_collector__node_stats_parsers__tests__empty_input.snap │ │ ├── lustre_collector__time__tests__time_triple_back_compat.snap │ │ ├── lustre_collector__recovery_status_parser__tests__target_recovery_stats.snap │ │ ├── lustre_collector__recovery_status_parser__tests__target_recovery_stats2.snap │ │ ├── lustre_collector__stats_parser__tests__mdstats.snap │ │ ├── lustre_collector__node_stats_parsers__tests__parse_meminfo_line.snap │ │ ├── lustre_collector__parser__tests__valid_fixture_lustre-2.14.0_ddn145__2.14.0_ddn145_recovery.txt.snap │ │ ├── lustre_collector__lnetctl_parser__tests__lnet_stats_parse.snap │ │ ├── lustre_collector__brw_stats_parser__tests__brw_stats_with_start_and_elapsed_time.snap │ │ ├── lustre_collector__node_stats_parsers__tests__cpu_stats.snap │ │ ├── lustre_collector__node_stats_parsers__tests__parse_meminfo.snap │ │ ├── lustre_collector__mdd_parser__tests__mdd_stats.snap │ │ ├── lustre_collector__tests__params.snap │ │ ├── lustre_collector__lnetctl_parser__tests__lnet_export_parse_no_bonding.snap │ │ ├── lustre_collector__lnetctl_parser__tests__lnet_net_parse.snap │ │ ├── lustre_collector__recovery_status_parser__tests__multiple.snap │ │ ├── lustre_collector__parser__tests__params.snap │ │ ├── lustre_collector__lnetctl_parser__tests__lnet_parse2.snap │ │ ├── lustre_collector__recovery_status_parser__tests__waiting_for_clients.snap │ │ ├── lustre_collector__recovery_status_parser__tests__multiple_recovering.snap │ │ └── lustre_collector__stats_parser__tests__stats.snap │ ├── mds │ │ ├── snapshots │ │ │ ├── lustre_collector__mds__client_count_parser__test__is_client.snap │ │ │ ├── lustre_collector__mds__client_count_parser__test__is_not_client.snap │ │ │ ├── lustre_collector__mds__client_count_parser__test__export_param.snap │ │ │ ├── lustre_collector__mds__client_count_parser__test__interface_clients.snap │ │ │ ├── lustre_collector__mds__client_count_parser__test__no_interface_clients.snap │ │ │ ├── lustre_collector__mds__client_count_parser__test__multiple_interface_clients.snap │ │ │ ├── lustre_collector__mds__client_count_parser__test__client_count_parser_two_clients.snap │ │ │ ├── lustre_collector__mds__client_count_parser__test__client_count_parser_zero_clients.snap │ │ │ ├── lustre_collector__mds__client_count_parser__test__client_count_parser_multiple_fs.snap │ │ │ └── lustre_collector__mds__client_count_parser__test__client_count_parser_one_client.snap │ │ ├── mdt_parser.rs │ │ └── mod.rs │ ├── mgs │ │ ├── mod.rs │ │ └── mgs_parser.rs │ ├── fixtures │ │ ├── mdd.txt │ │ ├── valid │ │ │ ├── lustre-2.14.0_ddn145 │ │ │ │ └── 2.14.0_ddn145_recovery.txt │ │ │ └── lustre-2-14-0 │ │ │ │ └── client │ │ │ │ └── llite_client.txt │ │ ├── recovery-multiple.txt │ │ ├── recovery-waiting-for-clients.txt │ │ ├── brw_stats_empty.txt │ │ ├── recovery-multiple-recovering.txt │ │ ├── brw_stats_with_start_and_elapsed_time.txt │ │ ├── brw_stats_with_data.txt │ │ └── osd.txt │ ├── quota │ │ └── mod.rs │ ├── ldlm │ │ ├── mod.rs │ │ └── ldlm_service_parser.rs │ ├── error.rs │ ├── exports_parser.rs │ ├── time.rs │ ├── llite │ │ └── mod.rs │ ├── oss │ │ ├── mod.rs │ │ ├── oss_parser.rs │ │ └── obdfilter_parser.rs │ ├── lib.rs │ ├── base_parsers.rs │ ├── mdd_parser.rs │ └── main.rs ├── benches │ └── combine_performance.rs ├── Cargo.toml ├── LICENSE └── combine_mem_usage.json ├── .editorconfig ├── .gitignore ├── .gitattributes ├── .cargo └── config.toml └── Cargo.toml /.codecov.yml: -------------------------------------------------------------------------------- 1 | flag_management: 2 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @jgrund 2 | -------------------------------------------------------------------------------- /clippy.toml: -------------------------------------------------------------------------------- 1 | allow-unwrap-in-tests = true 2 | -------------------------------------------------------------------------------- /lustrefs-exporter/debian/docs: -------------------------------------------------------------------------------- 1 | LICENSE 2 | README.md 3 | -------------------------------------------------------------------------------- /lustrefs-exporter/debian/install: -------------------------------------------------------------------------------- 1 | /usr/bin/prometheus-lustrefs-exporter 2 | -------------------------------------------------------------------------------- /.taplo.toml: -------------------------------------------------------------------------------- 1 | include = ["**/*/Cargo.toml"] 2 | 3 | [formatting] 4 | reorder_keys = true 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: rpm 2 | rpm: 3 | $(MAKE) -C lustrefs-exporter rpm 4 | 5 | .PHONY: deb 6 | deb: 7 | $(MAKE) -C lustrefs-exporter deb 8 | -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "1.89.0" 3 | profile = "default" 4 | targets = ["x86_64-unknown-linux-gnu", "x86_64-unknown-linux-musl"] 5 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__lnetctl_parser__tests__lnet_down.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/lnetctl_parser.rs 3 | expression: x 4 | --- 5 | [] 6 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__stats_parser__tests__empty_mdstats.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/stats_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | [], 7 | "", 8 | ) 9 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__time__tests__time_triple.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/time.rs 3 | expression: result 4 | --- 5 | ( 6 | "1684948453.142852820", 7 | "", 8 | ) 9 | -------------------------------------------------------------------------------- /lustrefs-exporter/fixtures/jobstats_only/2.14.0_162.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:099ac60512738d1c71ffc1fbf52c5a3bf6f688c65ef5014578c24dd9a53b924d 3 | size 37136 4 | -------------------------------------------------------------------------------- /lustrefs-exporter/fixtures/jobstats_only/2.14.0_164.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b29b4bde4a76ae920546cf398ca2de3244c82b613e95bfe0b6ad6f0b89eb0b8f 3 | size 42206 4 | -------------------------------------------------------------------------------- /lustrefs-exporter/fixtures/jobstats_only/co-vm03.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b58e8039b5f13ef7d2c2b91b2cebfed0e88c0cfe7c18c0f57a69de29cdac5bd0 3 | size 105020838 4 | -------------------------------------------------------------------------------- /lustrefs-exporter/fixtures/jobstats_only/ds86.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0b7958a47457c04d1f83eb873fe151ad28b1b524a0fc6e52890f951342bb3c5a 3 | size 449170582 4 | -------------------------------------------------------------------------------- /lustrefs-exporter/fixtures/jobstats_only/some_empty.txt: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c3aaa9e9f0daf3ca55b889bec0b1b57efdeaf2864da58b8ac6196956a48fb43e 3 | size 2260 4 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/snapshots/lustre_collector__mds__client_count_parser__test__is_client.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mds/client_count_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | 1, 7 | "\n", 8 | ) 9 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/snapshots/lustre_collector__mds__client_count_parser__test__is_not_client.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mds/client_count_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | 0, 7 | "\n", 8 | ) 9 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/historical_snapshots/lustrefs_exporter__tests__valid_fixture_lustre-2.14.0_ddn145__2.14.0_ddn145_recovery.txt.histsnap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustrefs-exporter/src/main.rs 3 | expression: x 4 | --- 5 | 6 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/snapshots/lustre_collector__mds__client_count_parser__test__export_param.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mds/client_count_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | "es01a-MDT0000", 7 | "", 8 | ) 9 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_style = space 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | max_line_length = off 11 | -------------------------------------------------------------------------------- /lustre-collector/src/mgs/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | pub mod mgs_fs_parser; 6 | pub mod mgs_parser; 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | perf.* 2 | stacks.folded 3 | flamegraph.svg 4 | profile.json.gz 5 | *~ 6 | 7 | # Generated by Cargo 8 | # will have compiled files and executables 9 | /target/ 10 | 11 | # These are backup files generated by rustfmt 12 | **/*.rs.bk 13 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__node_stats_parsers__tests__empty_input.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/node_stats_parsers.rs 3 | expression: "parse_meminfo().easy_parse(\"\")" 4 | --- 5 | Ok( 6 | ( 7 | [], 8 | "", 9 | ), 10 | ) 11 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/snapshots/lustre_collector__mds__client_count_parser__test__interface_clients.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mds/client_count_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | ( 7 | "fs-MDT0000", 8 | 1, 9 | ), 10 | "", 11 | ) 12 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__time__tests__time_triple_back_compat.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/time.rs 3 | expression: result 4 | --- 5 | ( 6 | "1596728874.484750908", 7 | "req_waittime 31280 samples [usec] 11 2695 5020274 1032267156\n\n", 8 | ) 9 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/snapshots/lustre_collector__mds__client_count_parser__test__no_interface_clients.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mds/client_count_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | ( 7 | "fs-MDT0000", 8 | 0, 9 | ), 10 | "", 11 | ) 12 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/snapshots/lustre_collector__mds__client_count_parser__test__multiple_interface_clients.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mds/client_count_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | ( 7 | "fs-MDT0000", 8 | 5, 9 | ), 10 | "", 11 | ) 12 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__recovery_status_parser__tests__target_recovery_stats.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/recovery_status_parser.rs 3 | expression: records 4 | --- 5 | [ 6 | Status( 7 | Complete, 8 | ), 9 | Completed( 10 | 4, 11 | ), 12 | ] 13 | -------------------------------------------------------------------------------- /lustre-collector/src/fixtures/mdd.txt: -------------------------------------------------------------------------------- 1 | mdd.ai400x2-MDT0000.changelog_users= 2 | current_index: 0 3 | ID index (idle) mask 4 | cl1 0 (327) 5 | mdd.ai400x2-MDT0001.changelog_users= 6 | current_index: 0 7 | ID index (idle) mask 8 | -------------------------------------------------------------------------------- /lustrefs-exporter/debian/service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Prometheus exporter for Lustre 3 | Documentation=https://github.com/whamcloud/lustrefs-exporter 4 | 5 | [Service] 6 | Restart=on-failure 7 | ExecStart=/usr/bin/prometheus-lustrefs-exporter 8 | 9 | [Install] 10 | WantedBy=multi-user.target 11 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/snapshots/lustrefs_exporter__tests__valid_fixture_otel@lustre-2.14.0_ddn145__2.14.0_ddn145_recovery.txt.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustrefs-exporter/src/lib.rs 3 | expression: x 4 | input_file: lustre-collector/src/fixtures/valid/lustre-2.14.0_ddn145/2.14.0_ddn145_recovery.txt 5 | --- 6 | # EOF 7 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__recovery_status_parser__tests__target_recovery_stats2.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/recovery_status_parser.rs 3 | expression: records 4 | --- 5 | [ 6 | Status( 7 | Recovering, 8 | ), 9 | Connected( 10 | 3, 11 | ), 12 | Completed( 13 | 3, 14 | ), 15 | ] 16 | -------------------------------------------------------------------------------- /lustrefs-exporter/Makefile: -------------------------------------------------------------------------------- 1 | RPM_OPTS = -bb -D '_topdir ${CURDIR}/_rpm' -D '_sourcedir .' -D '_builddir .' 2 | 3 | .PHONY: rpm 4 | rpm: 5 | rpmbuild ${RPM_OPTS} lustrefs_exporter.spec 6 | 7 | 8 | export DH_DESTDIR = ${CURDIR}/_deb 9 | 10 | .PHONY: deb 11 | deb: 12 | mkdir -p '${DH_DESTDIR}' 13 | ./debian/rules clean 14 | fakeroot ./debian/rules binary 15 | ./debian/rules clean 16 | 17 | -------------------------------------------------------------------------------- /lustrefs-exporter/lustrefs_exporter.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Prometheus exporter for Lustre filesystem 3 | Documentation=https://github.com/whamcloud/lustrefs-exporter 4 | 5 | [Service] 6 | Environment=RUST_LOG=info,opentelemetry_sdk=warn 7 | Restart=on-failure 8 | ExecStart=/usr/bin/lustrefs_exporter 9 | MemoryHigh=2750M 10 | MemoryMax=3G 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /lustrefs-exporter/debian/control: -------------------------------------------------------------------------------- 1 | Source: prometheus-lustrefs-exporter 2 | Section: admin 3 | Priority: optional 4 | Maintainer: EMF Team 5 | Standards-Version: 4.5.1 6 | Build-Depends: debhelper-compat (= 12) 7 | 8 | Package: prometheus-lustrefs-exporter 9 | Architecture: any 10 | Depends: ${shlibs:Depends}, ${misc:Depends} 11 | Description: prometheus exporter for the Lustre filesystem 12 | 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | lustrefs-exporter/fixtures/ds86.txt filter=lfs diff=lfs merge=lfs -text 2 | lustrefs-exporter/fixtures/co-vm03.txt filter=lfs diff=lfs merge=lfs -text 3 | lustrefs-exporter/fixtures/jobstats_only/co-vm03.txt filter=lfs diff=lfs merge=lfs -text 4 | lustrefs-exporter/fixtures/jobstats_only/ds86.txt filter=lfs diff=lfs merge=lfs -text 5 | lustrefs-exporter/fixtures/jobstats_only/*.txt filter=lfs diff=lfs merge=lfs -text 6 | -------------------------------------------------------------------------------- /lustre-collector/src/fixtures/valid/lustre-2.14.0_ddn145/2.14.0_ddn145_recovery.txt: -------------------------------------------------------------------------------- 1 | obdfilter.fs-OST0000.recovery_status=status: INACTIVE 2 | obdfilter.fs-OST0001.recovery_status=status: INACTIVE 3 | mdt.fs-MDT0000.recovery_status= 4 | status: COMPLETE 5 | recovery_start: 1715712434 6 | recovery_duration: 0 7 | completed_clients: 1/1 8 | replayed_requests: 0 9 | last_transno: 4294967298 10 | VBR: DISABLED 11 | IR: DISABLED 12 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__stats_parser__tests__mdstats.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/stats_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | [ 7 | Stat { 8 | name: "statfs", 9 | units: "reqs", 10 | samples: 16360, 11 | min: None, 12 | max: None, 13 | sum: None, 14 | sumsquare: None, 15 | }, 16 | ], 17 | "", 18 | ) 19 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/otel_snapshots/lustrefs_exporter__tests__valid_fixture_otel@lustre-2.14.0_ddn145__2.14.0_ddn145_recovery.txt.otelsnap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustrefs-exporter/src/main.rs 3 | expression: x 4 | input_file: lustre-collector/src/fixtures/valid/lustre-2.14.0_ddn145/2.14.0_ddn145_recovery.txt 5 | --- 6 | # HELP target_info Target metadata 7 | # TYPE target_info gauge 8 | target_info{service_name="lustrefs-exporter",telemetry_sdk_language="rust",telemetry_sdk_name="opentelemetry",telemetry_sdk_version="0.29.0"} 1 9 | -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.x86_64-unknown-linux-musl] 2 | linker = "rust-lld" 3 | 4 | [alias] 5 | build-musl = "build --release --target x86_64-unknown-linux-musl" 6 | 7 | build-musl-lustrefs-exporter = "build-musl -p lustrefs-exporter" 8 | 9 | build-musl-lustre-collector = "build-musl -p lustre_collector" 10 | 11 | # Run clippy checks 12 | ci_clippy = """clippy \ 13 | --locked \ 14 | --all-features \ 15 | --all-targets \ 16 | -- \ 17 | --deny warnings \ 18 | --deny clippy::unwrap_used \ 19 | --deny clippy::cast_possible_wrap""" 20 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__node_stats_parsers__tests__parse_meminfo_line.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/node_stats_parsers.rs 3 | expression: parse_meminfo_line().parse(x) 4 | --- 5 | Ok( 6 | ( 7 | Node( 8 | MemTotal( 9 | NodeStat { 10 | param: Param( 11 | "mem_total", 12 | ), 13 | value: 5943788, 14 | }, 15 | ), 16 | ), 17 | "\n", 18 | ), 19 | ) 20 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__parser__tests__valid_fixture_lustre-2.14.0_ddn145__2.14.0_ddn145_recovery.txt.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustre-collector/src/parser.rs 3 | expression: result 4 | --- 5 | ( 6 | [], 7 | "obdfilter.fs-OST0000.recovery_status=status: INACTIVE\nobdfilter.fs-OST0001.recovery_status=status: INACTIVE\nmdt.fs-MDT0000.recovery_status=\nstatus: COMPLETE\nrecovery_start: 1715712434\nrecovery_duration: 0\ncompleted_clients: 1/1\nreplayed_requests: 0\nlast_transno: 4294967298\nVBR: DISABLED\nIR: DISABLED\n", 8 | ) 9 | -------------------------------------------------------------------------------- /lustrefs-exporter/debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | export DH_VERBOSE=1 4 | export DEB_BUILD_OPTIONS=noddebs 5 | 6 | DH_DESTDIR ?= .. 7 | 8 | %: 9 | dh $@ --destdir='$(DH_DESTDIR)' 10 | 11 | 12 | override_dh_auto_build: 13 | cargo build --release 14 | 15 | override_dh_auto_install: 16 | install -v -d debian/tmp/usr/bin 17 | install -v -T ../target/release/lustrefs-exporter debian/tmp/usr/bin/prometheus-lustrefs-exporter 18 | 19 | 20 | override_dh_auto_clean: 21 | cargo clean 22 | 23 | # No-ops: 24 | override_dh_auto_test: 25 | override_dh_update_autotools_config: 26 | override_dh_autoreconf: 27 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/snapshots/lustre_collector__mds__client_count_parser__test__client_count_parser_two_clients.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mds/client_count_parser.rs 3 | expression: result 4 | 5 | --- 6 | ( 7 | [ 8 | Target( 9 | ConnectedClients( 10 | TargetStat { 11 | kind: Mdt, 12 | param: Param( 13 | "connected_clients", 14 | ), 15 | target: Target( 16 | "fs-MDT0000", 17 | ), 18 | value: 2, 19 | }, 20 | ), 21 | ), 22 | ], 23 | "", 24 | ) 25 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/snapshots/lustre_collector__mds__client_count_parser__test__client_count_parser_zero_clients.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mds/client_count_parser.rs 3 | expression: result 4 | 5 | --- 6 | ( 7 | [ 8 | Target( 9 | ConnectedClients( 10 | TargetStat { 11 | kind: Mdt, 12 | param: Param( 13 | "connected_clients", 14 | ), 15 | target: Target( 16 | "fs-MDT0000", 17 | ), 18 | value: 0, 19 | }, 20 | ), 21 | ), 22 | ], 23 | "", 24 | ) 25 | -------------------------------------------------------------------------------- /lustrefs-exporter/README.md: -------------------------------------------------------------------------------- 1 | # lustrefs-exporter 2 | 3 | Prometheus exporter for lustre 4 | 5 | ## Building Packages For Musl 6 | 7 | Building packages with `musl` creates statically-linked binaries that can run on any 8 | Linux platform without dependencies. This is especially useful when testing new features while 9 | developing on macOS. 10 | 11 | ### Building 12 | 13 | Build the desired binary by running one of the following commands: 14 | 15 | 1. `cargo build-musl-lustrefs-exporter` - Builds the lustrefs-export binary 16 | 1. `cargo build-musl-lustre-collector` - Buildes the lustre-collector 17 | 18 | The compile binaries will be located under: `target/x86_64-unknown-linux-musl/release/` 19 | -------------------------------------------------------------------------------- /lustre-collector/src/quota/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{Record, base_parsers::period}; 6 | use combine::{ParseError, Parser, Stream, parser::char::string}; 7 | 8 | pub(crate) mod quota_parser; 9 | 10 | pub(crate) const QMT: &str = "qmt"; 11 | 12 | pub(crate) fn params() -> Vec { 13 | quota_parser::params() 14 | } 15 | 16 | pub fn parse() -> impl Parser 17 | where 18 | I: Stream, 19 | I::Error: ParseError, 20 | { 21 | (string(QMT), period()).with(quota_parser::qmt_parse()) 22 | } 23 | -------------------------------------------------------------------------------- /lustrefs-exporter/fixtures/lnetctl_stats.txt: -------------------------------------------------------------------------------- 1 | statistics: 2 | msgs_alloc: 0 3 | msgs_max: 39 4 | rst_alloc: 15 5 | errors: 0 6 | send_count: 1110532 7 | resend_count: 0 8 | response_timeout_count: 0 9 | local_interrupt_count: 0 10 | local_dropped_count: 0 11 | local_aborted_count: 0 12 | local_no_route_count: 0 13 | local_timeout_count: 0 14 | local_error_count: 0 15 | remote_dropped_count: 0 16 | remote_error_count: 0 17 | remote_timeout_count: 0 18 | network_timeout_count: 0 19 | recv_count: 1110512 20 | route_count: 0 21 | drop_count: 11 22 | send_length: 261865376 23 | recv_length: 254091008 24 | route_length: 0 25 | drop_length: 4832 26 | -------------------------------------------------------------------------------- /lustre-collector/src/fixtures/recovery-multiple.txt: -------------------------------------------------------------------------------- 1 | obdfilter.fs-OST0000.recovery_status= 2 | status: COMPLETE 3 | recovery_start: 1620410016 4 | recovery_duration: 150 5 | completed_clients: 4/8 6 | replayed_requests: 0 7 | last_transno: 4294967296 8 | VBR: ENABLED 9 | IR: ENABLED 10 | obdfilter.fs-OST0001.recovery_status= 11 | status: COMPLETE 12 | recovery_start: 1620410016 13 | recovery_duration: 150 14 | completed_clients: 4/8 15 | replayed_requests: 0 16 | last_transno: 4294967296 17 | VBR: ENABLED 18 | IR: ENABLED 19 | mdt.fs-MDT0000.recovery_status= 20 | status: COMPLETE 21 | recovery_start: 1620410024 22 | recovery_duration: 1760 23 | completed_clients: 3/7 24 | replayed_requests: 0 25 | last_transno: 4294967310 26 | VBR: ENABLED 27 | IR: ENABLED -------------------------------------------------------------------------------- /lustre-collector/src/fixtures/recovery-waiting-for-clients.txt: -------------------------------------------------------------------------------- 1 | mdt.fs-MDT0000.recovery_status= 2 | status: COMPLETE 3 | recovery_start: 1620410024 4 | recovery_duration: 1760 5 | completed_clients: 3/7 6 | replayed_requests: 0 7 | last_transno: 4294967310 8 | VBR: ENABLED 9 | IR: ENABLED 10 | mdt.fs-MDT0002.recovery_status=status: WAITING_FOR_CLIENTS 11 | obdfilter.fs-OST0000.recovery_status= 12 | status: COMPLETE 13 | recovery_start: 1620410016 14 | recovery_duration: 150 15 | completed_clients: 4/8 16 | replayed_requests: 0 17 | last_transno: 4294967296 18 | VBR: ENABLED 19 | IR: ENABLED 20 | obdfilter.fs-OST0001.recovery_status= 21 | status: COMPLETE 22 | recovery_start: 1620410016 23 | recovery_duration: 150 24 | completed_clients: 4/8 25 | replayed_requests: 0 26 | last_transno: 4294967296 27 | VBR: ENABLED 28 | IR: ENABLED 29 | obdfilter.fs-OST0004.recovery_status=status: WAITING_FOR_CLIENTS -------------------------------------------------------------------------------- /lustrefs-exporter/src/snapshots/lustrefs_exporter__routes__tests__lnet_stats_output_with_mock.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustrefs-exporter/src/routes.rs 3 | expression: "String::from_utf8(output.stdout)?" 4 | --- 5 | statistics: 6 | msgs_alloc: 0 7 | msgs_max: 39 8 | rst_alloc: 15 9 | errors: 0 10 | send_count: 1110532 11 | resend_count: 0 12 | response_timeout_count: 0 13 | local_interrupt_count: 0 14 | local_dropped_count: 0 15 | local_aborted_count: 0 16 | local_no_route_count: 0 17 | local_timeout_count: 0 18 | local_error_count: 0 19 | remote_dropped_count: 0 20 | remote_error_count: 0 21 | remote_timeout_count: 0 22 | network_timeout_count: 0 23 | recv_count: 1110512 24 | route_count: 0 25 | drop_count: 11 26 | send_length: 261865376 27 | recv_length: 254091008 28 | route_length: 0 29 | drop_length: 4832 30 | -------------------------------------------------------------------------------- /lustrefs-exporter/debian/changelog: -------------------------------------------------------------------------------- 1 | prometheus-lustrefs-exporter (0.12.1-0.1) unstable; urgency=medium 2 | 3 | * Ignore LQA quotas 4 | 5 | -- EMF Team Fri, 31 Oct 2025 17:00:00 +0200 6 | 7 | prometheus-lustrefs-exporter (0.12.0-0.1) unstable; urgency=medium 8 | 9 | * Include lustre OSD cache statistics in the exporter 10 | 11 | -- EMF Team Mon, 27 Oct 2025 02:02:00 +0200 12 | 13 | prometheus-lustrefs-exporter (0.11.0-0.1) unstable; urgency=medium 14 | 15 | * Integrate Bencher 16 | * Add metrics for operation time statistics 17 | * Add per-node stats collection 18 | * Change otel dependencies to prometheus-client 19 | 20 | -- EMF Team Thu, 11 Sep 2025 09:48:22 +0200 21 | 22 | prometheus-lustrefs-exporter (0.10.2-0.1) unstable; urgency=medium 23 | 24 | * New release. 25 | 26 | -- EMF Team Wed, 05 Apr 2023 09:48:22 +0200 27 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__lnetctl_parser__tests__lnet_stats_parse.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/lnetctl_parser.rs 3 | expression: x 4 | --- 5 | [ 6 | LNetStat( 7 | SendLength( 8 | LNetStatGlobal { 9 | param: Param( 10 | "send_length", 11 | ), 12 | value: 62502714567608, 13 | }, 14 | ), 15 | ), 16 | LNetStat( 17 | RecvLength( 18 | LNetStatGlobal { 19 | param: Param( 20 | "recv_length", 21 | ), 22 | value: 17084716480056, 23 | }, 24 | ), 25 | ), 26 | LNetStat( 27 | DropLength( 28 | LNetStatGlobal { 29 | param: Param( 30 | "drop_length", 31 | ), 32 | value: 568792, 33 | }, 34 | ), 35 | ), 36 | ] 37 | -------------------------------------------------------------------------------- /lustrefs-exporter/testcmds/cmds_test_lnet_stats_output_with_mock.json: -------------------------------------------------------------------------------- 1 | { 2 | "commands": { 3 | "lnetctl:stats show": [ 4 | { 5 | "binary_name": "lnetctl", 6 | "args": [ 7 | "stats", 8 | "show" 9 | ], 10 | "stdout": "statistics:\n msgs_alloc: 0\n msgs_max: 39\n rst_alloc: 15\n errors: 0\n send_count: 1110532\n resend_count: 0\n response_timeout_count: 0\n local_interrupt_count: 0\n local_dropped_count: 0\n local_aborted_count: 0\n local_no_route_count: 0\n local_timeout_count: 0\n local_error_count: 0\n remote_dropped_count: 0\n remote_error_count: 0\n remote_timeout_count: 0\n network_timeout_count: 0\n recv_count: 1110512\n route_count: 0\n drop_count: 11\n send_length: 261865376\n recv_length: 254091008\n route_length: 0\n drop_length: 4832\n", 11 | "stderr": "", 12 | "exit_code": 0 13 | } 14 | ] 15 | } 16 | } -------------------------------------------------------------------------------- /lustre-collector/src/ldlm/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{Record, base_parsers::period}; 6 | use combine::{ParseError, Parser, Stream, attempt, parser::char::string}; 7 | 8 | mod ldlm_namespace_parser; 9 | mod ldlm_service_parser; 10 | 11 | pub(crate) const LDLM: &str = "ldlm"; 12 | 13 | pub(crate) fn params() -> Vec { 14 | ldlm_namespace_parser::params() 15 | .into_iter() 16 | .chain(ldlm_service_parser::params()) 17 | .collect() 18 | } 19 | 20 | pub(crate) fn parse() -> impl Parser 21 | where 22 | I: Stream, 23 | I::Error: ParseError, 24 | { 25 | (attempt(string(LDLM)), period()) 26 | .with(ldlm_namespace_parser::parse().or(ldlm_service_parser::parse())) 27 | .message("while parsing ldlm") 28 | } 29 | -------------------------------------------------------------------------------- /lustrefs-exporter/lustrefs_exporter.spec: -------------------------------------------------------------------------------- 1 | Name: lustrefs_exporter 2 | Version: 0.12.1 3 | Release: 1%{?dist} 4 | Summary: prometheus exporter for lustre 5 | License: MIT 6 | 7 | Requires(pre): shadow-utils 8 | 9 | %description 10 | Prometheus exporter for the Lustre filesystem 11 | 12 | %global debug_package %{nil} 13 | 14 | %prep 15 | 16 | %build 17 | cargo build --release 18 | 19 | %install 20 | install -v -d %{buildroot}%{_bindir} 21 | install -v -d %{buildroot}%{_unitdir} 22 | install -v -m 0644 lustrefs_exporter.service %{buildroot}%{_unitdir} 23 | install -v ../target/release/lustrefs-exporter %{buildroot}%{_bindir} 24 | %{__ln_s} lustrefs-exporter %{buildroot}%{_bindir}/lustrefs_exporter 25 | 26 | %files 27 | %{_bindir}/lustrefs-exporter 28 | %{_bindir}/lustrefs_exporter 29 | %{_unitdir}/lustrefs_exporter.service 30 | 31 | %post 32 | %systemd_post %{name}.service 33 | 34 | %preun 35 | %systemd_preun %{name}.service 36 | 37 | %postun 38 | %systemd_postun %{name}.service 39 | -------------------------------------------------------------------------------- /lustrefs-exporter/testcmds/cmds_test_concurrent_requests.json: -------------------------------------------------------------------------------- 1 | { 2 | "commands": { 3 | "lnetctl:stats show": [ 4 | { 5 | "binary_name": "lnetctl", 6 | "args": [ 7 | "stats", 8 | "show" 9 | ], 10 | "stdout": "", 11 | "stderr": "cat: /usr/local/bin/../fixtures/lnetctl_stats.txt: No such file or directory\n", 12 | "exit_code": 1 13 | }, 14 | { 15 | "binary_name": "lnetctl", 16 | "args": [ 17 | "stats", 18 | "show" 19 | ], 20 | "stdout": "", 21 | "stderr": "cat: /usr/local/bin/../fixtures/lnetctl_stats.txt: No such file or directory\n", 22 | "exit_code": 1 23 | }, 24 | { 25 | "binary_name": "lnetctl", 26 | "args": [ 27 | "stats", 28 | "show" 29 | ], 30 | "stdout": "", 31 | "stderr": "cat: /usr/local/bin/../fixtures/lnetctl_stats.txt: No such file or directory\n", 32 | "exit_code": 1 33 | } 34 | ] 35 | } 36 | } -------------------------------------------------------------------------------- /lustre-collector/benches/combine_performance.rs: -------------------------------------------------------------------------------- 1 | use combine::parser::EasyParser; 2 | use criterion::{Criterion, criterion_group, criterion_main}; 3 | use lustre_collector::quota::parse as combine_parse; 4 | use std::{fs::File, io::Read, time::Duration}; 5 | 6 | pub fn combine_perf(c: &mut Criterion) { 7 | let mut group = c.benchmark_group("parse_benchmarks"); 8 | 9 | group.sample_size(10); 10 | group.measurement_time(Duration::from_secs(90)); // Allow more time 11 | 12 | let mut raw = String::new(); 13 | File::open("benches/quotas.yml") 14 | .expect("Failed to open file") 15 | .read_to_string(&mut raw) 16 | .expect("Failed to read file"); 17 | 18 | group.bench_with_input("combine_performance", &raw, |b, input| { 19 | b.iter(|| { 20 | let mut needle = input.as_str(); 21 | while let Ok((_, e)) = combine_parse().easy_parse(needle) { 22 | needle = e; 23 | } 24 | }) 25 | }); 26 | } 27 | 28 | criterion_group!(benches, combine_perf); 29 | criterion_main!(benches); 30 | -------------------------------------------------------------------------------- /lustre-collector/src/fixtures/brw_stats_empty.txt: -------------------------------------------------------------------------------- 1 | 2 | snapshot_time: 1534429278.185762481 (secs.nsecs) 3 | 4 | read | write 5 | pages per bulk r/w rpcs % cum % | rpcs % cum % 6 | 7 | read | write 8 | discontiguous pages rpcs % cum % | rpcs % cum % 9 | 10 | read | write 11 | discontiguous blocks rpcs % cum % | rpcs % cum % 12 | 13 | read | write 14 | disk fragmented I/Os ios % cum % | ios % cum % 15 | 16 | read | write 17 | disk I/Os in flight ios % cum % | ios % cum % 18 | 19 | read | write 20 | I/O time (1/1000s) ios % cum % | ios % cum % 21 | 22 | read | write 23 | disk I/O size ios % cum % | ios % cum % 24 | 25 | read | write 26 | block maps msec maps % cum % | maps % cum % 27 | -------------------------------------------------------------------------------- /lustre-collector/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors.workspace = true 3 | description = "Scrapes Lustre stats and aggregates into JSON or YAML" 4 | edition.workspace = true 5 | license.workspace = true 6 | name = "lustre_collector" 7 | version.workspace = true 8 | 9 | [dependencies] 10 | clap = { workspace = true, features = ["derive"] } 11 | combine.workspace = true 12 | serde = { workspace = true, features = ["derive"] } 13 | serde_json.workspace = true 14 | serde_yaml.workspace = true 15 | thiserror.workspace = true 16 | tracing.workspace = true 17 | tracing-subscriber.workspace = true 18 | 19 | [dev-dependencies] 20 | include_dir.workspace = true 21 | insta.workspace = true 22 | tokio = { workspace = true, features = ["full"] } 23 | criterion = { workspace = true, features = ["html_reports", "async_tokio"] } 24 | sysinfo.workspace = true 25 | 26 | [lib] 27 | bench = false 28 | 29 | [[bin]] 30 | bench = false 31 | name = "lustre_collector" 32 | path = "src/main.rs" 33 | 34 | [[bench]] 35 | harness = false 36 | name = "combine_performance" 37 | 38 | [[bench]] 39 | harness = false 40 | name = "combine_memory" 41 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/snapshots/lustre_collector__mds__client_count_parser__test__client_count_parser_multiple_fs.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mds/client_count_parser.rs 3 | expression: result 4 | 5 | --- 6 | ( 7 | [ 8 | Target( 9 | ConnectedClients( 10 | TargetStat { 11 | kind: Mdt, 12 | param: Param( 13 | "connected_clients", 14 | ), 15 | target: Target( 16 | "fs-MDT0000", 17 | ), 18 | value: 2, 19 | }, 20 | ), 21 | ), 22 | Target( 23 | ConnectedClients( 24 | TargetStat { 25 | kind: Mdt, 26 | param: Param( 27 | "connected_clients", 28 | ), 29 | target: Target( 30 | "fs2-MDT0000", 31 | ), 32 | value: 2, 33 | }, 34 | ), 35 | ), 36 | ], 37 | "", 38 | ) 39 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/snapshots/lustre_collector__mds__client_count_parser__test__client_count_parser_one_client.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mds/client_count_parser.rs 3 | expression: result 4 | 5 | --- 6 | ( 7 | [ 8 | Target( 9 | ConnectedClients( 10 | TargetStat { 11 | kind: Mdt, 12 | param: Param( 13 | "connected_clients", 14 | ), 15 | target: Target( 16 | "es01a-MDT0000", 17 | ), 18 | value: 1, 19 | }, 20 | ), 21 | ), 22 | Target( 23 | ConnectedClients( 24 | TargetStat { 25 | kind: Mdt, 26 | param: Param( 27 | "connected_clients", 28 | ), 29 | target: Target( 30 | "fs-MDT0000", 31 | ), 32 | value: 0, 33 | }, 34 | ), 35 | ), 36 | ], 37 | "", 38 | ) 39 | -------------------------------------------------------------------------------- /lustre-collector/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 DDN 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lustrefs-exporter/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 whamCloud 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lustre-collector/combine_mem_usage.json: -------------------------------------------------------------------------------- 1 | { 2 | "scrape_allocations": { 3 | "start_rss_mib": { 4 | "value": 318.8093196902655 5 | }, 6 | "peak_rss_mib": { 7 | "value": 359.84375 8 | }, 9 | "end_rss_mib": { 10 | "value": 320.30074668141594 11 | }, 12 | "memory_growth_mib": { 13 | "value": 1.4914269911504425 14 | }, 15 | "peak_over_start_rss_ratio": { 16 | "value": 1.0788454764864892 17 | }, 18 | "avg_runtime_rss_mib": { 19 | "value": 319.2835635533252, 20 | "lower_value": 24.296875, 21 | "upper_value": 359.84375 22 | }, 23 | "start_virtual_mib": { 24 | "value": 402485.0271017699 25 | }, 26 | "peak_virtual_mib": { 27 | "value": 402526.0625 28 | }, 29 | "end_virtual_mib": { 30 | "value": 402497.05365044245 31 | }, 32 | "virtual_growth_mib": { 33 | "value": 12.026548672566372 34 | }, 35 | "peak_over_start_virtual_ratio": { 36 | "value": 1.0000299443068315 37 | }, 38 | "avg_runtime_virtual_mib": { 39 | "value": 402490.81627407606, 40 | "lower_value": 401167.0625, 41 | "upper_value": 402526.0625 42 | } 43 | } 44 | } -------------------------------------------------------------------------------- /lustre-collector/src/fixtures/recovery-multiple-recovering.txt: -------------------------------------------------------------------------------- 1 | obdfilter.fs-OST0000.recovery_status= 2 | status: COMPLETE 3 | recovery_start: 1620410016 4 | recovery_duration: 150 5 | completed_clients: 4/8 6 | replayed_requests: 0 7 | last_transno: 4294967296 8 | VBR: ENABLED 9 | IR: ENABLED 10 | obdfilter.fs-OST0001.recovery_status= 11 | status: COMPLETE 12 | recovery_start: 1620410016 13 | recovery_duration: 150 14 | completed_clients: 4/8 15 | replayed_requests: 0 16 | last_transno: 4294967296 17 | VBR: ENABLED 18 | IR: ENABLED 19 | obdfilter.fs-OST0004.recovery_status= 20 | status: COMPLETE 21 | recovery_start: 1620920834 22 | recovery_duration: 13 23 | completed_clients: 8/8 24 | replayed_requests: 0 25 | last_transno: 42949672960 26 | VBR: DISABLED 27 | IR: ENABLED 28 | mdt.fs-MDT0000.recovery_status= 29 | status: COMPLETE 30 | recovery_start: 1620410024 31 | recovery_duration: 1760 32 | completed_clients: 3/7 33 | replayed_requests: 0 34 | last_transno: 4294967310 35 | VBR: ENABLED 36 | IR: ENABLED 37 | mdt.fs-MDT0002.recovery_status= 38 | status: RECOVERING 39 | recovery_start: 1620920843 40 | time_remaining: 119 41 | connected_clients: 3/7 42 | req_replay_clients: 0 43 | lock_repay_clients: 0 44 | completed_clients: 3 -------------------------------------------------------------------------------- /lustre-collector/src/fixtures/brw_stats_with_start_and_elapsed_time.txt: -------------------------------------------------------------------------------- 1 | 2 | snapshot_time: 1684867636.682465202 secs.nsecs 3 | start_time: 1684865295.727058577 secs.nsecs 4 | elapsed_time: 2340.955406625 secs.nsecs 5 | 6 | read | write 7 | pages per bulk r/w rpcs % cum % | rpcs % cum % 8 | 9 | read | write 10 | discontiguous pages rpcs % cum % | rpcs % cum % 11 | 12 | read | write 13 | discontiguous blocks rpcs % cum % | rpcs % cum % 14 | 15 | read | write 16 | disk fragmented I/Os ios % cum % | ios % cum % 17 | 18 | read | write 19 | disk I/Os in flight ios % cum % | ios % cum % 20 | 21 | read | write 22 | I/O time (1/1000s) ios % cum % | ios % cum % 23 | 24 | read | write 25 | disk I/O size ios % cum % | ios % cum % 26 | 27 | read | write 28 | block maps msec maps % cum % | maps % cum % 29 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/main.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use clap::Parser; 6 | use lustrefs_exporter::{Error, dump_stats, routes::app}; 7 | use std::net::SocketAddr; 8 | 9 | const LUSTREFS_EXPORTER_PORT: &str = "32221"; 10 | 11 | #[derive(Debug, Parser)] 12 | pub struct CommandOpts { 13 | /// Port that exporter will listen to 14 | #[clap(short, long, env = "LUSTREFS_EXPORTER_PORT", default_value = LUSTREFS_EXPORTER_PORT)] 15 | pub port: u16, 16 | 17 | /// Dump stats as raw string and exit 18 | #[clap(long, hide = true)] 19 | dump: bool, 20 | } 21 | 22 | #[tokio::main] 23 | async fn main() -> Result<(), Error> { 24 | tracing_subscriber::fmt::init(); 25 | 26 | let opts = CommandOpts::parse(); 27 | 28 | if opts.dump { 29 | dump_stats().await?; 30 | } else { 31 | let addr = SocketAddr::from(([0, 0, 0, 0], opts.port)); 32 | 33 | tracing::info!("Listening on http://{addr}/metrics"); 34 | 35 | let listener = tokio::net::TcpListener::bind(("0.0.0.0", opts.port)).await?; 36 | 37 | axum::serve(listener, app()).await?; 38 | } 39 | 40 | Ok(()) 41 | } 42 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ['lustre-collector', 'lustrefs-exporter'] 3 | resolver = "3" 4 | 5 | [workspace.package] 6 | authors = ["EMF Team "] 7 | edition = "2024" 8 | license = "MIT" 9 | version = "0.12.1" 10 | 11 | [workspace.lints.rust] 12 | unreachable_pub = "deny" 13 | 14 | [workspace.lints.clippy] 15 | cast_lossless = "warn" 16 | cast_possible_wrap = "warn" 17 | unwrap_used = "warn" 18 | 19 | [workspace.dependencies] 20 | axum = "0.8" 21 | clap = "4" 22 | combine = "=4.6.7" 23 | commandeer-test = "0.1.1" 24 | const_format = "0.2" 25 | criterion = "0.7" 26 | iai-callgrind = "0.16" 27 | include_dir = { version = "0.7", features = ["glob"] } 28 | insta = "1" 29 | pretty_assertions = "1.4.1" 30 | prometheus-client = { git = "https://github.com/whamcloud/client_rust", branch = "whamcloud-08-12-2025" } 31 | prometheus-parse = "0.2.5" 32 | reqwest = "0.11" 33 | serde = "1" 34 | serde_json = "1" 35 | serde_yaml = "0.9" 36 | serial_test = "3.2" 37 | sysinfo = "0.29" 38 | thiserror = "2" 39 | tokio = "1" 40 | tower = "0.5" 41 | tower-http = "0.6" 42 | tracing = "0.1" 43 | tracing-subscriber = { version = "0.3", features = ["env-filter"] } 44 | 45 | [profile.release] 46 | lto = true 47 | 48 | [profile.profiling] 49 | debug = true 50 | inherits = "release" 51 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/llite.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::Family; 6 | use lustre_collector::LliteStat; 7 | use prometheus_client::{metrics::counter::Counter, registry::Registry}; 8 | use std::ops::Deref; 9 | 10 | #[derive(Debug, Default)] 11 | pub struct LliteMetrics { 12 | client_stats: Family>, 13 | } 14 | 15 | impl LliteMetrics { 16 | pub fn register_metric(&self, registry: &mut Registry) { 17 | registry.register_without_auto_suffix( 18 | "lustre_client_stats", 19 | "Lustre client interface stats", 20 | self.client_stats.clone(), 21 | ); 22 | } 23 | } 24 | 25 | pub fn build_llite_stats(x: &LliteStat, metrics: &mut LliteMetrics) { 26 | let LliteStat { 27 | target, 28 | param: _, 29 | stats, 30 | } = x; 31 | 32 | for stat in stats { 33 | metrics 34 | .client_stats 35 | .get_or_create(&vec![ 36 | ("operation", stat.name.deref().to_string()), 37 | ("target", target.deref().to_string()), 38 | ]) 39 | .inc_by(stat.samples); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /lustre-collector/src/error.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use combine::error::StringStreamError; 6 | use std::{io, str}; 7 | use thiserror::Error; 8 | 9 | #[derive(Debug, Error)] 10 | pub enum LustreCollectorError { 11 | #[error(transparent)] 12 | IoError(#[from] io::Error), 13 | #[error(transparent)] 14 | SerdeJsonError(#[from] serde_json::error::Error), 15 | #[error(transparent)] 16 | SerdeYamlError(#[from] serde_yaml::Error), 17 | #[error(transparent)] 18 | StringStreamError(#[from] StringStreamError), 19 | #[error(transparent)] 20 | CombineEasyError(combine::stream::easy::Errors), 21 | #[error(transparent)] 22 | Utf8Error(#[from] str::Utf8Error), 23 | #[error("{0}")] 24 | ConversionError(String), 25 | #[error("Cannot convert timestamp {0} to a u64 of milliseconds")] 26 | InvalidTime(String), 27 | } 28 | 29 | impl From> for LustreCollectorError { 30 | fn from(err: combine::stream::easy::Errors) -> Self { 31 | LustreCollectorError::CombineEasyError(err.map_range(|_| "")) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__brw_stats_parser__tests__brw_stats_with_start_and_elapsed_time.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/brw_stats_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | [ 7 | BrwStats { 8 | name: "pages", 9 | unit: "rpcs", 10 | buckets: [], 11 | }, 12 | BrwStats { 13 | name: "discont_pages", 14 | unit: "rpcs", 15 | buckets: [], 16 | }, 17 | BrwStats { 18 | name: "discont_blocks", 19 | unit: "rpcs", 20 | buckets: [], 21 | }, 22 | BrwStats { 23 | name: "dio_frags", 24 | unit: "ios", 25 | buckets: [], 26 | }, 27 | BrwStats { 28 | name: "rpc_hist", 29 | unit: "ios", 30 | buckets: [], 31 | }, 32 | BrwStats { 33 | name: "io_time", 34 | unit: "ios", 35 | buckets: [], 36 | }, 37 | BrwStats { 38 | name: "disk_iosize", 39 | unit: "ios", 40 | buckets: [], 41 | }, 42 | BrwStats { 43 | name: "block_maps_msec", 44 | unit: "maps", 45 | buckets: [], 46 | }, 47 | ], 48 | "", 49 | ) 50 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__node_stats_parsers__tests__cpu_stats.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/node_stats_parsers.rs 3 | expression: parse_cpustats().easy_parse(x) 4 | --- 5 | Ok( 6 | ( 7 | [ 8 | Node( 9 | CpuTotal( 10 | NodeStat { 11 | param: Param( 12 | "cpu_total", 13 | ), 14 | value: 140868629, 15 | }, 16 | ), 17 | ), 18 | Node( 19 | CpuUser( 20 | NodeStat { 21 | param: Param( 22 | "cpu_user", 23 | ), 24 | value: 370338, 25 | }, 26 | ), 27 | ), 28 | Node( 29 | CpuIowait( 30 | NodeStat { 31 | param: Param( 32 | "cpu_iowait", 33 | ), 34 | value: 6313, 35 | }, 36 | ), 37 | ), 38 | Node( 39 | CpuSystem( 40 | NodeStat { 41 | param: Param( 42 | "cpu_system", 43 | ), 44 | value: 481420, 45 | }, 46 | ), 47 | ), 48 | ], 49 | "", 50 | ), 51 | ) 52 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__node_stats_parsers__tests__parse_meminfo.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/node_stats_parsers.rs 3 | expression: parse_meminfo().easy_parse(PROC_MEMINFO) 4 | --- 5 | Ok( 6 | ( 7 | [ 8 | Node( 9 | MemTotal( 10 | NodeStat { 11 | param: Param( 12 | "mem_total", 13 | ), 14 | value: 5943788, 15 | }, 16 | ), 17 | ), 18 | Node( 19 | MemFree( 20 | NodeStat { 21 | param: Param( 22 | "mem_free", 23 | ), 24 | value: 4420248, 25 | }, 26 | ), 27 | ), 28 | Node( 29 | SwapTotal( 30 | NodeStat { 31 | param: Param( 32 | "swap_total", 33 | ), 34 | value: 2097148, 35 | }, 36 | ), 37 | ), 38 | Node( 39 | SwapFree( 40 | NodeStat { 41 | param: Param( 42 | "swap_free", 43 | ), 44 | value: 2097148, 45 | }, 46 | ), 47 | ), 48 | ], 49 | "", 50 | ), 51 | ) 52 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__mdd_parser__tests__mdd_stats.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/mdd_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | [ 7 | Target( 8 | Changelog( 9 | TargetStat { 10 | kind: Mdt, 11 | param: Param( 12 | "changelog_users", 13 | ), 14 | target: Target( 15 | "ai400x2-MDT0000", 16 | ), 17 | value: ChangelogStat { 18 | current_index: 0, 19 | users: [ 20 | ChangeLogUser { 21 | user: "cl1", 22 | index: 0, 23 | idle_secs: 327, 24 | }, 25 | ], 26 | }, 27 | }, 28 | ), 29 | ), 30 | Target( 31 | Changelog( 32 | TargetStat { 33 | kind: Mdt, 34 | param: Param( 35 | "changelog_users", 36 | ), 37 | target: Target( 38 | "ai400x2-MDT0001", 39 | ), 40 | value: ChangelogStat { 41 | current_index: 0, 42 | users: [], 43 | }, 44 | }, 45 | ), 46 | ), 47 | ], 48 | "", 49 | ) 50 | -------------------------------------------------------------------------------- /lustre-collector/src/exports_parser.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{ 6 | ExportStats, 7 | base_parsers::{equals, period}, 8 | stats_parser::stats, 9 | }; 10 | use combine::{ 11 | Parser, attempt, 12 | error::ParseError, 13 | many, many1, 14 | parser::char::{alpha_num, string}, 15 | stream::Stream, 16 | token, 17 | }; 18 | 19 | /// Parses a single nid 20 | pub(crate) fn nid() -> impl Parser 21 | where 22 | I: Stream, 23 | I::Error: ParseError, 24 | { 25 | ( 26 | many1::(alpha_num().or(period())), 27 | token('@'), 28 | many1::(alpha_num()), 29 | ) 30 | .map(|(ip, _, lnet)| format!("{ip}@{lnet}")) 31 | .message("while parsing nid") 32 | } 33 | 34 | /// Parses a single obdfilter.*OST*.exports.*.stats line 35 | fn exports_stat() -> impl Parser 36 | where 37 | I: Stream, 38 | I::Error: ParseError, 39 | { 40 | attempt(( 41 | nid().skip(period()), 42 | string("stats").skip(equals()), 43 | stats(), 44 | )) 45 | .map(|(nid, _, stats)| ExportStats { nid, stats }) 46 | .message("while parsing export_stats") 47 | } 48 | 49 | /// Parses multiple obdfilter.*OST*.exports.*.stats lines 50 | pub(crate) fn exports_stats() -> impl Parser> 51 | where 52 | I: Stream, 53 | I::Error: ParseError, 54 | { 55 | (many(exports_stat())).map(|x| x) 56 | } 57 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__tests__params.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustre-collector/src/lib.rs 3 | expression: "xs.join(\" \")" 4 | --- 5 | memused memused_max lnet_memused health_check mdt.*.exports.*.uuid osd-*.*.stats osd-*.*.filesfree osd-*.*.filestotal osd-*.*.fstype osd-*.*.kbytesavail osd-*.*.kbytesfree osd-*.*.kbytestotal osd-*.*.brw_stats osd-*.*.quota_slave.acct_group osd-*.*.quota_slave.acct_user osd-*.*.quota_slave.acct_project mgs.*.mgs.stats mgs.*.mgs.threads_max mgs.*.mgs.threads_min mgs.*.mgs.threads_started mgs.*.num_exports obdfilter.*OST*.stats obdfilter.*OST*.num_exports obdfilter.*OST*.tot_dirty obdfilter.*OST*.tot_granted obdfilter.*OST*.tot_pending obdfilter.*OST*.exports.*.stats ost.OSS.ost.stats ost.OSS.ost_io.stats ost.OSS.ost_create.stats ost.OSS.ost_out.stats ost.OSS.ost_seq.stats mds.MDS.mdt.stats mds.MDS.mdt_fld.stats mds.MDS.mdt_io.stats mds.MDS.mdt_out.stats mds.MDS.mdt_readpage.stats mds.MDS.mdt_seqm.stats mds.MDS.mdt_seqs.stats mds.MDS.mdt_setattr.stats mdt.*.md_stats mdt.*MDT*.num_exports mdt.*MDT*.exports.*.stats ldlm.namespaces.{mdt-,filter-}*.contended_locks ldlm.namespaces.{mdt-,filter-}*.contention_seconds ldlm.namespaces.{mdt-,filter-}*.ctime_age_limit ldlm.namespaces.{mdt-,filter-}*.early_lock_cancel ldlm.namespaces.{mdt-,filter-}*.lock_count ldlm.namespaces.{mdt-,filter-}*.lock_timeouts ldlm.namespaces.{mdt-,filter-}*.lock_unused_count ldlm.namespaces.{mdt-,filter-}*.lru_max_age ldlm.namespaces.{mdt-,filter-}*.lru_size ldlm.namespaces.{mdt-,filter-}*.max_nolock_bytes ldlm.namespaces.{mdt-,filter-}*.max_parallel_ast ldlm.namespaces.{mdt-,filter-}*.resource_count ldlm.services.ldlm_canceld.stats ldlm.services.ldlm_cbd.stats llite.*.stats mdd.*.changelog_users qmt.*.{dt,md}-*.glb-usr qmt.*.{dt,md}-*.glb-prj qmt.*.{dt,md}-*.glb-grp nodemap.*.dt_stats nodemap.*.md_stats 6 | -------------------------------------------------------------------------------- /lustrefs-exporter/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors.workspace = true 3 | edition.workspace = true 4 | name = "lustrefs-exporter" 5 | version.workspace = true 6 | 7 | [dependencies] 8 | axum = { workspace = true, features = ["http2"] } 9 | clap = { workspace = true, features = ["derive", "env", "wrap_help", "string"] } 10 | lustre_collector.path = "../lustre-collector" 11 | prometheus-client.workspace = true 12 | regex = { version = "1", default-features = false, features = [ 13 | "perf", 14 | "std", 15 | "perf-dfa-full", 16 | ] } 17 | serde = { version = "1", features = ["derive"] } 18 | thiserror.workspace = true 19 | tokio = { workspace = true, features = [ 20 | "macros", 21 | "process", 22 | "rt-multi-thread", 23 | ] } 24 | tower = { workspace = true, features = ["timeout", "load-shed", "limit"] } 25 | tower-http = { workspace = true, features = ["compression-full"] } 26 | tracing.workspace = true 27 | tracing-subscriber = { workspace = true, features = ["env-filter"] } 28 | 29 | [dev-dependencies] 30 | combine.workspace = true 31 | commandeer-test.workspace = true 32 | const_format.workspace = true 33 | criterion = { workspace = true, features = ["html_reports", "async_tokio"] } 34 | iai-callgrind.workspace = true 35 | insta = { workspace = true, features = ["glob"] } 36 | pretty_assertions.workspace = true 37 | prometheus-parse.workspace = true 38 | reqwest.workspace = true 39 | serde_json.workspace = true 40 | serial_test.workspace = true 41 | sysinfo.workspace = true 42 | 43 | [lib] 44 | bench = false 45 | 46 | [[bin]] 47 | bench = false 48 | name = "lustrefs-exporter" 49 | path = "src/main.rs" 50 | 51 | [[bench]] 52 | harness = false 53 | name = "jobstats" 54 | 55 | [[bench]] 56 | harness = false 57 | name = "lustre_metrics" 58 | 59 | [[bench]] 60 | harness = false 61 | name = "scrape_memory_metrics" 62 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/service.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::Family; 6 | use lustre_collector::LustreServiceStats; 7 | use prometheus_client::{metrics::counter::Counter, registry::Registry}; 8 | use std::ops::Deref; 9 | 10 | #[derive(Debug, Default)] 11 | pub struct ServiceMetrics { 12 | ldlm_canceld_stats: Family>, 13 | ldlm_cbd_stats: Family>, 14 | } 15 | 16 | impl ServiceMetrics { 17 | pub fn register_metric(&self, registry: &mut Registry) { 18 | registry.register_without_auto_suffix( 19 | "lustre_ldlm_canceld_stats", 20 | "Gives information about LDLM Canceld service", 21 | self.ldlm_canceld_stats.clone(), 22 | ); 23 | 24 | registry.register_without_auto_suffix( 25 | "lustre_ldlm_cbd_stats", 26 | "Gives information about LDLM Callback service", 27 | self.ldlm_cbd_stats.clone(), 28 | ); 29 | } 30 | } 31 | 32 | pub fn build_service_stats(x: &LustreServiceStats, service: &mut ServiceMetrics) { 33 | match x { 34 | LustreServiceStats::LdlmCanceld(xs) => { 35 | for s in xs { 36 | service 37 | .ldlm_canceld_stats 38 | .get_or_create(&vec![("operation", s.name.deref().to_string())]) 39 | .inc_by(s.samples); 40 | } 41 | } 42 | LustreServiceStats::LdlmCbd(xs) => { 43 | for s in xs { 44 | service 45 | .ldlm_cbd_stats 46 | .get_or_create(&vec![("operation", s.name.deref().to_string())]) 47 | .inc_by(s.samples); 48 | } 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__lnetctl_parser__tests__lnet_export_parse_no_bonding.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/lnetctl_parser.rs 3 | expression: x 4 | 5 | --- 6 | [ 7 | LNetStat( 8 | SendCount( 9 | LNetStat { 10 | nid: "0@lo", 11 | param: Param( 12 | "send_count", 13 | ), 14 | value: 9, 15 | }, 16 | ), 17 | ), 18 | LNetStat( 19 | RecvCount( 20 | LNetStat { 21 | nid: "0@lo", 22 | param: Param( 23 | "recv_count", 24 | ), 25 | value: 8, 26 | }, 27 | ), 28 | ), 29 | LNetStat( 30 | DropCount( 31 | LNetStat { 32 | nid: "0@lo", 33 | param: Param( 34 | "drop_count", 35 | ), 36 | value: 1, 37 | }, 38 | ), 39 | ), 40 | LNetStat( 41 | SendCount( 42 | LNetStat { 43 | nid: "10.36.4.130@tcp", 44 | param: Param( 45 | "send_count", 46 | ), 47 | value: 0, 48 | }, 49 | ), 50 | ), 51 | LNetStat( 52 | RecvCount( 53 | LNetStat { 54 | nid: "10.36.4.130@tcp", 55 | param: Param( 56 | "recv_count", 57 | ), 58 | value: 0, 59 | }, 60 | ), 61 | ), 62 | LNetStat( 63 | DropCount( 64 | LNetStat { 65 | nid: "10.36.4.130@tcp", 66 | param: Param( 67 | "drop_count", 68 | ), 69 | value: 0, 70 | }, 71 | ), 72 | ), 73 | ] 74 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__lnetctl_parser__tests__lnet_net_parse.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/lnetctl_parser.rs 3 | expression: x 4 | 5 | --- 6 | [ 7 | LNetStat( 8 | SendCount( 9 | LNetStat { 10 | nid: "0@lo", 11 | param: Param( 12 | "send_count", 13 | ), 14 | value: 942, 15 | }, 16 | ), 17 | ), 18 | LNetStat( 19 | RecvCount( 20 | LNetStat { 21 | nid: "0@lo", 22 | param: Param( 23 | "recv_count", 24 | ), 25 | value: 942, 26 | }, 27 | ), 28 | ), 29 | LNetStat( 30 | DropCount( 31 | LNetStat { 32 | nid: "0@lo", 33 | param: Param( 34 | "drop_count", 35 | ), 36 | value: 0, 37 | }, 38 | ), 39 | ), 40 | LNetStat( 41 | SendCount( 42 | LNetStat { 43 | nid: "10.73.20.11@tcp", 44 | param: Param( 45 | "send_count", 46 | ), 47 | value: 3825, 48 | }, 49 | ), 50 | ), 51 | LNetStat( 52 | RecvCount( 53 | LNetStat { 54 | nid: "10.73.20.11@tcp", 55 | param: Param( 56 | "recv_count", 57 | ), 58 | value: 3736, 59 | }, 60 | ), 61 | ), 62 | LNetStat( 63 | DropCount( 64 | LNetStat { 65 | nid: "10.73.20.11@tcp", 66 | param: Param( 67 | "drop_count", 68 | ), 69 | value: 30, 70 | }, 71 | ), 72 | ), 73 | ] 74 | -------------------------------------------------------------------------------- /.github/workflows/jobstats-metrics-bench.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | pull_request: 7 | types: [opened, reopened, edited, synchronize] 8 | 9 | jobs: 10 | benchmark_jobstats_metrics: 11 | name: Benchmark Jobstats Metrics 12 | permissions: 13 | checks: write 14 | pull-requests: write 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - uses: bencherdev/bencher@main 20 | 21 | - name: Install Rust 22 | uses: actions-rust-lang/setup-rust-toolchain@v1 23 | 24 | - name: Track Jobstats Metrics Benchmarks on main branch 25 | if: github.ref == 'refs/heads/main' && github.event_name == 'push' 26 | run: | 27 | bencher run \ 28 | --project '${{ secrets.BENCHER_PROJECT_ID }}' \ 29 | --token '${{ secrets.BENCHER_API_TOKEN }}' \ 30 | --branch main \ 31 | --testbed ci-runner \ 32 | --threshold-measure Latency \ 33 | --threshold-test t_test \ 34 | --threshold-max-sample-size 64 \ 35 | --threshold-lower-boundary 0.95 \ 36 | --threshold-upper-boundary 0.95 \ 37 | --err \ 38 | --adapter rust_criterion \ 39 | --github-actions '${{ secrets.GITHUB_TOKEN }}' \ 40 | "cargo bench --bench jobstats" 41 | 42 | - name: Compare Jobstats Metrics Benchmarks with main branch 43 | if: github.event_name == 'pull_request' 44 | run: | 45 | bencher run \ 46 | --project '${{ secrets.BENCHER_PROJECT_ID }}' \ 47 | --token '${{ secrets.BENCHER_API_TOKEN }}' \ 48 | --branch '${{ github.head_ref }}' \ 49 | --start-point main \ 50 | --testbed ci-runner \ 51 | --start-point-clone-thresholds \ 52 | --err \ 53 | --adapter rust_criterion \ 54 | --github-actions '${{ secrets.GITHUB_TOKEN }}' \ 55 | "cargo bench --bench jobstats" 56 | -------------------------------------------------------------------------------- /lustre-collector/src/fixtures/valid/lustre-2-14-0/client/llite_client.txt: -------------------------------------------------------------------------------- 1 | memused=35721854085 2 | memused_max=41220347397 3 | lnet_memused=22228358 4 | health_check=healthy 5 | ldlm.services.ldlm_cbd.stats= 6 | snapshot_time 1710759783.270541554 secs.nsecs 7 | req_waittime 41083956 samples [usecs] 1 33890 255153949 6336801119 8 | req_qdepth 41083956 samples [reqs] 0 7 429581 432557 9 | req_active 41083956 samples [reqs] 1 14 53101359 86233965 10 | req_timeout 41083956 samples [secs] 15 15 616259340 9243890100 11 | reqbuf_avail 87745428 samples [bufs] 0 3 80539671 80656909 12 | ldlm_bl_callback 41083956 samples [usecs] 1 1623 446351106 6062478214 13 | llite.ai400x2-ff47bce9ca35d800.stats= 14 | snapshot_time 1710759783.271040288 secs.nsecs 15 | ioctl 114 samples [reqs] 16 | open 35955554 samples [usecs] 0 172939 3393651645 8836730033307 17 | close 35955554 samples [usecs] 63 194037 5402095148 11956065309788 18 | readdir 60 samples [usecs] 0 2242 34741 52343869 19 | getattr 35955647 samples [usecs] 0 85505 2317024138 904252691326 20 | unlink 17977752 samples [usecs] 107 148413 3846291010 6948473454642 21 | mkdir 17987059 samples [usecs] 104 1306648 23731812831 644644512447919 22 | rmdir 17987079 samples [usecs] 95 1478009 37000527306 744761005599966 23 | mknod 17977752 samples [usecs] 119 193901 4144162753 7852936850197 24 | statfs 17864 samples [usecs] 0 88480 10257950 80137794724 25 | setxattr 1 samples [usecs] 8760 8760 8760 76737600 26 | inode_permission 629305628 samples [usecs] 0 147014 196719589 76130795547 27 | opencount 35955576 samples [reqs] 1 2 40433335 49388853 28 | openclosetime 17977772 samples [usecs] 34302 20804335332 4185265858629453 2084188699388296969 29 | -------------------------------------------------------------------------------- /lustre-collector/src/ldlm/ldlm_service_parser.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{ 6 | LustreServiceStats, Record, 7 | base_parsers::{param, period}, 8 | ldlm::LDLM, 9 | stats_parser::stats, 10 | }; 11 | use combine::{ParseError, Parser, Stream, attempt, choice, parser::char::string}; 12 | 13 | pub(crate) const LDLM_CANCELD: &str = "ldlm_canceld"; 14 | pub(crate) const LDLM_CBD: &str = "ldlm_cbd"; 15 | 16 | pub(crate) const SERVICES: &str = "services"; 17 | 18 | pub(crate) const STATS: &str = "stats"; 19 | 20 | pub(crate) fn params() -> Vec { 21 | [LDLM_CANCELD, LDLM_CBD] 22 | .into_iter() 23 | .map(|x| format!("{LDLM}.{SERVICES}.{x}.{STATS}")) 24 | .collect() 25 | } 26 | 27 | pub(crate) fn parse() -> impl Parser 28 | where 29 | I: Stream, 30 | I::Error: ParseError, 31 | { 32 | attempt((string(SERVICES), period())) 33 | .with(choice((ldlm_canceld_parser(), ldlm_cbd_parser()))) 34 | .map(Record::LustreService) 35 | } 36 | 37 | pub(crate) fn ldlm_canceld_parser() -> impl Parser 38 | where 39 | I: Stream, 40 | I::Error: ParseError, 41 | { 42 | attempt((string(LDLM_CANCELD), period(), param(STATS))) 43 | .with(stats()) 44 | .map(LustreServiceStats::LdlmCanceld) 45 | .message("While parsing ldlm_canceld.stats") 46 | } 47 | 48 | pub(crate) fn ldlm_cbd_parser() -> impl Parser 49 | where 50 | I: Stream, 51 | I::Error: ParseError, 52 | { 53 | (string(LDLM_CBD), period(), param(STATS)) 54 | .with(stats()) 55 | .map(LustreServiceStats::LdlmCbd) 56 | .message("While parsing ldlm_cbd.stats") 57 | } 58 | -------------------------------------------------------------------------------- /lustrefs-exporter/benches/common/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use std::time::Duration; 6 | use tokio::{task::JoinSet, time::Instant}; 7 | 8 | // Create a single request using `oneshot`. This is equivalent to hitting the 9 | // `/scrape` endpoint if the http service was running. 10 | async fn make_single_request() -> Result> { 11 | let body = reqwest::get("http://localhost:12345/metrics?jobstats=true") 12 | .await? 13 | .text() 14 | .await?; 15 | 16 | Ok(body) 17 | } 18 | 19 | // Use a JoinSet to make `concurrent` requests at a time, waiting for each batch to complete before 20 | // starting the next batch. 21 | pub async fn load_test_concurrent(concurrency: usize, total_requests: usize) -> Duration { 22 | let start = Instant::now(); 23 | 24 | let mut spawned_requests = 0; 25 | let mut successful_requests = 0; 26 | let mut failed_requests = 0; 27 | 28 | let mut join_set = JoinSet::new(); 29 | 30 | // Initially spawn `concurrency` requests 31 | for _ in 0..concurrency { 32 | join_set.spawn(async move { make_single_request().await }); 33 | 34 | spawned_requests += 1; 35 | } 36 | 37 | while let Some(result) = join_set.join_next().await { 38 | match result { 39 | Ok(Ok(_)) => successful_requests += 1, 40 | Ok(Err(_)) => failed_requests += 1, 41 | Err(_) => failed_requests += 1, 42 | } 43 | 44 | // Immediately spawn a new request if there are more to be made 45 | if spawned_requests < total_requests { 46 | join_set.spawn(async move { make_single_request().await }); 47 | 48 | spawned_requests += 1; 49 | } 50 | } 51 | 52 | let elapsed = start.elapsed(); 53 | 54 | println!( 55 | "Load test completed: {successful_requests} successful, {failed_requests} failed requests in {elapsed:?}", 56 | ); 57 | 58 | elapsed 59 | } 60 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__recovery_status_parser__tests__multiple.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/recovery_status_parser.rs 3 | expression: records 4 | --- 5 | [ 6 | Target( 7 | RecoveryStatus( 8 | TargetStat { 9 | kind: Ost, 10 | param: Param( 11 | "recovery_status", 12 | ), 13 | target: Target( 14 | "fs-OST0000", 15 | ), 16 | value: Complete, 17 | }, 18 | ), 19 | ), 20 | Target( 21 | RecoveryCompletedClients( 22 | TargetStat { 23 | kind: Ost, 24 | param: Param( 25 | "recovery_status", 26 | ), 27 | target: Target( 28 | "fs-OST0000", 29 | ), 30 | value: 4, 31 | }, 32 | ), 33 | ), 34 | Target( 35 | RecoveryStatus( 36 | TargetStat { 37 | kind: Ost, 38 | param: Param( 39 | "recovery_status", 40 | ), 41 | target: Target( 42 | "fs-OST0001", 43 | ), 44 | value: Complete, 45 | }, 46 | ), 47 | ), 48 | Target( 49 | RecoveryCompletedClients( 50 | TargetStat { 51 | kind: Ost, 52 | param: Param( 53 | "recovery_status", 54 | ), 55 | target: Target( 56 | "fs-OST0001", 57 | ), 58 | value: 4, 59 | }, 60 | ), 61 | ), 62 | Target( 63 | RecoveryStatus( 64 | TargetStat { 65 | kind: Mdt, 66 | param: Param( 67 | "recovery_status", 68 | ), 69 | target: Target( 70 | "fs-MDT0000", 71 | ), 72 | value: Complete, 73 | }, 74 | ), 75 | ), 76 | Target( 77 | RecoveryCompletedClients( 78 | TargetStat { 79 | kind: Mdt, 80 | param: Param( 81 | "recovery_status", 82 | ), 83 | target: Target( 84 | "fs-MDT0000", 85 | ), 86 | value: 3, 87 | }, 88 | ), 89 | ), 90 | ] 91 | -------------------------------------------------------------------------------- /.github/workflows/scrape-memory-metrics-bench.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | pull_request: 7 | types: [opened, reopened, edited, synchronize] 8 | 9 | jobs: 10 | benchmark_scrape_operation: 11 | name: Benchmark Scrape Operation 12 | permissions: 13 | checks: write 14 | pull-requests: write 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - uses: bencherdev/bencher@main 20 | 21 | - name: Install Rust 22 | uses: actions-rust-lang/setup-rust-toolchain@v1 23 | 24 | - uses: actions/cache@v4 25 | with: 26 | path: | 27 | ~/.cargo/bin/ 28 | ~/.cargo/registry/index/ 29 | ~/.cargo/registry/cache/ 30 | ~/.cargo/git/db/ 31 | target/ 32 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} 33 | 34 | - name: Run Scrape Memory Metrics Benchmark 35 | run: | 36 | # Run the benchmark first (suppress Criterion output) 37 | cargo bench --bench scrape_memory_metrics 2>&1 > /dev/null 38 | 39 | - name: Track scrape memory metrics benchmark on main branch 40 | if: github.ref == 'refs/heads/main' && github.event_name == 'push' 41 | run: | 42 | bencher run \ 43 | --project '${{ secrets.BENCHER_PROJECT_ID }}' \ 44 | --token '${{ secrets.BENCHER_API_TOKEN }}' \ 45 | --branch main \ 46 | --testbed ci-runner \ 47 | --threshold-measure peak_rss_mib \ 48 | --threshold-test t_test \ 49 | --threshold-max-sample-size 64 \ 50 | --threshold-lower-boundary 0.95 \ 51 | --threshold-upper-boundary 0.95 \ 52 | --err \ 53 | --adapter json \ 54 | --github-actions '${{ secrets.GITHUB_TOKEN }}' \ 55 | --file lustrefs-exporter/scrape_allocations_results.json 56 | 57 | - name: Compare scrape memory metrics with main branch 58 | if: github.event_name == 'pull_request' 59 | run: | 60 | bencher run \ 61 | --project '${{ secrets.BENCHER_PROJECT_ID }}' \ 62 | --token '${{ secrets.BENCHER_API_TOKEN }}' \ 63 | --branch '${{ github.head_ref }}' \ 64 | --start-point main \ 65 | --testbed ci-runner \ 66 | --start-point-clone-thresholds \ 67 | --err \ 68 | --adapter json \ 69 | --github-actions '${{ secrets.GITHUB_TOKEN }}' \ 70 | --file lustrefs-exporter/scrape_allocations_results.json 71 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__parser__tests__params.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustre-collector/src/parser.rs 3 | expression: params() 4 | --- 5 | [ 6 | "memused", 7 | "memused_max", 8 | "lnet_memused", 9 | "health_check", 10 | "mdt.*.exports.*.uuid", 11 | "osd-*.*.stats", 12 | "osd-*.*.filesfree", 13 | "osd-*.*.filestotal", 14 | "osd-*.*.fstype", 15 | "osd-*.*.kbytesavail", 16 | "osd-*.*.kbytesfree", 17 | "osd-*.*.kbytestotal", 18 | "osd-*.*.brw_stats", 19 | "osd-*.*.quota_slave.acct_group", 20 | "osd-*.*.quota_slave.acct_user", 21 | "osd-*.*.quota_slave.acct_project", 22 | "mgs.*.mgs.stats", 23 | "mgs.*.mgs.threads_max", 24 | "mgs.*.mgs.threads_min", 25 | "mgs.*.mgs.threads_started", 26 | "mgs.*.num_exports", 27 | "obdfilter.*OST*.stats", 28 | "obdfilter.*OST*.num_exports", 29 | "obdfilter.*OST*.tot_dirty", 30 | "obdfilter.*OST*.tot_granted", 31 | "obdfilter.*OST*.tot_pending", 32 | "obdfilter.*OST*.exports.*.stats", 33 | "ost.OSS.ost.stats", 34 | "ost.OSS.ost_io.stats", 35 | "ost.OSS.ost_create.stats", 36 | "ost.OSS.ost_out.stats", 37 | "ost.OSS.ost_seq.stats", 38 | "mds.MDS.mdt.stats", 39 | "mds.MDS.mdt_fld.stats", 40 | "mds.MDS.mdt_io.stats", 41 | "mds.MDS.mdt_out.stats", 42 | "mds.MDS.mdt_readpage.stats", 43 | "mds.MDS.mdt_seqm.stats", 44 | "mds.MDS.mdt_seqs.stats", 45 | "mds.MDS.mdt_setattr.stats", 46 | "mdt.*.md_stats", 47 | "mdt.*MDT*.num_exports", 48 | "mdt.*MDT*.exports.*.stats", 49 | "ldlm.namespaces.{mdt-,filter-}*.contended_locks", 50 | "ldlm.namespaces.{mdt-,filter-}*.contention_seconds", 51 | "ldlm.namespaces.{mdt-,filter-}*.ctime_age_limit", 52 | "ldlm.namespaces.{mdt-,filter-}*.early_lock_cancel", 53 | "ldlm.namespaces.{mdt-,filter-}*.lock_count", 54 | "ldlm.namespaces.{mdt-,filter-}*.lock_timeouts", 55 | "ldlm.namespaces.{mdt-,filter-}*.lock_unused_count", 56 | "ldlm.namespaces.{mdt-,filter-}*.lru_max_age", 57 | "ldlm.namespaces.{mdt-,filter-}*.lru_size", 58 | "ldlm.namespaces.{mdt-,filter-}*.max_nolock_bytes", 59 | "ldlm.namespaces.{mdt-,filter-}*.max_parallel_ast", 60 | "ldlm.namespaces.{mdt-,filter-}*.resource_count", 61 | "ldlm.services.ldlm_canceld.stats", 62 | "ldlm.services.ldlm_cbd.stats", 63 | "llite.*.stats", 64 | "mdd.*.changelog_users", 65 | "qmt.*.{dt,md}-*.glb-usr", 66 | "qmt.*.{dt,md}-*.glb-prj", 67 | "qmt.*.{dt,md}-*.glb-grp", 68 | "nodemap.*.dt_stats", 69 | "nodemap.*.md_stats", 70 | ] 71 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/historical_snapshots/lustrefs_exporter__tests__valid_fixture_lustre-2-14-0__client__llite_client.txt.histsnap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustrefs-exporter/src/main.rs 3 | expression: x 4 | --- 5 | # HELP lustre_client_stats Lustre client interface stats. 6 | # TYPE lustre_client_stats counter 7 | lustre_client_stats{operation="ioctl",target="ai400x2-ff47bce9ca35d800"} 114 8 | lustre_client_stats{operation="open",target="ai400x2-ff47bce9ca35d800"} 35955554 9 | lustre_client_stats{operation="close",target="ai400x2-ff47bce9ca35d800"} 35955554 10 | lustre_client_stats{operation="readdir",target="ai400x2-ff47bce9ca35d800"} 60 11 | lustre_client_stats{operation="getattr",target="ai400x2-ff47bce9ca35d800"} 35955647 12 | lustre_client_stats{operation="unlink",target="ai400x2-ff47bce9ca35d800"} 17977752 13 | lustre_client_stats{operation="mkdir",target="ai400x2-ff47bce9ca35d800"} 17987059 14 | lustre_client_stats{operation="rmdir",target="ai400x2-ff47bce9ca35d800"} 17987079 15 | lustre_client_stats{operation="mknod",target="ai400x2-ff47bce9ca35d800"} 17977752 16 | lustre_client_stats{operation="statfs",target="ai400x2-ff47bce9ca35d800"} 17864 17 | lustre_client_stats{operation="setxattr",target="ai400x2-ff47bce9ca35d800"} 1 18 | lustre_client_stats{operation="inode_permission",target="ai400x2-ff47bce9ca35d800"} 629305628 19 | lustre_client_stats{operation="opencount",target="ai400x2-ff47bce9ca35d800"} 35955576 20 | lustre_client_stats{operation="openclosetime",target="ai400x2-ff47bce9ca35d800"} 17977772 21 | 22 | # HELP lustre_health_healthy Indicates whether the Lustre target is healthy or not. 1 is healthy, 0 is unhealthy. 23 | # TYPE lustre_health_healthy gauge 24 | lustre_health_healthy 1 25 | 26 | # HELP lustre_ldlm_cbd_stats Gives information about LDLM Callback service. 27 | # TYPE lustre_ldlm_cbd_stats counter 28 | lustre_ldlm_cbd_stats{operation="req_waittime"} 41083956 29 | lustre_ldlm_cbd_stats{operation="req_qdepth"} 41083956 30 | lustre_ldlm_cbd_stats{operation="req_active"} 41083956 31 | lustre_ldlm_cbd_stats{operation="req_timeout"} 41083956 32 | lustre_ldlm_cbd_stats{operation="reqbuf_avail"} 87745428 33 | lustre_ldlm_cbd_stats{operation="ldlm_bl_callback"} 41083956 34 | 35 | # HELP lustre_lnet_mem_used Gives information about Lustre LNet memory usage. 36 | # TYPE lustre_lnet_mem_used gauge 37 | lustre_lnet_mem_used 22228358 38 | 39 | # HELP lustre_mem_used Gives information about Lustre memory usage. 40 | # TYPE lustre_mem_used gauge 41 | lustre_mem_used 35721854085 42 | 43 | # HELP lustre_mem_used_max Gives information about Lustre maximum memory usage. 44 | # TYPE lustre_mem_used_max counter 45 | lustre_mem_used_max 41220347397 46 | -------------------------------------------------------------------------------- /lustre-collector/src/fixtures/brw_stats_with_data.txt: -------------------------------------------------------------------------------- 1 | 2 | snapshot_time: 1534158712.738772898 (secs.nsecs) 3 | 4 | read | write 5 | pages per bulk r/w rpcs % cum % | rpcs % cum % 6 | 32: 0 0 0 | 1 11 11 7 | 64: 0 0 0 | 0 0 11 8 | 128: 0 0 0 | 0 0 11 9 | 256: 1 2 3 | 0 0 11 10 | 512: 0 0 0 | 0 0 11 11 | 1K: 0 0 0 | 8 88 100 12 | 13 | read | write 14 | discontiguous pages rpcs % cum % | rpcs % cum % 15 | 0: 0 0 0 | 6 66 66 16 | 1: 0 0 0 | 3 33 100 17 | 18 | read | write 19 | discontiguous blocks rpcs % cum % | rpcs % cum % 20 | 0: 0 0 0 | 9 100 100 21 | 22 | read | write 23 | disk fragmented I/Os ios % cum % | ios % cum % 24 | 1: 0 0 0 | 1 11 11 25 | 2: 0 0 0 | 0 0 11 26 | 3: 0 0 0 | 0 0 11 27 | 4: 0 0 0 | 8 88 100 28 | 29 | read | write 30 | disk I/Os in flight ios % cum % | ios % cum % 31 | 1: 0 0 0 | 3 9 9 32 | 2: 0 0 0 | 3 9 18 33 | 3: 0 0 0 | 3 9 27 34 | 4: 0 0 0 | 3 9 36 35 | 5: 0 0 0 | 3 9 45 36 | 6: 0 0 0 | 3 9 54 37 | 7: 0 0 0 | 3 9 63 38 | 8: 0 0 0 | 3 9 72 39 | 9: 0 0 0 | 2 6 78 40 | 10: 0 0 0 | 2 6 84 41 | 11: 0 0 0 | 2 6 90 42 | 12: 0 0 0 | 2 6 96 43 | 13: 0 0 0 | 1 3 100 44 | 45 | read | write 46 | I/O time (1/1000s) ios % cum % | ios % cum % 47 | 32: 0 0 0 | 1 11 11 48 | 64: 0 0 0 | 0 0 11 49 | 128: 0 0 0 | 2 22 33 50 | 256: 0 0 0 | 6 66 100 51 | 52 | read | write 53 | disk I/O size ios % cum % | ios % cum % 54 | 128K: 0 0 0 | 1 3 3 55 | 256K: 0 0 0 | 0 0 3 56 | 512K: 0 0 0 | 0 0 3 57 | 1M: 0 0 0 | 32 96 100 58 | 59 | read | write 60 | block maps msec maps % cum % | maps % cum % 61 | 1: 12689 100 100 | 0 0 0 62 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/snapshots/lustrefs_exporter__tests__valid_fixture_otel@lustre-2-14-0__client__llite_client.txt.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustrefs-exporter/src/lib.rs 3 | expression: x 4 | input_file: lustre-collector/src/fixtures/valid/lustre-2-14-0/client/llite_client.txt 5 | --- 6 | # HELP lustre_health_healthy Indicates whether the Lustre server is healthy or not. 1 is healthy, 0 is unhealthy. 7 | # TYPE lustre_health_healthy gauge 8 | lustre_health_healthy{} 1 9 | # HELP lustre_lnet_mem_used Gives information about Lustre LNet memory usage. 10 | # TYPE lustre_lnet_mem_used gauge 11 | lustre_lnet_mem_used{} 22228358 12 | # HELP lustre_mem_used Gives information about Lustre memory usage. 13 | # TYPE lustre_mem_used gauge 14 | lustre_mem_used{} 35721854085 15 | # HELP lustre_mem_used_max Gives information about Lustre maximum memory usage. 16 | # TYPE lustre_mem_used_max counter 17 | lustre_mem_used_max{} 41220347397 18 | # HELP lustre_ldlm_cbd_stats Gives information about LDLM Callback service. 19 | # TYPE lustre_ldlm_cbd_stats counter 20 | lustre_ldlm_cbd_stats{operation="ldlm_bl_callback"} 41083956 21 | lustre_ldlm_cbd_stats{operation="req_active"} 41083956 22 | lustre_ldlm_cbd_stats{operation="req_qdepth"} 41083956 23 | lustre_ldlm_cbd_stats{operation="req_timeout"} 41083956 24 | lustre_ldlm_cbd_stats{operation="req_waittime"} 41083956 25 | lustre_ldlm_cbd_stats{operation="reqbuf_avail"} 87745428 26 | # HELP lustre_client_stats Lustre client interface stats. 27 | # TYPE lustre_client_stats counter 28 | lustre_client_stats{operation="close",target="ai400x2-ff47bce9ca35d800"} 35955554 29 | lustre_client_stats{operation="getattr",target="ai400x2-ff47bce9ca35d800"} 35955647 30 | lustre_client_stats{operation="inode_permission",target="ai400x2-ff47bce9ca35d800"} 629305628 31 | lustre_client_stats{operation="ioctl",target="ai400x2-ff47bce9ca35d800"} 114 32 | lustre_client_stats{operation="mkdir",target="ai400x2-ff47bce9ca35d800"} 17987059 33 | lustre_client_stats{operation="mknod",target="ai400x2-ff47bce9ca35d800"} 17977752 34 | lustre_client_stats{operation="open",target="ai400x2-ff47bce9ca35d800"} 35955554 35 | lustre_client_stats{operation="openclosetime",target="ai400x2-ff47bce9ca35d800"} 17977772 36 | lustre_client_stats{operation="opencount",target="ai400x2-ff47bce9ca35d800"} 35955576 37 | lustre_client_stats{operation="readdir",target="ai400x2-ff47bce9ca35d800"} 60 38 | lustre_client_stats{operation="rmdir",target="ai400x2-ff47bce9ca35d800"} 17987079 39 | lustre_client_stats{operation="setxattr",target="ai400x2-ff47bce9ca35d800"} 1 40 | lustre_client_stats{operation="statfs",target="ai400x2-ff47bce9ca35d800"} 17864 41 | lustre_client_stats{operation="unlink",target="ai400x2-ff47bce9ca35d800"} 17977752 42 | # EOF 43 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/host.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::Family; 6 | use lustre_collector::HostStats; 7 | use prometheus_client::{ 8 | metrics::{counter::Counter, gauge::Gauge}, 9 | registry::Registry, 10 | }; 11 | use std::{ops::Deref, sync::atomic::AtomicU64}; 12 | 13 | #[derive(Debug, Default)] 14 | pub struct HostMetrics { 15 | lustre_targets_healthy: Family>, 16 | lnet_mem_used: Family>, 17 | mem_used: Family>, 18 | mem_used_max: Family>, 19 | } 20 | 21 | impl HostMetrics { 22 | pub fn register_metric(&self, registry: &mut Registry) { 23 | registry.register( 24 | "lustre_health_healthy", 25 | "Indicates whether the Lustre server is healthy or not. 1 is healthy, 0 is unhealthy", 26 | self.lustre_targets_healthy.clone(), 27 | ); 28 | 29 | registry.register( 30 | "lustre_lnet_mem_used", 31 | "Gives information about Lustre LNet memory usage", 32 | self.lnet_mem_used.clone(), 33 | ); 34 | 35 | registry.register( 36 | "lustre_mem_used", 37 | "Gives information about Lustre memory usage", 38 | self.mem_used.clone(), 39 | ); 40 | 41 | registry.register_without_auto_suffix( 42 | "lustre_mem_used_max", 43 | "Gives information about Lustre maximum memory usage", 44 | self.mem_used_max.clone(), 45 | ); 46 | } 47 | } 48 | 49 | pub fn build_host_stats(stats: &HostStats, metrics: &mut HostMetrics) { 50 | match stats { 51 | HostStats::HealthCheck(x) => { 52 | let healthy = x.value.healthy; 53 | 54 | metrics 55 | .lustre_targets_healthy 56 | .get_or_create(&vec![]) 57 | .set(if healthy { 1 } else { 0 }); 58 | 59 | for target in &x.value.targets { 60 | metrics 61 | .lustre_targets_healthy 62 | .get_or_create(&vec![("target", target.deref().to_string())]) 63 | .set(if healthy { 1 } else { 0 }); 64 | } 65 | } 66 | HostStats::LNetMemUsed(x) => { 67 | metrics.lnet_mem_used.get_or_create(&vec![]).set(x.value); 68 | } 69 | HostStats::Memused(x) => { 70 | metrics.mem_used.get_or_create(&vec![]).set(x.value); 71 | } 72 | HostStats::MemusedMax(x) => { 73 | metrics.mem_used_max.get_or_create(&vec![]).inc_by(x.value); 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /lustrefs-exporter/benches/lustre_metrics.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use iai_callgrind::{ 6 | Callgrind, CallgrindMetrics, FlamegraphConfig, FlamegraphKind, LibraryBenchmarkConfig, 7 | OutputFormat, library_benchmark, library_benchmark_group, main, 8 | }; 9 | use lustre_collector::{Record, parse_lnetctl_output, parse_lnetctl_stats}; 10 | use lustrefs_exporter::metrics::{Metrics, build_lustre_stats}; 11 | use prometheus_client::{encoding::text::encode, registry::Registry}; 12 | use std::hint::black_box; 13 | 14 | fn generate_records() -> Vec { 15 | let mut records = Vec::new(); 16 | 17 | let lustre_metrics = include_str!( 18 | "../../lustre-collector/src/fixtures/valid/lustre-2.14.0_ddn133/2.14.0_ddn133_quota.txt" 19 | ); 20 | let mut lustre_metrics_records = lustre_collector::parse_lctl_output(lustre_metrics.as_bytes()) 21 | .expect("Failed to parse lustre metrics"); 22 | records.append(&mut lustre_metrics_records); 23 | 24 | let net_show = include_bytes!("../fixtures/lnetctl_net_show.txt"); 25 | let mut net_show_records = 26 | parse_lnetctl_output(net_show).expect("Failed to parse lnetctl net show"); 27 | records.append(&mut net_show_records); 28 | 29 | let net_stats = include_bytes!("../fixtures/lnetctl_stats.txt"); 30 | let mut net_stats_records = 31 | parse_lnetctl_stats(net_stats).expect("Failed to parse lnetctl stats"); 32 | records.append(&mut net_stats_records); 33 | 34 | records 35 | } 36 | 37 | fn encode_metrics(records: Vec) -> String { 38 | let mut registry = Registry::default(); 39 | let mut metrics = Metrics::default(); 40 | 41 | // Build metrics 42 | build_lustre_stats(&records, &mut metrics); 43 | 44 | metrics.register_metric(&mut registry); 45 | 46 | let mut output = String::new(); 47 | 48 | encode(&mut output, ®istry).expect("Failed to encode metrics"); 49 | 50 | output 51 | } 52 | 53 | #[library_benchmark] 54 | #[benches::with_setup(setup = generate_records)] 55 | fn bench_encode_lustre_metrics(records: Vec) -> String { 56 | black_box(encode_metrics(records)) 57 | } 58 | 59 | library_benchmark_group!(name = memory_benches; benchmarks = bench_encode_lustre_metrics); 60 | main!( 61 | config = LibraryBenchmarkConfig::default() 62 | .tool(Callgrind::default() 63 | .format([CallgrindMetrics::All]) 64 | .flamegraph(FlamegraphConfig::default().kind(FlamegraphKind::Differential))) 65 | .output_format(OutputFormat::default() 66 | .truncate_description(None) 67 | ); 68 | library_benchmark_groups = memory_benches 69 | ); 70 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/metrics.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{ 6 | Family, 7 | brw_stats::{BrwStatsMetrics, build_target_stats}, 8 | host::{HostMetrics, build_host_stats}, 9 | llite::LliteMetrics, 10 | lnet::{LNetMetrics, build_lnet_stats}, 11 | quota::QuotaMetrics, 12 | service::{ServiceMetrics, build_service_stats}, 13 | stats::StatsMetrics, 14 | }; 15 | use lustre_collector::Record; 16 | use prometheus_client::{metrics::gauge::Gauge, registry::Registry}; 17 | use std::{collections::HashSet, sync::atomic::AtomicU64}; 18 | 19 | #[derive(Debug, Default)] 20 | pub struct Metrics { 21 | pub host: HostMetrics, 22 | pub quota: QuotaMetrics, 23 | pub service: ServiceMetrics, 24 | pub brw: BrwStatsMetrics, 25 | pub llite: LliteMetrics, 26 | pub lnet: LNetMetrics, 27 | pub stats: StatsMetrics, 28 | pub export: StatsMetrics, 29 | pub mds: StatsMetrics, // Reusing the Stats structure for MDS metrics 30 | target_info: Family>, 31 | } 32 | 33 | impl Metrics { 34 | pub fn register_metric(&self, registry: &mut Registry) { 35 | self.host.register_metric(registry); 36 | self.quota.register_metric(registry); 37 | self.service.register_metric(registry); 38 | self.brw.register_metric(registry); 39 | self.llite.register_metric(registry); 40 | self.lnet.register_metric(registry); 41 | self.stats.register_metric(registry); 42 | self.export.register_metric(registry); 43 | self.mds.register_metric(registry); 44 | 45 | // prometheus_client does not automatically include the `target_info` metric. 46 | // Add it manually. 47 | registry.register("target_info", "Target metadata", self.target_info.clone()); 48 | } 49 | } 50 | 51 | pub fn build_lustre_stats(output: &Vec, metrics: &mut Metrics) { 52 | // This set is used to store the possible duplicate target stats 53 | let mut set = HashSet::new(); 54 | 55 | for x in output { 56 | match x { 57 | lustre_collector::Record::Host(x) => { 58 | build_host_stats(x, &mut metrics.host); 59 | } 60 | lustre_collector::Record::LNetStat(x) => { 61 | build_lnet_stats(x, &mut metrics.lnet); 62 | } 63 | lustre_collector::Record::Target(x) => { 64 | build_target_stats(x, metrics, &mut set); 65 | } 66 | lustre_collector::Record::LustreService(x) => { 67 | build_service_stats(x, &mut metrics.service); 68 | } 69 | _ => {} 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__lnetctl_parser__tests__lnet_parse2.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/lnetctl_parser.rs 3 | expression: x 4 | 5 | --- 6 | [ 7 | LNetStat( 8 | SendCount( 9 | LNetStat { 10 | nid: "0@lo", 11 | param: Param( 12 | "send_count", 13 | ), 14 | value: 0, 15 | }, 16 | ), 17 | ), 18 | LNetStat( 19 | RecvCount( 20 | LNetStat { 21 | nid: "0@lo", 22 | param: Param( 23 | "recv_count", 24 | ), 25 | value: 0, 26 | }, 27 | ), 28 | ), 29 | LNetStat( 30 | DropCount( 31 | LNetStat { 32 | nid: "0@lo", 33 | param: Param( 34 | "drop_count", 35 | ), 36 | value: 0, 37 | }, 38 | ), 39 | ), 40 | LNetStat( 41 | SendCount( 42 | LNetStat { 43 | nid: "172.16.0.24@o2ib", 44 | param: Param( 45 | "send_count", 46 | ), 47 | value: 0, 48 | }, 49 | ), 50 | ), 51 | LNetStat( 52 | RecvCount( 53 | LNetStat { 54 | nid: "172.16.0.24@o2ib", 55 | param: Param( 56 | "recv_count", 57 | ), 58 | value: 0, 59 | }, 60 | ), 61 | ), 62 | LNetStat( 63 | DropCount( 64 | LNetStat { 65 | nid: "172.16.0.24@o2ib", 66 | param: Param( 67 | "drop_count", 68 | ), 69 | value: 0, 70 | }, 71 | ), 72 | ), 73 | LNetStat( 74 | SendCount( 75 | LNetStat { 76 | nid: "172.16.0.28@o2ib", 77 | param: Param( 78 | "send_count", 79 | ), 80 | value: 0, 81 | }, 82 | ), 83 | ), 84 | LNetStat( 85 | RecvCount( 86 | LNetStat { 87 | nid: "172.16.0.28@o2ib", 88 | param: Param( 89 | "recv_count", 90 | ), 91 | value: 0, 92 | }, 93 | ), 94 | ), 95 | LNetStat( 96 | DropCount( 97 | LNetStat { 98 | nid: "172.16.0.28@o2ib", 99 | param: Param( 100 | "drop_count", 101 | ), 102 | value: 0, 103 | }, 104 | ), 105 | ), 106 | ] 107 | -------------------------------------------------------------------------------- /.github/workflows/lustre-metrics-bench.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | pull_request: 7 | types: [opened, reopened, edited, synchronize] 8 | 9 | jobs: 10 | benchmark_lustre_metrics: 11 | name: Benchmark Lustre Metrics 12 | permissions: 13 | checks: write 14 | pull-requests: write 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - uses: bencherdev/bencher@main 20 | 21 | - name: Install Rust 22 | uses: actions-rust-lang/setup-rust-toolchain@v1 23 | 24 | - uses: actions/cache@v4 25 | with: 26 | path: | 27 | ~/.cargo/bin/ 28 | ~/.cargo/registry/index/ 29 | ~/.cargo/registry/cache/ 30 | ~/.cargo/git/db/ 31 | target/ 32 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} 33 | 34 | - name: Install Valgrind 35 | run: sudo apt-get update && sudo apt-get install -y valgrind 36 | 37 | # Callgrind runner version must match iai-callgrind version specified in Cargo.toml 38 | - name: Install iai-callgrind-runner 39 | run: | 40 | version=$(cargo metadata --format-version=1 |\ 41 | jq '.packages[] | select(.name == "iai-callgrind").version' |\ 42 | tr -d '"' 43 | ) 44 | cargo install iai-callgrind-runner --version $version 45 | 46 | - name: Track Lustre Metrics Benchmarks on main branch 47 | if: github.ref == 'refs/heads/main' && github.event_name == 'push' 48 | run: | 49 | bencher run \ 50 | --project '${{ secrets.BENCHER_PROJECT_ID }}' \ 51 | --token '${{ secrets.BENCHER_API_TOKEN }}' \ 52 | --branch main \ 53 | --testbed ci-runner \ 54 | --threshold-measure Instructions \ 55 | --threshold-test t_test \ 56 | --threshold-max-sample-size 64 \ 57 | --threshold-lower-boundary 0.95 \ 58 | --threshold-upper-boundary 0.95 \ 59 | --err \ 60 | --adapter rust_iai_callgrind \ 61 | --github-actions '${{ secrets.GITHUB_TOKEN }}' \ 62 | "cargo bench --bench lustre_metrics" 63 | 64 | - name: Compare Lustre Metrics Benchmarks with main branch 65 | if: github.event_name == 'pull_request' 66 | run: | 67 | bencher run \ 68 | --project '${{ secrets.BENCHER_PROJECT_ID }}' \ 69 | --token '${{ secrets.BENCHER_API_TOKEN }}' \ 70 | --branch '${{ github.head_ref }}' \ 71 | --start-point main \ 72 | --testbed ci-runner \ 73 | --start-point-clone-thresholds \ 74 | --err \ 75 | --adapter rust_iai_callgrind \ 76 | --github-actions '${{ secrets.GITHUB_TOKEN }}' \ 77 | "cargo bench --bench lustre_metrics" 78 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/mdt_parser.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{ 6 | ExportStats, 7 | base_parsers::{digits, param, param_period, period, target}, 8 | exports_parser::exports_stats, 9 | oss::obdfilter_parser::{EXPORTS, EXPORTS_PARAMS}, 10 | stats_parser::stats, 11 | types::{Param, Record, Stat, Target, TargetStat, TargetStats, TargetVariant}, 12 | }; 13 | use combine::{ 14 | Parser, attempt, choice, 15 | error::ParseError, 16 | parser::char::{newline, string}, 17 | stream::Stream, 18 | }; 19 | 20 | pub(crate) const STATS: &str = "md_stats"; 21 | pub(crate) const NUM_EXPORTS: &str = "num_exports"; 22 | 23 | enum MdtStat { 24 | Stats(Vec), 25 | NumExports(u64), 26 | ExportStats(Vec), 27 | } 28 | 29 | fn mdt_stat() -> impl Parser 30 | where 31 | I: Stream, 32 | I::Error: ParseError, 33 | { 34 | choice(( 35 | ( 36 | param(NUM_EXPORTS), 37 | digits().skip(newline()).map(MdtStat::NumExports), 38 | ), 39 | (param(STATS), stats().map(MdtStat::Stats)).message("while parsing mdt_stat"), 40 | ( 41 | param_period(EXPORTS), 42 | exports_stats().map(MdtStat::ExportStats), 43 | ), 44 | )) 45 | } 46 | 47 | pub(crate) fn params() -> Vec { 48 | [ 49 | format!("mdt.*.{STATS}"), 50 | format!("mdt.*MDT*.{NUM_EXPORTS}"), 51 | format!("mdt.*MDT*.{EXPORTS_PARAMS}"), 52 | ] 53 | .into_iter() 54 | .collect() 55 | } 56 | 57 | fn target_name() -> impl Parser 58 | where 59 | I: Stream, 60 | I::Error: ParseError, 61 | { 62 | ( 63 | attempt(string("mdt")).skip(period()), 64 | target().skip(period()), 65 | ) 66 | .map(|(_, x)| x) 67 | .message("while parsing target_name") 68 | } 69 | 70 | pub(crate) fn parse() -> impl Parser 71 | where 72 | I: Stream, 73 | I::Error: ParseError, 74 | { 75 | (target_name(), mdt_stat()) 76 | .map(|(target, (param, value))| match value { 77 | MdtStat::Stats(value) => TargetStats::Stats(TargetStat { 78 | kind: TargetVariant::Mdt, 79 | target, 80 | param, 81 | value, 82 | }), 83 | MdtStat::NumExports(value) => TargetStats::NumExports(TargetStat { 84 | kind: TargetVariant::Mdt, 85 | target, 86 | param, 87 | value, 88 | }), 89 | MdtStat::ExportStats(value) => TargetStats::ExportStats(TargetStat { 90 | kind: TargetVariant::Mdt, 91 | target, 92 | param, 93 | value, 94 | }), 95 | }) 96 | .map(Record::Target) 97 | .message("while parsing mdt") 98 | } 99 | -------------------------------------------------------------------------------- /lustre-collector/src/time.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::base_parsers::{digits, till_newline}; 6 | use combine::stream::Stream; 7 | use combine::{Parser, optional, token}; 8 | use combine::{ 9 | attempt, 10 | parser::char::{spaces, string}, 11 | }; 12 | use combine::{error::ParseError, parser::char::newline}; 13 | 14 | fn time(name: &'static str) -> impl Parser 15 | where 16 | I: Stream, 17 | I::Error: ParseError, 18 | { 19 | ( 20 | string(name).skip(optional(token(':'))), 21 | spaces(), 22 | digits().skip(token('.')), 23 | digits().skip(till_newline()), 24 | ) 25 | .map(|(_, _, secs, nsecs)| format!("{secs}.{nsecs}")) 26 | } 27 | 28 | pub(crate) fn time_triple() -> impl Parser 29 | where 30 | I: Stream, 31 | I::Error: ParseError, 32 | { 33 | ( 34 | time("snapshot_time") 35 | .message("While parsing snapshot_time") 36 | .skip(newline()), 37 | optional( 38 | attempt( 39 | time("start_time") 40 | .skip(newline()) 41 | .message("While parsing start_time"), 42 | ) 43 | .and( 44 | time("elapsed_time") 45 | .skip(newline()) 46 | .message("While parsing elapsed_time"), 47 | ), 48 | ), 49 | ) 50 | .map(|(time, _)| time) 51 | } 52 | 53 | #[cfg(test)] 54 | mod tests { 55 | use combine::EasyParser; 56 | use insta::assert_debug_snapshot; 57 | 58 | use super::*; 59 | 60 | #[test] 61 | fn test_time() { 62 | let x = r#"snapshot_time: 1534158712.738772898 (secs.nsecs) 63 | "#; 64 | 65 | let result = time("snapshot_time").parse(x); 66 | 67 | assert_eq!(result, Ok(("1534158712.738772898".to_string(), "\n",))); 68 | } 69 | #[test] 70 | fn test_time_no_colon() { 71 | let x = r#"snapshot_time 1534769431.137892896 secs.nsecs 72 | "#; 73 | 74 | let result = time("snapshot_time").parse(x); 75 | 76 | assert_eq!(result, Ok(("1534769431.137892896".to_string(), "\n"))); 77 | } 78 | 79 | #[test] 80 | fn test_time_triple() { 81 | let x = r#"snapshot_time 1684948453.142852820 secs.nsecs 82 | start_time 1684946875.504329012 secs.nsecs 83 | elapsed_time 1577.638523808 secs.nsecs 84 | "#; 85 | 86 | let result = time_triple().easy_parse(x).unwrap(); 87 | 88 | assert_debug_snapshot!(result); 89 | } 90 | 91 | #[test] 92 | fn test_time_triple_back_compat() { 93 | let x = r#"snapshot_time 1596728874.484750908 secs.nsecs 94 | req_waittime 31280 samples [usec] 11 2695 5020274 1032267156 95 | 96 | "#; 97 | 98 | let result = time_triple().easy_parse(x).unwrap(); 99 | 100 | assert_debug_snapshot!(result); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__recovery_status_parser__tests__waiting_for_clients.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/recovery_status_parser.rs 3 | expression: records 4 | --- 5 | [ 6 | Target( 7 | RecoveryStatus( 8 | TargetStat { 9 | kind: Mdt, 10 | param: Param( 11 | "recovery_status", 12 | ), 13 | target: Target( 14 | "fs-MDT0000", 15 | ), 16 | value: Complete, 17 | }, 18 | ), 19 | ), 20 | Target( 21 | RecoveryCompletedClients( 22 | TargetStat { 23 | kind: Mdt, 24 | param: Param( 25 | "recovery_status", 26 | ), 27 | target: Target( 28 | "fs-MDT0000", 29 | ), 30 | value: 3, 31 | }, 32 | ), 33 | ), 34 | Target( 35 | RecoveryStatus( 36 | TargetStat { 37 | kind: Mdt, 38 | param: Param( 39 | "recovery_status", 40 | ), 41 | target: Target( 42 | "fs-MDT0002", 43 | ), 44 | value: WaitingForClients, 45 | }, 46 | ), 47 | ), 48 | Target( 49 | RecoveryStatus( 50 | TargetStat { 51 | kind: Ost, 52 | param: Param( 53 | "recovery_status", 54 | ), 55 | target: Target( 56 | "fs-OST0000", 57 | ), 58 | value: Complete, 59 | }, 60 | ), 61 | ), 62 | Target( 63 | RecoveryCompletedClients( 64 | TargetStat { 65 | kind: Ost, 66 | param: Param( 67 | "recovery_status", 68 | ), 69 | target: Target( 70 | "fs-OST0000", 71 | ), 72 | value: 4, 73 | }, 74 | ), 75 | ), 76 | Target( 77 | RecoveryStatus( 78 | TargetStat { 79 | kind: Ost, 80 | param: Param( 81 | "recovery_status", 82 | ), 83 | target: Target( 84 | "fs-OST0001", 85 | ), 86 | value: Complete, 87 | }, 88 | ), 89 | ), 90 | Target( 91 | RecoveryCompletedClients( 92 | TargetStat { 93 | kind: Ost, 94 | param: Param( 95 | "recovery_status", 96 | ), 97 | target: Target( 98 | "fs-OST0001", 99 | ), 100 | value: 4, 101 | }, 102 | ), 103 | ), 104 | Target( 105 | RecoveryStatus( 106 | TargetStat { 107 | kind: Ost, 108 | param: Param( 109 | "recovery_status", 110 | ), 111 | target: Target( 112 | "fs-OST0004", 113 | ), 114 | value: WaitingForClients, 115 | }, 116 | ), 117 | ), 118 | ] 119 | -------------------------------------------------------------------------------- /lustre-collector/src/llite/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{ 6 | Param, Record, Stat, Target, TargetStats, 7 | base_parsers::{param, period, target}, 8 | stats_parser::stats, 9 | }; 10 | use combine::{ParseError, Parser, Stream, parser::char::string}; 11 | 12 | pub(crate) const LLITE: &str = "llite"; 13 | pub(crate) const STATS: &str = "stats"; 14 | 15 | pub(crate) fn params() -> Vec { 16 | [STATS] 17 | .into_iter() 18 | .map(|x| format!("{LLITE}.*.{x}")) 19 | .collect() 20 | } 21 | 22 | fn target_name() -> impl Parser 23 | where 24 | I: Stream, 25 | I::Error: ParseError, 26 | { 27 | (string(LLITE).skip(period()), target().skip(period())) 28 | .map(|(_, x)| x) 29 | .message("while parsing llite target_name") 30 | } 31 | 32 | enum LliteStat { 33 | Stats(Vec), 34 | } 35 | 36 | fn llite_stat() -> impl Parser 37 | where 38 | I: Stream, 39 | I::Error: ParseError, 40 | { 41 | (param(STATS), stats().map(LliteStat::Stats)).message("while parsing llite_stat") 42 | } 43 | 44 | pub(crate) fn parse() -> impl Parser 45 | where 46 | I: Stream, 47 | I::Error: ParseError, 48 | { 49 | (target_name(), llite_stat()) 50 | .map(|(target, (param, value))| match value { 51 | LliteStat::Stats(stats) => TargetStats::Llite(crate::types::LliteStat { 52 | target, 53 | param, 54 | stats, 55 | }), 56 | }) 57 | .map(Record::Target) 58 | .message("while parsing llite") 59 | } 60 | 61 | #[cfg(test)] 62 | mod tests { 63 | use super::*; 64 | use combine::many; 65 | use insta::assert_debug_snapshot; 66 | 67 | #[test] 68 | fn test_parse() { 69 | let x = r#"llite.ai400x2-ffff9440f1003000.stats= 70 | snapshot_time 1689697369.331040915 secs.nsecs 71 | ioctl 2 samples [reqs] 72 | open 13812423 samples [usec] 1 725287 1027077752 8835364169944 73 | close 13812423 samples [usec] 47 778498 1320315612 17542973849370 74 | readdir 12 samples [usec] 0 4647 6715 22456295 75 | getattr 14812440 samples [usec] 2 320411 1317584841 2110166912709 76 | unlink 6906208 samples [usec] 117 749323 1386719680 23443327087798 77 | mkdir 7906554 samples [usec] 104 1529199 20996782592 1837945636486522 78 | rmdir 6939862 samples [usec] 95 646028 16617944601 635123583760591 79 | mknod 6906208 samples [usec] 119 775827 1454511094 10119157242014 80 | statfs 7 samples [usec] 147 197 1236 220284 81 | inode_permission 251887103 samples [usec] 0 14235 178199279 1102415701 82 | opencount 13812424 samples [reqs] 1 2 20718632 34531048 83 | openclosetime 6906208 samples [usec] 2225920 34405427 163169641155255 11416538743473681487 84 | "#; 85 | 86 | let result: (Vec<_>, _) = many(parse()).parse(x).unwrap(); 87 | 88 | assert_debug_snapshot!(result) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/lnet.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::Family; 6 | use lustre_collector::{LNetStat, LNetStatGlobal, LNetStats}; 7 | use prometheus_client::{metrics::counter::Counter, registry::Registry}; 8 | 9 | #[derive(Debug, Default)] 10 | pub struct LNetMetrics { 11 | send_count_total: Family>, 12 | receive_count_total: Family>, 13 | drop_count_total: Family>, 14 | send_bytes_total: Family>, 15 | receive_bytes_total: Family>, 16 | drop_bytes_total: Family>, 17 | } 18 | 19 | impl LNetMetrics { 20 | pub fn register_metric(&self, registry: &mut Registry) { 21 | registry.register( 22 | "lustre_send_count", 23 | "Total number of messages that have been sent", 24 | self.send_count_total.clone(), 25 | ); 26 | 27 | registry.register( 28 | "lustre_receive_count", 29 | "Total number of messages that have been received", 30 | self.receive_count_total.clone(), 31 | ); 32 | 33 | registry.register( 34 | "lustre_drop_count", 35 | "Total number of messages that have been dropped", 36 | self.drop_count_total.clone(), 37 | ); 38 | 39 | registry.register( 40 | "lustre_send_bytes", 41 | "Total number of bytes that have been sent", 42 | self.send_bytes_total.clone(), 43 | ); 44 | 45 | registry.register( 46 | "lustre_receive_bytes", 47 | "Total number of bytes that have been received", 48 | self.receive_bytes_total.clone(), 49 | ); 50 | 51 | registry.register( 52 | "lustre_drop_bytes", 53 | "Total number of bytes that have been dropped", 54 | self.drop_bytes_total.clone(), 55 | ); 56 | } 57 | } 58 | 59 | fn record_lnet_stat(stat: &LNetStat, counter: &mut Family>) { 60 | let labels = vec![("nid", stat.nid.to_string())]; 61 | 62 | counter 63 | .get_or_create(&labels) 64 | .inc_by(stat.value.try_into().unwrap_or(0)); 65 | } 66 | 67 | fn record_lnet_stat_global(stat: &LNetStatGlobal, counter: &mut Family>) { 68 | let labels = vec![]; 69 | 70 | counter 71 | .get_or_create(&labels) 72 | .inc_by(stat.value.try_into().unwrap_or(0)); 73 | } 74 | 75 | pub fn build_lnet_stats(x: &LNetStats, lnet: &mut LNetMetrics) { 76 | match x { 77 | LNetStats::SendCount(stat) => { 78 | record_lnet_stat(stat, &mut lnet.send_count_total); 79 | } 80 | LNetStats::RecvCount(stat) => { 81 | record_lnet_stat(stat, &mut lnet.receive_count_total); 82 | } 83 | LNetStats::DropCount(stat) => { 84 | record_lnet_stat(stat, &mut lnet.drop_count_total); 85 | } 86 | LNetStats::SendLength(stat) => { 87 | record_lnet_stat_global(stat, &mut lnet.send_bytes_total); 88 | } 89 | LNetStats::RecvLength(stat) => { 90 | record_lnet_stat_global(stat, &mut lnet.receive_bytes_total); 91 | } 92 | LNetStats::DropLength(stat) => { 93 | record_lnet_stat_global(stat, &mut lnet.drop_bytes_total); 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/otel_snapshots/lustrefs_exporter__tests__valid_fixture_otel@lustre-2-14-0__client__llite_client.txt.otelsnap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustrefs-exporter/src/main.rs 3 | expression: x 4 | input_file: lustre-collector/src/fixtures/valid/lustre-2-14-0/client/llite_client.txt 5 | --- 6 | # HELP lustre_client_stats Lustre client interface stats. 7 | # TYPE lustre_client_stats counter 8 | lustre_client_stats{operation="close",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 35955554 9 | lustre_client_stats{operation="getattr",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 35955647 10 | lustre_client_stats{operation="inode_permission",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 629305628 11 | lustre_client_stats{operation="ioctl",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 114 12 | lustre_client_stats{operation="mkdir",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 17987059 13 | lustre_client_stats{operation="mknod",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 17977752 14 | lustre_client_stats{operation="open",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 35955554 15 | lustre_client_stats{operation="openclosetime",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 17977772 16 | lustre_client_stats{operation="opencount",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 35955576 17 | lustre_client_stats{operation="readdir",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 60 18 | lustre_client_stats{operation="rmdir",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 17987079 19 | lustre_client_stats{operation="setxattr",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 1 20 | lustre_client_stats{operation="statfs",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 17864 21 | lustre_client_stats{operation="unlink",target="ai400x2-ff47bce9ca35d800",otel_scope_name="lustre"} 17977752 22 | # HELP lustre_health_healthy Indicates whether the Lustre server is healthy or not. 1 is healthy, 0 is unhealthy. 23 | # TYPE lustre_health_healthy gauge 24 | lustre_health_healthy{otel_scope_name="lustre"} 1 25 | # HELP lustre_ldlm_cbd_stats Gives information about LDLM Callback service. 26 | # TYPE lustre_ldlm_cbd_stats counter 27 | lustre_ldlm_cbd_stats{operation="ldlm_bl_callback",otel_scope_name="lustre"} 41083956 28 | lustre_ldlm_cbd_stats{operation="req_active",otel_scope_name="lustre"} 41083956 29 | lustre_ldlm_cbd_stats{operation="req_qdepth",otel_scope_name="lustre"} 41083956 30 | lustre_ldlm_cbd_stats{operation="req_timeout",otel_scope_name="lustre"} 41083956 31 | lustre_ldlm_cbd_stats{operation="req_waittime",otel_scope_name="lustre"} 41083956 32 | lustre_ldlm_cbd_stats{operation="reqbuf_avail",otel_scope_name="lustre"} 87745428 33 | # HELP lustre_lnet_mem_used Gives information about Lustre LNet memory usage. 34 | # TYPE lustre_lnet_mem_used gauge 35 | lustre_lnet_mem_used{otel_scope_name="lustre"} 22228358 36 | # HELP lustre_mem_used Gives information about Lustre memory usage. 37 | # TYPE lustre_mem_used gauge 38 | lustre_mem_used{otel_scope_name="lustre"} 35721854085 39 | # HELP lustre_mem_used_max Gives information about Lustre maximum memory usage. 40 | # TYPE lustre_mem_used_max counter 41 | lustre_mem_used_max{otel_scope_name="lustre"} 41220347397 42 | # HELP target_info Target metadata 43 | # TYPE target_info gauge 44 | target_info{service_name="lustrefs-exporter",telemetry_sdk_language="rust",telemetry_sdk_name="opentelemetry",telemetry_sdk_version="0.29.0"} 1 45 | -------------------------------------------------------------------------------- /.github/workflows/quota-parsing.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | pull_request: 7 | types: [opened, reopened, edited, synchronize] 8 | 9 | jobs: 10 | benchmark_quota_parsing: 11 | name: Benchmark quota parsing 12 | permissions: 13 | checks: write 14 | pull-requests: write 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - uses: bencherdev/bencher@main 20 | 21 | - name: Install Rust 22 | uses: actions-rust-lang/setup-rust-toolchain@v1 23 | 24 | - name: Track quota parsing metrics benchmarks on main branch 25 | if: github.ref == 'refs/heads/main' && github.event_name == 'push' 26 | run: | 27 | bencher run \ 28 | --project '${{ secrets.BENCHER_PROJECT_ID }}' \ 29 | --token '${{ secrets.BENCHER_API_TOKEN }}' \ 30 | --branch main \ 31 | --testbed ci-runner \ 32 | --threshold-measure Latency \ 33 | --threshold-test t_test \ 34 | --threshold-max-sample-size 64 \ 35 | --threshold-lower-boundary 0.95 \ 36 | --threshold-upper-boundary 0.95 \ 37 | --err \ 38 | --adapter rust_criterion \ 39 | --github-actions '${{ secrets.GITHUB_TOKEN }}' \ 40 | "cargo bench --bench combine_performance" 41 | 42 | - name: Compare quota parsing metrics benchmarks with main branch 43 | if: github.event_name == 'pull_request' 44 | run: | 45 | bencher run \ 46 | --project '${{ secrets.BENCHER_PROJECT_ID }}' \ 47 | --token '${{ secrets.BENCHER_API_TOKEN }}' \ 48 | --branch '${{ github.head_ref }}' \ 49 | --start-point main \ 50 | --testbed ci-runner \ 51 | --start-point-clone-thresholds \ 52 | --err \ 53 | --adapter rust_criterion \ 54 | --github-actions '${{ secrets.GITHUB_TOKEN }}' \ 55 | "cargo bench --bench combine_performance" 56 | 57 | - name: Run quota parsing memory usage benchmark 58 | run: | 59 | # Run the benchmark first (suppress Criterion output) 60 | cargo bench --bench combine_memory 2>&1 > /dev/null 61 | 62 | - name: Track quota parsing memory usage benchmark on main branch 63 | if: github.ref == 'refs/heads/main' && github.event_name == 'push' 64 | run: | 65 | bencher run \ 66 | --project '${{ secrets.BENCHER_PROJECT_ID }}' \ 67 | --token '${{ secrets.BENCHER_API_TOKEN }}' \ 68 | --branch main \ 69 | --testbed ci-runner \ 70 | --threshold-measure peak_rss_mib \ 71 | --threshold-test t_test \ 72 | --threshold-max-sample-size 64 \ 73 | --threshold-lower-boundary 0.95 \ 74 | --threshold-upper-boundary 0.95 \ 75 | --err \ 76 | --adapter json \ 77 | --github-actions '${{ secrets.GITHUB_TOKEN }}' \ 78 | --file lustre-collector/combine_mem_usage.json 79 | 80 | - name: Compare quota parsing memory metrics with main branch 81 | if: github.event_name == 'pull_request' 82 | run: | 83 | bencher run \ 84 | --project '${{ secrets.BENCHER_PROJECT_ID }}' \ 85 | --token '${{ secrets.BENCHER_API_TOKEN }}' \ 86 | --branch '${{ github.head_ref }}' \ 87 | --start-point main \ 88 | --testbed ci-runner \ 89 | --start-point-clone-thresholds \ 90 | --err \ 91 | --adapter json \ 92 | --github-actions '${{ secrets.GITHUB_TOKEN }}' \ 93 | --file lustre-collector/combine_mem_usage.json 94 | -------------------------------------------------------------------------------- /lustrefs-exporter/testcmds/cmds_test_net_show_output_with_mock.json: -------------------------------------------------------------------------------- 1 | { 2 | "commands": { 3 | "lnetctl:net show -v 4": [ 4 | { 5 | "binary_name": "lnetctl", 6 | "args": [ 7 | "net", 8 | "show", 9 | "-v", 10 | "4" 11 | ], 12 | "stdout": "net:\n - net type: lo\n local NI(s):\n - nid: 0@lo\n status: up\n statistics:\n send_count: 180076\n recv_count: 180072\n drop_count: 4\n sent_stats:\n put: 180076\n get: 0\n reply: 0\n ack: 0\n hello: 0\n received_stats:\n put: 180063\n get: 0\n reply: 0\n ack: 9\n hello: 0\n dropped_stats:\n put: 4\n get: 0\n reply: 0\n ack: 0\n hello: 0\n health stats:\n fatal_error: 0\n health value: 1000\n interrupts: 0\n dropped: 0\n aborted: 0\n no route: 0\n timeouts: 0\n error: 0\n tunables:\n peer_timeout: 0\n peer_credits: 0\n peer_buffer_credits: 0\n credits: 0\n lnd tunables:\n dev cpt: 0\n CPT: \"[0,1,2,3,4]\"\n - net type: tcp\n local NI(s):\n - nid: 172.16.0.24@tcp\n status: up\n interfaces:\n 0: mlxen0\n statistics:\n send_count: 464970\n recv_count: 464963\n drop_count: 4\n sent_stats:\n put: 340418\n get: 124552\n reply: 0\n ack: 0\n hello: 0\n received_stats:\n put: 340401\n get: 63529\n reply: 61023\n ack: 10\n hello: 0\n dropped_stats:\n put: 4\n get: 0\n reply: 0\n ack: 0\n hello: 0\n health stats:\n fatal_error: 0\n health value: 1000\n interrupts: 0\n dropped: 0\n aborted: 0\n no route: 0\n timeouts: 0\n error: 0\n tunables:\n peer_timeout: 180\n peer_credits: 8\n peer_buffer_credits: 0\n credits: 256\n lnd tunables:\n conns_per_peer: 3\n dev cpt: 0\n CPT: \"[0,1,2,3,4]\"\n - nid: 172.16.0.25@tcp\n status: up\n interfaces:\n 0: mlxen1\n statistics:\n send_count: 464886\n recv_count: 464877\n drop_count: 3\n sent_stats:\n put: 340879\n get: 124007\n reply: 0\n ack: 0\n hello: 0\n received_stats:\n put: 340858\n get: 62013\n reply: 61994\n ack: 12\n hello: 0\n dropped_stats:\n put: 3\n get: 0\n reply: 0\n ack: 0\n hello: 0\n health stats:\n fatal_error: 0\n health value: 1000\n interrupts: 0\n dropped: 0\n aborted: 0\n no route: 0\n timeouts: 0\n error: 0\n tunables:\n peer_timeout: 180\n peer_credits: 8\n peer_buffer_credits: 0\n credits: 256\n lnd tunables:\n conns_per_peer: 3\n dev cpt: 0\n CPT: \"[0,1,2,3,4]\"\n", 13 | "stderr": "", 14 | "exit_code": 0 15 | } 16 | ] 17 | } 18 | } -------------------------------------------------------------------------------- /lustre-collector/src/oss/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | pub(crate) mod obdfilter_parser; 6 | pub(crate) mod oss_parser; 7 | 8 | use crate::types::Record; 9 | use combine::{Parser, Stream, attempt, error::ParseError}; 10 | 11 | pub(crate) fn params() -> Vec { 12 | obdfilter_parser::obd_params() 13 | .into_iter() 14 | .chain(oss_parser::params()) 15 | .collect() 16 | } 17 | 18 | pub(crate) fn parse() -> impl Parser 19 | where 20 | I: Stream, 21 | I::Error: ParseError, 22 | { 23 | attempt(obdfilter_parser::parse()).or(attempt(oss_parser::parse())) 24 | } 25 | 26 | #[cfg(test)] 27 | mod tests { 28 | use super::*; 29 | use combine::many; 30 | use insta::assert_debug_snapshot; 31 | 32 | #[test] 33 | fn test_parse() { 34 | let x = r#"obdfilter.fs-OST0000.stats= 35 | snapshot_time 1535148988.363769785 secs.nsecs 36 | write_bytes 9 samples [bytes] 98303 4194304 33554431 37 | create 4 samples [reqs] 38 | statfs 42297 samples [reqs] 39 | get_info 2 samples [reqs] 40 | connect 6 samples [reqs] 41 | reconnect 1 samples [reqs] 42 | disconnect 4 samples [reqs] 43 | statfs 46806 samples [reqs] 44 | preprw 9 samples [reqs] 45 | commitrw 9 samples [reqs] 46 | ping 8229 samples [reqs] 47 | obdfilter.fs-OST0000.num_exports=2 48 | obdfilter.fs-OST0000.tot_dirty=0 49 | obdfilter.fs-OST0000.tot_granted=8666816 50 | obdfilter.fs-OST0000.tot_pending=0 51 | ost.OSS.ost.stats= 52 | snapshot_time 1688128253.497763049 secs.nsecs 53 | req_waittime 18419628 samples [usec] 2 40983 305482965 25043535105 54 | req_qdepth 18419628 samples [reqs] 0 34 99937 130635 55 | req_active 18419628 samples [reqs] 1 36 69585063 634492353 56 | req_timeout 18419628 samples [sec] 1 15 276294334 4144414654 57 | reqbuf_avail 38185151 samples [bufs] 60 64 2438170078 155685175822 58 | ldlm_glimpse_enqueue 9257180 samples [reqs] 1 1 9257180 9257180 59 | ldlm_extent_enqueue 19856 samples [reqs] 1 1 19856 19856 60 | ost_create 144904 samples [usec] 6 16594 98795730 85661707326 61 | ost_destroy 8988941 samples [usec] 89 173579 5160119682 8184502010174 62 | ost_get_info 8 samples [usec] 540 3603 10971 28145019 63 | ost_connect 341 samples [usec] 21 903 24182 2818080 64 | ost_disconnect 331 samples [usec] 23 524 39358 7068516 65 | ost_sync 4510 samples [usec] 3 10945 997271 2117171965 66 | ost_set_info 28 samples [usec] 9 34 606 14594 67 | obd_ping 3529 samples [usec] 3 12431 60722 155336592 68 | ost.OSS.ost_io.stats= 69 | snapshot_time 1688128269.170769339 secs.nsecs 70 | req_waittime 3398592545 samples [usec] 2 585517 95316362073 32500246129015 71 | req_qdepth 3398592545 samples [reqs] 0 53 90676247 112259319 72 | req_active 3398592545 samples [reqs] 1 82 55496806665 1427593461517 73 | req_timeout 3398592545 samples [sec] 15 15 50978888175 764683322625 74 | reqbuf_avail 7234158916 samples [bufs] 55 64 461878663298 29490702443182 75 | ost_read 2447557926 samples [usec] 23 138321 1871024223288 4497819893384848 76 | ost_write 951033049 samples [usec] 59 1247713 2749050524782 100048363896296658 77 | ost_punch 1515 samples [usec] 16 4883 63967 29511205 78 | "#; 79 | 80 | let result: (Vec<_>, _) = many(parse()).parse(x).unwrap(); 81 | 82 | assert_debug_snapshot!(result) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /lustre-collector/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | mod base_parsers; 6 | pub(crate) mod brw_stats_parser; 7 | pub mod error; 8 | pub(crate) mod exports_parser; 9 | pub(crate) mod ldlm; 10 | pub(crate) mod llite; 11 | mod lnetctl_parser; 12 | mod mdd_parser; 13 | mod mds; 14 | pub mod mgs; 15 | mod node_stats_parsers; 16 | mod nodemap; 17 | mod osd_parser; 18 | mod oss; 19 | pub mod parser; 20 | pub mod quota; 21 | pub mod recovery_status_parser; 22 | mod stats_parser; 23 | mod time; 24 | mod top_level_parser; 25 | pub mod types; 26 | 27 | pub use crate::error::LustreCollectorError; 28 | use combine::parser::EasyParser; 29 | pub use lnetctl_parser::{parse as parse_lnetctl_output, parse_lnetctl_stats}; 30 | pub use node_stats_parsers::{parse_cpustats_output, parse_meminfo_output}; 31 | use std::{io, str}; 32 | pub use types::*; 33 | 34 | fn check_output(records: Vec, state: &str) -> Result, LustreCollectorError> { 35 | let params = crate::parser::params().join(" "); 36 | 37 | if !state.is_empty() { 38 | return Err(io::Error::new( 39 | io::ErrorKind::InvalidInput, 40 | format!("Content left in input buffer. Please run and supply to support: `lctl get_param {params}`"), 41 | ) 42 | .into()); 43 | } 44 | 45 | Ok(records) 46 | } 47 | 48 | /// Must be called with output of `lctl get_params` for all params returned from `parser::parse()` 49 | pub fn parse_lctl_output(lctl_output: &[u8]) -> Result, LustreCollectorError> { 50 | let lctl_stats = str::from_utf8(lctl_output)?; 51 | 52 | let (lctl_record, state) = parser::parse() 53 | .easy_parse(lctl_stats) 54 | .map_err(|err| err.map_position(|p| p.translate_position(lctl_stats)))?; 55 | 56 | check_output(lctl_record, state) 57 | } 58 | 59 | pub fn parse_mgs_fs_output(mgs_fs_output: &[u8]) -> Result, LustreCollectorError> { 60 | let mgs_fs = str::from_utf8(mgs_fs_output)?; 61 | 62 | let (mgs_fs_record, state) = mgs::mgs_fs_parser::parse() 63 | .easy_parse(mgs_fs) 64 | .map_err(|err| err.map_position(|p| p.translate_position(mgs_fs)))?; 65 | 66 | check_output(mgs_fs_record, state) 67 | } 68 | 69 | pub fn parse_recovery_status_output( 70 | recovery_status_output: &[u8], 71 | ) -> Result, LustreCollectorError> { 72 | let recovery_status = str::from_utf8(recovery_status_output)?; 73 | let recovery_status = recovery_status.trim(); 74 | 75 | let (recovery_statuses, state) = recovery_status_parser::parse() 76 | .easy_parse(recovery_status) 77 | .map_err(|err| err.map_position(|p| p.translate_position(recovery_status)))?; 78 | 79 | check_output(recovery_statuses, state) 80 | } 81 | 82 | #[cfg(test)] 83 | mod tests { 84 | use super::{Record, parse_lctl_output}; 85 | 86 | #[test] 87 | fn ex8761_job_stats() { 88 | let xs = include_bytes!("./fixtures/valid/ex8761-lctl.txt"); 89 | let expected = parse_lctl_output(xs).unwrap(); 90 | 91 | let y = serde_json::to_string(&expected).unwrap(); 92 | let z: Vec = serde_json::from_str(&y).unwrap(); 93 | 94 | assert_eq!(expected, z); 95 | } 96 | 97 | #[test] 98 | fn es_6_2_0_job_stats_unhealthy() { 99 | let xs = include_bytes!("./fixtures/valid/params-6.2.0-r9.txt"); 100 | let expected = parse_lctl_output(xs).unwrap(); 101 | 102 | let y = serde_json::to_string(&expected).unwrap(); 103 | let z: Vec = serde_json::from_str(&y).unwrap(); 104 | 105 | assert_eq!(expected, z); 106 | } 107 | 108 | #[test] 109 | fn params() { 110 | let xs = super::parser::params(); 111 | 112 | insta::assert_snapshot!(xs.join(" ")); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | name: rust 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - main 7 | schedule: 8 | - cron: "00 01 * * *" 9 | 10 | jobs: 11 | check: 12 | name: Check 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions-rust-lang/setup-rust-toolchain@v1 17 | - run: | 18 | cargo check --locked 19 | 20 | test: 21 | name: Test Suite 22 | runs-on: ubuntu-latest 23 | steps: 24 | - uses: actions/checkout@v4 25 | with: 26 | lfs: true 27 | - uses: actions-rust-lang/setup-rust-toolchain@v1 28 | - name: Install nextest 29 | uses: taiki-e/install-action@nextest 30 | - run: | 31 | cargo nextest run --release 32 | stale-snaphots: 33 | name: Stale Snapshots 34 | runs-on: ubuntu-latest 35 | steps: 36 | - uses: actions/checkout@v4 37 | with: 38 | lfs: true 39 | - uses: actions-rust-lang/setup-rust-toolchain@v1 40 | - name: Install cargo-insta 41 | run: cargo install cargo-insta 42 | - run: | 43 | cargo insta test --release --unreferenced=auto 44 | fmt: 45 | name: Rustfmt 46 | runs-on: ubuntu-latest 47 | steps: 48 | - uses: actions/checkout@v4 49 | - uses: actions-rust-lang/setup-rust-toolchain@v1 50 | - run: | 51 | cargo fmt --all -- --check 52 | 53 | clippy: 54 | name: Clippy 55 | runs-on: ubuntu-latest 56 | steps: 57 | - uses: actions/checkout@v4 58 | - uses: actions-rust-lang/setup-rust-toolchain@v1 59 | - run: | 60 | cargo ci_clippy 61 | 62 | coverage: 63 | name: coverage 64 | runs-on: ubuntu-latest 65 | steps: 66 | - name: Checkout sources 67 | uses: actions/checkout@v4 68 | with: 69 | lfs: true 70 | 71 | - name: Install Rust 72 | uses: actions-rust-lang/setup-rust-toolchain@v1 73 | 74 | - name: Install `cargo-llvm-cov` 75 | uses: taiki-e/install-action@cargo-llvm-cov 76 | 77 | - name: Measure code coverage for all tests 78 | run: | 79 | # Fail if any tests fail 80 | set -e 81 | set -o pipefail 82 | cargo llvm-cov --all-features --workspace --codecov --output-path codecov.json 83 | 84 | - name: Upload coverage to Codecov 85 | uses: codecov/codecov-action@v3 86 | with: 87 | token: ${{ secrets.CODECOV_TOKEN }} 88 | files: codecov.json 89 | flags: all-tests 90 | fail_ci_if_error: true 91 | 92 | lustre-coverage: 93 | name: lustre-coverage 94 | runs-on: ubuntu-latest 95 | strategy: 96 | matrix: 97 | lustre_version: ["2_14_0_ddn133", "2_14_0_ddn145"] 98 | steps: 99 | - name: Checkout sources 100 | uses: actions/checkout@v4 101 | with: 102 | lfs: true 103 | 104 | - name: Install Rust 105 | uses: actions-rust-lang/setup-rust-toolchain@v1 106 | 107 | - name: Install `cargo-llvm-cov` 108 | uses: taiki-e/install-action@cargo-llvm-cov 109 | 110 | - name: Measure code coverage for specific lustre version ${{ matrix.lustre_version }} 111 | run: | 112 | # Fail if any tests fail 113 | set -e 114 | set -o pipefail 115 | cargo llvm-cov --codecov --output-path codecov_${{ matrix.lustre_version }}.json --package lustre_collector --lib -- parser::tests::test_lustre_${{ matrix.lustre_version }}_fixtures 116 | 117 | - name: Upload coverage for specific lustre version ${{ matrix.lustre_version }} 118 | uses: codecov/codecov-action@v3 119 | with: 120 | token: ${{ secrets.CODECOV_TOKEN }} 121 | files: codecov_${{ matrix.lustre_version }}.json 122 | flags: ${{ matrix.lustre_version }} 123 | fail_ci_if_error: true 124 | -------------------------------------------------------------------------------- /lustrefs-exporter/fixtures/lnetctl_net_show.txt: -------------------------------------------------------------------------------- 1 | net: 2 | - net type: lo 3 | local NI(s): 4 | - nid: 0@lo 5 | status: up 6 | statistics: 7 | send_count: 180076 8 | recv_count: 180072 9 | drop_count: 4 10 | sent_stats: 11 | put: 180076 12 | get: 0 13 | reply: 0 14 | ack: 0 15 | hello: 0 16 | received_stats: 17 | put: 180063 18 | get: 0 19 | reply: 0 20 | ack: 9 21 | hello: 0 22 | dropped_stats: 23 | put: 4 24 | get: 0 25 | reply: 0 26 | ack: 0 27 | hello: 0 28 | health stats: 29 | fatal_error: 0 30 | health value: 1000 31 | interrupts: 0 32 | dropped: 0 33 | aborted: 0 34 | no route: 0 35 | timeouts: 0 36 | error: 0 37 | tunables: 38 | peer_timeout: 0 39 | peer_credits: 0 40 | peer_buffer_credits: 0 41 | credits: 0 42 | lnd tunables: 43 | dev cpt: 0 44 | CPT: "[0,1,2,3,4]" 45 | - net type: tcp 46 | local NI(s): 47 | - nid: 172.16.0.24@tcp 48 | status: up 49 | interfaces: 50 | 0: mlxen0 51 | statistics: 52 | send_count: 464970 53 | recv_count: 464963 54 | drop_count: 4 55 | sent_stats: 56 | put: 340418 57 | get: 124552 58 | reply: 0 59 | ack: 0 60 | hello: 0 61 | received_stats: 62 | put: 340401 63 | get: 63529 64 | reply: 61023 65 | ack: 10 66 | hello: 0 67 | dropped_stats: 68 | put: 4 69 | get: 0 70 | reply: 0 71 | ack: 0 72 | hello: 0 73 | health stats: 74 | fatal_error: 0 75 | health value: 1000 76 | interrupts: 0 77 | dropped: 0 78 | aborted: 0 79 | no route: 0 80 | timeouts: 0 81 | error: 0 82 | tunables: 83 | peer_timeout: 180 84 | peer_credits: 8 85 | peer_buffer_credits: 0 86 | credits: 256 87 | lnd tunables: 88 | conns_per_peer: 3 89 | dev cpt: 0 90 | CPT: "[0,1,2,3,4]" 91 | - nid: 172.16.0.25@tcp 92 | status: up 93 | interfaces: 94 | 0: mlxen1 95 | statistics: 96 | send_count: 464886 97 | recv_count: 464877 98 | drop_count: 3 99 | sent_stats: 100 | put: 340879 101 | get: 124007 102 | reply: 0 103 | ack: 0 104 | hello: 0 105 | received_stats: 106 | put: 340858 107 | get: 62013 108 | reply: 61994 109 | ack: 12 110 | hello: 0 111 | dropped_stats: 112 | put: 3 113 | get: 0 114 | reply: 0 115 | ack: 0 116 | hello: 0 117 | health stats: 118 | fatal_error: 0 119 | health value: 1000 120 | interrupts: 0 121 | dropped: 0 122 | aborted: 0 123 | no route: 0 124 | timeouts: 0 125 | error: 0 126 | tunables: 127 | peer_timeout: 180 128 | peer_credits: 8 129 | peer_buffer_credits: 0 130 | credits: 256 131 | lnd tunables: 132 | conns_per_peer: 3 133 | dev cpt: 0 134 | CPT: "[0,1,2,3,4]" 135 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/quota.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{Family, LabelProm}; 6 | use lustre_collector::{QuotaStats, QuotaStatsOsd, TargetQuotaStat, TargetStat}; 7 | use prometheus_client::{metrics::gauge::Gauge, registry::Registry}; 8 | use std::{ops::Deref, sync::atomic::AtomicU64}; 9 | 10 | #[derive(Debug, Default)] 11 | pub struct QuotaMetrics { 12 | quota_hard: Family>, 13 | quota_soft: Family>, 14 | quota_granted: Family>, 15 | quota_used_kbytes: Family>, 16 | quota_used_inodes: Family>, 17 | } 18 | 19 | impl QuotaMetrics { 20 | pub fn register_metric(&self, registry: &mut Registry) { 21 | registry.register( 22 | "lustre_quota_hard", 23 | "The hard quota for a given component", 24 | self.quota_hard.clone(), 25 | ); 26 | 27 | registry.register( 28 | "lustre_quota_soft", 29 | "The soft quota for a given component", 30 | self.quota_soft.clone(), 31 | ); 32 | 33 | registry.register( 34 | "lustre_quota_granted", 35 | "The granted quota for a given component", 36 | self.quota_granted.clone(), 37 | ); 38 | 39 | registry.register( 40 | "lustre_quota_used_kbytes", 41 | "The hard quota for a given component", 42 | self.quota_used_kbytes.clone(), 43 | ); 44 | 45 | registry.register( 46 | "lustre_quota_used_inodes", 47 | "The amount of inodes used by quota", 48 | self.quota_used_inodes.clone(), 49 | ); 50 | } 51 | } 52 | 53 | pub fn build_quota_stats(x: &TargetQuotaStat, quota: &mut QuotaMetrics) { 54 | let TargetQuotaStat { 55 | target, 56 | value, 57 | pool, 58 | manager, 59 | param, 60 | .. 61 | } = x; 62 | 63 | for s in &value.stats { 64 | let pool = pool.deref().to_string(); 65 | let pool = if pool == "0x0" { String::new() } else { pool }; 66 | let accounting = match param.deref() { 67 | "usr" => "user".to_string(), 68 | "grp" => "group".to_string(), 69 | "prj" => "project".to_string(), 70 | _ => param.to_string(), 71 | }; 72 | 73 | let label = vec![ 74 | ("accounting", accounting.clone()), 75 | ("id", s.id.to_string()), 76 | ("manager", manager.to_string()), 77 | ("pool", pool.clone()), 78 | ("target", target.to_string()), 79 | ]; 80 | 81 | quota.quota_hard.get_or_create(&label).set(s.limits.hard); 82 | 83 | quota.quota_soft.get_or_create(&label).set(s.limits.soft); 84 | 85 | quota 86 | .quota_granted 87 | .get_or_create(&label) 88 | .set(s.limits.granted); 89 | } 90 | } 91 | 92 | pub fn build_ost_quota_stats(x: &TargetStat, quota: &mut QuotaMetrics) { 93 | let TargetStat { 94 | kind, 95 | target, 96 | value, 97 | .. 98 | } = x; 99 | 100 | for s in &value.stats { 101 | let accounting = match value.kind { 102 | lustre_collector::QuotaKind::Usr => "user", 103 | lustre_collector::QuotaKind::Grp => "group", 104 | lustre_collector::QuotaKind::Prj => "project", 105 | }; 106 | 107 | let label = vec![ 108 | ("accounting", accounting.to_string()), 109 | ("component", kind.to_prom_label().to_string()), 110 | ("id", s.id.to_string()), 111 | ("target", target.to_string()), 112 | ]; 113 | 114 | quota 115 | .quota_used_inodes 116 | .get_or_create(&label) 117 | .set(s.usage.inodes); 118 | 119 | quota 120 | .quota_used_kbytes 121 | .get_or_create(&label) 122 | .set(s.usage.kbytes); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /lustrefs-exporter/src/snapshots/lustrefs_exporter__routes__tests__net_show_output_with_mock.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustrefs-exporter/src/routes.rs 3 | expression: "String::from_utf8(output.stdout)?" 4 | --- 5 | net: 6 | - net type: lo 7 | local NI(s): 8 | - nid: 0@lo 9 | status: up 10 | statistics: 11 | send_count: 180076 12 | recv_count: 180072 13 | drop_count: 4 14 | sent_stats: 15 | put: 180076 16 | get: 0 17 | reply: 0 18 | ack: 0 19 | hello: 0 20 | received_stats: 21 | put: 180063 22 | get: 0 23 | reply: 0 24 | ack: 9 25 | hello: 0 26 | dropped_stats: 27 | put: 4 28 | get: 0 29 | reply: 0 30 | ack: 0 31 | hello: 0 32 | health stats: 33 | fatal_error: 0 34 | health value: 1000 35 | interrupts: 0 36 | dropped: 0 37 | aborted: 0 38 | no route: 0 39 | timeouts: 0 40 | error: 0 41 | tunables: 42 | peer_timeout: 0 43 | peer_credits: 0 44 | peer_buffer_credits: 0 45 | credits: 0 46 | lnd tunables: 47 | dev cpt: 0 48 | CPT: "[0,1,2,3,4]" 49 | - net type: tcp 50 | local NI(s): 51 | - nid: 172.16.0.24@tcp 52 | status: up 53 | interfaces: 54 | 0: mlxen0 55 | statistics: 56 | send_count: 464970 57 | recv_count: 464963 58 | drop_count: 4 59 | sent_stats: 60 | put: 340418 61 | get: 124552 62 | reply: 0 63 | ack: 0 64 | hello: 0 65 | received_stats: 66 | put: 340401 67 | get: 63529 68 | reply: 61023 69 | ack: 10 70 | hello: 0 71 | dropped_stats: 72 | put: 4 73 | get: 0 74 | reply: 0 75 | ack: 0 76 | hello: 0 77 | health stats: 78 | fatal_error: 0 79 | health value: 1000 80 | interrupts: 0 81 | dropped: 0 82 | aborted: 0 83 | no route: 0 84 | timeouts: 0 85 | error: 0 86 | tunables: 87 | peer_timeout: 180 88 | peer_credits: 8 89 | peer_buffer_credits: 0 90 | credits: 256 91 | lnd tunables: 92 | conns_per_peer: 3 93 | dev cpt: 0 94 | CPT: "[0,1,2,3,4]" 95 | - nid: 172.16.0.25@tcp 96 | status: up 97 | interfaces: 98 | 0: mlxen1 99 | statistics: 100 | send_count: 464886 101 | recv_count: 464877 102 | drop_count: 3 103 | sent_stats: 104 | put: 340879 105 | get: 124007 106 | reply: 0 107 | ack: 0 108 | hello: 0 109 | received_stats: 110 | put: 340858 111 | get: 62013 112 | reply: 61994 113 | ack: 12 114 | hello: 0 115 | dropped_stats: 116 | put: 3 117 | get: 0 118 | reply: 0 119 | ack: 0 120 | hello: 0 121 | health stats: 122 | fatal_error: 0 123 | health value: 1000 124 | interrupts: 0 125 | dropped: 0 126 | aborted: 0 127 | no route: 0 128 | timeouts: 0 129 | error: 0 130 | tunables: 131 | peer_timeout: 180 132 | peer_credits: 8 133 | peer_buffer_credits: 0 134 | credits: 256 135 | lnd tunables: 136 | conns_per_peer: 3 137 | dev cpt: 0 138 | CPT: "[0,1,2,3,4]" 139 | -------------------------------------------------------------------------------- /lustre-collector/src/base_parsers.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use combine::{ 6 | Parser, attempt, 7 | error::{Format, ParseError}, 8 | many1, one_of, 9 | parser::{ 10 | char::{alpha_num, digit, newline, string}, 11 | repeat::take_until, 12 | }, 13 | stream::Stream, 14 | token, unexpected, unexpected_any, value, 15 | }; 16 | 17 | use crate::types::{Param, Target}; 18 | 19 | pub(crate) fn period() -> impl Parser 20 | where 21 | I: Stream, 22 | I::Error: ParseError, 23 | { 24 | token('.') 25 | } 26 | 27 | pub(crate) fn equals() -> impl Parser 28 | where 29 | I: Stream, 30 | I::Error: ParseError, 31 | { 32 | token('=') 33 | } 34 | 35 | pub(crate) fn word() -> impl Parser 36 | where 37 | I: Stream, 38 | I::Error: ParseError, 39 | { 40 | many1(alpha_num().or(token('_'))) 41 | } 42 | 43 | /// Parses a target name 44 | pub(crate) fn target() -> impl Parser 45 | where 46 | I: Stream, 47 | I::Error: ParseError, 48 | { 49 | many1(alpha_num().or(one_of("_-".chars()))).map(Target) 50 | } 51 | 52 | /// Takes many consecutive digits and 53 | /// returns them as u64 54 | pub(crate) fn digits() -> impl Parser 55 | where 56 | I: Stream, 57 | I::Error: ParseError, 58 | { 59 | many1(digit()).then(|x: String| match x.parse::() { 60 | Ok(n) => value(n).left(), 61 | Err(e) => unexpected_any(Format(e)).right(), 62 | }) 63 | } 64 | 65 | pub(crate) fn till_newline() -> impl Parser 66 | where 67 | I: Stream, 68 | I::Error: ParseError, 69 | { 70 | take_until(newline()) 71 | } 72 | 73 | pub(crate) fn till_period() -> impl Parser 74 | where 75 | I: Stream, 76 | I::Error: ParseError, 77 | { 78 | take_until(period()) 79 | } 80 | 81 | pub(crate) fn string_to(x: &'static str, y: &'static str) -> impl Parser 82 | where 83 | I: Stream, 84 | I::Error: ParseError, 85 | { 86 | string(x).map(move |_| String::from(y)) 87 | } 88 | 89 | pub(crate) fn not_words(xs: &'static [&'static str]) -> impl Parser 90 | where 91 | I: Stream, 92 | I::Error: ParseError, 93 | { 94 | attempt(word().then(move |y| { 95 | for &x in xs { 96 | if x == y { 97 | return unexpected(x).map(|_| "".to_string()).right(); 98 | } 99 | } 100 | 101 | value(y).left() 102 | })) 103 | } 104 | 105 | pub(crate) fn param(x: &'static str) -> impl Parser 106 | where 107 | I: Stream, 108 | I::Error: ParseError, 109 | { 110 | attempt(string(x).skip(equals())) 111 | .map(|x| Param(x.to_string())) 112 | .message("while getting param") 113 | } 114 | 115 | pub(crate) fn param_period(x: &'static str) -> impl Parser 116 | where 117 | I: Stream, 118 | I::Error: ParseError, 119 | { 120 | attempt(string(x).skip(token('.'))) 121 | .map(|x| Param(x.to_string())) 122 | .message("while getting param") 123 | } 124 | 125 | #[cfg(test)] 126 | mod tests { 127 | use super::*; 128 | use crate::types::Param; 129 | 130 | #[test] 131 | fn test_param() { 132 | let result = param("memused").parse("memused=77991501\n"); 133 | 134 | assert_eq!(result, Ok((Param("memused".to_string()), "77991501\n"))) 135 | } 136 | 137 | #[test] 138 | fn test_param_period() { 139 | let result = param_period("exports").parse("exports.1.2.3.130@o2ib.stats=Y\n"); 140 | 141 | assert_eq!( 142 | result, 143 | Ok((Param("exports".to_string()), "1.2.3.130@o2ib.stats=Y\n")) 144 | ) 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /lustrefs-exporter/benches/jobstats.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use const_format::{formatcp, str_repeat}; 6 | use criterion::{Criterion, criterion_group, criterion_main}; 7 | use lustrefs_exporter::jobstats::JobstatMetrics; 8 | use prometheus_client::{encoding::text::encode, registry::Registry}; 9 | use std::{hint, io::BufReader}; 10 | 11 | const JOBSTAT_JOB: &str = r#" 12 | - job_id: "FAKE_JOB" 13 | snapshot_time: 1720516680 14 | read_bytes: { samples: 0, unit: bytes, min: 0, max: 0, sum: 0, sumsq: 0 } 15 | write_bytes: { samples: 52, unit: bytes, min: 4096, max: 475136, sum: 5468160, sumsq: 1071040692224 } 16 | read: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 17 | write: { samples: 52, unit: usecs, min: 12, max: 40081, sum: 692342, sumsq: 17432258604 } 18 | getattr: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 19 | setattr: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 20 | punch: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 21 | sync: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 22 | destroy: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 23 | create: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 24 | statfs: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 25 | get_info: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 26 | set_info: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 27 | quotactl: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 } 28 | prealloc: { samples: 0, unit: usecs, min: 0, max: 0, sum: 0, sumsq: 0 }"#; 29 | 30 | #[allow(long_running_const_eval)] 31 | const INPUT_100_JOBS: &str = formatcp!( 32 | r#"obdfilter.ds002-OST0000.job_stats= 33 | job_stats:{}"#, 34 | str_repeat!(JOBSTAT_JOB, 100) 35 | ); 36 | 37 | #[allow(long_running_const_eval)] 38 | const INPUT_1000_JOBS: &str = formatcp!( 39 | r#"obdfilter.ds002-OST0000.job_stats= 40 | job_stats:{}"#, 41 | str_repeat!(JOBSTAT_JOB, 1000) 42 | ); 43 | 44 | async fn parse_synthetic_yaml(input: &'static str) -> String { 45 | // Setup jobstats metrics 46 | let registry = Registry::default(); 47 | let jobstats_metrics = JobstatMetrics::default(); 48 | 49 | let f = BufReader::with_capacity(128 * 1_024, input.as_bytes()); 50 | 51 | lustrefs_exporter::jobstats::jobstats_stream(f, jobstats_metrics) 52 | .await 53 | .expect("Failed to parse jobstats"); 54 | 55 | let mut buffer = String::new(); 56 | 57 | encode(&mut buffer, ®istry).expect("Failed to encode metrics"); 58 | 59 | buffer 60 | } 61 | 62 | fn criterion_benchmark_fast(c: &mut Criterion) { 63 | c.bench_function("jobstats 100", |b| { 64 | b.to_async( 65 | tokio::runtime::Builder::new_multi_thread() 66 | .build() 67 | .expect("Failed to build tokio runtime"), 68 | ) 69 | .iter(|| hint::black_box(parse_synthetic_yaml(INPUT_100_JOBS))) 70 | }); 71 | 72 | c.bench_function("jobstats 1000", |b| { 73 | b.to_async( 74 | tokio::runtime::Builder::new_multi_thread() 75 | .build() 76 | .expect("Failed to build tokio runtime"), 77 | ) 78 | .iter(|| hint::black_box(parse_synthetic_yaml(INPUT_1000_JOBS))) 79 | }); 80 | } 81 | criterion_group! { 82 | name = benches; 83 | config = Criterion::default(); 84 | targets = criterion_benchmark_fast 85 | } 86 | criterion_main!(benches); 87 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__recovery_status_parser__tests__multiple_recovering.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: src/recovery_status_parser.rs 3 | expression: records 4 | --- 5 | [ 6 | Target( 7 | RecoveryStatus( 8 | TargetStat { 9 | kind: Ost, 10 | param: Param( 11 | "recovery_status", 12 | ), 13 | target: Target( 14 | "fs-OST0000", 15 | ), 16 | value: Complete, 17 | }, 18 | ), 19 | ), 20 | Target( 21 | RecoveryCompletedClients( 22 | TargetStat { 23 | kind: Ost, 24 | param: Param( 25 | "recovery_status", 26 | ), 27 | target: Target( 28 | "fs-OST0000", 29 | ), 30 | value: 4, 31 | }, 32 | ), 33 | ), 34 | Target( 35 | RecoveryStatus( 36 | TargetStat { 37 | kind: Ost, 38 | param: Param( 39 | "recovery_status", 40 | ), 41 | target: Target( 42 | "fs-OST0001", 43 | ), 44 | value: Complete, 45 | }, 46 | ), 47 | ), 48 | Target( 49 | RecoveryCompletedClients( 50 | TargetStat { 51 | kind: Ost, 52 | param: Param( 53 | "recovery_status", 54 | ), 55 | target: Target( 56 | "fs-OST0001", 57 | ), 58 | value: 4, 59 | }, 60 | ), 61 | ), 62 | Target( 63 | RecoveryStatus( 64 | TargetStat { 65 | kind: Ost, 66 | param: Param( 67 | "recovery_status", 68 | ), 69 | target: Target( 70 | "fs-OST0004", 71 | ), 72 | value: Complete, 73 | }, 74 | ), 75 | ), 76 | Target( 77 | RecoveryCompletedClients( 78 | TargetStat { 79 | kind: Ost, 80 | param: Param( 81 | "recovery_status", 82 | ), 83 | target: Target( 84 | "fs-OST0004", 85 | ), 86 | value: 8, 87 | }, 88 | ), 89 | ), 90 | Target( 91 | RecoveryStatus( 92 | TargetStat { 93 | kind: Mdt, 94 | param: Param( 95 | "recovery_status", 96 | ), 97 | target: Target( 98 | "fs-MDT0000", 99 | ), 100 | value: Complete, 101 | }, 102 | ), 103 | ), 104 | Target( 105 | RecoveryCompletedClients( 106 | TargetStat { 107 | kind: Mdt, 108 | param: Param( 109 | "recovery_status", 110 | ), 111 | target: Target( 112 | "fs-MDT0000", 113 | ), 114 | value: 3, 115 | }, 116 | ), 117 | ), 118 | Target( 119 | RecoveryStatus( 120 | TargetStat { 121 | kind: Mdt, 122 | param: Param( 123 | "recovery_status", 124 | ), 125 | target: Target( 126 | "fs-MDT0002", 127 | ), 128 | value: Recovering, 129 | }, 130 | ), 131 | ), 132 | Target( 133 | RecoveryConnectedClients( 134 | TargetStat { 135 | kind: Mdt, 136 | param: Param( 137 | "recovery_status", 138 | ), 139 | target: Target( 140 | "fs-MDT0002", 141 | ), 142 | value: 3, 143 | }, 144 | ), 145 | ), 146 | Target( 147 | RecoveryCompletedClients( 148 | TargetStat { 149 | kind: Mdt, 150 | param: Param( 151 | "recovery_status", 152 | ), 153 | target: Target( 154 | "fs-MDT0002", 155 | ), 156 | value: 3, 157 | }, 158 | ), 159 | ), 160 | ] 161 | -------------------------------------------------------------------------------- /lustre-collector/src/mdd_parser.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{ 6 | ChangeLogUser, ChangelogStat, 7 | base_parsers::{digits, param, period, target, till_newline, till_period}, 8 | types::{Param, Record, Target, TargetStat, TargetStats, TargetVariant}, 9 | }; 10 | use combine::{ 11 | Parser, attempt, choice, 12 | error::{ParseError, StreamError}, 13 | many, 14 | parser::char::{newline, spaces, string}, 15 | stream::{Stream, StreamErrorFor}, 16 | token, 17 | }; 18 | 19 | pub(crate) const MDD: &str = "mdd"; 20 | pub(crate) const CHANGELOG_USERS: &str = "changelog_users"; 21 | pub(crate) fn params() -> Vec { 22 | vec![format!("{MDD}.*.{CHANGELOG_USERS}")] 23 | } 24 | 25 | #[derive(Debug)] 26 | enum MddStat { 27 | /// Changelog stat 28 | ChangeLog(ChangelogStat), 29 | } 30 | 31 | fn target_and_variant() -> impl Parser 32 | where 33 | I: Stream, 34 | I::Error: ParseError, 35 | { 36 | ( 37 | attempt(string("mdd").skip(till_period())).skip(period()), 38 | target().skip(period()), 39 | ) 40 | .and_then(move |(_, x)| -> Result<_, _> { 41 | let variant = match (&x).try_into() { 42 | Ok(x) => x, 43 | Err(e) => return Err(StreamErrorFor::::other(e)), 44 | }; 45 | 46 | Ok((x, variant)) 47 | }) 48 | .message("while parsing target_and_variant") 49 | } 50 | 51 | fn table_headers() -> impl Parser 52 | where 53 | I: Stream, 54 | I::Error: ParseError, 55 | { 56 | (string("ID"), till_newline()).map(|_| ()) 57 | } 58 | 59 | fn table_rows() -> impl Parser> 60 | where 61 | I: Stream, 62 | I::Error: ParseError, 63 | { 64 | many(attempt(( 65 | target(), 66 | spaces(), 67 | digits(), 68 | spaces(), 69 | token('('), 70 | digits(), 71 | token(')'), 72 | till_newline().skip(newline()), 73 | ))) 74 | .map(|x: Vec<_>| { 75 | x.iter() 76 | .map(|x| ChangeLogUser { 77 | user: x.0.to_string(), 78 | index: x.2, 79 | idle_secs: x.5, 80 | }) 81 | .collect() 82 | }) 83 | } 84 | 85 | fn mdd_stat() -> impl Parser 86 | where 87 | I: Stream, 88 | I::Error: ParseError, 89 | { 90 | choice((( 91 | param(CHANGELOG_USERS), 92 | ( 93 | newline(), 94 | string("current_index: "), 95 | digits(), 96 | newline(), 97 | table_headers(), 98 | newline(), 99 | table_rows(), 100 | ) 101 | .map(|(_, _, x, _, _, _, y)| { 102 | MddStat::ChangeLog(ChangelogStat { 103 | current_index: x, 104 | users: y, 105 | }) 106 | }), 107 | ) 108 | .message("while parsing changelog"),)) 109 | } 110 | 111 | pub(crate) fn parse() -> impl Parser 112 | where 113 | I: Stream, 114 | I::Error: ParseError, 115 | { 116 | (target_and_variant(), mdd_stat()) 117 | .map(|((target, kind), (param, stat))| match stat { 118 | MddStat::ChangeLog(value) => TargetStats::Changelog(TargetStat { 119 | kind, 120 | target, 121 | param, 122 | value, 123 | }), 124 | }) 125 | .map(Record::Target) 126 | .message("while parsing mdd") 127 | } 128 | 129 | #[cfg(test)] 130 | mod tests { 131 | use combine::{EasyParser, many}; 132 | use insta::assert_debug_snapshot; 133 | 134 | use super::*; 135 | 136 | #[test] 137 | fn test_mdd_stats() { 138 | static FIXTURE: &str = include_str!("fixtures/mdd.txt"); 139 | 140 | let result = many::, _, _>(parse()) 141 | .easy_parse(FIXTURE) 142 | .map_err(|err| err.map_position(|p| p.translate_position(FIXTURE))) 143 | .unwrap(); 144 | 145 | assert_debug_snapshot!(result); 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /lustre-collector/src/snapshots/lustre_collector__stats_parser__tests__stats.snap: -------------------------------------------------------------------------------- 1 | --- 2 | source: lustre-collector/src/stats_parser.rs 3 | expression: result 4 | --- 5 | ( 6 | [ 7 | Stat { 8 | name: "write_bytes", 9 | units: "bytes", 10 | samples: 9, 11 | min: Some( 12 | 98303, 13 | ), 14 | max: Some( 15 | 4194304, 16 | ), 17 | sum: Some( 18 | 33554431, 19 | ), 20 | sumsquare: None, 21 | }, 22 | Stat { 23 | name: "create", 24 | units: "reqs", 25 | samples: 4, 26 | min: None, 27 | max: None, 28 | sum: None, 29 | sumsquare: None, 30 | }, 31 | Stat { 32 | name: "statfs", 33 | units: "reqs", 34 | samples: 5634, 35 | min: None, 36 | max: None, 37 | sum: None, 38 | sumsquare: None, 39 | }, 40 | Stat { 41 | name: "get_info", 42 | units: "reqs", 43 | samples: 2, 44 | min: None, 45 | max: None, 46 | sum: None, 47 | sumsquare: None, 48 | }, 49 | Stat { 50 | name: "connect", 51 | units: "reqs", 52 | samples: 4, 53 | min: None, 54 | max: None, 55 | sum: None, 56 | sumsquare: None, 57 | }, 58 | Stat { 59 | name: "reconnect", 60 | units: "reqs", 61 | samples: 1, 62 | min: None, 63 | max: None, 64 | sum: None, 65 | sumsquare: None, 66 | }, 67 | Stat { 68 | name: "disconnect", 69 | units: "reqs", 70 | samples: 3, 71 | min: None, 72 | max: None, 73 | sum: None, 74 | sumsquare: None, 75 | }, 76 | Stat { 77 | name: "statfs", 78 | units: "reqs", 79 | samples: 18, 80 | min: None, 81 | max: None, 82 | sum: None, 83 | sumsquare: None, 84 | }, 85 | Stat { 86 | name: "preprw", 87 | units: "reqs", 88 | samples: 9, 89 | min: None, 90 | max: None, 91 | sum: None, 92 | sumsquare: None, 93 | }, 94 | Stat { 95 | name: "commitrw", 96 | units: "reqs", 97 | samples: 9, 98 | min: None, 99 | max: None, 100 | sum: None, 101 | sumsquare: None, 102 | }, 103 | Stat { 104 | name: "ping", 105 | units: "reqs", 106 | samples: 1075, 107 | min: None, 108 | max: None, 109 | sum: None, 110 | sumsquare: None, 111 | }, 112 | Stat { 113 | name: "get_page", 114 | units: "usecs", 115 | samples: 13, 116 | min: Some( 117 | 0, 118 | ), 119 | max: Some( 120 | 3, 121 | ), 122 | sum: Some( 123 | 6, 124 | ), 125 | sumsquare: Some( 126 | 18, 127 | ), 128 | }, 129 | Stat { 130 | name: "cache_access", 131 | units: "pages", 132 | samples: 4, 133 | min: Some( 134 | 1, 135 | ), 136 | max: Some( 137 | 25, 138 | ), 139 | sum: Some( 140 | 52, 141 | ), 142 | sumsquare: None, 143 | }, 144 | Stat { 145 | name: "cache_hit", 146 | units: "pages", 147 | samples: 4, 148 | min: Some( 149 | 1, 150 | ), 151 | max: Some( 152 | 25, 153 | ), 154 | sum: Some( 155 | 52, 156 | ), 157 | sumsquare: None, 158 | }, 159 | Stat { 160 | name: "many_credits", 161 | units: "reqs", 162 | samples: 1, 163 | min: Some( 164 | 1, 165 | ), 166 | max: Some( 167 | 1, 168 | ), 169 | sum: Some( 170 | 1, 171 | ), 172 | sumsquare: None, 173 | }, 174 | ], 175 | "", 176 | ) 177 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - main 7 | workflow_dispatch: 8 | inputs: 9 | versionName: 10 | description: "Release version" 11 | required: true 12 | isDraft: 13 | description: "Draft release" 14 | required: false 15 | default: "true" 16 | 17 | defaults: 18 | run: 19 | shell: bash -eux {0} 20 | 21 | jobs: 22 | build_rocky8_rpms: 23 | name: Build Rockylinux 8 RPMs 24 | runs-on: ubuntu-latest 25 | container: 26 | image: rockylinux:8 27 | steps: 28 | - name: Cancel Workflow Action 29 | uses: styfle/cancel-workflow-action@0.6.0 30 | with: 31 | access_token: ${{ github.token }} 32 | 33 | - name: Checkout sources 34 | uses: actions/checkout@v3 35 | 36 | - name: Install dependencies 37 | run: dnf install -y gcc make rpm-build 38 | 39 | - name: Install latest rust toolchain 40 | uses: actions-rs/toolchain@v1 41 | with: 42 | toolchain: stable 43 | default: true 44 | override: true 45 | 46 | - name: Build 47 | run: make rpm 48 | 49 | - name: Summary 50 | run: | 51 | find lustrefs-exporter/_rpm -type f -name \*.rpm -print -exec rpm -qivlp {} \; 52 | 53 | - name: Upload RPMs 54 | uses: actions/upload-artifact@v4 55 | with: 56 | name: rocky8_rpm 57 | retention-days: 1 58 | path: lustrefs-exporter/_rpm/RPMS/**/*.rpm 59 | 60 | build_rocky9_rpms: 61 | name: Build Rockylinux 9 RPMs 62 | runs-on: ubuntu-latest 63 | container: 64 | image: rockylinux/rockylinux:9.2-ubi 65 | steps: 66 | - name: Cancel Workflow Action 67 | uses: styfle/cancel-workflow-action@0.6.0 68 | with: 69 | access_token: ${{ github.token }} 70 | 71 | - name: Checkout sources 72 | uses: actions/checkout@v3 73 | 74 | - name: Install dependencies 75 | run: dnf install -y gcc make rpm-build 76 | 77 | - name: Install latest rust toolchain 78 | uses: actions-rs/toolchain@v1 79 | with: 80 | toolchain: stable 81 | default: true 82 | override: true 83 | 84 | - name: Build 85 | run: make rpm 86 | 87 | - name: Summary 88 | run: | 89 | find lustrefs-exporter/_rpm -type f -name \*.rpm -print -exec rpm -qivlp {} \; 90 | 91 | - name: Upload RPMs 92 | uses: actions/upload-artifact@v4 93 | with: 94 | name: rocky9_rpm 95 | retention-days: 1 96 | path: lustrefs-exporter/_rpm/RPMS/**/*.rpm 97 | 98 | build_ubuntu20_debs: 99 | name: Build Ubuntu 20.04 DEBs 100 | runs-on: ubuntu-latest 101 | container: 102 | image: ubuntu:20.04 103 | steps: 104 | - name: Cancel Workflow Action 105 | uses: styfle/cancel-workflow-action@0.6.0 106 | with: 107 | access_token: ${{ github.token }} 108 | 109 | - name: Checkout sources 110 | uses: actions/checkout@v3 111 | 112 | - name: Install dependencies 113 | run: | 114 | export DEBIAN_FRONTEND=noninteractive 115 | apt-get update 116 | apt-get install -y curl fakeroot 117 | cd lustrefs-exporter 118 | apt-get build-dep -y . 119 | 120 | - name: Install latest rust toolchain 121 | uses: actions-rs/toolchain@v1 122 | with: 123 | toolchain: stable 124 | default: true 125 | override: true 126 | 127 | - name: Build 128 | run: make deb 129 | 130 | - name: Summary 131 | run: | 132 | find lustrefs-exporter/_deb -type f -name \*.deb -print -exec dpkg -I {} \; -exec dpkg -c {} \; 133 | 134 | - name: Upload DEBs 135 | uses: actions/upload-artifact@v4 136 | with: 137 | name: ubuntu20_debs 138 | retention-days: 1 139 | path: lustrefs-exporter/_deb/*.deb 140 | 141 | release: 142 | if: github.event_name == 'workflow_dispatch' 143 | name: Release 144 | needs: [build_rocky8_rpms, build_rocky9_rpms, build_ubuntu20_debs] 145 | runs-on: ubuntu-latest 146 | steps: 147 | - name: Download Rockylinux 8 RPMs 148 | uses: actions/download-artifact@v4 149 | with: 150 | name: rocky8_rpm 151 | 152 | - name: Download Rockylinux 9 RPMs 153 | uses: actions/download-artifact@v4 154 | with: 155 | name: rocky9_rpm 156 | 157 | - name: Download Ubuntu 20.04 DEBs 158 | uses: actions/download-artifact@v4 159 | with: 160 | name: ubuntu20_debs 161 | 162 | - name: Summary 163 | run: find -ls 164 | 165 | - name: Publish 166 | uses: softprops/action-gh-release@v1 167 | with: 168 | tag_name: ${{ github.event.inputs.versionName }} 169 | draft: ${{ github.event.inputs.isDraft }} 170 | files: | 171 | **/*.rpm 172 | **/*.deb 173 | -------------------------------------------------------------------------------- /lustre-collector/src/oss/oss_parser.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{ 6 | OssStat, 7 | base_parsers::{equals, period}, 8 | stats_parser::stats, 9 | types::{Param, Record, Stat, TargetStats}, 10 | }; 11 | use combine::{Parser, attempt, choice, error::ParseError, parser::char::string, stream::Stream}; 12 | 13 | const OSS: &str = "OSS"; 14 | const STATS: &str = "stats"; 15 | pub(crate) const OST: &str = "ost"; 16 | 17 | pub(crate) const OST_IO: &str = "ost_io"; 18 | pub(crate) const OST_CREATE: &str = "ost_create"; 19 | pub(crate) const OST_OUT: &str = "ost_out"; 20 | pub(crate) const OST_SEQ: &str = "ost_seq"; 21 | 22 | pub(crate) const OST_STATS: [&str; 5] = [OST, OST_IO, OST_CREATE, OST_OUT, OST_SEQ]; 23 | 24 | /// Takes [`OST_STATS`] and produces a list of params for 25 | /// consumption in proper ltcl get_param format. 26 | pub(crate) fn params() -> Vec { 27 | OST_STATS 28 | .iter() 29 | .map(|x| format!("{OST}.{OSS}.{x}.{STATS}")) 30 | .collect() 31 | } 32 | 33 | fn oss_prefix() -> impl Parser 34 | where 35 | I: Stream, 36 | I::Error: ParseError, 37 | { 38 | (string(OST).skip(period())) 39 | .with(string(OSS).skip(period())) 40 | .map(|_| ()) 41 | .message("while parsing `oss_prefix`") 42 | } 43 | 44 | fn param_non_final(x: &'static str) -> impl Parser 45 | where 46 | I: Stream, 47 | I::Error: ParseError, 48 | { 49 | attempt(string(x).skip(period())) 50 | .skip(string(STATS).skip(equals())) 51 | .map(|x| Param(x.to_string())) 52 | .message("while parsing `oss_suffix`") 53 | } 54 | 55 | fn oss_stat() -> impl Parser)> 56 | where 57 | I: Stream, 58 | I::Error: ParseError, 59 | { 60 | ( 61 | choice(( 62 | param_non_final(OST), 63 | param_non_final(OST_IO), 64 | param_non_final(OST_CREATE), 65 | param_non_final(OST_OUT), 66 | param_non_final(OST_SEQ), 67 | )), 68 | stats(), 69 | ) 70 | .message("while parsing `oss_stat`") 71 | } 72 | 73 | pub(crate) fn parse() -> impl Parser 74 | where 75 | I: Stream, 76 | I::Error: ParseError, 77 | { 78 | oss_prefix() 79 | .with(oss_stat()) 80 | .map(|(param, stats)| TargetStats::Oss(OssStat { param, stats })) 81 | .map(Record::Target) 82 | .message("while parsing oss") 83 | } 84 | 85 | #[cfg(test)] 86 | mod tests { 87 | use super::*; 88 | use combine::{many, parser::EasyParser}; 89 | use insta::assert_debug_snapshot; 90 | 91 | #[test] 92 | fn test_parse() { 93 | let x = r#"ost.OSS.ost.stats= 94 | snapshot_time 1688128253.497763049 secs.nsecs 95 | req_waittime 18419628 samples [usec] 2 40983 305482965 25043535105 96 | req_qdepth 18419628 samples [reqs] 0 34 99937 130635 97 | req_active 18419628 samples [reqs] 1 36 69585063 634492353 98 | req_timeout 18419628 samples [sec] 1 15 276294334 4144414654 99 | reqbuf_avail 38185151 samples [bufs] 60 64 2438170078 155685175822 100 | ldlm_glimpse_enqueue 9257180 samples [reqs] 1 1 9257180 9257180 101 | ldlm_extent_enqueue 19856 samples [reqs] 1 1 19856 19856 102 | ost_create 144904 samples [usec] 6 16594 98795730 85661707326 103 | ost_destroy 8988941 samples [usec] 89 173579 5160119682 8184502010174 104 | ost_get_info 8 samples [usec] 540 3603 10971 28145019 105 | ost_connect 341 samples [usec] 21 903 24182 2818080 106 | ost_disconnect 331 samples [usec] 23 524 39358 7068516 107 | ost_sync 4510 samples [usec] 3 10945 997271 2117171965 108 | ost_set_info 28 samples [usec] 9 34 606 14594 109 | obd_ping 3529 samples [usec] 3 12431 60722 155336592 110 | ost.OSS.ost_io.stats= 111 | snapshot_time 1688128269.170769339 secs.nsecs 112 | req_waittime 3398592545 samples [usec] 2 585517 95316362073 32500246129015 113 | req_qdepth 3398592545 samples [reqs] 0 53 90676247 112259319 114 | req_active 3398592545 samples [reqs] 1 82 55496806665 1427593461517 115 | req_timeout 3398592545 samples [sec] 15 15 50978888175 764683322625 116 | reqbuf_avail 7234158916 samples [bufs] 55 64 461878663298 29490702443182 117 | ost_read 2447557926 samples [usec] 23 138321 1871024223288 4497819893384848 118 | ost_write 951033049 samples [usec] 59 1247713 2749050524782 100048363896296658 119 | ost_punch 1515 samples [usec] 16 4883 63967 29511205 120 | "#; 121 | 122 | let result: (Vec<_>, _) = many(parse()).easy_parse(x).unwrap(); 123 | 124 | assert_debug_snapshot!(result) 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /lustre-collector/src/oss/obdfilter_parser.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{ 6 | ExportStats, 7 | base_parsers::{digits, param, param_period, period, target}, 8 | exports_parser::exports_stats, 9 | stats_parser::stats, 10 | types::{Param, Record, Stat, Target, TargetStat, TargetStats, TargetVariant}, 11 | }; 12 | use combine::{ 13 | Parser, choice, 14 | error::ParseError, 15 | parser::char::{newline, string}, 16 | stream::Stream, 17 | }; 18 | 19 | pub(crate) const STATS: &str = "stats"; 20 | 21 | pub(crate) const NUM_EXPORTS: &str = "num_exports"; 22 | pub(crate) const TOT_DIRTY: &str = "tot_dirty"; 23 | pub(crate) const TOT_GRANTED: &str = "tot_granted"; 24 | pub(crate) const TOT_PENDING: &str = "tot_pending"; 25 | 26 | pub(crate) const EXPORTS: &str = "exports"; 27 | pub(crate) const EXPORTS_PARAMS: &str = "exports.*.stats"; 28 | 29 | pub(crate) const OBD_STATS: [&str; 6] = [ 30 | STATS, 31 | NUM_EXPORTS, 32 | TOT_DIRTY, 33 | TOT_GRANTED, 34 | TOT_PENDING, 35 | EXPORTS_PARAMS, 36 | ]; 37 | 38 | /// Takes OBD_STATS and produces a list of params for 39 | /// consumption in proper ltcl get_param format. 40 | pub(crate) fn obd_params() -> Vec { 41 | OBD_STATS 42 | .iter() 43 | .map(|x| format!("obdfilter.*OST*.{x}")) 44 | .collect() 45 | } 46 | 47 | /// Parses the name of a target 48 | fn target_name() -> impl Parser 49 | where 50 | I: Stream, 51 | I::Error: ParseError, 52 | { 53 | (string("obdfilter").skip(period()), target().skip(period())) 54 | .map(|(_, x)| x) 55 | .message("while parsing target_name") 56 | } 57 | 58 | #[derive(Debug)] 59 | enum ObdfilterStat { 60 | Stats(Vec), 61 | ExportStats(Vec), 62 | NumExports(u64), 63 | TotDirty(u64), 64 | TotGranted(u64), 65 | TotPending(u64), 66 | } 67 | 68 | fn obdfilter_stat() -> impl Parser 69 | where 70 | I: Stream, 71 | I::Error: ParseError, 72 | { 73 | choice(( 74 | (param(STATS), stats().map(ObdfilterStat::Stats)), 75 | ( 76 | param(NUM_EXPORTS), 77 | digits().skip(newline()).map(ObdfilterStat::NumExports), 78 | ), 79 | ( 80 | param(TOT_DIRTY), 81 | digits().skip(newline()).map(ObdfilterStat::TotDirty), 82 | ), 83 | ( 84 | param(TOT_GRANTED), 85 | digits().skip(newline()).map(ObdfilterStat::TotGranted), 86 | ), 87 | ( 88 | param(TOT_PENDING), 89 | digits().skip(newline()).map(ObdfilterStat::TotPending), 90 | ), 91 | ( 92 | param_period(EXPORTS), 93 | exports_stats().map(ObdfilterStat::ExportStats), 94 | ), 95 | )) 96 | .message("while parsing obdfilter") 97 | } 98 | 99 | pub(crate) fn parse() -> impl Parser 100 | where 101 | I: Stream, 102 | I::Error: ParseError, 103 | { 104 | (target_name(), obdfilter_stat()) 105 | .map(|(target, (param, value))| match value { 106 | ObdfilterStat::Stats(value) => TargetStats::Stats(TargetStat { 107 | kind: TargetVariant::Ost, 108 | target, 109 | param, 110 | value, 111 | }), 112 | ObdfilterStat::NumExports(value) => TargetStats::NumExports(TargetStat { 113 | kind: TargetVariant::Ost, 114 | target, 115 | param, 116 | value, 117 | }), 118 | ObdfilterStat::TotDirty(value) => TargetStats::TotDirty(TargetStat { 119 | kind: TargetVariant::Ost, 120 | target, 121 | param, 122 | value, 123 | }), 124 | ObdfilterStat::TotGranted(value) => TargetStats::TotGranted(TargetStat { 125 | kind: TargetVariant::Ost, 126 | target, 127 | param, 128 | value, 129 | }), 130 | ObdfilterStat::TotPending(value) => TargetStats::TotPending(TargetStat { 131 | kind: TargetVariant::Ost, 132 | target, 133 | param, 134 | value, 135 | }), 136 | ObdfilterStat::ExportStats(value) => TargetStats::ExportStats(TargetStat { 137 | kind: TargetVariant::Ost, 138 | target, 139 | param, 140 | value, 141 | }), 142 | }) 143 | .map(Record::Target) 144 | .message("while parsing obdfilter") 145 | } 146 | 147 | #[cfg(test)] 148 | mod tests { 149 | use super::*; 150 | 151 | #[test] 152 | fn test_target_name() { 153 | let result = target_name().parse("obdfilter.fs-OST0000.num_exports="); 154 | 155 | assert_eq!( 156 | result, 157 | Ok((Target("fs-OST0000".to_string()), "num_exports=")) 158 | ); 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /lustrefs-exporter/testcmds/cmds_test_app_routes.json: -------------------------------------------------------------------------------- 1 | { 2 | "commands": { 3 | "lctl:get_param memused memused_max lnet_memused health_check mdt.*.exports.*.uuid osd-*.*.stats osd-*.*.filesfree osd-*.*.filestotal osd-*.*.fstype osd-*.*.kbytesavail osd-*.*.kbytesfree osd-*.*.kbytestotal osd-*.*.brw_stats osd-*.*.quota_slave.acct_group osd-*.*.quota_slave.acct_user osd-*.*.quota_slave.acct_project mgs.*.mgs.stats mgs.*.mgs.threads_max mgs.*.mgs.threads_min mgs.*.mgs.threads_started mgs.*.num_exports obdfilter.*OST*.stats obdfilter.*OST*.num_exports obdfilter.*OST*.tot_dirty obdfilter.*OST*.tot_granted obdfilter.*OST*.tot_pending obdfilter.*OST*.exports.*.stats ost.OSS.ost.stats ost.OSS.ost_io.stats ost.OSS.ost_create.stats ost.OSS.ost_out.stats ost.OSS.ost_seq.stats mds.MDS.mdt.stats mds.MDS.mdt_fld.stats mds.MDS.mdt_io.stats mds.MDS.mdt_out.stats mds.MDS.mdt_readpage.stats mds.MDS.mdt_seqm.stats mds.MDS.mdt_seqs.stats mds.MDS.mdt_setattr.stats mdt.*.md_stats mdt.*MDT*.num_exports mdt.*MDT*.exports.*.stats ldlm.namespaces.{mdt-,filter-}*.contended_locks ldlm.namespaces.{mdt-,filter-}*.contention_seconds ldlm.namespaces.{mdt-,filter-}*.ctime_age_limit ldlm.namespaces.{mdt-,filter-}*.early_lock_cancel ldlm.namespaces.{mdt-,filter-}*.lock_count ldlm.namespaces.{mdt-,filter-}*.lock_timeouts ldlm.namespaces.{mdt-,filter-}*.lock_unused_count ldlm.namespaces.{mdt-,filter-}*.lru_max_age ldlm.namespaces.{mdt-,filter-}*.lru_size ldlm.namespaces.{mdt-,filter-}*.max_nolock_bytes ldlm.namespaces.{mdt-,filter-}*.max_parallel_ast ldlm.namespaces.{mdt-,filter-}*.resource_count ldlm.services.ldlm_canceld.stats ldlm.services.ldlm_cbd.stats llite.*.stats mdd.*.changelog_users qmt.*.{dt,md}-*.glb-usr qmt.*.{dt,md}-*.glb-prj qmt.*.{dt,md}-*.glb-grp": [ 4 | { 5 | "binary_name": "lctl", 6 | "args": [ 7 | "get_param", 8 | "memused", 9 | "memused_max", 10 | "lnet_memused", 11 | "health_check", 12 | "mdt.*.exports.*.uuid", 13 | "osd-*.*.stats", 14 | "osd-*.*.filesfree", 15 | "osd-*.*.filestotal", 16 | "osd-*.*.fstype", 17 | "osd-*.*.kbytesavail", 18 | "osd-*.*.kbytesfree", 19 | "osd-*.*.kbytestotal", 20 | "osd-*.*.brw_stats", 21 | "osd-*.*.quota_slave.acct_group", 22 | "osd-*.*.quota_slave.acct_user", 23 | "osd-*.*.quota_slave.acct_project", 24 | "mgs.*.mgs.stats", 25 | "mgs.*.mgs.threads_max", 26 | "mgs.*.mgs.threads_min", 27 | "mgs.*.mgs.threads_started", 28 | "mgs.*.num_exports", 29 | "obdfilter.*OST*.stats", 30 | "obdfilter.*OST*.num_exports", 31 | "obdfilter.*OST*.tot_dirty", 32 | "obdfilter.*OST*.tot_granted", 33 | "obdfilter.*OST*.tot_pending", 34 | "obdfilter.*OST*.exports.*.stats", 35 | "ost.OSS.ost.stats", 36 | "ost.OSS.ost_io.stats", 37 | "ost.OSS.ost_create.stats", 38 | "ost.OSS.ost_out.stats", 39 | "ost.OSS.ost_seq.stats", 40 | "mds.MDS.mdt.stats", 41 | "mds.MDS.mdt_fld.stats", 42 | "mds.MDS.mdt_io.stats", 43 | "mds.MDS.mdt_out.stats", 44 | "mds.MDS.mdt_readpage.stats", 45 | "mds.MDS.mdt_seqm.stats", 46 | "mds.MDS.mdt_seqs.stats", 47 | "mds.MDS.mdt_setattr.stats", 48 | "mdt.*.md_stats", 49 | "mdt.*MDT*.num_exports", 50 | "mdt.*MDT*.exports.*.stats", 51 | "ldlm.namespaces.{mdt-,filter-}*.contended_locks", 52 | "ldlm.namespaces.{mdt-,filter-}*.contention_seconds", 53 | "ldlm.namespaces.{mdt-,filter-}*.ctime_age_limit", 54 | "ldlm.namespaces.{mdt-,filter-}*.early_lock_cancel", 55 | "ldlm.namespaces.{mdt-,filter-}*.lock_count", 56 | "ldlm.namespaces.{mdt-,filter-}*.lock_timeouts", 57 | "ldlm.namespaces.{mdt-,filter-}*.lock_unused_count", 58 | "ldlm.namespaces.{mdt-,filter-}*.lru_max_age", 59 | "ldlm.namespaces.{mdt-,filter-}*.lru_size", 60 | "ldlm.namespaces.{mdt-,filter-}*.max_nolock_bytes", 61 | "ldlm.namespaces.{mdt-,filter-}*.max_parallel_ast", 62 | "ldlm.namespaces.{mdt-,filter-}*.resource_count", 63 | "ldlm.services.ldlm_canceld.stats", 64 | "ldlm.services.ldlm_cbd.stats", 65 | "llite.*.stats", 66 | "mdd.*.changelog_users", 67 | "qmt.*.{dt,md}-*.glb-usr", 68 | "qmt.*.{dt,md}-*.glb-prj", 69 | "qmt.*.{dt,md}-*.glb-grp" 70 | ], 71 | "stdout": "", 72 | "stderr": "cat: /usr/local/bin/../../lustre-collector/src/fixtures/valid/lustre-2.14.0_ddn133/2.14.0_ddn133_quota.txt: No such file or directory\n", 73 | "exit_code": 1 74 | } 75 | ], 76 | "lnetctl:stats show": [ 77 | { 78 | "binary_name": "lnetctl", 79 | "args": [ 80 | "stats", 81 | "show" 82 | ], 83 | "stdout": "", 84 | "stderr": "cat: /usr/local/bin/../fixtures/lnetctl_stats.txt: No such file or directory\n", 85 | "exit_code": 1 86 | } 87 | ], 88 | "lnetctl:net show -v 4": [ 89 | { 90 | "binary_name": "lnetctl", 91 | "args": [ 92 | "net", 93 | "show", 94 | "-v", 95 | "4" 96 | ], 97 | "stdout": "", 98 | "stderr": "cat: /usr/local/bin/../fixtures/lnetctl_net_show.txt: No such file or directory\n", 99 | "exit_code": 1 100 | } 101 | ] 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /lustre-collector/src/mds/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | pub(crate) mod client_count_parser; 6 | pub(crate) mod mds_parser; 7 | pub(crate) mod mdt_parser; 8 | 9 | use crate::types::Record; 10 | use combine::{Parser, Stream, attempt, error::ParseError}; 11 | 12 | pub(crate) fn params() -> Vec { 13 | mds_parser::params() 14 | .into_iter() 15 | .chain(mdt_parser::params()) 16 | .collect() 17 | } 18 | 19 | pub(crate) fn parse() -> impl Parser 20 | where 21 | I: Stream, 22 | I::Error: ParseError, 23 | { 24 | attempt(mds_parser::parse()).or(attempt(mdt_parser::parse())) 25 | } 26 | 27 | #[cfg(test)] 28 | mod tests { 29 | use super::*; 30 | use combine::{many, parser::EasyParser}; 31 | use insta::assert_debug_snapshot; 32 | 33 | #[test] 34 | fn test_params() { 35 | let x = r#"mdt.fs-MDT0000.md_stats= 36 | snapshot_time 1566017453.009677077 secs.nsecs 37 | statfs 20318 samples [reqs] 38 | mdt.fs-MDT0001.md_stats= 39 | snapshot_time 1566017453.009825550 secs.nsecs 40 | statfs 20805 samples [reqs] 41 | mdt.fs-MDT0002.md_stats= 42 | snapshot_time 1566017453.009857366 secs.nsecs 43 | statfs 20805 samples [reqs] 44 | mdt.fs-MDT0000.num_exports=16 45 | mdt.fs-MDT0001.num_exports=13 46 | mdt.fs-MDT0002.num_exports=13 47 | mds.MDS.mdt.stats= 48 | snapshot_time 1689062826.416705941 secs.nsecs 49 | req_waittime 96931 samples [usec] 4 62710 5997491 90147428825 50 | req_qdepth 96931 samples [reqs] 0 2 433 455 51 | req_active 96931 samples [reqs] 1 4 127024 195224 52 | req_timeout 96931 samples [sec] 1 15 1453215 21794505 53 | reqbuf_avail 214247 samples [bufs] 63 64 13711216 877480528 54 | ldlm_ibits_enqueue 14567 samples [reqs] 1 1 14567 14567 55 | mds_reint_setattr 257 samples [reqs] 1 1 257 257 56 | mds_reint_create 2 samples [reqs] 1 1 2 2 57 | mds_reint_open 5505 samples [reqs] 1 1 5505 5505 58 | ost_set_info 3 samples [usec] 11 19 47 771 59 | mds_connect 88 samples [usec] 13 4222 15363 40886015 60 | mds_get_root 1 samples [usec] 5 5 5 25 61 | mds_statfs 4 samples [usec] 14 35 100 2726 62 | mds_sync 256 samples [usec] 8 45 5212 119940 63 | obd_ping 81753 samples [usec] 2 63010 2811336 56636492420 64 | mds.MDS.mdt_fld.stats= 65 | snapshot_time 1689062826.416782077 secs.nsecs 66 | req_waittime 65 samples [usec] 6 42 1212 25042 67 | req_qdepth 65 samples [reqs] 0 0 0 0 68 | req_active 65 samples [reqs] 1 1 65 65 69 | req_timeout 65 samples [sec] 1 15 186 1956 70 | reqbuf_avail 141 samples [bufs] 63 64 9012 576012 71 | fld_query 57 samples [usec] 3 23 510 6280 72 | fld_read 8 samples [usec] 11 42 220 6736 73 | mds.MDS.mdt_io.stats=snapshot_time 1689062826.416807892 secs.nsecs 74 | mds.MDS.mdt_out.stats= 75 | snapshot_time 1689062826.416820124 secs.nsecs 76 | req_waittime 42447 samples [usec] 12 22802 1589380 2854834950 77 | req_qdepth 42447 samples [reqs] 0 0 0 0 78 | req_active 42447 samples [reqs] 1 2 42451 42459 79 | req_timeout 42447 samples [sec] 15 15 636705 9550575 80 | reqbuf_avail 85306 samples [bufs] 63 64 5458793 349312919 81 | mds_statfs 42437 samples [usec] 5 11264 1188972 162527406 82 | out_update 10 samples [usec] 9 24 146 2296 83 | mds.MDS.mdt_readpage.stats= 84 | snapshot_time 1689062826.416854039 secs.nsecs 85 | req_waittime 5506 samples [usec] 3 641 120123 4566199 86 | req_qdepth 5506 samples [reqs] 0 1 12 12 87 | req_active 5506 samples [reqs] 1 3 6103 7421 88 | req_timeout 5506 samples [sec] 15 15 82590 1238850 89 | reqbuf_avail 11604 samples [bufs] 63 64 740345 47236487 90 | mds_getattr 1 samples [usec] 40 40 40 1600 91 | mds_close 5505 samples [usec] 11 245 178562 7560868 92 | mds.MDS.mdt_seqm.stats= 93 | snapshot_time 1689062826.416885077 secs.nsecs 94 | req_waittime 1 samples [usec] 28 28 28 784 95 | req_qdepth 1 samples [reqs] 0 0 0 0 96 | req_active 1 samples [reqs] 1 1 1 1 97 | req_timeout 1 samples [sec] 15 15 15 225 98 | reqbuf_avail 3 samples [bufs] 64 64 192 12288 99 | seq_query 1 samples [usec] 14 14 14 196 100 | mds.MDS.mdt_seqs.stats= 101 | snapshot_time 1689062826.416927653 secs.nsecs 102 | req_waittime 16 samples [usec] 17 3399 7042 21343934 103 | req_qdepth 16 samples [reqs] 0 0 0 0 104 | req_active 16 samples [reqs] 1 3 26 52 105 | req_timeout 16 samples [sec] 1 10 25 115 106 | reqbuf_avail 37 samples [bufs] 63 64 2364 151044 107 | seq_query 16 samples [usec] 119 3577 17742 46177518 108 | mds.MDS.mdt_setattr.stats= 109 | snapshot_time 1689062826.416952373 secs.nsecs 110 | "#; 111 | 112 | let result: (Vec<_>, _) = many(parse()).easy_parse(x).unwrap(); 113 | 114 | assert_debug_snapshot!(result) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /lustre-collector/src/mgs/mgs_parser.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use crate::{ 6 | base_parsers::{digits, param, period, target}, 7 | stats_parser::stats, 8 | types::{Param, Record, Stat, Target, TargetStat, TargetStats, TargetVariant}, 9 | }; 10 | use combine::{ 11 | Parser, attempt, choice, 12 | error::ParseError, 13 | parser::char::{newline, string}, 14 | stream::Stream, 15 | }; 16 | 17 | pub const STATS: &str = "stats"; 18 | pub const THREADS_MIN: &str = "threads_min"; 19 | pub const THREADS_MAX: &str = "threads_max"; 20 | pub const THREADS_STARTED: &str = "threads_started"; 21 | pub const NUM_EXPORTS: &str = "num_exports"; 22 | 23 | pub fn params() -> Vec { 24 | [ 25 | format!("mgs.*.mgs.{STATS}"), 26 | format!("mgs.*.mgs.{THREADS_MAX}"), 27 | format!("mgs.*.mgs.{THREADS_MIN}"), 28 | format!("mgs.*.mgs.{THREADS_STARTED}"), 29 | format!("mgs.*.{NUM_EXPORTS}"), 30 | ] 31 | .iter() 32 | .map(|x| x.to_owned()) 33 | .collect::>() 34 | } 35 | 36 | #[derive(Debug)] 37 | enum MgsStat { 38 | Stats(Vec), 39 | ThreadsMin(u64), 40 | ThreadsMax(u64), 41 | ThreadsStarted(u64), 42 | NumExports(u64), 43 | } 44 | 45 | /// Parses the name of a target 46 | fn target_name() -> impl Parser 47 | where 48 | I: Stream, 49 | I::Error: ParseError, 50 | { 51 | ( 52 | attempt(string("mgs")).skip(period()), 53 | target().skip(period()), 54 | ) 55 | .map(|(_, x)| x) 56 | .message("while parsing target_name") 57 | } 58 | 59 | fn mgs_stat() -> impl Parser 60 | where 61 | I: Stream, 62 | I::Error: ParseError, 63 | { 64 | choice(( 65 | ( 66 | param(NUM_EXPORTS), 67 | digits().skip(newline()).map(MgsStat::NumExports), 68 | ), 69 | ( 70 | string("mgs").skip(period()), 71 | choice(( 72 | (param(STATS), stats().map(MgsStat::Stats)), 73 | ( 74 | param(THREADS_MIN), 75 | digits().skip(newline()).map(MgsStat::ThreadsMin), 76 | ), 77 | ( 78 | param(THREADS_MAX), 79 | digits().skip(newline()).map(MgsStat::ThreadsMax), 80 | ), 81 | ( 82 | param(THREADS_STARTED), 83 | digits().skip(newline()).map(MgsStat::ThreadsStarted), 84 | ), 85 | )), 86 | ) 87 | .map(|(_, (y, z))| (y, z)), 88 | )) 89 | .message("while parsing mgs stats") 90 | } 91 | 92 | pub fn parse() -> impl Parser 93 | where 94 | I: Stream, 95 | I::Error: ParseError, 96 | { 97 | (target_name(), mgs_stat()) 98 | .map(|(target, (param, value))| match value { 99 | MgsStat::Stats(value) => TargetStats::Stats(TargetStat { 100 | kind: TargetVariant::Mgt, 101 | target, 102 | param, 103 | value, 104 | }), 105 | MgsStat::NumExports(value) => TargetStats::NumExports(TargetStat { 106 | kind: TargetVariant::Mgt, 107 | target, 108 | param, 109 | value, 110 | }), 111 | MgsStat::ThreadsMin(value) => TargetStats::ThreadsMin(TargetStat { 112 | kind: TargetVariant::Mgt, 113 | target, 114 | param, 115 | value, 116 | }), 117 | MgsStat::ThreadsMax(value) => TargetStats::ThreadsMax(TargetStat { 118 | kind: TargetVariant::Mgt, 119 | target, 120 | param, 121 | value, 122 | }), 123 | MgsStat::ThreadsStarted(value) => TargetStats::ThreadsStarted(TargetStat { 124 | kind: TargetVariant::Mgt, 125 | target, 126 | param, 127 | value, 128 | }), 129 | }) 130 | .map(Record::Target) 131 | .message("while parsing mgs params") 132 | } 133 | 134 | #[cfg(test)] 135 | mod tests { 136 | use super::*; 137 | use combine::{many, parser::EasyParser}; 138 | use insta::assert_debug_snapshot; 139 | 140 | #[test] 141 | fn test_parse() { 142 | let x = r#"mgs.MGS.mgs.stats= 143 | snapshot_time 1596728874.484750908 secs.nsecs 144 | req_waittime 31280 samples [usec] 11 2695 5020274 1032267156 145 | req_qdepth 31280 samples [reqs] 0 1 56 56 146 | req_active 31280 samples [reqs] 1 2 36625 47315 147 | req_timeout 31280 samples [sec] 1 10 31289 31379 148 | reqbuf_avail 85192 samples [bufs] 62 64 5364658 337866142 149 | ldlm_plain_enqueue 201 samples [reqs] 1 1 201 201 150 | mgs_connect 9 samples [usec] 52 5165 19362 66639088 151 | mgs_disconnect 4 samples [usec] 50 92 265 18709 152 | mgs_target_reg 90 samples [usec] 874 163383 1262544 91852108168 153 | mgs_config_read 41 samples [usec] 41 2203 26823 32448779 154 | obd_ping 30339 samples [usec] 3 4398 1552005 134387261 155 | llog_origin_handle_open 153 samples [usec] 29 16443 25516 270992222 156 | llog_origin_handle_next_block 298 samples [usec] 24 31952 141030 2788155300 157 | llog_origin_handle_read_header 145 samples [usec] 25 44125 192095 4905765639 158 | mgs.MGS.mgs.threads_max=32 159 | mgs.MGS.mgs.threads_min=3 160 | mgs.MGS.mgs.threads_started=4 161 | mgs.MGS.num_exports=5 162 | "#; 163 | 164 | let result: (Vec<_>, _) = many(parse()).easy_parse(x).unwrap(); 165 | 166 | assert_debug_snapshot!(result) 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /lustre-collector/src/fixtures/osd.txt: -------------------------------------------------------------------------------- 1 | osd-ldiskfs.fs-OST0003.stats= 2 | snapshot_time 1750226041.107594239 secs.nsecs 3 | start_time 1750176720.831807230 secs.nsecs 4 | elapsed_time 49320.275787009 secs.nsecs 5 | osd-ldiskfs.MGS.filesfree=32531 6 | osd-ldiskfs.fs-MDT0000.filesfree=1885343 7 | osd-ldiskfs.fs-OST0000.filesfree=39110 8 | osd-ldiskfs.fs-OST0010.filesfree=39110 9 | osd-ldiskfs.MGS.filestotal=32768 10 | osd-ldiskfs.fs-MDT0000.filestotal=1885696 11 | osd-ldiskfs.fs-OST0000.filestotal=40960 12 | osd-ldiskfs.fs-OST0010.filestotal=40960 13 | osd-ldiskfs.MGS.fstype=ldiskfs 14 | osd-ldiskfs.fs-MDT0000.fstype=ldiskfs 15 | osd-ldiskfs.fs-OST0000.fstype=ldiskfs 16 | osd-ldiskfs.fs-OST0010.fstype=ldiskfs 17 | osd-ldiskfs.MGS.kbytesavail=462528 18 | osd-ldiskfs.fs-MDT0000.kbytesavail=2365872 19 | osd-ldiskfs.fs-OST0000.kbytesavail=4037288 20 | osd-ldiskfs.fs-OST0010.kbytesavail=4037288 21 | osd-ldiskfs.MGS.kbytesfree=488740 22 | osd-ldiskfs.fs-MDT0000.kbytesfree=2599980 23 | osd-ldiskfs.fs-OST0000.kbytesfree=4106100 24 | osd-ldiskfs.fs-OST0010.kbytesfree=4106100 25 | osd-ldiskfs.MGS.kbytestotal=491092 26 | osd-ldiskfs.fs-MDT0000.kbytestotal=2602832 27 | osd-ldiskfs.fs-OST0000.kbytestotal=4108388 28 | osd-ldiskfs.fs-OST0010.kbytestotal=4108388 29 | osd-ldiskfs.MGS.brw_stats= 30 | snapshot_time: 1648754081.716383375 (secs.nsecs) 31 | 32 | read | write 33 | pages per bulk r/w rpcs % cum % | rpcs % cum % 34 | 35 | read | write 36 | discontiguous pages rpcs % cum % | rpcs % cum % 37 | 38 | read | write 39 | discontiguous blocks rpcs % cum % | rpcs % cum % 40 | 41 | read | write 42 | disk fragmented I/Os ios % cum % | ios % cum % 43 | 44 | read | write 45 | disk I/Os in flight ios % cum % | ios % cum % 46 | 47 | read | write 48 | I/O time (1/1000s) ios % cum % | ios % cum % 49 | 50 | read | write 51 | disk I/O size ios % cum % | ios % cum % 52 | osd-ldiskfs.fs-MDT0000.brw_stats= 53 | snapshot_time: 1648754081.716523297 (secs.nsecs) 54 | 55 | read | write 56 | pages per bulk r/w rpcs % cum % | rpcs % cum % 57 | 58 | read | write 59 | discontiguous pages rpcs % cum % | rpcs % cum % 60 | 61 | read | write 62 | discontiguous blocks rpcs % cum % | rpcs % cum % 63 | 64 | read | write 65 | disk fragmented I/Os ios % cum % | ios % cum % 66 | 67 | read | write 68 | disk I/Os in flight ios % cum % | ios % cum % 69 | 70 | read | write 71 | I/O time (1/1000s) ios % cum % | ios % cum % 72 | 73 | read | write 74 | disk I/O size ios % cum % | ios % cum % 75 | osd-ldiskfs.fs-OST0000.brw_stats= 76 | snapshot_time: 1648754081.716654344 (secs.nsecs) 77 | 78 | read | write 79 | pages per bulk r/w rpcs % cum % | rpcs % cum % 80 | 81 | read | write 82 | discontiguous pages rpcs % cum % | rpcs % cum % 83 | 84 | read | write 85 | discontiguous blocks rpcs % cum % | rpcs % cum % 86 | 87 | read | write 88 | disk fragmented I/Os ios % cum % | ios % cum % 89 | 90 | read | write 91 | disk I/Os in flight ios % cum % | ios % cum % 92 | 93 | read | write 94 | I/O time (1/1000s) ios % cum % | ios % cum % 95 | 96 | read | write 97 | disk I/O size ios % cum % | ios % cum % 98 | osd-ldiskfs.fs-OST0010.brw_stats= 99 | snapshot_time: 1648754081.716778676 (secs.nsecs) 100 | 101 | read | write 102 | pages per bulk r/w rpcs % cum % | rpcs % cum % 103 | 104 | read | write 105 | discontiguous pages rpcs % cum % | rpcs % cum % 106 | 107 | read | write 108 | discontiguous blocks rpcs % cum % | rpcs % cum % 109 | 110 | read | write 111 | disk fragmented I/Os ios % cum % | ios % cum % 112 | 113 | read | write 114 | disk I/Os in flight ios % cum % | ios % cum % 115 | 116 | read | write 117 | I/O time (1/1000s) ios % cum % | ios % cum % 118 | 119 | read | write 120 | disk I/O size ios % cum % | ios % cum % 121 | osd-ldiskfs.exa01-OST0013.brw_stats= 122 | snapshot_time: 1698140569.773721492 secs.nsecs 123 | 124 | read | write 125 | pages per bulk r/w rpcs % cum % | rpcs % cum % 126 | 127 | read | write 128 | discontiguous pages rpcs % cum % | rpcs % cum % 129 | 130 | read | write 131 | discontiguous blocks rpcs % cum % | rpcs % cum % 132 | 133 | read | write 134 | disk fragmented I/Os ios % cum % | ios % cum % 135 | 136 | read | write 137 | disk I/Os in flight ios % cum % | ios % cum % 138 | 139 | read | write 140 | I/O time (1/1000s) ios % cum % | ios % cum % 141 | 142 | read | write 143 | disk I/O size ios % cum % | ios % cum % 144 | 145 | read | write 146 | block maps msec maps % cum % | maps % cum % 147 | -------------------------------------------------------------------------------- /lustre-collector/src/main.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 DDN. All rights reserved. 2 | // Use of this source code is governed by a MIT-style 3 | // license that can be found in the LICENSE file. 4 | 5 | use clap::{Arg, ValueEnum, value_parser}; 6 | use lustre_collector::{ 7 | error::LustreCollectorError, mgs::mgs_fs_parser, parse_lctl_output, parse_lnetctl_output, 8 | parse_lnetctl_stats, parse_mgs_fs_output, parse_recovery_status_output, parser, 9 | recovery_status_parser, types::Record, 10 | }; 11 | use std::{ 12 | fmt, panic, 13 | process::{Command, ExitCode}, 14 | str::{self, FromStr}, 15 | thread, 16 | }; 17 | use tracing::debug; 18 | 19 | #[derive(ValueEnum, PartialEq, Debug, Clone, Copy)] 20 | enum Format { 21 | Json, 22 | Yaml, 23 | } 24 | 25 | impl FromStr for Format { 26 | type Err = String; 27 | 28 | fn from_str(s: &str) -> Result { 29 | match s.to_lowercase().trim() { 30 | "json" => Ok(Format::Json), 31 | "yaml" => Ok(Format::Yaml), 32 | _ => Err(format!("Could not convert {s} to format type")), 33 | } 34 | } 35 | } 36 | 37 | impl fmt::Display for Format { 38 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 39 | match self { 40 | Self::Json => write!(f, "json"), 41 | Self::Yaml => write!(f, "yaml"), 42 | } 43 | } 44 | } 45 | 46 | fn get_lctl_output() -> Result, LustreCollectorError> { 47 | let lctl_params = parser::params(); 48 | 49 | debug!(lctl_params = lctl_params.join(" ")); 50 | 51 | let r = Command::new("lctl") 52 | .arg("get_param") 53 | .args(lctl_params) 54 | .output()?; 55 | 56 | Ok(r.stdout) 57 | } 58 | 59 | fn get_lctl_mgs_fs_output() -> Result, LustreCollectorError> { 60 | let r = Command::new("lctl") 61 | .arg("get_param") 62 | .arg("-N") 63 | .args(mgs_fs_parser::params()) 64 | .output()?; 65 | 66 | Ok(r.stdout) 67 | } 68 | 69 | fn get_recovery_status_output() -> Result, LustreCollectorError> { 70 | let r = Command::new("lctl") 71 | .arg("get_param") 72 | .args(recovery_status_parser::params()) 73 | .output()?; 74 | 75 | Ok(r.stdout) 76 | } 77 | 78 | fn get_lnetctl_stats_output() -> Result, LustreCollectorError> { 79 | let r = Command::new("lnetctl").arg("stats").arg("show").output()?; 80 | 81 | Ok(r.stdout) 82 | } 83 | 84 | fn main() -> ExitCode { 85 | match run() { 86 | Ok(()) => ExitCode::SUCCESS, 87 | Err(e) => { 88 | eprintln!("{e}"); 89 | 90 | ExitCode::FAILURE 91 | } 92 | } 93 | } 94 | 95 | fn run() -> Result<(), LustreCollectorError> { 96 | tracing_subscriber::fmt::init(); 97 | 98 | let matches = clap::Command::new(env!("CARGO_PKG_NAME")) 99 | .version(env!("CARGO_PKG_VERSION")) 100 | .author("Whamcloud") 101 | .about("Grabs various Lustre statistics for display in JSON or YAML") 102 | .arg( 103 | Arg::new("format") 104 | .short('f') 105 | .long("format") 106 | .value_parser(value_parser!(Format)) 107 | .default_value("json") 108 | .help("Sets the output formatting"), 109 | ) 110 | .get_matches(); 111 | 112 | let format = matches 113 | .get_one::("format") 114 | .expect("Required argument `format` missing"); 115 | 116 | let handle = thread::spawn(move || -> Result, LustreCollectorError> { 117 | let lctl_output = get_lctl_output()?; 118 | 119 | let lctl_record = parse_lctl_output(&lctl_output)?; 120 | 121 | Ok(lctl_record) 122 | }); 123 | 124 | let mgs_fs_handle = thread::spawn(move || -> Result, LustreCollectorError> { 125 | let lctl_output = get_lctl_mgs_fs_output()?; 126 | let lctl_record = parse_mgs_fs_output(&lctl_output)?; 127 | 128 | Ok(lctl_record) 129 | }); 130 | 131 | let lnetctl_stats_handle = 132 | thread::spawn(move || -> Result, LustreCollectorError> { 133 | let lnetctl_stats_output = get_lnetctl_stats_output()?; 134 | let lnetctl_stats_record = parse_lnetctl_stats(&lnetctl_stats_output)?; 135 | 136 | Ok(lnetctl_stats_record) 137 | }); 138 | 139 | let recovery_status_handle = 140 | thread::spawn(move || -> Result, LustreCollectorError> { 141 | let recovery_status_output = get_recovery_status_output()?; 142 | let recovery_statuses = parse_recovery_status_output(&recovery_status_output)?; 143 | 144 | Ok(recovery_statuses) 145 | }); 146 | 147 | let lnetctl_net_show_output = Command::new("lnetctl") 148 | .args(["net", "show", "-v", "4"]) 149 | .output() 150 | .expect("failed to get lnetctl stats"); 151 | 152 | let mut lnet_record = parse_lnetctl_output(&lnetctl_net_show_output.stdout) 153 | .expect("while parsing 'lnetctl net show -v 4' stats"); 154 | 155 | let mut lctl_record = match handle.join() { 156 | Ok(r) => r?, 157 | Err(e) => panic::resume_unwind(e), 158 | }; 159 | 160 | let mut mgs_fs_record = match mgs_fs_handle.join() { 161 | Ok(r) => r.unwrap_or_default(), 162 | Err(e) => panic::resume_unwind(e), 163 | }; 164 | 165 | let mut recovery_status_records = match recovery_status_handle.join() { 166 | Ok(r) => r.unwrap_or_default(), 167 | Err(e) => panic::resume_unwind(e), 168 | }; 169 | 170 | let mut lnetctl_stats_record = match lnetctl_stats_handle.join() { 171 | Ok(r) => r.unwrap_or_default(), 172 | Err(e) => panic::resume_unwind(e), 173 | }; 174 | 175 | lctl_record.append(&mut lnet_record); 176 | lctl_record.append(&mut mgs_fs_record); 177 | lctl_record.append(&mut recovery_status_records); 178 | lctl_record.append(&mut lnetctl_stats_record); 179 | 180 | let x = match format { 181 | Format::Json => serde_json::to_string(&lctl_record)?, 182 | Format::Yaml => serde_yaml::to_string(&lctl_record)?, 183 | }; 184 | 185 | println!("{x}"); 186 | 187 | Ok(()) 188 | } 189 | --------------------------------------------------------------------------------