├── Debugging ├── drgn-tools │ ├── fid_dump │ ├── res_dump │ ├── fid_dump_ub0 │ ├── hdroom_dump │ ├── mdb_table_dump │ ├── pgt_table_dump │ ├── port_range_dump │ ├── drgn_tool_run │ ├── res_dump.py │ ├── port_range_dump.py │ ├── res_dump.txt │ ├── pgt_table_dump.py │ ├── hdroom_dump.py │ ├── mdb_table_dump.py │ ├── port_range_dump.txt │ ├── pgt_table_dump.txt │ ├── mlxsw_drgn.py │ ├── fid_dump_ub0.py │ ├── mdb_table_dump.txt │ ├── hdroom_dump.txt │ ├── fid_dump.py │ ├── fid_dump_ub0.txt │ └── fid_dump.txt ├── libbpf-tools │ ├── .gitignore │ ├── common │ │ ├── vmlinux.h │ │ ├── .gitignore │ │ ├── trace_helpers.h │ │ ├── map_helpers.h │ │ ├── Makefile │ │ ├── trace_helpers.c │ │ └── map_helpers.c │ ├── resmon │ │ ├── .gitignore │ │ ├── config.h.in │ │ ├── resmon.service.in │ │ ├── resmon-exporter.service.in │ │ ├── resmon-stop.8.md │ │ ├── resmon-ping.8.md │ │ ├── test-fdb.sh │ │ ├── resmon-stats.8.md │ │ ├── resmon-exporter.8.md │ │ ├── README.md │ │ ├── resmon-emad.8.md │ │ ├── resmon.bpf.c │ │ ├── resmon.c │ │ ├── resmon-start.8.md │ │ ├── Makefile │ │ ├── resmon.8.md │ │ ├── resmon-sock.c │ │ ├── resmon-exporter.in │ │ ├── resmon-dump.8.md │ │ ├── mlxsw.h │ │ ├── resmon-dl.c │ │ └── resmon-back.c │ ├── src │ │ ├── .gitignore │ │ ├── emadump.h │ │ ├── emadlatency.h │ │ ├── trapagg.h │ │ ├── bits.bpf.h │ │ ├── Makefile │ │ ├── emadump_example.txt │ │ ├── emadump.bpf.c │ │ ├── trapagg-exporter.py │ │ ├── trapagg_example.txt │ │ ├── emadump.c │ │ ├── emadlatency.bpf.c │ │ ├── trapagg.bpf.c │ │ └── trapagg.c │ ├── tools │ │ └── bpftool │ ├── Makefile │ ├── common.mk │ └── README.md ├── EMADs │ ├── README.md │ ├── common.py │ ├── devlink-hwmsg.py │ └── bwz.py ├── fw_dump.py └── hdroom_sz ├── README.md └── .gitmodules /Debugging/drgn-tools/fid_dump: -------------------------------------------------------------------------------- 1 | drgn_tool_run -------------------------------------------------------------------------------- /Debugging/drgn-tools/res_dump: -------------------------------------------------------------------------------- 1 | drgn_tool_run -------------------------------------------------------------------------------- /Debugging/drgn-tools/fid_dump_ub0: -------------------------------------------------------------------------------- 1 | drgn_tool_run -------------------------------------------------------------------------------- /Debugging/drgn-tools/hdroom_dump: -------------------------------------------------------------------------------- 1 | drgn_tool_run -------------------------------------------------------------------------------- /Debugging/drgn-tools/mdb_table_dump: -------------------------------------------------------------------------------- 1 | drgn_tool_run -------------------------------------------------------------------------------- /Debugging/drgn-tools/pgt_table_dump: -------------------------------------------------------------------------------- 1 | drgn_tool_run -------------------------------------------------------------------------------- /Debugging/drgn-tools/port_range_dump: -------------------------------------------------------------------------------- 1 | drgn_tool_run -------------------------------------------------------------------------------- /Debugging/libbpf-tools/.gitignore: -------------------------------------------------------------------------------- 1 | /.libbpf-output 2 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/common/vmlinux.h: -------------------------------------------------------------------------------- 1 | vmlinux_5010.h -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/.gitignore: -------------------------------------------------------------------------------- 1 | /.output 2 | /resmon 3 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/common/.gitignore: -------------------------------------------------------------------------------- 1 | /.output 2 | /libcommon.a 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mlxsw # 2 | 3 | * Project Wiki: https://github.com/Mellanox/mlxsw/wiki 4 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/config.h.in: -------------------------------------------------------------------------------- 1 | #define RESMON_DEFAULT_SOCKDIR "@RUNSTATEDIR@" 2 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/.gitignore: -------------------------------------------------------------------------------- 1 | /.output 2 | /emadlatency 3 | /emadump 4 | /trapagg 5 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/tools/bpftool: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mellanox/mlxsw/HEAD/Debugging/libbpf-tools/tools/bpftool -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "Debugging/libbpf-tools/libbpf"] 2 | path = Debugging/libbpf-tools/libbpf 3 | url = https://github.com/libbpf/libbpf.git 4 | -------------------------------------------------------------------------------- /Debugging/EMADs/README.md: -------------------------------------------------------------------------------- 1 | For documentation please refer to the [Hardware Messages Monitoring][1] 2 | Wiki page 3 | 4 | [1]: https://github.com/Mellanox/mlxsw/wiki/Hardware-Messages-Monitoring 5 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon.service.in: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=mlxsw resource monitor 3 | 4 | [Service] 5 | Type=notify 6 | ExecStart=@BINDIR@/resmon start 7 | ExecStop=@BINDIR@/resmon stop 8 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-exporter.service.in: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=mlxsw resource monitor exporter 3 | Requisite=resmon.service 4 | After=resmon.service 5 | 6 | [Service] 7 | ExecStart=@BINDIR@/resmon-exporter -l 0.0.0.0:9417 8 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/emadump.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | #ifndef __EMADUMP_H 3 | #define __EMADUMP_H 4 | 5 | #define EMAD_MAX_LEN 2048 6 | 7 | struct emad_event { 8 | char buf[EMAD_MAX_LEN]; 9 | size_t len; 10 | __u64 ts; 11 | }; 12 | 13 | #endif /* __EMADUMP_H */ 14 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/common/trace_helpers.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | #ifndef __TRACE_HELPERS_H 3 | #define __TRACE_HELPERS_H 4 | 5 | void print_log2_hist(unsigned int *vals, int vals_size, const char *val_type); 6 | 7 | int bump_memlock_rlimit(void); 8 | 9 | #endif /* __TRACE_HELPERS_H */ 10 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/emadlatency.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | #ifndef __EMADLATENCY_H 3 | #define __EMADLATENCY_H 4 | 5 | #define MAX_SLOTS 27 6 | 7 | struct hist_key { 8 | __u16 reg_id; 9 | bool write; 10 | }; 11 | 12 | struct hist { 13 | __u32 slots[MAX_SLOTS]; 14 | __u64 latency; 15 | __u64 count; 16 | }; 17 | 18 | #endif /* __EMADLATENCY_H */ 19 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/drgn_tool_run: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Python 3 does not allow import from the same directory for plain 4 | # programs. Creating a full-blown package for the few helpers that we use 5 | # is an overkill. Instead, let's have this launcher that exports to 6 | # PYTHONPATH the directory that the tool is run from. 7 | 8 | dir=$(dirname $0) 9 | PYTHONPATH=$dir${dir+:}$PYTHONPATH python3 $0.py 10 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/common/map_helpers.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | /* Copyright (c) 2020 Anton Protopopov */ 3 | #ifndef __MAP_HELPERS_H 4 | #define __MAP_HELPERS_H 5 | 6 | #include 7 | 8 | int dump_hash(int map_fd, void *keys, __u32 key_size, 9 | void *values, __u32 value_size, __u32 *count, void *invalid_key); 10 | 11 | #endif /* __MAP_HELPERS_H */ 12 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/trapagg.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | #ifndef __TRAPAGG_H 3 | #define __TRAPAGG_H 4 | 5 | #define MAX_ENTRIES 10240 6 | 7 | #define TRAP_NAME_LEN 80 8 | 9 | struct trap_flow_key { 10 | __be32 saddrv4; 11 | __be32 daddrv4; 12 | __u32 saddrv6[4]; 13 | __u32 daddrv6[4]; 14 | __u16 addr_proto; /* ETH_P_IP or ETH_P_IPV6 */ 15 | __u16 sport; 16 | __u16 dport; 17 | __u8 ip_proto; 18 | __u8 is_encap; 19 | char trap_name[TRAP_NAME_LEN]; 20 | }; 21 | 22 | #endif /* __TRAPAGG_H */ 23 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/res_dump.py: -------------------------------------------------------------------------------- 1 | # res_dump Dump device resources in JSON format 2 | 3 | from socket import ntohl 4 | from mlxsw_drgn import * 5 | import json 6 | import sys 7 | 8 | mlxsw_sp = MlxswSp.find() 9 | dump = {} 10 | 11 | for res_idx, res_id in enumerate(prog['mlxsw_res_ids']): 12 | res_enum = prog.type('enum mlxsw_res_id').enumerators[res_idx] 13 | res_name = res_enum.name[len("MLXSW_RES_ID_"):] 14 | 15 | res = {} 16 | res["id"] = hex(res_id) 17 | res["valid"] = mlxsw_sp.core.res.valid[res_idx].value_() 18 | res["value"] = mlxsw_sp.core.res.values[res_idx].value_() 19 | dump[res_name] = res 20 | 21 | sys.stdout.write(json.dumps(dump)) 22 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/bits.bpf.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | #ifndef __BITS_BPF_H 3 | #define __BITS_BPF_H 4 | 5 | static __always_inline u64 log2(u32 v) 6 | { 7 | u32 shift, r; 8 | 9 | r = (v > 0xFFFF) << 4; v >>= r; 10 | shift = (v > 0xFF) << 3; v >>= shift; r |= shift; 11 | shift = (v > 0xF) << 2; v >>= shift; r |= shift; 12 | shift = (v > 0x3) << 1; v >>= shift; r |= shift; 13 | r |= (v >> 1); 14 | 15 | return r; 16 | } 17 | 18 | static __always_inline u64 log2l(u64 v) 19 | { 20 | u32 hi = v >> 32; 21 | 22 | if (hi) 23 | return log2(hi) + 32; 24 | else 25 | return log2(v); 26 | } 27 | 28 | #endif /* __BITS_BPF_H */ 29 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/Makefile: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 | REL_SRCDIR := . 3 | include $(REL_SRCDIR)/common.mk 4 | 5 | DIRS = common src resmon 6 | 7 | .PHONY: all 8 | all: $(DIRS:%=build-%) 9 | 10 | build-%: 11 | $(Q)$(MAKE) -C $* 12 | 13 | .PHONY: install 14 | install: install-resmon 15 | 16 | install-%: 17 | $(Q)$(MAKE) -C $* install 18 | 19 | .PHONY: test 20 | test: test-resmon 21 | 22 | test-%: 23 | $(Q)$(MAKE) -C $* test 24 | 25 | .PHONY: doc 26 | doc: doc-resmon 27 | 28 | doc-%: 29 | $(Q)$(MAKE) -C $* doc 30 | 31 | .PHONY: clean 32 | clean: $(DIRS:%=clean-%) 33 | $(call msg,CLEAN) 34 | $(Q)rm -rf $(LIBBPF_OUTPUT) 35 | 36 | clean-%: 37 | $(Q)$(MAKE) -C $* clean 38 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/common/Makefile: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 | REL_SRCDIR := .. 3 | include $(REL_SRCDIR)/common.mk 4 | 5 | OUTPUT := .output 6 | INCLUDES := $(LIBBPF_INCLUDE) 7 | LIBS := libcommon.a 8 | libcommon.a-OBJECTS := \ 9 | $(OUTPUT)/trace_helpers.o \ 10 | $(OUTPUT)/map_helpers.o \ 11 | # 12 | 13 | all: $(OUTPUT)/libcommon.a 14 | 15 | $(OUTPUT)/libcommon.a: $(libcommon.a-OBJECTS) 16 | $(call msg,AR,$@) 17 | $(Q)ar cr $@ $^ 18 | 19 | $(OUTPUT): 20 | $(call msg,MKDIR,$@) 21 | $(Q)mkdir -p $@ 22 | 23 | $(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) 24 | $(call msg,CC,$@) 25 | $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ 26 | $(OUTPUT)/map_helpers.o: $(LIBBPF_OBJ) 27 | 28 | .PHONY: clean 29 | clean: 30 | $(call msg,CLEAN) 31 | $(Q)rm -rf $(foreach tgt,$(LIBS),$($(tgt)-OBJECTS)) 32 | $(Q)rm -f $(LIBS:%=$(OUTPUT)/%) $(EXTRA_CLEAN) 33 | $(Q)if test -d $(OUTPUT); then rmdir $(OUTPUT); fi 34 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/port_range_dump.py: -------------------------------------------------------------------------------- 1 | # port_range_dump Dump port range registers in JSON format 2 | 3 | from drgn.helpers.linux.xarray import xa_for_each 4 | from mlxsw_drgn import * 5 | import json 6 | import sys 7 | 8 | mlxsw_sp = MlxswSp.find() 9 | dump = {} 10 | 11 | dump_prrs = {} 12 | dump["port_range_registers"] = dump_prrs 13 | 14 | for index, entry in xa_for_each(mlxsw_sp.pr_core.prr_xa): 15 | dump_prr = {} 16 | dump_prrs[index] = dump_prr 17 | 18 | mlxsw_sp_port_range_reg = drgn.cast("struct mlxsw_sp_port_range_reg *", 19 | entry) 20 | dump_prr["min_port"] = mlxsw_sp_port_range_reg.range.min.value_() 21 | dump_prr["max_port"] = mlxsw_sp_port_range_reg.range.max.value_() 22 | dump_prr["is_source"] = bool(mlxsw_sp_port_range_reg.range.source.value_()) 23 | dump_prr["refcount"] = mlxsw_sp_port_range_reg.refcount.refs.counter.value_() 24 | dump_prr["index"] = mlxsw_sp_port_range_reg.index.value_() 25 | 26 | sys.stdout.write(json.dumps(dump)) 27 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-stop.8.md: -------------------------------------------------------------------------------- 1 | % resmon-stop(8) | Linux 2 | 3 | NAME 4 | ==== 5 | 6 | `resmon stop` - stop the `resmon` daemon 7 | 8 | SYNOPSIS 9 | ======== 10 | 11 | `resmon stop` 12 | 13 | DESCRIPTION 14 | =========== 15 | 16 | The `stop` RPC method and the associated command-line wrapper can be used 17 | to stop the `resmon` daemon. 18 | 19 | RPC REQUEST 20 | =========== 21 | 22 | The `stop` method takes no parameters. 23 | 24 | ``` 25 | { 26 | "jsonrpc": "2.0", 27 | "id": $ID, 28 | "method": "stop" 29 | } 30 | ``` 31 | 32 | RPC RESPONSE 33 | ============ 34 | 35 | ``` 36 | { 37 | "jsonrpc": "2.0", 38 | "id": $ID, 39 | "result": null 40 | } 41 | ``` 42 | 43 | This response indicates that the daemon intends to stop soon. A refusal to 44 | stop would be expressed through an error response. 45 | 46 | 47 | SEE ALSO 48 | ======== 49 | 50 | resmon(8), resmon-start(8), resmon-ping(8) 51 | 52 | [JSON RPC specification][JSON RPC]. 53 | 54 | REPORTING ISSUES 55 | ================ 56 | 57 | To report issues please send an email to: mlxsw@nvidia.com. 58 | 59 | [JSON RPC]: https://www.jsonrpc.org/specification 60 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-ping.8.md: -------------------------------------------------------------------------------- 1 | % resmon-ping(8) | Linux 2 | 3 | NAME 4 | ==== 5 | 6 | `resmon ping` - probe the liveness of the `resmon` daemon 7 | 8 | SYNOPSIS 9 | ======== 10 | 11 | `resmon ping` 12 | 13 | DESCRIPTION 14 | =========== 15 | 16 | The `ping` RPC method and the associated command-line wrapper can be used 17 | to check whether the daemon is alive and capable of servicing RPC requests. 18 | 19 | Note that the `resmon` daemon is a single-threaded program. If an 20 | outstanding `stats` request is blocked in `devlink` (such as would be the 21 | case during the driver reset), the daemon will not service `ping` requests 22 | either, until the block clears. 23 | 24 | RPC REQUEST 25 | =========== 26 | 27 | The `ping` method takes any object for `params`. The daemon will simply 28 | return the passed-in object through the `result` in response. 29 | 30 | ``` 31 | { 32 | "jsonrpc": "2.0", 33 | "id": $ID, 34 | "method": "ping", 35 | "params": $OBJECT 36 | } 37 | ``` 38 | 39 | RPC RESPONSE 40 | ============ 41 | 42 | ``` 43 | { 44 | "jsonrpc": "2.0", 45 | "id": $ID, 46 | "result": $OBJECT 47 | } 48 | ``` 49 | 50 | SEE ALSO 51 | ======== 52 | 53 | resmon(8), resmon-start(8), resmon-stop(8) 54 | 55 | [JSON RPC specification][JSON RPC]. 56 | 57 | REPORTING ISSUES 58 | ================ 59 | 60 | To report issues please send an email to: mlxsw@nvidia.com. 61 | 62 | [JSON RPC]: https://www.jsonrpc.org/specification 63 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/res_dump.txt: -------------------------------------------------------------------------------- 1 | NAME 2 | 3 | res_dump - Dump device resources 4 | 5 | SYNOPSIS: 6 | 7 | res_dump 8 | 9 | DESCRIPTION: 10 | 11 | res_dump is a tool written on top of drgn, for dumping device 12 | resources. That way it is possible to directly find out device resources 13 | without relying on tools such as devlink(8) that only provide an 14 | abstraction. 15 | 16 | OUTPUT 17 | 18 | As an output, res_dump emits a JSON object with the following 19 | attributes: 20 | 21 | ..id 22 | The identifier of the device resource. 23 | 24 | ..valid 25 | Whether the resource is valid or not. Invalid resources are 26 | ignored by mlxsw. 27 | 28 | ..value 29 | The value of the device resource, as queried from the 30 | device. 31 | 32 | res_dump always outputs the complete information. Filtering and 33 | querying can be done e.g. through `jq`. 34 | 35 | EXAMPLE: 36 | 37 | # res_dump | jq 38 | { 39 | "KVD_SIZE": { 40 | "id": "0x1001", 41 | "valid": true, 42 | "value": 524288 43 | }, 44 | [...] 45 | "MAX_NVE_MC_ENTRIES_IPV6": { 46 | "id": "0x2e03", 47 | "valid": true, 48 | "value": 4 49 | } 50 | } 51 | 52 | SEE ALSO: 53 | 54 | https://github.com/Mellanox/mlxsw/wiki 55 | https://drgn.readthedocs.io 56 | https://man7.org/linux/man-pages/man8/devlink-resource.8.html 57 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/pgt_table_dump.py: -------------------------------------------------------------------------------- 1 | # pgt table dump - Dump PGT table in JSON format 2 | 3 | from drgn.helpers.linux.idr import idr_for_each 4 | from mlxsw_drgn import * 5 | import json 6 | import sys 7 | 8 | mlxsw_sp = MlxswSp.find() 9 | ports_mapping = mlxsw_sp.ports_mapping() 10 | 11 | dump = {} 12 | dump["end_index"] = mlxsw_sp.pgt.end_index.value_() 13 | 14 | smpe_index_valid = mlxsw_sp.pgt.smpe_index_valid.value_() 15 | dump["smpe_index_valid"] = smpe_index_valid 16 | 17 | dump_pgt_entries = {} 18 | dump["pgt_entries"] = dump_pgt_entries 19 | 20 | for index, entry in idr_for_each(mlxsw_sp.pgt.pgt_idr): 21 | dump_pgt_entry = {} 22 | dump_pgt_entries[index] = dump_pgt_entry 23 | 24 | mlxsw_sp_pgt_entry = drgn.cast("struct mlxsw_sp_pgt_entry *", entry) 25 | dump_pgt_entry["mid_index"] = mlxsw_sp_pgt_entry.index.value_() 26 | 27 | if smpe_index_valid: 28 | dump_pgt_entry["smpe_index"] = mlxsw_sp_pgt_entry.smpe_index.value_() 29 | 30 | dump_pgt_entry_ports = {} 31 | dump_pgt_entry["ports"] = dump_pgt_entry_ports 32 | 33 | for entry_port in \ 34 | helpers.list_for_each_entry("struct mlxsw_sp_pgt_entry_port", 35 | mlxsw_sp_pgt_entry.ports_list.address_of_(), 36 | "list"): 37 | swp = ports_mapping[entry_port.local_port.value_()] 38 | 39 | dump_entry_port = {} 40 | dump_pgt_entry_ports[swp] = dump_entry_port 41 | 42 | dump_entry_port["local_port"] = entry_port.local_port.value_() 43 | 44 | sys.stdout.write(json.dumps(dump)) 45 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/Makefile: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 | REL_SRCDIR := .. 3 | include $(REL_SRCDIR)/common.mk 4 | 5 | OUTPUT := .output 6 | INCLUDES := -I$(OUTPUT) $(LIBBPF_INCLUDE) $(COMMON_INCLUDE) 7 | CFLAGS := -g -Wall -Wunused 8 | 9 | APPS = emadlatency emadump trapagg 10 | 11 | .PHONY: all 12 | all: $(APPS) 13 | 14 | .PHONY: clean 15 | clean: 16 | $(call msg,CLEAN) 17 | $(Q)rm -rf $(OUTPUT) $(APPS) 18 | 19 | $(OUTPUT): 20 | $(call msg,MKDIR,$@) 21 | $(Q)mkdir -p $@ 22 | 23 | # Build BPF code 24 | $(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) ../common/vmlinux.h | $(OUTPUT) 25 | $(call msg,BPF,$@) 26 | $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) -c $(filter %.c,$^) -o $@ 27 | $(Q)$(LLVM_STRIP) -g $@ # strip useless DWARF info 28 | 29 | # Generate BPF skeletons 30 | $(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) 31 | $(call msg,GEN-SKEL,$@) 32 | $(Q)$(BPFTOOL) gen skeleton $< > $@ 33 | 34 | # Build user-space code 35 | $(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h 36 | 37 | $(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) 38 | $(call msg,CC,$@) 39 | $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ 40 | 41 | # Build application binary 42 | emadlatency trapagg: %: $(OUTPUT)/%.o $(LIBBPF_OBJ) $(COMMON_OBJ) | $(OUTPUT) 43 | $(call msg,BINARY,$@) 44 | $(Q)$(CC) $(CFLAGS) $^ -lelf -lz -o $@ 45 | 46 | emadump: %: $(OUTPUT)/%.o $(LIBBPF_OBJ) $(COMMON_OBJ) | $(OUTPUT) 47 | $(call msg,BINARY,$@) 48 | $(Q)$(CC) $(CFLAGS) $^ -lelf -lz -lpcap -o $@ 49 | 50 | resmon-test: resmon/resmon 51 | resmon-test: 52 | ./resmon/resmon-test.sh 53 | 54 | # delete failed targets 55 | .DELETE_ON_ERROR: 56 | 57 | # keep intermediate (.skel.h, .bpf.o, etc) targets 58 | .SECONDARY: 59 | 60 | -------------------------------------------------------------------------------- /Debugging/EMADs/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2016, 2018 Mellanox Technologies. All rights reserved. 3 | Licensed under the GNU General Public License, version 2 as 4 | published by the Free Software Foundation; see COPYING for details. 5 | """ 6 | 7 | import struct 8 | import sys 9 | 10 | def pcap_header_out(f, link_type = 162): 11 | pcap_header = struct.pack("IHHiIII", 0xa1b2c3d4, 2, 4, 0, 0, 0xffff, 12 | link_type) 13 | f.write(pcap_header) 14 | f.flush() 15 | 16 | def pcap_packet_header(secs, usecs, pktlen): 17 | return struct.pack("IIII", secs, usecs, pktlen, pktlen) 18 | 19 | def nulstr_to_str(s): 20 | assert s.endswith(b'\0') 21 | return s[:-1] 22 | 23 | def normalize_ba(ba): 24 | if (isinstance(ba, str)): 25 | ba = bytearray(ba + "\0", 'utf-8') 26 | return ba 27 | 28 | class Tag: 29 | def __init__(self, tag, decoder, encoder): 30 | self._tag = tag 31 | self._decoder = decoder 32 | self._encoder = encoder 33 | 34 | def tag(self): 35 | return self._tag 36 | 37 | def decode(self, s): 38 | return self._decoder(s) 39 | 40 | def encode(self, v): 41 | return self._encoder(v) 42 | 43 | tlv_bus_name = Tag(0, nulstr_to_str, normalize_ba) 44 | tlv_dev_name = Tag(1, nulstr_to_str, normalize_ba) 45 | tlv_driver_name = Tag(2, nulstr_to_str, normalize_ba) 46 | tlv_incoming = Tag(3, lambda s: struct.unpack("?", s)[0], 47 | lambda v: struct.pack("?", v)) 48 | tlv_type = Tag(4, lambda s: struct.unpack("H", s)[0], 49 | lambda v: struct.pack("H", v)) 50 | tlv_buf = Tag(5, lambda s: s, lambda v: v) 51 | 52 | tag_dict = {t.tag(): t for t in [tlv_bus_name, tlv_dev_name, tlv_driver_name, 53 | tlv_incoming, tlv_type, tlv_buf]} 54 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/test-fdb.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 3 | 4 | sfd_reg_tlv_get() 5 | { 6 | local op=$1; shift 7 | local num_rec=$1; shift 8 | 9 | local type_len="19050000" 10 | local swid_rec_type="00000000" 11 | local record_locator="0000000" 12 | local resv1="000000" 13 | local resv2="00000000" 14 | 15 | echo $type_len$swid_rec_type$op$record_locator$resv1$num_rec$resv2 16 | } 17 | 18 | sfd_record_get() 19 | { 20 | local type=$1; shift 21 | local mac_31_0=$1; shift 22 | local mac_47_32=$1; shift 23 | local fid_vid=$1; shift 24 | # Depends on record type: 25 | # For type=0x0: param is system_port 26 | # For type=0x1: param is lag_id 27 | # For type=0x2: param is mid 28 | # For type=0xc: param is tunnel_port 29 | local param=$1; shift 30 | 31 | local swid="00" 32 | local policy_a="0" 33 | local resv1="0000" 34 | local resv2="0000" 35 | local resv3=$(printf '%*s' 32 | tr ' ' "0") 36 | 37 | echo $swid$type$policy_a$mac_47_32$mac_31_0$resv1$fid_vid$resv2$param$resv3 38 | } 39 | 40 | sfd_reg_payload_get() 41 | { 42 | local op=$1; shift 43 | local record_type=${1:-"0"}; shift 44 | local fid=${1:-"1000"}; shift 45 | local mac_47_32=${1:-"aabb"}; shift 46 | 47 | local num_rec="01" 48 | local reg_tlv_part_1=$(sfd_reg_tlv_get $op $num_rec) 49 | 50 | local mac_31_0="ccddeeff" 51 | local system_port="0069" 52 | 53 | local sfd_record=$(sfd_record_get $record_type $mac_31_0 $mac_47_32 $fid $system_port) 54 | 55 | local rec_len=32 56 | local num_empty_rec=63 57 | local empty_records_len=$(( $rec_len * $num_empty_rec )) 58 | local empty_records=$(printf '%*s' $empty_records_len | tr ' ' "0") 59 | 60 | echo $reg_tlv_part_1$sfd_record$empty_records 61 | } 62 | 63 | sfdf_reg_payload_get() 64 | { 65 | local flush_type=$1; shift 66 | local param=$1; shift 67 | 68 | local type_len="18060000" 69 | local resv1="00000000" 70 | local resv2="1000000" 71 | local resv_param="000000000000" 72 | local resv3="00000000" 73 | 74 | echo $type_len$resv1$flush_type$resv2$resv_param$param$resv3 75 | } 76 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/hdroom_dump.py: -------------------------------------------------------------------------------- 1 | # hdroom_dump Dump mlxsw port headroom configuration in JSON format 2 | 3 | from mlxsw_drgn import * 4 | import json 5 | import sys 6 | 7 | mlxsw_sp = MlxswSp.find() 8 | dump = {} 9 | dump_ports = {} 10 | dump["ports"] = dump_ports 11 | dump["max_headroom_cells"] = mlxsw_sp.sb.max_headroom_cells.value_() 12 | dump["cell_size"] = mlxsw_sp.sb.cell_size.value_() 13 | for mlxsw_sp_port in mlxsw_sp.ports(): 14 | hdroom = mlxsw_sp_port.hdroom 15 | if hdroom.value_() == 0: 16 | continue 17 | 18 | dump_port = {} 19 | dump_ports[mlxsw_sp_port.name()] = dump_port 20 | 21 | dump_port["max_mtu"] = mlxsw_sp_port.max_mtu.value_() 22 | dump_port["max_speed"] = mlxsw_sp_port.max_speed.value_() 23 | 24 | mode_n = enum_name(hdroom.mode) 25 | 26 | dump_port["mode"] = mode_n 27 | dump_port["mtu"] = hdroom.mtu.value_() 28 | dump_port["delay_bytes"] = hdroom.delay_bytes.value_() 29 | 30 | dump_prios = {} 31 | dump_port["prios"] = dump_prios 32 | for mlxsw_sp_hdroom_prio in hdroom.prios.prio: 33 | dump_prio = { 34 | "buf_idx": mlxsw_sp_hdroom_prio.buf_idx.value_(), 35 | "ets_buf_idx": mlxsw_sp_hdroom_prio.ets_buf_idx.value_(), 36 | "set_buf_idx": mlxsw_sp_hdroom_prio.set_buf_idx.value_(), 37 | "lossy": mlxsw_sp_hdroom_prio.lossy.value_(), 38 | } 39 | dump_prios[len(dump_prios)] = dump_prio 40 | 41 | dump_bufs = {} 42 | dump_port["bufs"] = dump_bufs 43 | for mlxsw_sp_hdroom_buf in hdroom.bufs.buf: 44 | dump_buf = { 45 | "thres_cells": mlxsw_sp_hdroom_buf.thres_cells.value_(), 46 | "size_cells": mlxsw_sp_hdroom_buf.size_cells.value_(), 47 | "lossy": mlxsw_sp_hdroom_buf.lossy.value_(), 48 | } 49 | dump_bufs[len(dump_bufs)] = dump_buf 50 | 51 | dump_int_buf = { 52 | "enable": hdroom.int_buf.enable.value_(), 53 | "size_cells": hdroom.int_buf.size_cells.value_(), 54 | "reserve_cells": hdroom.int_buf.reserve_cells.value_(), 55 | } 56 | dump_port["int_buf"] = dump_int_buf 57 | 58 | sys.stdout.write(json.dumps(dump)) 59 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/common/trace_helpers.c: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 | // 3 | // Based on trace_helpers.c from BCC by Wenbo Zhang. 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "trace_helpers.h" 11 | 12 | #define min(x, y) ({ \ 13 | typeof(x) _min1 = (x); \ 14 | typeof(y) _min2 = (y); \ 15 | (void) (&_min1 == &_min2); \ 16 | _min1 < _min2 ? _min1 : _min2; }) 17 | 18 | static void print_stars(unsigned int val, unsigned int val_max, int width) 19 | { 20 | int num_stars, num_spaces, i; 21 | bool need_plus; 22 | 23 | num_stars = min(val, val_max) * width / val_max; 24 | num_spaces = width - num_stars; 25 | need_plus = val > val_max; 26 | 27 | for (i = 0; i < num_stars; i++) 28 | printf("*"); 29 | for (i = 0; i < num_spaces; i++) 30 | printf(" "); 31 | if (need_plus) 32 | printf("+"); 33 | } 34 | 35 | void print_log2_hist(unsigned int *vals, int vals_size, const char *val_type) 36 | { 37 | int stars_max = 40, idx_max = -1; 38 | unsigned int val, val_max = 0; 39 | unsigned long long low, high; 40 | int stars, width, i; 41 | 42 | for (i = 0; i < vals_size; i++) { 43 | val = vals[i]; 44 | if (val > 0) 45 | idx_max = i; 46 | if (val > val_max) 47 | val_max = val; 48 | } 49 | 50 | if (idx_max < 0) 51 | return; 52 | 53 | printf("%*s%-*s : count distribution\n", idx_max <= 32 ? 5 : 15, "", 54 | idx_max <= 32 ? 19 : 29, val_type); 55 | 56 | if (idx_max <= 32) 57 | stars = stars_max; 58 | else 59 | stars = stars_max / 2; 60 | 61 | for (i = 0; i <= idx_max; i++) { 62 | low = (1ULL << (i + 1)) >> 1; 63 | high = (1ULL << (i + 1)) - 1; 64 | if (low == high) 65 | low -= 1; 66 | val = vals[i]; 67 | width = idx_max <= 32 ? 10 : 20; 68 | printf("%*lld -> %-*lld : %-8d |", width, low, width, high, val); 69 | print_stars(val, val_max, stars); 70 | printf("|\n"); 71 | } 72 | } 73 | 74 | int bump_memlock_rlimit(void) 75 | { 76 | struct rlimit rlim_new = { 77 | .rlim_cur = RLIM_INFINITY, 78 | .rlim_max = RLIM_INFINITY, 79 | }; 80 | 81 | return setrlimit(RLIMIT_MEMLOCK, &rlim_new); 82 | } 83 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/mdb_table_dump.py: -------------------------------------------------------------------------------- 1 | # mdb table dump - Dump MDB table in JSON format 2 | 3 | from mlxsw_drgn import * 4 | import json 5 | import sys 6 | 7 | mlxsw_sp = MlxswSp.find() 8 | ports_mapping = mlxsw_sp.ports_mapping() 9 | 10 | dump = {} 11 | 12 | def port_list_dump(mdb_entry, dump_ports_list): 13 | ports_list = mdb_entry.ports_list 14 | for entry_port in helpers.list_for_each_entry("struct mlxsw_sp_mdb_entry_port", 15 | ports_list.address_of_(), 16 | "list"): 17 | dump_entry_port = {} 18 | swp = ports_mapping[entry_port.local_port.value_()] 19 | 20 | dump_ports_list[swp] = dump_entry_port 21 | 22 | dump_entry_port["local_port"] = entry_port.local_port.value_() 23 | dump_entry_port["refcount"] = entry_port.refcount.refs.counter.value_() 24 | dump_entry_port["mrouter"] = entry_port.mrouter.value_() 25 | 26 | def mac_addr_get(mac_array): 27 | return ":".join("%02x" % x for x in mac_array) 28 | 29 | bridges_list = mlxsw_sp.bridge.bridges_list 30 | for bridge_dev in helpers.list_for_each_entry("struct mlxsw_sp_bridge_device", 31 | bridges_list.address_of_(), 32 | "list"): 33 | br_dev_name = bridge_dev.dev.name.string_().decode("utf-8") 34 | 35 | dump_bridge_dev = {} 36 | dump[br_dev_name] = dump_bridge_dev 37 | 38 | mdb_list = bridge_dev.mdb_list 39 | for mdb_entry in helpers.list_for_each_entry("struct mlxsw_sp_mdb_entry", 40 | mdb_list.address_of_(), 41 | "list"): 42 | dump_mdb_entry = {} 43 | dump_bridge_dev[mdb_entry.mid.value_()] = dump_mdb_entry 44 | 45 | dump_mdb_entry["mac_address"] = mac_addr_get(mdb_entry.key.addr.value_()) 46 | dump_mdb_entry["fid_index"] = mdb_entry.key.fid.value_() 47 | dump_mdb_entry["mid_index"] = mdb_entry.mid.value_() 48 | 49 | dump_ports_list = {} 50 | dump_mdb_entry["ports_list"] = dump_ports_list 51 | port_list_dump(mdb_entry, dump_ports_list) 52 | 53 | sys.stdout.write(json.dumps(dump)) 54 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/port_range_dump.txt: -------------------------------------------------------------------------------- 1 | NAME 2 | 3 | port_range_dump - Dump mlxsw port range registers 4 | 5 | SYNOPSIS: 6 | 7 | port_range_dump 8 | 9 | DESCRIPTION: 10 | 11 | port_range_dump is a tool written on top of drgn, for dumping 12 | internal structures of mlxsw driver related to port range 13 | registers. See kernel commit b3eb04be7299 ("mlxsw: 14 | spectrum_port_range: Add port range core") for more details. 15 | 16 | OUTPUT 17 | 18 | As an output, port_range_dump emits a JSON object with the 19 | following attributes: 20 | 21 | .port_range_registers..min_port 22 | The minimum port number of the port range register. 23 | Inclusive. 24 | 25 | .port_range_registers..max_port 26 | The maximum port number of the port range register. 27 | Inclusive. 28 | 29 | .port_range_registers..is_source 30 | Whether the port range register is configured to match on a 31 | source port range or a destination port range. 32 | 33 | .port_range_registers..refcount 34 | The port range register's reference count. In other words, 35 | the number of filters using the register. 36 | 37 | .port_range_registers..index 38 | The port range register's index. 39 | 40 | EXAMPLE: 41 | 42 | # tc qdisc add dev swp1 clsact 43 | # tc filter add dev swp1 ingress pref 1 proto ip flower skip_sw ip_proto udp src_port 100-200 action pass 44 | # tc filter add dev swp1 ingress pref 1 proto ip flower skip_sw ip_proto tcp src_port 100-200 action pass 45 | # tc filter add dev swp1 ingress pref 1 proto ip flower skip_sw ip_proto tcp dst_port 300-400 action pass 46 | 47 | # port_range_dump | jq 48 | { 49 | "port_range_registers": { 50 | "0": { 51 | "min_port": 100, 52 | "max_port": 200, 53 | "is_source": true, 54 | "refcount": 2, 55 | "index": 0 56 | }, 57 | "1": { 58 | "min_port": 300, 59 | "max_port": 400, 60 | "is_source": false, 61 | "refcount": 1, 62 | "index": 1 63 | } 64 | } 65 | } 66 | 67 | SEE ALSO: 68 | 69 | https://github.com/Mellanox/mlxsw/wiki 70 | https://drgn.readthedocs.io 71 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-stats.8.md: -------------------------------------------------------------------------------- 1 | % resmon-stats(8) | Linux 2 | 3 | NAME 4 | ==== 5 | 6 | `resmon stats` - dump statistics about objects allocated in the device 7 | 8 | SYNOPSIS 9 | ======== 10 | 11 | `resmon stats` 12 | 13 | DESCRIPTION 14 | =========== 15 | 16 | The daemon collects allocation and deallocation requests for a number of 17 | resource types. It can report number of entries allocated for individual 18 | resource types through the `stats` command: 19 | 20 | ```shell 21 | $ resmon stats 22 | Resource Usage 23 | IPv4 LPM 29 / 524288 (0%) 24 | IPv6 LPM 35 / 524288 (0%) 25 | ATCAM 12 / 524288 (0%) 26 | ACL Action Set 1008 / 524288 (0%) 27 | IPv4 Host Table 6 / 524288 (0%) 28 | IPv6 Host Table 0 / 524288 (0%) 29 | Adjacency Table 0 / 524288 (0%) 30 | FDB Entry 74 / 524288 (0%) 31 | Total 1164 / 524288 (0%) 32 | ``` 33 | 34 | The value behind slash is the capacity of on-chip memory. This is reported 35 | separately at each resource for convenience, but actually all resources consume 36 | the same memory. This can be seen in the "Total" line, where individual resource 37 | usages are summed, but the capacity is still the same. 38 | 39 | RPC REQUEST 40 | =========== 41 | 42 | The RPC method takes no parameters: 43 | 44 | ``` 45 | { 46 | "jsonrpc": "2.0", 47 | "id": $ID, 48 | "method": "stats", 49 | } 50 | ``` 51 | 52 | RPC RESPONSE 53 | ============ 54 | 55 | The `stats` method returns list of resources with their consumptions and 56 | capacities: 57 | 58 | ``` 59 | { 60 | "jsonrpc": "2.0", 61 | "id": $ID, 62 | "result": { 63 | "gauges": [ 64 | { 65 | "name": "unique_name", 66 | "descr": "Human-readable desciption", 67 | "value": $V, 68 | "capacity": $C 69 | }, 70 | ... 71 | } 72 | } 73 | ``` 74 | 75 | The result object has one element, "gauges", whose value is an array of objects. 76 | Each object has a unique symbolic name of the resource for which this gauge is; 77 | a human-readable description of what the resource is; current occupation below 78 | "value"; and current capacity. 79 | 80 | SEE ALSO 81 | ======== 82 | 83 | resmon(8) 84 | 85 | [JSON RPC specification][JSON RPC]. 86 | 87 | REPORTING ISSUES 88 | ================ 89 | 90 | To report issues please send an email to: mlxsw@nvidia.com. 91 | 92 | [JSON RPC]: https://www.jsonrpc.org/specification 93 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/emadump_example.txt: -------------------------------------------------------------------------------- 1 | Demonstration of emadump. 2 | 3 | EMADs (Ethernet Management Datagrams) are configuration packets 4 | exchanged between the mlxsw driver and the underlying device/firmware 5 | over a bus such as PCI or I2C. These packets are similar in nature to 6 | the netlink packets exchanged between user space and kernel. 7 | 8 | Each EMAD transaction initiated by the driver encodes a single register 9 | and is either a request to write to the register or a request to query 10 | from it. 11 | 12 | emadump traces EMADs and dumps them to a PCAP file. By default, EMADs 13 | are dumped to stdout, making it easy to pipe EMADs to Wireshark. For 14 | example: 15 | 16 | # ./emadump | tshark -X lua_script:../../EMADs/emad_dissector.lua -r - 17 | 1 0.000000 Mellanox_01:02:03 _ Mellanox_00:00:01 EMAD 1212 Ethernet II 18 | 2 0.000513 Mellanox_01:02:03 _ Mellanox_00:00:01 EMAD 1212 Ethernet II 19 | 20 | Alternatively, EMADs can be dumped to a file for later inspection. For 21 | example: 22 | 23 | # ./emadump -f emads.pcap 24 | 25 | It is also possible to filter only EMADs (request and response) that 26 | took longer than a specified threshold in microseconds. For example: 27 | 28 | # ./emadump -l 1000 -f emads.pcap 29 | 30 | Finally, it is also possible to filter only EMADs that encountered some error 31 | during processing. For example: 32 | 33 | # ./emadump -e -f emads.pcap 34 | 35 | USAGE message: 36 | 37 | # ./emadump --help 38 | Usage: emadump [OPTION...] 39 | Dump EMADs to a PCAP file. 40 | 41 | USAGE: emadump [--help] [-e] [-l] [-f] [-v] 42 | 43 | EXAMPLES: 44 | emadump # dump all EMADs to stdout 45 | emadump -e # only dump EMADs (request & response) with errors 46 | emadump -l 1000 # only dump EMADs that took longer than 1000 usecs 47 | emadump -f emads.pcap # dump EMADs to emads.pcap instead of stdout 48 | 49 | -e, --errors Only dump EMADs with errors 50 | -f, --file=FILE Dump EMADs to this file 51 | -l, --latency=LAT Only dump EMADs that took longer than specified 52 | threshold in microseconds 53 | -v, --verbose Verbose debug output 54 | -?, --help Give this help list 55 | --usage Give a short usage message 56 | -V, --version Print program version 57 | 58 | Mandatory or optional arguments to long options are also mandatory or optional 59 | for any corresponding short options. 60 | 61 | Report bugs to . 62 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-exporter.8.md: -------------------------------------------------------------------------------- 1 | % resmon-exporter(8) | Linux 2 | 3 | NAME 4 | ==== 5 | 6 | `resmon-exporter` - Prometheus node exporter for `resmon` 7 | 8 | SYNOPSIS 9 | ======== 10 | 11 | `resmon-exporter [--resmon-sockdir ] -f [-i | -1]` 12 | 13 | `resmon-exporter [--resmon-sockdir ] -l ` 14 | 15 | DESCRIPTION 16 | =========== 17 | 18 | `resmon-exporter` is a Prometheus node exporter, a component that exports 19 | the statistics collected by `resmon` in a format that the Prometheus 20 | time-series database understands. The exporter can run in one of two modes: 21 | a file mode, in which it fetches the statistics and dumps them in a file, 22 | either once or in regular intervals; or a listening mode, in which the 23 | exporter accepts TCP connections on a designated port, and serves the 24 | result to its clients. 25 | 26 | OPTIONS 27 | ======= 28 | 29 | `-f ` 30 | 31 | : Run the exporter in file mode. `` is the name of the file 32 | where the statistics are stored. 33 | 34 | `-i ` 35 | 36 | : When in file mode, how often should the statistics be scraped and the 37 | file updated. 38 | 39 | `-1` 40 | 41 | : When in file mode, scrape the statistics once, store them in the file, 42 | and exit. 43 | 44 | `-l ` 45 | 46 | : Open a TCP socket on a given address and port. Every time a connection is 47 | made to this port, scrape the statistics and send them as a response. 48 | 49 | `--resmon-sockdir ` 50 | 51 | : The directory in which `resmon` opens the RPC socket. This should only be 52 | necessary to customize if the `resmon` daemon itself is running with 53 | non-default `--sockdir`. 54 | 55 | EXAMPLE 56 | ======= 57 | 58 | ```shell 59 | $ resmon-exporter -l 0.0.0.0:9417 60 | $ curl http://localhost:9417 61 | # HELP node_net_resmon_stats Resmon stats 62 | # TYPE node_net_resmon_stats gauge 63 | node_net_resmon_stats{descr="IPv4 LPM",name="LPM_IPV4"} 0.0 64 | node_net_resmon_stats{descr="IPv6 LPM",name="LPM_IPV6"} 0.0 65 | ... etc ... 66 | node_net_resmon_stats{descr="Total",name="TOTAL"} 0.0 67 | # HELP node_net_resmon_stats_capacity Resmon stats capacity 68 | # TYPE node_net_resmon_stats_capacity gauge 69 | node_net_resmon_stats_capacity{descr="IPv4 LPM",name="LPM_IPV4"} 10000.0 70 | node_net_resmon_stats_capacity{descr="IPv6 LPM",name="LPM_IPV6"} 10000.0 71 | ... etc ... 72 | node_net_resmon_stats_capacity{descr="Total",name="TOTAL"} 10000.0 73 | ``` 74 | 75 | SEE ALSO 76 | ======== 77 | 78 | resmon(8) 79 | 80 | REPORTING ISSUES 81 | ================ 82 | 83 | To report issues please send an email to: mlxsw@nvidia.com. 84 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/README.md: -------------------------------------------------------------------------------- 1 | # mlxsw `resmon` 2 | 3 | `resmon` is a daemon for monitoring ASIC resource consumption in NVIDIA Spectrum 4 | Ethernet switches with Linux `mlxsw` driver. 5 | 6 | ## Building & Installation 7 | 8 | To build `resmon`, first make sure that development packages for the following 9 | libraries are installed on the system: 10 | 11 | - [json-c](https://github.com/json-c/json-c) 12 | - [systemd](https://systemd.io/) 13 | - [libnl-genl](http://www.infradead.org/~tgr/libnl/) 14 | 15 | Additionally, to build `resmon` manual pages, make sure the following is 16 | installed: 17 | 18 | - [pandoc](https://pandoc.org/) 19 | 20 | To run the Prometheus exporter described below, make sure the following 21 | is installed: 22 | 23 | - [prometheus-client](https://github.com/prometheus/client_python) 24 | 25 | On top of that, some requirements arise from the fact that `resmon` uses a BPF 26 | component. These requirements are covered in the [top-level 27 | README.md](../README.md). 28 | 29 | Please refer to the [top-level README.md](../README.md) for detailed building and installation instructions as well. 30 | 31 | ## Daemon Control 32 | 33 | To start and stop the daemon, use the commands `start` and `stop`: 34 | 35 | ```shell 36 | $ resmon start 37 | $ resmon stop 38 | ``` 39 | 40 | Under usual circumstances, it is expected that `systemctl` will be used to 41 | manage the daemon. To start, inspect status of, and stop the `resmon` daemon: 42 | 43 | ```shell 44 | $ systemctl start resmon 45 | $ systemctl status resmon 46 | $ systemctl stop resmon 47 | ``` 48 | 49 | For further details, please consult man pages [`resmon(8)`](resmon.8.md), 50 | [`resmon-start(8)`](resmon-start.8.md) and 51 | [`resmon-stop(8)`](resmon-stop.8.md). 52 | 53 | ## Prometheus Exporter 54 | 55 | Shipped together with `resmon` is a Prometheus node exporter, a component that 56 | exports the statistics collected by `resmon` in a format that the Prometheus 57 | time-series database understands. This can be started using `systemctl`: 58 | 59 | ```shell 60 | $ systemctl start resmon-exporter 61 | $ systemctl status resmon-exporter 62 | $ systemctl stop resmon-exporter 63 | ``` 64 | 65 | By default, the resmon-exporter service opens a TCP socket at 0.0.0.0:9417. 66 | Requests at that port are responded to with a resmon stats scrape in 67 | Prometheus node exporter format: 68 | 69 | ```shell 70 | $ curl http://localhost:9417 71 | # HELP node_net_resmon_stats Resmon stats 72 | # TYPE node_net_resmon_stats gauge 73 | node_net_resmon_stats{descr="IPv4 LPM",name="LPM_IPV4"} 0.0 74 | node_net_resmon_stats{descr="IPv6 LPM",name="LPM_IPV6"} 0.0 75 | ... etc ... 76 | ``` 77 | 78 | For further details, please consult the 79 | [`resmon-exporter(8)`](resmon-exporter.8.md) man page. 80 | -------------------------------------------------------------------------------- /Debugging/EMADs/devlink-hwmsg.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | """ 4 | Copyright 2016, 2018 Mellanox Technologies. All rights reserved. 5 | Licensed under the GNU General Public License, version 2 as 6 | published by the Free Software Foundation; see COPYING for details. 7 | """ 8 | 9 | __author__ = """ 10 | jiri@mellanox.com (Jiri Pirko) 11 | """ 12 | 13 | import perf 14 | import sys 15 | import struct 16 | from common import pcap_header_out, pcap_packet_header, \ 17 | tlv_bus_name, tlv_dev_name, tlv_driver_name, tlv_incoming, tlv_type, tlv_buf 18 | 19 | from signal import signal, SIGPIPE, SIG_DFL 20 | signal(SIGPIPE,SIG_DFL) 21 | 22 | class tracepoint(perf.evsel): 23 | def __init__(self, sys, name): 24 | config = perf.tracepoint(sys, name) 25 | perf.evsel.__init__(self, type = perf.TYPE_TRACEPOINT, config = config, 26 | freq = 0, sample_period = 1, wakeup_events = 1, 27 | sample_type = perf.SAMPLE_PERIOD | perf.SAMPLE_TID | 28 | perf.SAMPLE_CPU | perf.SAMPLE_RAW | 29 | perf.SAMPLE_TIME) 30 | 31 | def tlv_data(data_type, data): 32 | enc = data_type.encode(data) 33 | tlv_header = struct.pack("HH", data_type.tag(), len(enc)) 34 | return tlv_header + enc 35 | 36 | def event_out(event): 37 | data = bytearray() 38 | data += tlv_data(tlv_bus_name, event.bus_name) 39 | data += tlv_data(tlv_dev_name, event.dev_name) 40 | data += tlv_data(tlv_driver_name, event.driver_name) 41 | data += tlv_data(tlv_incoming, event.incoming) 42 | data += tlv_data(tlv_type, event.type) 43 | data += tlv_data(tlv_buf, event.buf) 44 | 45 | secs = event.sample_time // 1000000000 46 | usecs = (event.sample_time % 1000000000) // 1000 47 | sys.stdout.write(pcap_packet_header(secs, usecs, len(data))) 48 | sys.stdout.write(data) 49 | sys.stdout.flush() 50 | 51 | def main(): 52 | sys.stdout = open('/dev/stdout', 'wb') 53 | 54 | tp = tracepoint("devlink", "devlink_hwmsg") 55 | cpus = perf.cpu_map() 56 | threads = perf.thread_map(-1) 57 | 58 | evlist = perf.evlist(cpus, threads) 59 | evlist.add(tp) 60 | evlist.open() 61 | evlist.mmap() 62 | 63 | pcap_header_out(sys.stdout) 64 | 65 | while True: 66 | try: 67 | evlist.poll(timeout = -1) 68 | except KeyboardInterrupt: 69 | break 70 | for cpu in cpus: 71 | while True: 72 | event = evlist.read_on_cpu(cpu) 73 | if not event: 74 | break 75 | if not isinstance(event, perf.sample_event): 76 | continue 77 | event_out(event) 78 | 79 | if __name__ == '__main__': 80 | main() 81 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/pgt_table_dump.txt: -------------------------------------------------------------------------------- 1 | NAME 2 | 3 | pgt_table_dump - Dump mlxsw PGT (port group table) configuration 4 | 5 | SYNOPSIS: 6 | 7 | pgt_table_dump 8 | 9 | DESCRIPTION: 10 | 11 | pgt_table_dump is a tool written on top of drgn, for dumping 12 | an internal structure in mlxsw driver which represents the PGT 13 | table in hardware. That way it is possible to find out how PGT 14 | entries are configured in the ASIC. 15 | 16 | OUTPUT 17 | 18 | As an output, pgt_table_dump emits a JSON object with the following 19 | attributes: 20 | 21 | .end_index 22 | The last index (exclusive) of the PGT table, this value is read 23 | from hardware and depends on ASIC generation. 24 | 25 | .smpe_index_valid 26 | Indicates if the SMPE index is valid in the PGT table. 27 | In Spectrum-1, the SMPE index needs to be programmed as part of 28 | the PGT entry, while it is reserved for Spectrum-2 and later 29 | ASICs. See more information in kernel commit a1697d11c945d 30 | ("mlxsw: Add an indication of SMPE index validity for PGT table") 31 | 32 | .pgt_entries..mid_index 33 | Multicast identifier, the index of the entry. 34 | 35 | .pgt_entries..smpe_index 36 | SMPE (switch multicast to port egress VID) index, the index 37 | into the MPE table. This field appears only in case that 38 | .pgt_table.smpe_index_valid is true. 39 | 40 | .pgt_entries..ports..local_port 41 | A local port to transmit the packet to. 42 | 43 | pgt_table_dump always outputs the complete information. Filtering 44 | and querying can be done e.g. through `jq`. 45 | 46 | EXAMPLE: 47 | 48 | # ./pgt_table_dump | jq 49 | { 50 | "end_index": 31744, 51 | "smpe_index_valid": true, 52 | "pgt_entries": { 53 | "0": { 54 | "mid_index": 0, 55 | "smpe_index": 1, 56 | "ports": { 57 | "swp14": { 58 | "local_port": 50 59 | }, 60 | "swp15": { 61 | "local_port": 51 62 | } 63 | } 64 | }, 65 | "4094": { 66 | "mid_index": 4094, 67 | "smpe_index": 1, 68 | "ports": { 69 | "router_port": { 70 | "local_port": 66 71 | }, 72 | "swp15": { 73 | "local_port": 51 74 | }, 75 | "swp14": { 76 | "local_port": 50 77 | } 78 | } 79 | } 80 | } 81 | } 82 | 83 | SEE ALSO: 84 | 85 | https://github.com/Mellanox/mlxsw/wiki 86 | https://drgn.readthedocs.io 87 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/common/map_helpers.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 | // Copyright (c) 2020 Anton Protopopov 3 | #include 4 | #include 5 | #include 6 | 7 | #include "map_helpers.h" 8 | 9 | #define warn(...) fprintf(stderr, __VA_ARGS__) 10 | 11 | static bool batch_map_ops = true; /* hope for the best */ 12 | 13 | static int 14 | dump_hash_iter(int map_fd, void *keys, __u32 key_size, 15 | void *values, __u32 value_size, __u32 *count, 16 | void *invalid_key) 17 | { 18 | __u8 key[key_size], next_key[key_size]; 19 | __u32 n = 0; 20 | int err; 21 | 22 | /* First get keys */ 23 | __builtin_memcpy(key, invalid_key, key_size); 24 | while (n < *count) { 25 | err = bpf_map_get_next_key(map_fd, key, next_key); 26 | if (err && errno != ENOENT) { 27 | return -1; 28 | } else if (err) { 29 | break; 30 | } 31 | __builtin_memcpy(key, next_key, key_size); 32 | __builtin_memcpy(keys + key_size * n, next_key, key_size); 33 | n++; 34 | } 35 | 36 | /* Now read values */ 37 | for (int i = 0; i < n; i++) { 38 | err = bpf_map_lookup_elem(map_fd, keys + key_size * i, 39 | values + value_size * i); 40 | if (err) 41 | return -1; 42 | } 43 | 44 | *count = n; 45 | return 0; 46 | } 47 | 48 | static int 49 | dump_hash_batch(int map_fd, void *keys, __u32 key_size, 50 | void *values, __u32 value_size, __u32 *count) 51 | { 52 | void *in = NULL, *out; 53 | __u32 n, n_read = 0; 54 | int err = 0; 55 | 56 | while (n_read < *count && !err) { 57 | n = *count - n_read; 58 | err = bpf_map_lookup_batch(map_fd, &in, &out, 59 | keys + n_read * key_size, 60 | values + n_read * value_size, 61 | &n, NULL); 62 | if (err && errno != ENOENT) { 63 | return -1; 64 | } 65 | n_read += n; 66 | in = out; 67 | } 68 | 69 | *count = n_read; 70 | return 0; 71 | } 72 | 73 | int dump_hash(int map_fd, 74 | void *keys, __u32 key_size, 75 | void *values, __u32 value_size, 76 | __u32 *count, void *invalid_key) 77 | { 78 | int err; 79 | 80 | if (!keys || !values || !count || !key_size || !value_size) { 81 | errno = EINVAL; 82 | return -1; 83 | } 84 | 85 | if (batch_map_ops) { 86 | err = dump_hash_batch(map_fd, keys, key_size, 87 | values, value_size, count); 88 | if (err) { 89 | if (errno != EINVAL) { 90 | return -1; 91 | 92 | /* assume that batch operations are not 93 | * supported and try non-batch mode */ 94 | batch_map_ops = false; 95 | } 96 | } 97 | } 98 | 99 | if (!invalid_key) { 100 | errno = EINVAL; 101 | return -1; 102 | } 103 | 104 | return dump_hash_iter(map_fd, keys, key_size, 105 | values, value_size, count, invalid_key); 106 | } 107 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-emad.8.md: -------------------------------------------------------------------------------- 1 | % resmon-emad(8) | Linux 2 | 3 | NAME 4 | ==== 5 | 6 | `resmon emad` - inject an EMAD into the `resmon` daemon in mock mode 7 | 8 | SYNOPSIS 9 | ======== 10 | 11 | `resmon emad string ` 12 | 13 | DESCRIPTION 14 | =========== 15 | 16 | When `resmon` is run in the mock mode (and only then), it understands the RPC 17 | method `emad`. This is the only way to inject EMADs into `resmon` in the mock 18 | mode. 19 | 20 | See `resmon-start(8)` for details on the mock mode. 21 | 22 | PARAMETERS 23 | ========== 24 | 25 | `string ` 26 | 27 | : Payload of the EMAD to inject into the daemon. Payload only, i.e. without 28 | the Ethernet header. 29 | 30 | The payload is given in a hex-coded EMAD payload. Hex-coding means that 31 | each byte is represented by two hexadecimal digits describing the byte 32 | value. Thus e.g. the string "Hello!" (ASCII 0x48, 0x65, 0x6c, 0x6c, 33 | 0x6f, 0x21) would be hex-coded as 48656c6c6f21. 34 | 35 | A way to obtain the EMADs in question in the first place is through the tool 36 | `emadump` in `src` directory of the repository, e.g. by feeding the resulting 37 | pcap file to wireshark. 38 | 39 | EXAMPLE 40 | ======= 41 | 42 | ``` 43 | $ resmon --sockdir . start mode mock 44 | $ resmon --sockdir . -v emad string \ 45 | 08040000801382012d68bbc20004cbd3102100000000000000000000000000` 46 | `00000000000000000000000000000000000000000000000000000000000000` 47 | `00000000000000000000000000000000000000000000000000000000000000` 48 | `00000000000000000000000000000000000000000000000000000000000000` 49 | `000000000000000000000000000000000000000000000000180f0000000100` 50 | `000000000000000020000000000000000000000000c6010203802000020000` 51 | `0000000000000000000000000000000000000000000000010000 52 | resmond took the EMAD 53 | $ resmon --sockdir . stats 54 | Resource Usage 55 | IPv4 LPM 1 / 10000 (0%) <-- newly-allocated resource 56 | [...] 57 | Total 1 / 10000 (0%) 58 | ``` 59 | 60 | RPC REQUEST 61 | =========== 62 | 63 | ``` 64 | { 65 | "jsonrpc": "2.0", 66 | "id": $ID, 67 | "method": "emad", 68 | "params": { 69 | "payload": "" 70 | } 71 | } 72 | ``` 73 | 74 | RPC RESPONSE 75 | ============ 76 | 77 | ``` 78 | { 79 | "jsonrpc": "2.0", 80 | "id": $ID, 81 | "result": null 82 | } 83 | ``` 84 | 85 | This response indicates that the daemon accepted the EMAD. A failure of any 86 | kind would be communicated through an error response. 87 | 88 | SEE ALSO 89 | ======== 90 | 91 | resmon(8), resmon-start(8) 92 | 93 | [JSON RPC specification][JSON RPC]. 94 | 95 | REPORTING ISSUES 96 | ================ 97 | 98 | To report issues please send an email to: mlxsw@nvidia.com. 99 | 100 | [JSON RPC]: https://www.jsonrpc.org/specification 101 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/mlxsw_drgn.py: -------------------------------------------------------------------------------- 1 | # This module collects helpers related to working with the mlxsw Spectrum 2 | # driver. But since it's a module, the drgn `prog' variable is not available. So 3 | # we need to construct it anew. Then once we have it, we can export it to the 4 | # tool, and use drgn as just a library, and run the tool through python. 5 | 6 | import sys 7 | import drgn 8 | import drgn.helpers.linux as helpers 9 | 10 | prog = drgn.Program() 11 | prog.set_kernel() 12 | 13 | try: 14 | prog.load_debug_info(None, default=True, main=True) 15 | except drgn.MissingDebugInfoError as e: 16 | print(str(e), file=sys.stderr) 17 | 18 | class MlxswSpPort: 19 | def __init__(self, _mlxsw_sp_port): 20 | self._mlxsw_sp_port = _mlxsw_sp_port 21 | def __getattr__(self, key): 22 | return getattr(self._mlxsw_sp_port, key) 23 | 24 | def name(self): 25 | dev = self._mlxsw_sp_port.dev 26 | if dev.value_() == 0: 27 | raise RuntimeError("No netdev associated with the port") 28 | return dev.name.string_().decode("utf-8") 29 | 30 | class MlxswSp: 31 | def __init__(self, _mlxsw_sp): 32 | self._mlxsw_sp = _mlxsw_sp 33 | def __getattr__(self, key): 34 | return getattr(self._mlxsw_sp, key) 35 | 36 | def ports(self): 37 | mlxsw_core = self._mlxsw_sp.core 38 | max_ports = mlxsw_core.max_ports.value_() - 1 39 | for i in range(0, max_ports): 40 | if self._mlxsw_sp.ports[i].value_() == 0: 41 | continue 42 | yield MlxswSpPort(self._mlxsw_sp.ports[i]) 43 | 44 | def ports_mapping(self): 45 | ports_mapping = {} 46 | for mlxsw_sp_port in self.ports(): 47 | if mlxsw_sp_port.dev.value_() == 0: 48 | continue 49 | 50 | local_port = mlxsw_sp_port.local_port.value_() 51 | ports_mapping[local_port] = mlxsw_sp_port.name() 52 | 53 | router_port = self._mlxsw_sp.core.max_ports.value_() + 1 54 | ports_mapping[router_port] = "router_port" 55 | 56 | return ports_mapping 57 | 58 | @staticmethod 59 | def find(): 60 | devlinks = prog['devlinks'].address_of_() 61 | mlxsw_devlink_ops = prog['mlxsw_devlink_ops'].address_of_() 62 | for _, entry in helpers.xarray.xa_for_each(devlinks): 63 | devlink = drgn.reinterpret(prog.type("struct devlink *"), entry) 64 | if devlink.ops == mlxsw_devlink_ops: 65 | mlxsw_core = drgn.reinterpret(prog.type("struct mlxsw_core"), 66 | devlink.priv) 67 | mlxsw_sp = drgn.reinterpret(prog.type("struct mlxsw_sp"), 68 | mlxsw_core.driver_priv) 69 | return MlxswSp(mlxsw_sp) 70 | raise RuntimeError("mlxsw devlink instance not found") 71 | 72 | def netns(self): 73 | mlxsw_core = self._mlxsw_sp.core 74 | devlink = drgn.container_of(mlxsw_core, "struct devlink", "priv") 75 | 76 | return devlink._net.net 77 | 78 | def enum_name(in_enum): 79 | enum_n, = list(enum.name 80 | for enum in in_enum.type_.enumerators 81 | if enum.value == in_enum.value_()) 82 | 83 | return enum_n.split("_")[-1] 84 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/fid_dump_ub0.py: -------------------------------------------------------------------------------- 1 | # fid_dump_ub0 Dump FID configuration in JSON format 2 | 3 | from socket import ntohl 4 | from mlxsw_drgn import * 5 | import json 6 | import sys 7 | 8 | mlxsw_sp = MlxswSp.find() 9 | dump = {} 10 | 11 | dump_ports = {} 12 | dump["ports"] = dump_ports 13 | 14 | for mlxsw_sp_port in mlxsw_sp.ports(): 15 | if mlxsw_sp_port.dev.value_() == 0: 16 | continue 17 | 18 | dump_port = {} 19 | dump_ports[mlxsw_sp_port.name()] = dump_port 20 | 21 | local_port = mlxsw_sp_port.local_port.value_() 22 | virtual = mlxsw_sp.fid_core.port_fid_mappings[local_port] != 0 23 | 24 | dump_port["local_port"] = local_port 25 | dump_port["virtual"] = virtual 26 | 27 | dump_fid_families = {} 28 | dump["fid_families"] = dump_fid_families 29 | 30 | for family in mlxsw_sp.fid_core.fid_family_arr: 31 | dump_fid_family = {} 32 | family_type_n = enum_name(family.type) 33 | dump_fid_families[family_type_n] = dump_fid_family 34 | 35 | dump_fid_family["start_index"] = family.start_index.value_() 36 | dump_fid_family["end_index"] = family.end_index.value_() 37 | dump_fid_family["rif_type"] = enum_name(family.rif_type) 38 | 39 | dump_fids = {} 40 | dump_fid_family["fids"] = dump_fids 41 | 42 | for fid in helpers.list_for_each_entry("struct mlxsw_sp_fid", 43 | family.fids_list.address_of_(), 44 | "list"): 45 | dump_fid = {} 46 | dump_fids[fid.fid_index.value_()] = dump_fid 47 | 48 | if family_type_n == "8021Q": 49 | fid_8021q = drgn.container_of(fid, "struct mlxsw_sp_fid_8021q", 50 | "common") 51 | dump_fid["vid"] = fid_8021q.vid.value_() 52 | 53 | if family_type_n == "8021D": 54 | fid_8021d = drgn.container_of(fid, "struct mlxsw_sp_fid_8021d", 55 | "common") 56 | br_ifindex = fid_8021d.br_ifindex.value_() 57 | br_dev = helpers.net.netdev_get_by_index(mlxsw_sp.netns(), 58 | br_ifindex) 59 | 60 | dump_fid["br_ifindex"] = br_ifindex 61 | dump_fid["br_ifname"] = br_dev.name.string_().decode("utf-8") 62 | 63 | dump_fid["ref_count"] = fid.ref_count.refs.counter.value_() 64 | 65 | if fid.rif.value_(): 66 | rif_dump = {} 67 | dump_fid["rif"] = rif_dump 68 | 69 | rif_dump["index"] = fid.rif.rif_index.value_() 70 | rif_dump["ifindex"] = fid.rif.dev.ifindex.value_() 71 | rif_dump["ifname"] = fid.rif.dev.name.string_().decode("utf-8") 72 | 73 | if fid.vni_valid.value_(): 74 | nve_ifindex = fid.nve_ifindex.value_() 75 | nve_dev = helpers.net.netdev_get_by_index(mlxsw_sp.netns(), 76 | nve_ifindex) 77 | 78 | dump_fid["vni"] = ntohl(fid.vni.value_()) 79 | dump_fid["nve_ifindex"] = nve_ifindex 80 | dump_fid["nve_ifname"] = nve_dev.name.string_().decode("utf-8") 81 | 82 | if fid.nve_flood_index_valid.value_(): 83 | dump_fid["nve_flood_index"] = fid.nve_flood_index.value_() 84 | 85 | sys.stdout.write(json.dumps(dump)) 86 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon.bpf.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 2 | #include "vmlinux.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define EMAD_ETH_HDR_LEN 0x10 10 | #define EMAD_OP_TLV_LEN 0x10 11 | #define EMAD_OP_TLV_METHOD_MASK 0x7F 12 | #define EMAD_OP_TLV_STATUS_MASK 0x7F 13 | 14 | enum { 15 | EMAD_OP_TLV_METHOD_QUERY = 1, 16 | EMAD_OP_TLV_METHOD_WRITE = 2, 17 | EMAD_OP_TLV_METHOD_EVENT = 5, 18 | }; 19 | 20 | struct emad_tlv_head { 21 | int type; 22 | int length; 23 | }; 24 | 25 | struct emad_op_tlv { 26 | __be16 type_len_be; 27 | u8 status; 28 | u8 resv2; 29 | u16 reg_id; 30 | u8 r_method; 31 | u8 resv3; 32 | u64 tid; 33 | }; 34 | 35 | static struct emad_tlv_head emad_tlv_decode_header(__be16 type_len_be) 36 | { 37 | u16 type_len = bpf_ntohs(type_len_be); 38 | 39 | return (struct emad_tlv_head){ 40 | .type = type_len >> 11, 41 | .length = type_len & 0x7ff, 42 | }; 43 | } 44 | 45 | struct { 46 | __uint(type, BPF_MAP_TYPE_RINGBUF); 47 | __uint(max_entries, 8 * 1024 * 1024 /* 8 MB */); 48 | } ringbuf SEC(".maps"); 49 | 50 | static int push_to_ringbuf(const u8 *buf, size_t len) 51 | { 52 | u8 *space; 53 | 54 | if (len > 2048) 55 | return 0; 56 | 57 | else if (len > 1024) 58 | space = bpf_ringbuf_reserve(&ringbuf, 2048, 0); 59 | else if (len > 512) 60 | space = bpf_ringbuf_reserve(&ringbuf, 1024, 0); 61 | else if (len > 256) 62 | space = bpf_ringbuf_reserve(&ringbuf, 512, 0); 63 | else 64 | space = bpf_ringbuf_reserve(&ringbuf, 256, 0); 65 | 66 | if (!space) { 67 | bpf_printk("Unable to reserve %lu bytes for an EMAD on ring buffer\n", 68 | len); 69 | return 0; 70 | } 71 | 72 | bpf_core_read(space, len, buf); 73 | bpf_ringbuf_submit(space, 0); 74 | 75 | return 0; 76 | } 77 | 78 | SEC("fentry/mlxsw_emad_rx_listener_func") 79 | int BPF_PROG(mlxsw_emad_rx_listener_func, struct sk_buff *skb) 80 | { 81 | struct emad_tlv_head tlv_head; 82 | struct emad_op_tlv op_tlv; 83 | void *buf = skb->data; 84 | unsigned int len; 85 | 86 | buf += EMAD_ETH_HDR_LEN; 87 | 88 | bpf_core_read(&op_tlv, sizeof(op_tlv), buf); 89 | tlv_head = emad_tlv_decode_header(op_tlv.type_len_be); 90 | 91 | /* Filter out queries and events. Later on we can assume `op' 92 | * fields in a register refer to a write. 93 | */ 94 | if ((op_tlv.r_method & EMAD_OP_TLV_METHOD_MASK) 95 | != EMAD_OP_TLV_METHOD_WRITE) 96 | return 0; 97 | 98 | /* Filter out errors. */ 99 | if (op_tlv.status & EMAD_OP_TLV_STATUS_MASK) 100 | return 0; 101 | 102 | switch (bpf_ntohs(op_tlv.reg_id)) { 103 | case 0x8013: /* MLXSW_REG_RALUE_ID */ 104 | case 0x3006: /* MLXSW_REG_PTAR_ID */ 105 | case 0x3027: /* MLXSW_REG_PTCE3_ID */ 106 | case 0x300F: /* MLXSW_REG_PEFA_ID */ 107 | case 0x3804: /* MLXSW_REG_IEDR_ID */ 108 | case 0x8014: /* MLXSW_REG_RAUHT_ID */ 109 | case 0x8008: /* MLXSW_REG_RATR_ID */ 110 | case 0x200A: /* MLXSW_REG_SFD_ID */ 111 | case 0x2013: /* MLXSW_REG_SFDF_ID */ 112 | case 0x201C: /* MLXSW_REG_SVFA_ID */ 113 | case 0x8021: /* MLXSW_REG_RIPS_ID */ 114 | case 0x201F: /* MLXSW_REG_SFMR_ID */ 115 | return push_to_ringbuf(buf, skb->len - EMAD_ETH_HDR_LEN); 116 | }; 117 | return 0; 118 | } 119 | 120 | char LICENSE[] SEC("license") = "GPL"; 121 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 2 | #define _GNU_SOURCE 3 | #include 4 | #include 5 | #include 6 | 7 | #include "resmon.h" 8 | #include "config.h" 9 | 10 | struct resmon_env env = { 11 | .verbosity = 0, 12 | }; 13 | const char *program_version = "resmon 1.0"; 14 | const char *program_bug_address = ""; 15 | 16 | static int resmon_help(void) 17 | { 18 | puts("Monitor resource usage in a Spectrum switch.\n" 19 | "\n" 20 | "Usage: resmon [OPTIONS] { COMMAND | help }\n" 21 | "where OPTIONS := [ -h | --help | -q | --quiet | -v | --verbose |\n" 22 | " -V | --version | --sockdir | --json ]\n" 23 | " COMMAND := { start | stop | ping | emad | stats | dump }\n" 24 | ); 25 | return 0; 26 | } 27 | 28 | static int resmon_cmd(int argc, char **argv) 29 | { 30 | if (!argc || strcmp(*argv, "help") == 0) { 31 | return resmon_help(); 32 | } else if (strcmp(*argv, "start") == 0) { 33 | NEXT_ARG_FWD(); 34 | return resmon_d_start(argc, argv); 35 | } else if (strcmp(*argv, "stop") == 0) { 36 | NEXT_ARG_FWD(); 37 | return resmon_c_stop(argc, argv); 38 | } else if (strcmp(*argv, "ping") == 0) { 39 | NEXT_ARG_FWD(); 40 | return resmon_c_ping(argc, argv); 41 | } else if (strcmp(*argv, "emad") == 0) { 42 | NEXT_ARG_FWD(); 43 | return resmon_c_emad(argc, argv); 44 | } else if (strcmp(*argv, "stats") == 0) { 45 | NEXT_ARG_FWD(); 46 | return resmon_c_stats(argc, argv); 47 | } else if (strcmp(*argv, "dump") == 0) { 48 | NEXT_ARG_FWD(); 49 | return resmon_c_dump(argc, argv); 50 | } 51 | 52 | fprintf(stderr, "Unknown command \"%s\"\n", *argv); 53 | return -EINVAL; 54 | } 55 | 56 | int main(int argc, char **argv) 57 | { 58 | enum { 59 | opt_sockaddr = 257, 60 | opt_json, 61 | }; 62 | static const struct option long_options[] = { 63 | { "help", no_argument, NULL, 'h' }, 64 | { "json", no_argument, NULL, opt_json }, 65 | { "quiet", no_argument, NULL, 'q' }, 66 | { "verbose", no_argument, NULL, 'v' }, 67 | { "version", no_argument, NULL, 'V' }, 68 | { "sockdir", required_argument, NULL, opt_sockaddr }, 69 | { NULL, 0, NULL, 0 } 70 | }; 71 | int opt; 72 | 73 | env.sockdir = RESMON_DEFAULT_SOCKDIR; 74 | while ((opt = getopt_long(argc, argv, "hqvV", 75 | long_options, NULL)) >= 0) { 76 | switch (opt) { 77 | case 'V': 78 | printf("mlxsw resource monitoring tool, %s\n", 79 | program_version); 80 | return 0; 81 | case 'h': 82 | resmon_help(); 83 | return 0; 84 | case 'v': 85 | env.verbosity++; 86 | break; 87 | case 'q': 88 | env.verbosity--; 89 | break; 90 | case opt_sockaddr: 91 | env.sockdir = optarg; 92 | break; 93 | case opt_json: 94 | env.show_json = true; 95 | break; 96 | default: 97 | fprintf(stderr, "Unknown option.\n"); 98 | resmon_help(); 99 | return 1; 100 | } 101 | } 102 | 103 | argc -= optind; 104 | argv += optind; 105 | 106 | return resmon_cmd(argc, argv); 107 | } 108 | 109 | __attribute__((format(printf, 2, 3))) 110 | int resmon_fmterr(char **strp, const char *fmt, ...) 111 | { 112 | va_list ap; 113 | int rc; 114 | 115 | va_start(ap, fmt); 116 | rc = vasprintf(strp, fmt, ap); 117 | va_end(ap); 118 | 119 | if (rc < 0) 120 | *strp = NULL; 121 | return rc; 122 | } 123 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-start.8.md: -------------------------------------------------------------------------------- 1 | % resmon-start(8) | Linux 2 | 3 | NAME 4 | ==== 5 | 6 | `resmon start` - start `resmon`, the resource monitor for NVIDIA Spectrum 7 | switches 8 | 9 | SYNOPSIS 10 | ======== 11 | 12 | `resmon start [mode {hw | mock}] [[include | exclude] resources RES1 RES2 ...` 13 | 14 | DESCRIPTION 15 | =========== 16 | 17 | After the daemon is up and running, it starts collecting allocation and 18 | deallocation requests. Any allocations done prior to the daemon start will 19 | not have been recorded and will thus be invisible to the daemon. Therefore 20 | to have a fully accurate view of the state of resources, it is necessary to 21 | issue a devlink reload, e.g.: 22 | 23 | ```shell 24 | $ devlink reload pci/0000:06:00 25 | ``` 26 | 27 | This way even resources allocated during driver init are recorded. 28 | 29 | ## Communication With the Daemon 30 | 31 | The daemon opens a Unix socket through which it communicates with the 32 | client. The communication is according to the [JSON RPC][] protocol. Please 33 | refer to the specification to understand the details of the message format. 34 | 35 | Besides the JSON RPC interface, the suite also provides command-line 36 | wrappers for individual JSON-RPC methods. Please refer to man pages for the 37 | individual commands for further details. 38 | 39 | OPTIONS 40 | ======= 41 | 42 | Please refer to `resmon(8)` for details about command-line options. 43 | 44 | When the daemon is started in a verbose mode (`-v`), the messages can be 45 | seen in `journalctl -t resmon`. 46 | 47 | PARAMETERS 48 | ========== 49 | 50 | `[include | exclude] resources <...>` 51 | 52 | : It is possible to configure a subset of resources that `resmon` is 53 | supposed to monitor. This will save the memory and some processing time 54 | that would be necessary for bookkeeping of uninterested resources. The 55 | list of resources to monitor needs to be selected when the daemon is 56 | started: 57 | 58 | ```shell 59 | $ resmon start resources lpm_ipv4 lpm_ipv6 60 | ``` 61 | 62 | It is also possible to request monitoring of a group of resources. E.g. 63 | `lpm_ipv4` and `lpm_ipv6` are grouped together in a group named `lpm`: 64 | 65 | ```shell 66 | $ resmon start resources lpm 67 | ``` 68 | 69 | It is also possible to monitor all resources except of an excluded few. 70 | E.g. to exclude LPM resources: 71 | 72 | ```shell 73 | $ resmon start exclude resources lpm 74 | ``` 75 | 76 | `mode {mock | hw}` 77 | 78 | : By default, `resmon` starts in hardware mode, which means that it 79 | installs probes necessary to capture the EMAD messages exchanged 80 | between the `mlxsw` driver and the FW running on a device. 81 | 82 | For testing purposes, it is possible to start `resmon` in mock mode. In 83 | that situation `resmon` may run unprivileged, and the EMAD messages are 84 | injected from user space. 85 | 86 | To start `resmon` in mock mode, pass `mode mock` to the start command 87 | line: 88 | 89 | ```shell 90 | $ resmon start mode mock 91 | ``` 92 | 93 | It is also possible to `systemctl edit resmon.service` to create an 94 | override with adjusted start-up parameters. 95 | 96 | 97 | SEE ALSO 98 | ======== 99 | 100 | resmon(8), resmon-stop(8), resmon-ping(8) 101 | 102 | [JSON RPC specification][JSON RPC]. 103 | 104 | REPORTING ISSUES 105 | ================ 106 | 107 | To report issues please send an email to: mlxsw@nvidia.com. 108 | 109 | [JSON RPC]: https://www.jsonrpc.org/specification 110 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/common.mk: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 | SAVE_DEFAULT_GOAL := $(.DEFAULT_GOAL) 3 | TOP_SRCDIR := $(abspath $(REL_SRCDIR)) 4 | 5 | CLANG ?= clang 6 | LLVM_STRIP ?= llvm-strip 7 | BPFTOOL ?= $(TOP_SRCDIR)/tools/bpftool 8 | ARCH := $(shell uname -m | sed 's/x86_64/x86/') 9 | 10 | INSTALL = install 11 | INSTALL_PROGRAM = $(INSTALL) 12 | INSTALL_DATA = $(INSTALL) -m 644 13 | 14 | PREFIX = /usr/local 15 | EXEC_PREFIX = $(PREFIX) 16 | BINDIR = $(EXEC_PREFIX)/bin 17 | DATAROOTDIR = $(PREFIX)/share 18 | DATADIR = $(DATAROOTDIR) 19 | SYSCONFDIR = $(PREFIX)/etc 20 | LOCALSTATEDIR = $(PREFIX)/var 21 | RUNSTATEDIR = $(LOCALSTATEDIR)/run 22 | DOCDIR = $(DATAROOTDIR)/doc/$(PACKAGE) 23 | MANDIR = $(DATAROOTDIR)/man 24 | MAN8DIR = $(MANDIR)/man8 25 | SYSTEMDSYSTEMUNITDIR = $(shell pkgconf --variable=systemdsystemunitdir systemd) 26 | DESTDIR = 27 | 28 | VAR_SUBSTITUTIONS = \ 29 | s|@BINDIR@|$(BINDIR)|g; \ 30 | s|@SYSCONFDIR@|$(SYSCONFDIR)|g; \ 31 | s|@RUNSTATEDIR@|$(RUNSTATEDIR)|g; \ 32 | # 33 | 34 | # Get Clang's default includes on this system. We'll explicitly add these dirs 35 | # to the includes list when compiling with `-target bpf` because otherwise some 36 | # architecture-specific dirs will be "missing" on some architectures/distros - 37 | # headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, 38 | # sys/cdefs.h etc. might be missing. 39 | # 40 | # Use '-idirafter': Don't interfere with include mechanics except where the 41 | # build would have failed anyways. 42 | CLANG_BPF_SYS_INCLUDES = $(shell $(CLANG) -v -E - &1 \ 43 | | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') 44 | 45 | ifeq ($(V),1) 46 | Q = 47 | NQ = @ 48 | msg = 49 | else 50 | Q = @ 51 | NQ = 52 | msg = @printf ' %-8s %s%s\n' \ 53 | "$(1)" \ 54 | "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ 55 | "$(if $(3), $(3))"; 56 | MAKEFLAGS += --no-print-directory 57 | endif 58 | 59 | LIBBPF_OUTPUT := $(TOP_SRCDIR)/.libbpf-output 60 | LIBBPF_INCLUDE := -I$(LIBBPF_OUTPUT) 61 | LIBBPF_OBJ := $(abspath $(LIBBPF_OUTPUT)/libbpf.a) 62 | LIBBPF_SRC := $(TOP_SRCDIR)/libbpf/src 63 | 64 | # Build libbpf 65 | $(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) \ 66 | | $(LIBBPF_OUTPUT)/libbpf 67 | $(call msg,LIB,$@) 68 | $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ 69 | OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ 70 | INCLUDEDIR= LIBDIR= UAPIDIR= \ 71 | install 72 | 73 | $(LIBBPF_OUTPUT)/libbpf: 74 | $(call msg,MKDIR,$@) 75 | $(Q)mkdir -p $@ 76 | 77 | COMMON_OUTPUT := $(TOP_SRCDIR)/common/.output 78 | COMMON_INCLUDE := -I$(TOP_SRCDIR)/common 79 | COMMON_OBJ := $(COMMON_OUTPUT)/libcommon.a 80 | COMMON_SRC := $(TOP_SRCDIR)/common 81 | 82 | $(COMMON_OBJ): $(LIBBPF_OBJ) 83 | $(Q)$(MAKE) -C $(COMMON_SRC) 84 | 85 | # delete failed targets 86 | .DELETE_ON_ERROR: 87 | 88 | # keep the libbpf intermediate target 89 | .SECONDARY: 90 | 91 | define __do_install 92 | $(call msg,INSTALL,$1) 93 | $(Q)if [ ! -d '$(DESTDIR)$2' ]; then \ 94 | $4 -d -m 755 '$(DESTDIR)$2'; \ 95 | fi; 96 | $(Q)$4 $(if $3,-m $3,) $1 '$(DESTDIR)$2' 97 | endef 98 | 99 | define do_install_program 100 | $(call __do_install,$1,$2,$3,$(INSTALL_PROGRAM)) 101 | endef 102 | 103 | define do_install_data 104 | $(call __do_install,$1,$2,$3,$(INSTALL_DATA)) 105 | endef 106 | 107 | # To avoid picking up the above rules as default goal, revert back to the 108 | # goal that we've had at the top of the file. 109 | .DEFAULT_GOAL := $(SAVE_DEFAULT_GOAL) 110 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/Makefile: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 | PACKAGE = resmon 3 | REL_SRCDIR := .. 4 | include $(REL_SRCDIR)/common.mk 5 | 6 | OUTPUT := .output 7 | INCLUDES := $(COMMON_INCLUDE) 8 | CFLAGS := -g -Wall -Wunused 9 | 10 | APPS := resmon resmon-exporter 11 | resmon-OBJECTS := \ 12 | $(OUTPUT)/resmon.o \ 13 | $(OUTPUT)/resmon-back.o \ 14 | $(OUTPUT)/resmon-c.o \ 15 | $(OUTPUT)/resmon-d.o \ 16 | $(OUTPUT)/resmon-dl.o \ 17 | $(OUTPUT)/resmon-jrpc.o \ 18 | $(OUTPUT)/resmon-reg.o \ 19 | $(OUTPUT)/resmon-sock.o \ 20 | $(OUTPUT)/resmon-stat.o \ 21 | # 22 | SYSTEMD_UNITS := \ 23 | $(OUTPUT)/resmon.service \ 24 | $(OUTPUT)/resmon-exporter.service \ 25 | # 26 | EXTRA_CLEAN := \ 27 | $(OUTPUT)/config.h \ 28 | $(OUTPUT)/resmon.bpf.o \ 29 | $(OUTPUT)/resmon.skel.h \ 30 | # 31 | TESTS := \ 32 | test.sh \ 33 | # 34 | MAN_PAGES := \ 35 | $(OUTPUT)/resmon.8 \ 36 | $(OUTPUT)/resmon-exporter.8 \ 37 | $(OUTPUT)/resmon-start.8 \ 38 | $(OUTPUT)/resmon-stop.8 \ 39 | $(OUTPUT)/resmon-ping.8 \ 40 | $(OUTPUT)/resmon-stats.8 \ 41 | $(OUTPUT)/resmon-emad.8 \ 42 | $(OUTPUT)/resmon-dump.8 \ 43 | # 44 | 45 | BUILT := $(APPS) $(SYSTEMD_UNITS) $(MAN_PAGES) 46 | 47 | .PHONY: all 48 | all: $(BUILT) 49 | 50 | .PHONY: doc 51 | doc: $(MAN_PAGES) 52 | 53 | .PHONY: $(APPS) 54 | $(APPS): %: $(OUTPUT)/% 55 | 56 | .PHONY: install 57 | install: $(BUILT) 58 | $(call msg,MKDIR,$(RUNSTATEDIR)) 59 | $(Q)$(INSTALL_DATA) -d -m 755 '$(DESTDIR)$(RUNSTATEDIR)' 60 | $(call do_install_program,$(APPS:%=$(OUTPUT)/%),$(BINDIR)) 61 | $(call do_install_data,$(SYSTEMD_UNITS),$(SYSTEMDSYSTEMUNITDIR)) 62 | $(call do_install_data,$(MAN_PAGES),$(MAN8DIR)) 63 | $(call do_install_data,README.md,$(DOCDIR)) 64 | 65 | .PHONY: test 66 | test: $(TESTS:%=run-%) 67 | 68 | run-%: % $(APPS) 69 | $(call msg,RUN,$*) 70 | $(Q)./$* 71 | 72 | $(OUTPUT): 73 | $(call msg,MKDIR,$@) 74 | $(Q)mkdir -p $@ 75 | 76 | $(OUTPUT)/resmon: CFLAGS += $(shell pkgconf --libs libelf json-c libsystemd \ 77 | libnl-3.0 libnl-genl-3.0) 78 | $(OUTPUT)/resmon: $(resmon-OBJECTS) $(LIBBPF_OBJ) $(COMMON_OBJ) 79 | $(call msg,BINARY,$@) 80 | $(Q)$(CC) $^ $(CFLAGS) -lz -o $@ 81 | 82 | $(OUTPUT)/%.o: %.c resmon.h mlxsw.h $(COMMON_OBJ) | $(OUTPUT) 83 | $(call msg,CC,$@) 84 | $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ 85 | 86 | $(OUTPUT)/resmon.o: $(OUTPUT)/config.h 87 | $(OUTPUT)/resmon.o: INCLUDES += -I$(OUTPUT) 88 | 89 | $(OUTPUT)/resmon-dl.o: INCLUDES += $(shell pkgconf --cflags libnl-3.0 \ 90 | libnl-genl-3.0) 91 | 92 | $(OUTPUT)/resmon.bpf.o: INCLUDES += $(LIBBPF_INCLUDE) 93 | $(OUTPUT)/resmon.bpf.o: resmon.bpf.c resmon.h ../common/vmlinux.h 94 | $(call msg,BPF,$@) 95 | $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) -c $< -o $@ 96 | $(Q)$(LLVM_STRIP) -g $@ # strip useless DWARF info 97 | 98 | $(OUTPUT)/resmon-back.o: INCLUDES += $(LIBBPF_INCLUDE) -I$(OUTPUT) 99 | $(OUTPUT)/resmon-back.o: $(OUTPUT)/resmon.skel.h 100 | 101 | %.skel.h: %.bpf.o 102 | $(call msg,GEN-SKEL,$@) 103 | $(Q)$(BPFTOOL) gen skeleton $< > $@ 104 | 105 | $(OUTPUT)/%: %.in | $(OUTPUT) 106 | $(call msg,SED,$*) 107 | $(Q)sed -e '$(VAR_SUBSTITUTIONS)' $< > $@ 108 | $(Q)chmod --reference=$< $@ 109 | 110 | $(MAN_PAGES): $(OUTPUT)/%: %.md | $(OUTPUT) 111 | pandoc --standalone --to man $< -o $@ 112 | 113 | .PHONY: clean 114 | clean: 115 | $(call msg,CLEAN) 116 | $(Q)rm -f $(foreach tgt,$(APPS),$($(tgt)-OBJECTS)) 117 | $(Q)rm -f $(APPS:%=$(OUTPUT)/%) $(SYSTEMD_UNITS) $(MAN_PAGES) $(EXTRA_CLEAN) 118 | $(Q)if test -d $(OUTPUT); then rmdir $(OUTPUT); fi 119 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/mdb_table_dump.txt: -------------------------------------------------------------------------------- 1 | NAME 2 | 3 | mdb_table_dump - Dump mlxsw MDB (multicast group database) 4 | configuration 5 | 6 | SYNOPSIS: 7 | 8 | mdb_table_dump 9 | 10 | DESCRIPTION: 11 | 12 | mdb_table_dump is a tool written on top of drgn, for dumping internal 13 | structures of mlxsw driver related to MDB table. That way it is 14 | possible to find out how MDB entries are configured in the ASIC. 15 | In Linux, the MDB table is stored per bridge device, so the tool 16 | dumps the table per bridge device. 17 | 18 | OUTPUT 19 | 20 | As an output, mdb_table_dump emits a JSON object with the following 21 | attributes: 22 | 23 | ...mac_address 24 | MAC address, the pair {MAC, FID} is used as an index to the 25 | multicast FDB table. 26 | 27 | ...fid_index 28 | FID index, the pair {MAC, FID} is used as an index to the 29 | multicast FDB table. 30 | 31 | ...mid_index 32 | MID index. This is the index to the PGT table which 33 | maintains a list of ports to transmit packets which point to 34 | the entry. 35 | 36 | ...ports_list 37 | List of ports which are configured in the PGT table in the 38 | relevant MID index. Packets which point to the MID index, 39 | will be transmitted to all the ports in the list. 40 | 41 | ...ports_list..local_port 42 | A local port to transmit the packet to. 43 | 44 | ...ports_list..refcount 45 | Reference counter of the port. MDB entries in the Linux bridge 46 | are keyed according to their multicast IP, when these entries 47 | are notified to device drivers via switchdev, the multicast IP 48 | is converted to a multicast MAC. This conversion might cause 49 | collisions, so port can be used more than once in MDB entry 50 | which is keyed by MAC. 51 | 52 | ...ports_list..mrouter 53 | Indicates if the port is multicast router or not. See more 54 | information in kernel commit d2994e1305858 ("mlxsw: 55 | spectrum_switchdev: Add support for maintaining list of ports per 56 | MDB entry") 57 | 58 | mdb_table_dump always outputs the complete information. Filtering 59 | and querying can be done e.g. through `jq`. 60 | 61 | EXAMPLE: 62 | 63 | # ./mdb_table_dump | jq 64 | { 65 | "br0": { 66 | "15354": { 67 | "mac_address": "33:33:ff:82:bf:af", 68 | "fid_index": 1, 69 | "mid_index": 15354, 70 | "ports_list": { 71 | "swp28": { 72 | "local_port": 57, 73 | "refcount": 1, 74 | "mrouter": false 75 | } 76 | } 77 | }, 78 | "15355": { 79 | "mac_address": "33:33:ff:f5:ee:1c", 80 | "fid_index": 1, 81 | "mid_index": 15355, 82 | "ports_list": { 83 | "swp26": { 84 | "local_port": 49, 85 | "refcount": 1, 86 | "mrouter": false 87 | }, 88 | "swp25": { 89 | "local_port": 53, 90 | "refcount": 1, 91 | "mrouter": false 92 | } 93 | } 94 | } 95 | } 96 | } 97 | 98 | SEE ALSO: 99 | 100 | https://github.com/Mellanox/mlxsw/wiki 101 | https://drgn.readthedocs.io 102 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/README.md: -------------------------------------------------------------------------------- 1 | # mlxsw libbpf-tools 2 | 3 | This directory includes various BPF-based tools aimed at improving the 4 | observability and debuggability of mlxsw. The tools are written using 5 | [libbpf][1], so that they could be compiled once and run everywhere 6 | [(CO-RE)][2]. 7 | 8 | ## List of tools 9 | 10 | * src/emadlatency: Summarize EMAD latency as a histogram. [Examples](src/emadlatency_example.txt) 11 | * src/emadump: Dump EMADs to a PCAP file. [Examples](src/emadump_example.txt) 12 | * src/trapagg: Dump aggregated per-{trap, flow} statistics. [Examples](src/trapagg_example.txt) 13 | * resmon/resmon: Monitor resource consumption in Spectrum switches. 14 | 15 | ## Building 16 | 17 | Before building any of libbpf-tools, the system needs to have the following 18 | tools installed: 19 | 20 | - clang 21 | - llvm-strip 22 | 23 | Besides this, BPF-based tools need the kernel that they are running on to 24 | be configured with BTF (BPF Type Format) annotations: 25 | 26 | - `CONFIG_DEBUG_INFO_BTF=y` 27 | 28 | Then to prepare the source tree for building, first check out the libbpf 29 | submodule: 30 | 31 | ```shell 32 | $ git submodule update --init --recursive # check out libbpf 33 | ``` 34 | 35 | Then either just build everything: 36 | 37 | ```shell 38 | $ make 39 | ``` 40 | 41 | Or cherry-pick what should be built: 42 | 43 | ```shell 44 | $ make -C src 45 | $ make -C src emadump 46 | $ make -C resmon 47 | $ make -C resmon resmon 48 | ``` 49 | 50 | Some tools support installation. For those that do, the build system can be 51 | configured by passing variables describing the directory layout of the system 52 | where tools will be installed. 53 | 54 | ```shell 55 | $ make PREFIX=/usr LOCALSTATEDIR=/var 56 | ``` 57 | 58 | ### Building Documentation 59 | 60 | It is possible to request building specifically of only documentation, for 61 | the tools that do need documentation building: 62 | 63 | ```shell 64 | $ make doc 65 | $ make -C resmon doc 66 | ``` 67 | 68 | ## Testing 69 | 70 | Some tools support testing. To perform all available tests, run: 71 | 72 | ```shell 73 | $ make test 74 | ``` 75 | 76 | It is also possible to run only a certain suite of tests: 77 | 78 | ```shell 79 | $ make -C resmon test # all resmon tests 80 | $ make -C resmon run-test.sh # specifically this one suite 81 | ``` 82 | 83 | ## Installation 84 | 85 | Some tools support installation. To install all that do, run: 86 | 87 | ```shell 88 | $ make install 89 | ``` 90 | 91 | It is also possible to cherry-pick installation of a certain tool: 92 | 93 | ```shell 94 | $ make -C resmon install 95 | ``` 96 | 97 | Remember to pass the directory-layout variables to install as well: 98 | 99 | ```shell 100 | $ make PREFIX=/usr LOCALSTATEDIR=/var install 101 | ``` 102 | 103 | The build system also supports staged installations, e.g.: 104 | 105 | ```shell 106 | $ make PREFIX=/usr LOCALSTATEDIR=/var DESTDIR=${HOME}/tmp/ install 107 | ``` 108 | 109 | ## Further resources 110 | 111 | 1. [BPF portability and CO-RE][3] 112 | 2. [BCC to libbpf conversion guide][4] 113 | 3. [Building BPF applications with libbpf-boostrap][5] 114 | 4. [BPF ring buffer][6] 115 | 5. [Why We Switched from BCC to libbpf for Linux BPF Performance Analysis][7] 116 | 6. [Tips and Tricks for Writing Linux BPF Applications with libbpf][8] 117 | 118 | [1]: https://github.com/libbpf/libbpf 119 | [2]: https://github.com/libbpf/libbpf#bpf-co-re-compile-once--run-everywhere 120 | [3]: https://nakryiko.com/posts/bpf-portability-and-co-re/ 121 | [4]: https://nakryiko.com/posts/bcc-to-libbpf-howto-guide/ 122 | [5]: https://nakryiko.com/posts/libbpf-bootstrap/ 123 | [6]: https://nakryiko.com/posts/bpf-ringbuf/ 124 | [7]: https://en.pingcap.com/blog/why-we-switched-from-bcc-to-libbpf-for-linux-bpf-performance-analysis 125 | [8]: https://en.pingcap.com/blog/tips-and-tricks-for-writing-linux-bpf-applications-with-libbpf 126 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon.8.md: -------------------------------------------------------------------------------- 1 | % resmon(8) | Linux 2 | 3 | NAME 4 | ==== 5 | 6 | `resmon` - resource monitor for NVIDIA Spectrum switches 7 | 8 | SYNOPSIS 9 | ======== 10 | 11 | `resmon -V` 12 | 13 | `resmon [-v | -q] [--json] [--sockdir ] 14 | {start | stop | ping | emad | stats | dump}` 15 | 16 | DESCRIPTION 17 | =========== 18 | 19 | `resmon` is a daemon for monitoring ASIC resource consumption in NVIDIA 20 | Spectrum Ethernet switches with Linux `mlxsw` driver. 21 | 22 | The daemon opens a Unix socket through which it communicates with the 23 | client using the [JSON RPC][] protocol. 24 | 25 | ## Supported Platforms 26 | 27 | `resmon` supports NVIDIA Spectrum-2 and later switches. 28 | 29 | Partial support is available on NVIDIA Spectrum-1 switches: KVDL-based 30 | resources will not be tracked properly, because KVDL in Spectrum-1 switches 31 | is managed by software, not firmware, and therefore the release events will 32 | not be seen by `resmon`. When running on Spectrum-1, it is therefore 33 | reasonable to exclude KVDL resources from being monitored. See 34 | `resmon-start(8)` for details on how to do that. 35 | 36 | ## Method of Operation 37 | 38 | The driver communicates with the device using Ethernet packets called 39 | Ethernet Management Datagrams (EMADs). The tool works by hooking up to a 40 | function in the `mlxsw` driver that processes EMAD responses, filtering 41 | out register EMADs that might indicate resource allocation or 42 | deallocation. These it sends through a ring buffer to the user-space 43 | daemon that dissects the registers and keeps track of state of 44 | individual resources. 45 | 46 | OPTIONS 47 | ======= 48 | 49 | `-v, --verbose, -q, --quiet` 50 | 51 | : Be more or less verbose, respectively. It makes sense to repeat the `-v` 52 | option up to three times, which increases verbosity every time (mostly 53 | for the benefit of messages from the BPF tooling). 54 | 55 | `--sockdir ` 56 | 57 | : Location of the Unix socket to use for communication with the daemon. 58 | 59 | By default, `resmon` opens the socket in a directory specified by the 60 | build-time variable `RUNSTATEDIR`. The default value of the variable is 61 | `/usr/local/var/run`. It is common to override the build-time variable 62 | `LOCALSTATEDIR` to `/var`, in which case `RUNSTATEDIR` is `/var/run`, 63 | and that's where the socket will be placed. 64 | 65 | The command-line argument `--sockdir` allows overriding of this default 66 | location. 67 | 68 | `--json` 69 | 70 | : When displaying the response sent by the daemon back to the client, 71 | instead of interpreting it and formatting in a human-readable way, dump 72 | the JSON result object instead, if any. Errors, including interpreted 73 | valid JSON RPC error response, are still dumped to standard error as 74 | usual. 75 | 76 | Please see manual pages for the individual client commands to 77 | understand the result object format. The man pages document full 78 | response object, the result is returned under the `result` key. 79 | 80 | COMMANDS 81 | ======== 82 | 83 | `start` 84 | 85 | : Starts the daemon. 86 | 87 | `stop` 88 | 89 | : Stops the daemon. 90 | 91 | `stats` 92 | 93 | : Scrapes collected resource allocation statistics. 94 | 95 | `ping` 96 | 97 | : Probes the liveness of the daemon. 98 | 99 | `emad` 100 | 101 | : Injects a hardware configuration message to the daemon. Only available in 102 | mock mode. 103 | 104 | `dump` 105 | 106 | : Show the contents of the tables that the daemon uses to keep track of 107 | resource allocation. 108 | 109 | SEE ALSO 110 | ======== 111 | 112 | resmon-start(8), resmon-stop(8), resmon-stats(8), resmon-ping(8), 113 | resmon-emad(8), resmon-dump(8) 114 | 115 | [JSON RPC specification][JSON RPC]. 116 | 117 | REPORTING ISSUES 118 | ================ 119 | 120 | To report issues please send an email to: mlxsw@nvidia.com. 121 | 122 | [JSON RPC]: https://www.jsonrpc.org/specification 123 | -------------------------------------------------------------------------------- /Debugging/fw_dump.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from datetime import datetime 4 | import subprocess 5 | import argparse 6 | import sys 7 | 8 | def make_parser(): 9 | parser = argparse.ArgumentParser(description='A script that triggers mstflint when fw_fatal events occur') 10 | parser.add_argument('--output-path', 11 | default="/tmp", 12 | help='Path to output tar file, default is /tmp') 13 | return parser 14 | 15 | def read_line(p): 16 | line = p.stdout.readline() 17 | if not isinstance(line, (str)): 18 | line = line.decode('utf-8').rstrip() 19 | return line 20 | 21 | def verify_dependencies(): 22 | # Verify that there is a single Mellanox PCI device 23 | cmd = 'lspci | grep Mellanox | wc -l' 24 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 25 | mellanox_pci_count = read_line(p) 26 | if int(mellanox_pci_count) != 1: 27 | print("There is no single Mellanox PCI device") 28 | return 1 29 | 30 | # Verify that devlink is installed 31 | cmd = "which devlink" 32 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 33 | rc = p.wait() 34 | if rc: 35 | print("devlink is not installed") 36 | return rc 37 | 38 | # Verify that mstflint is installed 39 | cmd = "which mstflint" 40 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 41 | rc = p.wait() 42 | if rc: 43 | print("mstflint is not installed") 44 | return rc 45 | 46 | return 0 47 | 48 | def send_command(p, cmd): 49 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 50 | # Return out, err 51 | return p.communicate() 52 | 53 | def devlink_health_dump(p, pci_addr): 54 | cmd = "echo 'devlink health show' > /tmp/devlink_health_dump.txt" 55 | send_command(p, cmd) 56 | 57 | cmd = "devlink health show >> /tmp/devlink_health_dump.txt" 58 | send_command(p, cmd) 59 | 60 | cmd = "echo '' >> /tmp/devlink_health_dump.txt" 61 | send_command(p, cmd) 62 | 63 | cmd = "echo 'devlink health dump show pci/0000:%s reporter fw_fatal' >> \ 64 | /tmp/devlink_health_dump.txt" % pci_addr 65 | send_command(p, cmd) 66 | 67 | cmd = "devlink health dump show pci/0000:%s reporter fw_fatal >> \ 68 | /tmp/devlink_health_dump.txt" % pci_addr 69 | send_command(p, cmd) 70 | 71 | def dump_fw(p, pci_addr, tar_path): 72 | if p.poll() is not None: 73 | return 74 | 75 | line = read_line(p) 76 | if "state error" not in line: 77 | return 78 | 79 | date_time_str = datetime.now().strftime("%d.%m.%Y,%H:%M:%S") 80 | tar_name = date_time_str + "-mstregdump.tar.xz" 81 | 82 | cmd = "for i in {1..3}; do mstregdump %s > /tmp/mstregdump$i; done" % pci_addr 83 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 84 | out, err = p.communicate() 85 | 86 | devlink_health_dump(p, pci_addr) 87 | 88 | cmd = "cd /tmp && tar cvJf %s/%s mstregdump[123] devlink_health_dump.txt" % (tar_path, tar_name) 89 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 90 | out, err = p.communicate() 91 | 92 | cmd = "rm -rf /tmp/mstregdump* /tmp/devlink_health_dump.txt" 93 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 94 | 95 | def main(cmdline=None): 96 | parser = make_parser() 97 | args = parser.parse_args() 98 | 99 | rc = verify_dependencies() 100 | if rc: 101 | return rc 102 | 103 | cmd = 'lspci | grep Mellanox | cut -d " " -f1' 104 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 105 | pci_addr = read_line(p) 106 | 107 | cmd = "devlink monitor health" 108 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 109 | 110 | while p.poll() is None: 111 | line = read_line(p) 112 | if "fw_fatal" in line: 113 | dump_fw(p, pci_addr, args.output_path) 114 | 115 | if __name__ == "__main__": 116 | sys.exit(main(sys.argv[1:])) 117 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/emadump.bpf.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | #include "vmlinux.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "emadump.h" 8 | #include "bits.bpf.h" 9 | 10 | #define EMAD_ETH_HDR_LEN 0x10 11 | #define EMAD_OP_TLV_LEN 0x10 12 | #define EMAD_OP_TLV_STATUS_MASK 0x7F 13 | 14 | struct emad_op_tlv { 15 | u16 resv1; 16 | u8 status; 17 | u8 resv2; 18 | u16 reg_id; 19 | u8 r_method; 20 | u8 resv3; 21 | u64 tid; 22 | }; 23 | 24 | #define MAX_ENTRIES 10240 25 | 26 | const volatile bool targ_errors = false; 27 | const volatile __u64 targ_thresh_us = 0; 28 | 29 | struct { 30 | __uint(type, BPF_MAP_TYPE_RINGBUF); 31 | __uint(max_entries, 256 * 1024); 32 | } rb SEC(".maps"); 33 | 34 | struct { 35 | __uint(type, BPF_MAP_TYPE_HASH); 36 | __uint(max_entries, MAX_ENTRIES); 37 | __type(key, u64); 38 | __type(value, struct emad_event); 39 | } start SEC(".maps"); 40 | 41 | struct { 42 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 43 | __uint(max_entries, 1); 44 | __type(key, int); 45 | __type(value, struct emad_event); 46 | } heap SEC(".maps"); 47 | 48 | SEC("fentry/mlxsw_emad_transmit") 49 | int BPF_PROG(mlxsw_emad_transmit, struct mlxsw_core *mlxsw_core, 50 | struct mlxsw_reg_trans *trans) 51 | { 52 | u8 emad[EMAD_ETH_HDR_LEN + EMAD_OP_TLV_LEN]; 53 | u64 ts = bpf_ktime_get_ns() / 1000U; 54 | struct emad_op_tlv *op_tlv; 55 | struct emad_event *e; 56 | struct sk_buff *skb; 57 | size_t emad_len; 58 | int zero = 0; 59 | void *buf; 60 | 61 | skb = trans->tx_skb; 62 | emad_len = skb->len; 63 | buf = skb->data; 64 | 65 | /* This should never happen. */ 66 | if (emad_len > EMAD_MAX_LEN) 67 | return 0; 68 | 69 | /* Allocate EMAD event from our "heap". */ 70 | e = bpf_map_lookup_elem(&heap, &zero); 71 | if (!e) /* Cannot happen. */ 72 | return 0; 73 | 74 | /* Initialize EMAD event. */ 75 | bpf_probe_read(&e->buf, emad_len, buf); 76 | e->len = emad_len; 77 | e->ts = ts; 78 | 79 | /* If no filtering, then output the event to BPF ringbuf. */ 80 | if (!targ_errors && !targ_thresh_us) { 81 | bpf_ringbuf_output(&rb, e, sizeof(*e), 0); 82 | return 0; 83 | } 84 | 85 | bpf_probe_read(emad, EMAD_ETH_HDR_LEN + EMAD_OP_TLV_LEN, buf); 86 | op_tlv = (struct emad_op_tlv *)(emad + EMAD_ETH_HDR_LEN); 87 | 88 | /* Store EMAD request in a hash table for retrieval upon response. */ 89 | bpf_map_update_elem(&start, &op_tlv->tid, e, BPF_ANY); 90 | return 0; 91 | } 92 | 93 | SEC("fentry/mlxsw_emad_rx_listener_func") 94 | int BPF_PROG(mlxsw_emad_rx_listener_func, struct sk_buff *skb) 95 | { 96 | u8 emad[EMAD_ETH_HDR_LEN + EMAD_OP_TLV_LEN]; 97 | u64 ts = bpf_ktime_get_ns() / 1000U; 98 | unsigned int emad_len = skb->len; 99 | struct emad_event *e, *req_e; 100 | struct emad_op_tlv *op_tlv; 101 | void *buf = skb->data; 102 | int zero = 0; 103 | bool error; 104 | s64 delta; 105 | 106 | /* This should never happen. */ 107 | if (emad_len > EMAD_MAX_LEN) 108 | return 0; 109 | 110 | /* Allocate EMAD event from our "heap". */ 111 | e = bpf_map_lookup_elem(&heap, &zero); 112 | if (!e) /* Cannot happen. */ 113 | return 0; 114 | 115 | /* Initialize EMAD event. */ 116 | bpf_probe_read(&e->buf, emad_len, buf); 117 | e->len = emad_len; 118 | e->ts = ts; 119 | 120 | /* If no filtering, then output the event to BPF ringbuf. */ 121 | if (!targ_errors && !targ_thresh_us) { 122 | bpf_ringbuf_output(&rb, e, sizeof(*e), 0); 123 | return 0; 124 | } 125 | 126 | bpf_probe_read(emad, EMAD_ETH_HDR_LEN + EMAD_OP_TLV_LEN, buf); 127 | op_tlv = (struct emad_op_tlv *)(emad + EMAD_ETH_HDR_LEN); 128 | 129 | /* Retrieve the request from the response. */ 130 | req_e = bpf_map_lookup_elem(&start, &op_tlv->tid); 131 | if (!req_e) 132 | return 0; 133 | 134 | delta = (s64)(ts - req_e->ts); 135 | if (delta < 0) 136 | goto out; 137 | error = (op_tlv->status & EMAD_OP_TLV_STATUS_MASK) != 0; 138 | 139 | /* Submit request and response if match filters. */ 140 | if ((targ_errors && error) || 141 | (targ_thresh_us && delta > targ_thresh_us)) { 142 | bpf_ringbuf_output(&rb, req_e, sizeof(*req_e), 0); 143 | bpf_ringbuf_output(&rb, e, sizeof(*e), 0); 144 | } 145 | 146 | out: 147 | bpf_map_delete_elem(&start, &op_tlv->tid); 148 | return 0; 149 | } 150 | 151 | char LICENSE[] SEC("license") = "GPL"; 152 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-sock.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 2 | #define _GNU_SOURCE 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "resmon.h" 13 | 14 | static int resmon_sock_sockaddr(const char *sockdir, const char *sockname, 15 | struct sockaddr_un *sa) 16 | { 17 | const char *maybe_slash = "/"; 18 | int len; 19 | 20 | if (sockdir[strlen(sockdir) - 1] == '/') 21 | maybe_slash++; 22 | 23 | sa->sun_family = AF_LOCAL; 24 | len = snprintf(sa->sun_path, sizeof(sa->sun_path), "%s%s%s", 25 | sockdir, maybe_slash, sockname); 26 | if (len < 0) 27 | return len; 28 | if (len >= sizeof(sa->sun_path)) 29 | return -ENOBUFS; 30 | 31 | return 0; 32 | } 33 | 34 | static int resmon_ctl_sockaddr(const char *sockdir, struct sockaddr_un *ctl_sa) 35 | { 36 | return resmon_sock_sockaddr(sockdir, "resmon.ctl", ctl_sa); 37 | } 38 | 39 | static int resmon_cli_sockaddr(const char *sockdir, struct sockaddr_un *cli_sa) 40 | { 41 | char *sockname; 42 | int rc; 43 | 44 | rc = asprintf(&sockname, "resmon.cli.%d", getpid()); 45 | if (rc < 0) 46 | return rc; 47 | 48 | rc = resmon_sock_sockaddr(sockdir, sockname, cli_sa); 49 | free(sockname); 50 | return rc; 51 | } 52 | 53 | static int resmon_sock_open(struct sockaddr_un sa, struct resmon_sock *sock) 54 | { 55 | int fd; 56 | int rc; 57 | 58 | *sock = (struct resmon_sock) { .fd = -1 }; 59 | 60 | fd = socket(AF_LOCAL, SOCK_DGRAM, 0); 61 | if (fd < 0) { 62 | fprintf(stderr, "Failed to create control socket: %m\n"); 63 | return -1; 64 | } 65 | 66 | unlink(sa.sun_path); 67 | 68 | rc = bind(fd, (struct sockaddr *) &sa, sizeof(sa)); 69 | if (rc < 0) { 70 | fprintf(stderr, "Failed to bind control socket `%s': %m\n", 71 | sa.sun_path); 72 | goto close_fd; 73 | } 74 | 75 | *sock = (struct resmon_sock) { 76 | .fd = fd, 77 | .sa = sa, 78 | .len = sizeof(sa), 79 | }; 80 | return 0; 81 | 82 | close_fd: 83 | close(fd); 84 | return rc; 85 | } 86 | 87 | static void resmon_sock_close(struct resmon_sock *sock) 88 | { 89 | close(sock->fd); 90 | unlink(sock->sa.sun_path); 91 | } 92 | 93 | int resmon_sock_open_d(struct resmon_sock *ctl, const char *sockdir) 94 | { 95 | struct sockaddr_un sa; 96 | int rc; 97 | 98 | rc = resmon_ctl_sockaddr(sockdir, &sa); 99 | if (rc != 0) 100 | return rc; 101 | 102 | return resmon_sock_open(sa, ctl); 103 | } 104 | 105 | void resmon_sock_close_d(struct resmon_sock *ctl) 106 | { 107 | resmon_sock_close(ctl); 108 | } 109 | 110 | int resmon_sock_open_c(struct resmon_sock *cli, 111 | struct resmon_sock *peer, 112 | const char *sockdir) 113 | { 114 | struct sockaddr_un ctl_sa; 115 | struct sockaddr_un cli_sa; 116 | int rc; 117 | 118 | rc = resmon_ctl_sockaddr(sockdir, &ctl_sa); 119 | if (rc != 0) 120 | return rc; 121 | 122 | rc = resmon_cli_sockaddr(sockdir, &cli_sa); 123 | if (rc != 0) 124 | return rc; 125 | 126 | rc = resmon_sock_open(cli_sa, cli); 127 | if (rc != 0) 128 | return rc; 129 | 130 | *peer = (struct resmon_sock) { 131 | .fd = cli->fd, 132 | .sa = ctl_sa, 133 | .len = sizeof(peer->sa), 134 | }; 135 | rc = connect(cli->fd, (struct sockaddr *) &peer->sa, peer->len); 136 | if (rc != 0) { 137 | fprintf(stderr, "Failed to connect to %s: %m\n", 138 | peer->sa.sun_path); 139 | goto close_cli; 140 | } 141 | 142 | return 0; 143 | 144 | close_cli: 145 | resmon_sock_close_c(cli); 146 | return -1; 147 | 148 | } 149 | 150 | void resmon_sock_close_c(struct resmon_sock *cli) 151 | { 152 | resmon_sock_close(cli); 153 | } 154 | 155 | int resmon_sock_recv(struct resmon_sock *sock, struct resmon_sock *peer, 156 | char **bufp) 157 | { 158 | ssize_t msgsz; 159 | char *buf; 160 | ssize_t n; 161 | int rc; 162 | 163 | *bufp = NULL; 164 | *peer = (struct resmon_sock) { 165 | .fd = sock->fd, 166 | .len = sizeof(peer->sa), 167 | }; 168 | msgsz = recvfrom(sock->fd, NULL, 0, MSG_PEEK | MSG_TRUNC, 169 | (struct sockaddr *) &peer->sa, &peer->len); 170 | if (msgsz < 0) { 171 | fprintf(stderr, "Failed to receive data on control socket: %m\n"); 172 | return -1; 173 | } 174 | 175 | buf = calloc(1, msgsz + 1); 176 | if (buf == NULL) { 177 | fprintf(stderr, "Failed to allocate control message buffer: %m\n"); 178 | return -1; 179 | } 180 | 181 | n = recv(sock->fd, buf, msgsz, 0); 182 | if (n < 0) { 183 | fprintf(stderr, "Failed to receive data on control socket: %m\n"); 184 | rc = -1; 185 | goto out; 186 | } 187 | buf[n] = '\0'; 188 | 189 | *bufp = buf; 190 | buf = NULL; 191 | rc = 0; 192 | 193 | out: 194 | free(buf); 195 | return rc; 196 | } 197 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/hdroom_dump.txt: -------------------------------------------------------------------------------- 1 | NAME 2 | 3 | hdroom_dump - Dump mlxsw headroom configuration 4 | 5 | SYNOPSIS: 6 | 7 | hdroom_dump 8 | 9 | DESCRIPTION: 10 | 11 | On Spectrum, port buffers, also called port headroom, is where packets 12 | are stored while they are parsed and the forwarding decision is being 13 | made. For lossless traffic flows, in case shared buffer admission is not 14 | allowed, headroom is also where to put the extra traffic received before 15 | the sent PAUSE takes effect. Another aspect of the port headroom is the 16 | so called internal buffer, which is used for packets that are mirrored 17 | due to triggers that the Spectrum ASIC considers "egress". Besides ACL 18 | mirroring on port egress this includes also packets mirrored due to ECN 19 | marking. 20 | 21 | hdroom_dump is a tool written on top of drgn, for dumping internal 22 | structures of mlxsw driver related to headroom management. That way it 23 | is possible to find out how headroom is configured in the ASIC. 24 | 25 | OUTPUT 26 | 27 | As an output, hdroom_dump emits a JSON object with the following 28 | attributes: 29 | 30 | .cell_size 31 | Granularity of the chip memory. The same cell size is reported also 32 | through "devlink sb pool show". 33 | 34 | .max_headroom_cells 35 | Maximum number of cells that FW permits to configure for headroom 36 | on one port. 37 | 38 | .ports..mode 39 | "DCB" or "TC". In DCB mode, headroom is autoconfigured from ETS 40 | configuration. In "TC" mode, it is configured through the DCB 41 | buffer commands. 42 | 43 | .ports..mtu 44 | Current port MTU value in bytes. 45 | 46 | .ports..max_mtu 47 | Maximum MTU permitted on this port. 48 | 49 | .ports..max_speed 50 | Maximum speed of this port in Mbps. 51 | 52 | .ports..delay_bytes 53 | Number of bytes of delay configured through DCB PFC interface. 54 | 55 | .ports..prios..ets_buf_idx 56 | .ports..prios..set_buf_idx 57 | .ports..prios..buf_idx 58 | The index of the buffer that should take traffic with priority 59 | , as configured through, respectively, DCB ETS, DCB buffer 60 | and that was actually configured in the ASIC. 61 | 62 | .ports..prios..lossy 63 | Whether this priority is marked as lossy by the DCB PFC 64 | configuration. 65 | 66 | .ports..bufs..size_cells 67 | Buffer size in cells. 68 | 69 | .ports..bufs..thres_cells 70 | Buffer Xon/Xoff threshold in cells. 71 | 72 | .ports..bufs..lossy 73 | Whether the given buffer is lossy. In Spectrum, lossiness is an 74 | attribute of a buffer, not of a priority, so this shows how the 75 | buffer was actually configured. 76 | 77 | .ports..int_buf.enable 78 | Whether the internal mirroring buffer is enabled, i.e. whether 79 | there are any mirroring sessions that are considered "egress". 80 | 81 | .ports..int_buf.size_cells 82 | Internal mirroring buffer size in cells. 83 | 84 | .ports..int_buf.reserve_cells 85 | Amount of space in cells that is always reserved in the headroom 86 | for the internal mirroring buffer. 87 | 88 | hdroom_dump always outputs the complete information. Filtering and 89 | querying can be done e.g. through `jq'. 90 | 91 | EXAMPLE: 92 | 93 | # hdroom_dump | jq .ports.swp1 94 | { 95 | "max_mtu": 10240, 96 | "max_speed": 25000, 97 | "mode": "TC", 98 | "mtu": 1500, 99 | "delay_bytes": 0, 100 | "prios": { 101 | "0": { 102 | "buf_idx": 0, 103 | "ets_buf_idx": 0, 104 | "set_buf_idx": 0, 105 | "lossy": true 106 | }, 107 | [...] 108 | }, 109 | "bufs": { 110 | "0": { 111 | "thres_cells": 32, 112 | "size_cells": 4627, 113 | "lossy": true 114 | }, 115 | [...] 116 | "9": { 117 | "thres_cells": 0, 118 | "size_cells": 107, 119 | "lossy": false 120 | } 121 | }, 122 | "int_buf": { 123 | "enable": true, 124 | "size_cells": 268, 125 | "reserve_cells": 268 126 | } 127 | } 128 | 129 | SEE ALSO: 130 | 131 | https://github.com/Mellanox/mlxsw/wiki 132 | https://drgn.readthedocs.io 133 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/trapagg-exporter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Collect trap metrics and publish them via http or save them to a file.""" 3 | import argparse 4 | import logging 5 | import os 6 | import prometheus_client 7 | import re 8 | import subprocess 9 | import sys 10 | import time 11 | 12 | from prometheus_client.core import CounterMetricFamily 13 | 14 | 15 | class TrapaggCollector(object): 16 | """Collect aggregated per-{trap, flow} metrics and publish them via http 17 | or save them to a file.""" 18 | 19 | def __init__(self, args=None): 20 | """Construct the object and parse the arguments.""" 21 | self.args = None 22 | if not args: 23 | args = sys.argv[1:] 24 | self._parse_args(args) 25 | 26 | def _parse_args(self, args): 27 | """Parse CLI args and set them to self.args.""" 28 | parser = argparse.ArgumentParser() 29 | group = parser.add_mutually_exclusive_group(required=True) 30 | group.add_argument( 31 | '-f', 32 | '--textfile-name', 33 | dest='textfile_name', 34 | help=('Full file path where to store data for node ' 35 | 'collector to pick up') 36 | ) 37 | group.add_argument( 38 | '-l', 39 | '--listen', 40 | dest='listen', 41 | help='Listen host:port, i.e. 0.0.0.0:9417' 42 | ) 43 | parser.add_argument( 44 | '-i', 45 | '--interval', 46 | dest='interval', 47 | type=int, 48 | help=('Number of seconds between updates of the textfile. ' 49 | 'Default is 5 seconds') 50 | ) 51 | parser.add_argument( 52 | '-1', 53 | '--oneshot', 54 | dest='oneshot', 55 | action='store_true', 56 | default=False, 57 | help='Run only once and exit. Useful for running in a cronjob' 58 | ) 59 | arguments = parser.parse_args(args) 60 | if arguments.oneshot and not arguments.textfile_name: 61 | logging.error('Oneshot has to be used with textfile mode') 62 | parser.print_help() 63 | sys.exit(1) 64 | if arguments.interval and not arguments.textfile_name: 65 | logging.error('Interval has to be used with textfile mode') 66 | parser.print_help() 67 | sys.exit(1) 68 | if not arguments.interval: 69 | arguments.interval = 5 70 | self.args = vars(arguments) 71 | 72 | def trapagg_output_get(self, command): 73 | """Execute command and return output.""" 74 | try: 75 | proc = subprocess.Popen(command, stdout=subprocess.PIPE) 76 | except OSError as e: 77 | logging.critical(e.strerror) 78 | sys.exit(1) 79 | return proc.stdout.readlines() 80 | 81 | def update_trapagg_stats(self, counter): 82 | """Update counter with statistics from trapagg.""" 83 | command = ['./trapagg', '-s'] 84 | output = self.trapagg_output_get(command) 85 | for line in output[2:]: 86 | columns = line.decode('utf-8').split() 87 | counter.add_metric(columns[:-1], columns[-1]) 88 | 89 | def collect(self): 90 | """ 91 | Collect the metrics. 92 | 93 | Collect the metrics and yield them. Prometheus client library 94 | uses this method to respond to http queries or save them to disk. 95 | """ 96 | output = self.trapagg_output_get(['./trapagg', '-s']) 97 | 98 | labels = [l.lower() for l in output[1].decode('utf-8').split()] 99 | counter = CounterMetricFamily('node_net_trapagg', 100 | 'Aggregated trap data', labels=labels) 101 | 102 | for line in output[2:]: 103 | columns = line.decode('utf-8').split() 104 | counter.add_metric(columns[:-1], columns[-1]) 105 | 106 | yield counter 107 | 108 | if __name__ == '__main__': 109 | collector = TrapaggCollector() 110 | registry = prometheus_client.CollectorRegistry() 111 | registry.register(collector) 112 | args = collector.args 113 | if args['listen']: 114 | (ip, port) = args['listen'].split(':') 115 | prometheus_client.start_http_server(port=int(port), 116 | addr=ip, registry=registry) 117 | while True: 118 | time.sleep(3600) 119 | if args['textfile_name']: 120 | while True: 121 | collector.collect() 122 | prometheus_client.write_to_textfile(args['textfile_name'], 123 | registry) 124 | if collector.args['oneshot']: 125 | sys.exit(0) 126 | time.sleep(args['interval']) 127 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/fid_dump.py: -------------------------------------------------------------------------------- 1 | # fid_dump Dump FID configuration in JSON format 2 | 3 | from socket import ntohl 4 | from mlxsw_drgn import * 5 | import json 6 | import sys 7 | 8 | mlxsw_sp = MlxswSp.find() 9 | dump = {} 10 | 11 | def mid_indexes_dump(fid, dump_fid): 12 | dump_mid_indexes = {} 13 | dump_fid["flood_mid_indexes"] = dump_mid_indexes 14 | 15 | fid_family = fid.fid_family 16 | flood_tables = fid_family.flood_tables 17 | 18 | num_fids_in_family = fid_family.end_index.value_() - \ 19 | fid_family.start_index.value_() + 1 20 | 21 | for i in range(fid_family.nr_flood_tables.value_()): 22 | flood_table = flood_tables[i] 23 | packet_type_n = enum_name(flood_table.packet_type) 24 | 25 | # Same to the calculation in mlxsw_sp_fid_flood_table_mid(). 26 | mid_index = fid_family.pgt_base.value_() + \ 27 | num_fids_in_family * flood_table.table_index.value_() + \ 28 | fid.fid_offset.value_() 29 | dump_mid_indexes[packet_type_n] = mid_index 30 | 31 | dump_ports = {} 32 | dump["ports"] = dump_ports 33 | 34 | for mlxsw_sp_port in mlxsw_sp.ports(): 35 | if mlxsw_sp_port.dev.value_() == 0: 36 | continue 37 | 38 | dump_port = {} 39 | dump_ports[mlxsw_sp_port.name()] = dump_port 40 | 41 | local_port = mlxsw_sp_port.local_port.value_() 42 | virtual = mlxsw_sp.fid_core.port_fid_mappings[local_port] != 0 43 | 44 | dump_port["local_port"] = local_port 45 | dump_port["virtual"] = virtual 46 | 47 | dump_fid_families = {} 48 | dump["fid_families"] = dump_fid_families 49 | 50 | for family in mlxsw_sp.fid_core.fid_family_arr: 51 | dump_fid_family = {} 52 | family_type_n = enum_name(family.type) 53 | dump_fid_families[family_type_n] = dump_fid_family 54 | 55 | dump_fid_family["start_index"] = family.start_index.value_() 56 | dump_fid_family["end_index"] = family.end_index.value_() 57 | dump_fid_family["rif_type"] = enum_name(family.rif_type) 58 | 59 | dump_fids = {} 60 | dump_fid_family["fids"] = dump_fids 61 | 62 | for fid in helpers.list_for_each_entry("struct mlxsw_sp_fid", 63 | family.fids_list.address_of_(), 64 | "list"): 65 | dump_fid = {} 66 | dump_fids[fid.fid_index.value_()] = dump_fid 67 | dump_fid["fid_offset"] = fid.fid_offset.value_() 68 | 69 | if family_type_n == "8021Q": 70 | fid_8021q = drgn.container_of(fid, "struct mlxsw_sp_fid_8021q", 71 | "common") 72 | dump_fid["vid"] = fid_8021q.vid.value_() 73 | 74 | if family_type_n == "8021D": 75 | fid_8021d = drgn.container_of(fid, "struct mlxsw_sp_fid_8021d", 76 | "common") 77 | br_ifindex = fid_8021d.br_ifindex.value_() 78 | br_dev = helpers.net.netdev_get_by_index(mlxsw_sp.netns(), 79 | br_ifindex) 80 | 81 | dump_fid["br_ifindex"] = br_ifindex 82 | dump_fid["br_ifname"] = br_dev.name.string_().decode("utf-8") 83 | 84 | dump_fid["ref_count"] = fid.ref_count.refs.counter.value_() 85 | 86 | if fid.rif.value_(): 87 | rif_dump = {} 88 | dump_fid["rif"] = rif_dump 89 | 90 | rif_dump["index"] = fid.rif.rif_index.value_() 91 | rif_dump["ifindex"] = fid.rif.dev.ifindex.value_() 92 | rif_dump["ifname"] = fid.rif.dev.name.string_().decode("utf-8") 93 | 94 | if fid.vni_valid.value_(): 95 | nve_ifindex = fid.nve_ifindex.value_() 96 | nve_dev = helpers.net.netdev_get_by_index(mlxsw_sp.netns(), 97 | nve_ifindex) 98 | 99 | dump_fid["vni"] = ntohl(fid.vni.value_()) 100 | dump_fid["nve_ifindex"] = nve_ifindex 101 | dump_fid["nve_ifname"] = nve_dev.name.string_().decode("utf-8") 102 | 103 | if fid.nve_flood_index_valid.value_(): 104 | dump_fid["nve_flood_index"] = fid.nve_flood_index.value_() 105 | 106 | if family_type_n == "8021Q" or family_type_n == "8021D": 107 | mid_indexes_dump(fid, dump_fid) 108 | 109 | dump_port_vid_list = [] 110 | dump_fid["port_vid_list"] = dump_port_vid_list 111 | for port_vid in \ 112 | helpers.list_for_each_entry("struct mlxsw_sp_fid_port_vid", 113 | fid.port_vid_list.address_of_(), 114 | "list"): 115 | dump_port_vid = {} 116 | dump_port_vid_list.append(dump_port_vid) 117 | dump_port_vid["local_port"] = port_vid.local_port.value_() 118 | dump_port_vid["vid"] = port_vid.vid.value_() 119 | 120 | sys.stdout.write(json.dumps(dump)) 121 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-exporter.in: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Collect resmon stats and publish them via http or save them to a file.""" 3 | import argparse 4 | import json 5 | import logging 6 | import os 7 | import prometheus_client 8 | import sys 9 | import time 10 | import socket 11 | import tempfile 12 | 13 | from prometheus_client.core import GaugeMetricFamily 14 | 15 | RESMON_DEFAULT_SOCKDIR = "@RUNSTATEDIR@" 16 | 17 | 18 | class bind_socket: 19 | def __init__(self, sock): 20 | self._sock = sock 21 | self._name = None 22 | 23 | def __enter__(self): 24 | while True: 25 | self._name = tempfile.mktemp(dir="/var/run/", 26 | prefix="resmon.exporter.") 27 | try: 28 | self._sock.bind(self._name) 29 | break 30 | except OSError: 31 | pass 32 | 33 | def __exit__(self, exc_type, exc_value, exc_traceback): 34 | os.unlink(self._name) 35 | 36 | class ResmonCollector(object): 37 | """Collect resmon stats and publish them via http or save them to a 38 | file.""" 39 | 40 | def __init__(self, args): 41 | """Construct the object and parse the arguments.""" 42 | self.args = self._parse_args(args) 43 | 44 | @staticmethod 45 | def _parse_args(args): 46 | parser = argparse.ArgumentParser() 47 | group = parser.add_mutually_exclusive_group(required=True) 48 | group.add_argument( 49 | '-f', 50 | '--textfile-name', 51 | dest='textfile_name', 52 | help=('Full file path where to store data for node ' 53 | 'collector to pick up') 54 | ) 55 | group.add_argument( 56 | '-l', 57 | '--listen', 58 | dest='listen', 59 | help='Listen host:port, i.e. 0.0.0.0:9417' 60 | ) 61 | parser.add_argument( 62 | '-i', 63 | '--interval', 64 | dest='interval', 65 | type=int, 66 | help=('Number of seconds between updates of the textfile. ' 67 | 'Default is 5 seconds') 68 | ) 69 | parser.add_argument( 70 | '-1', 71 | '--oneshot', 72 | dest='oneshot', 73 | action='store_true', 74 | default=False, 75 | help='Run only once and exit. Useful for running in a cronjob' 76 | ) 77 | parser.add_argument( 78 | '--resmon-sockdir', 79 | dest='resmon_sockdir', 80 | default=RESMON_DEFAULT_SOCKDIR, 81 | help='The directory where resmon socket is located' 82 | ) 83 | arguments = parser.parse_args(args) 84 | if arguments.oneshot and not arguments.textfile_name: 85 | logging.error('Oneshot has to be used with textfile mode') 86 | parser.print_help() 87 | sys.exit(1) 88 | if arguments.interval and not arguments.textfile_name: 89 | logging.error('Interval has to be used with textfile mode') 90 | parser.print_help() 91 | sys.exit(1) 92 | if not arguments.interval: 93 | arguments.interval = 5 94 | return vars(arguments) 95 | 96 | def resmon_jsonout_get(self): 97 | """Open socket, execute stats command and return JSON output.""" 98 | with socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) as sock: 99 | with bind_socket(sock): 100 | sock.sendto(b"""{ "jsonrpc": "2.0", "id": 1, "method": "stats" }""", 101 | "%s/resmon.ctl" % self.args['resmon_sockdir']) 102 | data = sock.recv(2048) 103 | return json.loads(data) 104 | 105 | def update_resmon_stats(self, gauge_val, gauge_cap): 106 | """Update gauge with statistics from resmon.""" 107 | jsonout = self.resmon_jsonout_get() 108 | 109 | try: 110 | for count in jsonout["result"]["gauges"]: 111 | labels = [count["name"], count["descr"]] 112 | gauge_val.add_metric(labels, count["value"]) 113 | gauge_cap.add_metric(labels, count["capacity"]) 114 | except KeyError: 115 | print("Failed to get stats") 116 | sys.exit(1) 117 | 118 | def collect(self): 119 | """ 120 | Collect the stats. 121 | 122 | Collect the stats and yield them. Prometheus client library 123 | uses this method to respond to http queries or save them to disk. 124 | """ 125 | gauge_val = GaugeMetricFamily('node_net_resmon_stats', 126 | 'Resmon stats', 127 | labels=['name', 'descr']) 128 | 129 | gauge_cap = GaugeMetricFamily('node_net_resmon_stats_capacity', 130 | 'Resmon stats capacity', 131 | labels=['name', 'descr']) 132 | 133 | self.update_resmon_stats(gauge_val, gauge_cap) 134 | yield gauge_val 135 | yield gauge_cap 136 | 137 | if __name__ == '__main__': 138 | collector = ResmonCollector(sys.argv[1:]) 139 | 140 | registry = prometheus_client.CollectorRegistry() 141 | registry.register(collector) 142 | args = collector.args 143 | if args['listen']: 144 | (ip, port) = args['listen'].split(':') 145 | prometheus_client.start_http_server(port=int(port), 146 | addr=ip, registry=registry) 147 | while True: 148 | time.sleep(3600) 149 | if args['textfile_name']: 150 | while True: 151 | collector.collect() 152 | prometheus_client.write_to_textfile(args['textfile_name'], 153 | registry) 154 | if collector.args['oneshot']: 155 | sys.exit(0) 156 | time.sleep(args['interval']) 157 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/trapagg_example.txt: -------------------------------------------------------------------------------- 1 | Demonstration of trapagg. 2 | 3 | Devices capable of offloading the kernel’s datapath and perform 4 | functions such as bridging and routing must also be able to send 5 | specific packets to the kernel (i.e., the CPU) for processing. The 6 | fundamental ability of sending certain packets to the kernel for 7 | processing is called “packet trapping”. 8 | 9 | The devlink-trap [1] mechanism allows capable device drivers to register 10 | their supported packet traps with devlink and report trapped packets to 11 | devlink for further analysis. 12 | 13 | trapagg traces trapped packets and stores per-{trap, flow} statistics in 14 | a BPF map. This is achieved by attaching a BPF program to the 15 | 'devlink_trap_report' tracepoint, which is triggered whenever a packet 16 | trap is received. The aggregated statistics can then be read from user 17 | space. For example: 18 | 19 | # ./trapagg 20 | Tracing packet traps... Hit Ctrl-C to end. 21 | ^C 22 | 23 | TRAP SIP DIP SPORT DPORT IP_PROTO IS_ENCAP 24 | ingress_vlan_filter 192.0.2.1 192.0.2.2 12345 3131 17 0 1247 25 | ingress_vlan_filter 192.0.2.1 192.0.2.2 12345 44 17 0 1440 26 | 27 | It is possible to only trace packet traps of a specific type. For 28 | example, to only trace control traps: 29 | 30 | # ./trapagg -c 31 | Tracing packet traps... Hit Ctrl-C to end. 32 | ^C 33 | 34 | TRAP SIP DIP SPORT DPORT IP_PROTO IS_ENCAP 35 | ipv6_neigh_solicit :: ff02::1:ff00:2 0 0 58 0 1 36 | mld_v2_report :: ff02::16 0 0 0 0 2 37 | ipv6_neigh_solicit 2001:db8:1::2 ff02::1:ff00:1 0 0 58 0 1 38 | local_route 2001:db8:1::2 2001:db8:1::1 0 0 58 0 4 39 | local_route 192.0.2.2 192.0.2.1 0 0 1 0 186 40 | 41 | Upon termination of the trapagg process, the file descriptor associated 42 | with the trace event will be closed and the BPF program will be detached 43 | from the tracepoint. This is inconvenient in case user space wants 44 | to have the BPF program aggregating statistics for a long time and to 45 | only read them periodically. The solution is to pin both the BPF map and 46 | the BPF link [2] to the BPF file system [4]. For example: 47 | 48 | # ./trapagg -p 49 | # ./trapagg -s 50 | 51 | TRAP SIP DIP SPORT DPORT IP_PROTO IS_ENCAP 52 | ipv6_neigh_advert 2001:db8:1::2 fe80::7efe:90ff:feff:27d1 0 0 58 0 1 53 | local_route 2001:db8:1::2 2001:db8:1::1 0 0 58 0 11 54 | ingress_vlan_filter 192.0.2.1 192.0.2.2 12345 3131 17 0 1595 55 | 56 | These aggregated statistics can then be exported to Prometheus [5] using 57 | trapagg-exporter.py: 58 | 59 | # ./trapagg-exporter.py -l 0.0.0.0:9432 60 | 61 | The aggregated statistics are then available over HTTP for the 62 | Prometheus to periodically query: 63 | 64 | # HELP node_net_trapagg_total Aggregated trap data 65 | # TYPE node_net_trapagg_total counter 66 | node_net_trapagg_total{dip="fe80::7efe:90ff:feff:27d1",dport="0",ip_proto="58",is_encap="0",sip="2001:db8:1::2",sport="0",trap="ipv6_neigh_advert"} 6.0 67 | node_net_trapagg_total{dip="2001:db8:1::1",dport="0",ip_proto="58",is_encap="0",sip="2001:db8:1::2",sport="0",trap="local_route"} 11.0 68 | node_net_trapagg_total{dip="192.0.2.2",dport="3131",ip_proto="17",is_encap="0",sip="192.0.2.1",sport="12345",trap="ingress_vlan_filter"} 6817.0 69 | 70 | USAGE message: 71 | 72 | # ./trapagg --help 73 | Usage: trapagg [OPTION...] 74 | Dump aggregated per-{trap, flow} statistics. 75 | 76 | USAGE: trapagg [--help] [-d] [-e] [-c] [-p] [-u] [-s] [-T] [-v] [interval] 77 | [count] 78 | 79 | EXAMPLES: 80 | trapagg # dump aggregated per-{trap, flow} statistics 81 | trapagg -d # dump aggregated statistics of drop traps only 82 | trapagg -p # pin BPF objects and exit 83 | trapagg -u # unpin BPF objects and exit 84 | trapagg -s # dump statistics from pinned objects and exit 85 | trapagg 1 10 # print 1 second summaries, 10 times 86 | trapagg -T 1 # 1s summaries with timestamps 87 | 88 | -c, --control Trace control traps only 89 | -d, --drop Trace drop traps only 90 | -e, --exception Trace exception traps only 91 | -p, --pin Pin BPF objects and exit 92 | -s, --stats Dump aggregated statistics from pinned objects and 93 | exit 94 | -T, --timestamp Include timestamp on output 95 | -u, --unpin Unpin BPF objects and exit 96 | -v, --verbose Verbose debug output 97 | -?, --help Give this help list 98 | --usage Give a short usage message 99 | -V, --version Print program version 100 | 101 | Report bugs to . 102 | 103 | [1] https://www.kernel.org/doc/html/latest/networking/devlink/devlink-trap.html 104 | [2] https://lore.kernel.org/netdev/20200228223948.360936-1-andriin@fb.com/ 105 | [3] https://facebookmicrosites.github.io/bpf/blog/2018/08/31/object-lifetime.html#bpffs 106 | [4] https://docs.cilium.io/en/v1.6/kubernetes/requirements/#mounted-bpf-filesystem 107 | [5] https://prometheus.io/ 108 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/fid_dump_ub0.txt: -------------------------------------------------------------------------------- 1 | NAME 2 | 3 | fid_dump_ub0 - Dump mlxsw filtering identifier (FID) configuration 4 | 5 | SYNOPSIS: 6 | 7 | fid_dump_ub0 8 | 9 | DESCRIPTION: 10 | 11 | fid_dump_ub0 is a tool written on top of drgn, for dumping internal 12 | structures of mlxsw driver related to FID management. That way it is 13 | possible to find out how FIDs are configured in the ASIC. This 14 | is the old version of the tool, which can be used to debug 15 | kernels which use the legacy model. Starting from kernel commit 16 | 798661c73672 ("Merge branch 'mlxsw-unified-bridge-conversion-part-6'"), 17 | the 'fid_dump' tool should be used. 18 | 19 | OUTPUT 20 | 21 | As an output, fid_dump_ub0 emits a JSON object with the following 22 | attributes: 23 | 24 | .ports..local_port 25 | The local port number of the port. 26 | 27 | .ports..virtual 28 | Whether the port is in virtual mode or not. When in virtual 29 | mode, ingress FID classification is performed based on 30 | {Port, VID}. Otherwise, based on VID alone. Virtual mode is 31 | less scalable as it requires more FID classification 32 | entries to be present in the ASIC. 33 | 34 | .fid_families. 35 | The FID family. See kernel commit a11074872545 ("mlxsw: 36 | spectrum: Implement common FID core") for more details. 37 | 38 | .fid_families..start_index 39 | Start index (inclusive) for FID allocation from given FID 40 | family. 41 | 42 | .fid_families..end_index 43 | End index (inclusive) for FID allocation for given FID 44 | family. 45 | 46 | .fid_families..rif_type 47 | The type of router interface (RIF) used for RIFs constructed 48 | on top of FIDs from the given FID family. 49 | 50 | .fid_families..fids..vid 51 | VLAN identifier (VID) mapped to the FID. Only valid for 52 | "8021Q" FIDs. 53 | 54 | .fid_families..fids..br_ifindex 55 | VLAN-unaware bridge interface index mapped to the FID. Only 56 | valid for "8021D" FIDs. 57 | 58 | .fid_families..fids..br_ifname 59 | VLAN-unaware bridge interface name mapped to the FID. Only 60 | valid for "8021D" FIDs. 61 | 62 | .fid_families..fids..ref_count 63 | FID reference count. 64 | 65 | .fid_families..fids..rif.index 66 | Index of the RIF constructed on top of the FID. 67 | 68 | .fid_families..fids..rif.ifindex 69 | Interface index of the net device represented by the RIF. 70 | 71 | .fid_families..fids..rif.ifname 72 | Interface name of the net device represented by the RIF. 73 | 74 | .fid_families..fids..vni 75 | Virtual Network Identifier (VNI) mapped to the FID. 76 | 77 | .fid_families..fids..nve_ifindex 78 | Interface index of the Network Virtualization Endpoint (NVE) 79 | whose VNI is mapped to the FID. 80 | 81 | .fid_families..fids..nve_ifname 82 | Interface name of the Network Virtualization Endpoint (NVE) 83 | whose VNI is mapped to the FID. 84 | 85 | .fid_families..fids..nve_flood_index 86 | Index to a linked list of underlay IP addresses in the KVDL 87 | to which BUM packets should be replicated to. See kernel 88 | commit 90ea0bb55115 ("mlxsw: spectrum: Add a new type of KVD 89 | linear record") for more details. 90 | 91 | fid_dump_ub0 always outputs the complete information. Filtering and 92 | querying can be done e.g. through `jq`. 93 | 94 | EXAMPLE: 95 | 96 | # fid_dump_ub0 | jq 97 | { 98 | "ports": { 99 | "swp18": { 100 | "local_port": 1, 101 | "virtual": false 102 | }, 103 | [...] 104 | }, 105 | "fid_families": { 106 | "8021Q": { 107 | "start_index": 5120, 108 | "end_index": 9213, 109 | "rif_type": "VLAN", 110 | "fids": {} 111 | }, 112 | "8021D": { 113 | "start_index": 4096, 114 | "end_index": 5119, 115 | "rif_type": "FID", 116 | "fids": { 117 | "4096": { 118 | "br_ifindex": 96, 119 | "br_ifname": "br0", 120 | "ref_count": 3, 121 | "rif": { 122 | "index": 2, 123 | "ifindex": 96, 124 | "ifname": "br0" 125 | }, 126 | "vni": 10, 127 | "nve_ifindex": 97, 128 | "nve_ifname": "vxlan0", 129 | "nve_flood_index": 0 130 | } 131 | } 132 | }, 133 | "RFID": { 134 | "start_index": 15360, 135 | "end_index": 16383, 136 | "rif_type": "SUBPORT", 137 | "fids": { 138 | "15364": { 139 | "ref_count": 2, 140 | "rif": { 141 | "index": 4, 142 | "ifindex": 72, 143 | "ifname": "swp4" 144 | } 145 | }, 146 | "15363": { 147 | "ref_count": 2, 148 | "rif": { 149 | "index": 3, 150 | "ifindex": 71, 151 | "ifname": "swp3" 152 | } 153 | }, 154 | "15361": { 155 | "ref_count": 2, 156 | "rif": { 157 | "index": 1, 158 | "ifindex": 73, 159 | "ifname": "swp1" 160 | } 161 | } 162 | } 163 | }, 164 | "DUMMY": { 165 | "start_index": 4095, 166 | "end_index": 4095, 167 | "rif_type": "SUBPORT", 168 | "fids": { 169 | "4095": { 170 | "ref_count": 1 171 | } 172 | } 173 | } 174 | } 175 | } 176 | 177 | SEE ALSO: 178 | 179 | https://github.com/Mellanox/mlxsw/wiki 180 | https://drgn.readthedocs.io 181 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/emadump.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 | #include 3 | #define PCAP_DONT_INCLUDE_PCAP_BPF_H 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "emadump.h" 14 | #include "emadump.skel.h" 15 | #include "trace_helpers.h" 16 | 17 | static struct env { 18 | bool errors; 19 | uint64_t thresh_us; 20 | const char *dumpfile; 21 | bool verbose; 22 | pcap_t *pcap_handle; 23 | pcap_dumper_t *pcap_dumper; 24 | struct timeval boot_tv; /* Boot time relative to the Epoch. */ 25 | } env = { 26 | .dumpfile = "/dev/stdout", 27 | }; 28 | 29 | static volatile bool exiting; 30 | 31 | const char *argp_program_version = "emadump 0.0"; 32 | const char *argp_program_bug_address = ""; 33 | const char argp_program_doc[] = 34 | "Dump EMADs to a PCAP file.\n" 35 | "\n" 36 | "USAGE: emadump [--help] [-e] [-l] [-f] [-v]\n" 37 | "\n" 38 | "EXAMPLES:\n" 39 | " emadump # dump all EMADs to stdout\n" 40 | " emadump -e # only dump EMADs (request & response) with errors\n" 41 | " emadump -l 1000 # only dump EMADs that took longer than 1000 usecs\n" 42 | " emadump -f emads.pcap # dump EMADs to emads.pcap instead of stdout\n"; 43 | 44 | static const struct argp_option opts[] = { 45 | { "errors", 'e', NULL, 0, "Only dump EMADs with errors" }, 46 | { "latency", 'l', "LAT", 0, 47 | "Only dump EMADs that took longer than specified threshold in microseconds" }, 48 | { "file", 'f', "FILE", 0, "Dump EMADs to this file" }, 49 | { "verbose", 'v', NULL, 0, "Verbose debug output" }, 50 | {}, 51 | }; 52 | 53 | static error_t parse_arg(int key, char *arg, struct argp_state *state) 54 | { 55 | switch (key) { 56 | case 'e': 57 | env.errors = true; 58 | break; 59 | case 'l': 60 | errno = 0; 61 | env.thresh_us = strtoull(arg, NULL, 0); 62 | if (errno) { 63 | fprintf(stderr, "Invaild threshold specified\n"); 64 | argp_usage(state); 65 | } 66 | break; 67 | case 'f': 68 | env.dumpfile = arg; 69 | break; 70 | case 'v': 71 | env.verbose = true; 72 | break; 73 | case ARGP_KEY_ARG: 74 | argp_usage(state); 75 | break; 76 | default: 77 | return ARGP_ERR_UNKNOWN; 78 | } 79 | return 0; 80 | } 81 | 82 | static int emadump_pcap_init(void) 83 | { 84 | struct timespec tp_real, tp_mono; 85 | int err; 86 | 87 | env.pcap_handle = pcap_open_dead(DLT_EN10MB, EMAD_MAX_LEN); 88 | if (!env.pcap_handle) { 89 | perror("pcap_open_dead"); 90 | return -1; 91 | } 92 | 93 | env.pcap_dumper = pcap_dump_open(env.pcap_handle, env.dumpfile); 94 | if (!env.pcap_dumper) { 95 | pcap_perror(env.pcap_handle, "pcap_dump_open"); 96 | goto err_pcap_close; 97 | } 98 | 99 | /* Each event contains a timestamp which is recorded as number of 100 | * microseconds since boot (CLOCK_MONOTONIC), but in the packet header 101 | * we need to record a timestamp which is relative to the Epoch. We 102 | * therefore need to calculate the boot time relative to the Epoch. 103 | */ 104 | err = clock_gettime(CLOCK_REALTIME, &tp_real); 105 | if (err) 106 | goto err_pcap_dump_close; 107 | 108 | err = clock_gettime(CLOCK_MONOTONIC, &tp_mono); 109 | if (err) 110 | goto err_pcap_dump_close; 111 | 112 | env.boot_tv.tv_sec = tp_real.tv_sec - tp_mono.tv_sec; 113 | env.boot_tv.tv_usec = 0; 114 | 115 | return 0; 116 | 117 | err_pcap_dump_close: 118 | pcap_dump_close(env.pcap_dumper); 119 | err_pcap_close: 120 | pcap_close(env.pcap_handle); 121 | return -1; 122 | } 123 | 124 | static void emadump_pcap_fini(void) 125 | { 126 | pcap_dump_close(env.pcap_dumper); 127 | pcap_close(env.pcap_handle); 128 | } 129 | 130 | int libbpf_print_fn(enum libbpf_print_level level, const char *format, 131 | va_list args) 132 | { 133 | if (level == LIBBPF_DEBUG && !env.verbose) 134 | return 0; 135 | return vfprintf(stderr, format, args); 136 | } 137 | 138 | static void sig_handler(int sig) 139 | { 140 | exiting = true; 141 | } 142 | 143 | static int handle_event(void *ctx, void *data, size_t data_sz) 144 | { 145 | const struct emad_event *e = data; 146 | struct pcap_pkthdr hdr; 147 | 148 | hdr.caplen = e->len; 149 | hdr.len = e->len; 150 | hdr.ts.tv_sec = env.boot_tv.tv_sec + (e->ts / 1000000); 151 | hdr.ts.tv_usec = env.boot_tv.tv_usec + (e->ts % 1000000); 152 | 153 | pcap_dump((unsigned char *) env.pcap_dumper, &hdr, 154 | (const unsigned char *) e->buf); 155 | /* In case packets are written to stdout, make sure each packet is 156 | * immediately written and not buffered. 157 | */ 158 | fflush(NULL); 159 | 160 | return 0; 161 | } 162 | 163 | int main(int argc, char **argv) 164 | { 165 | static const struct argp argp = { 166 | .options = opts, 167 | .parser = parse_arg, 168 | .doc = argp_program_doc, 169 | }; 170 | struct ring_buffer *rb = NULL; 171 | struct emadump_bpf *obj; 172 | int err; 173 | 174 | err = argp_parse(&argp, argc, argv, 0, NULL, NULL); 175 | if (err) 176 | return err; 177 | 178 | libbpf_set_print(libbpf_print_fn); 179 | 180 | err = bump_memlock_rlimit(); 181 | if (err) { 182 | fprintf(stderr, "Failed to increase rlimit: %d\n", err); 183 | return 1; 184 | } 185 | 186 | obj = emadump_bpf__open(); 187 | if (!obj) { 188 | fprintf(stderr, "Failed to open BPF object\n"); 189 | return 1; 190 | } 191 | 192 | /* Initialize global data (filtering options). */ 193 | obj->rodata->targ_errors = env.errors; 194 | obj->rodata->targ_thresh_us = env.thresh_us; 195 | 196 | err = emadump_bpf__load(obj); 197 | if (err) { 198 | fprintf(stderr, "Failed to load BPF object: %d\n", err); 199 | goto cleanup; 200 | } 201 | 202 | err = emadump_bpf__attach(obj); 203 | if (err) { 204 | fprintf(stderr, "Failed to attach BPF program\n"); 205 | goto cleanup; 206 | } 207 | 208 | signal(SIGINT, sig_handler); 209 | signal(SIGTERM, sig_handler); 210 | 211 | /* Set up ring buffer polling. */ 212 | rb = ring_buffer__new(bpf_map__fd(obj->maps.rb), handle_event, NULL, 213 | NULL); 214 | if (!rb) { 215 | err = -1; 216 | fprintf(stderr, "Failed to create ring buffer\n"); 217 | goto cleanup; 218 | } 219 | 220 | err = emadump_pcap_init(); 221 | if (err) { 222 | fprintf(stderr, "Failed to initialize PCAP\n"); 223 | goto rb_free; 224 | } 225 | 226 | while (!exiting) { 227 | err = ring_buffer__poll(rb, 100 /* Timeout, ms */); 228 | /* Ctrl-C will cause -EINTR. */ 229 | if (err == -EINTR) { 230 | err = 0; 231 | break; 232 | } 233 | if (err < 0) { 234 | printf("Error polling ring buffer: %d\n", err); 235 | break; 236 | } 237 | } 238 | 239 | emadump_pcap_fini(); 240 | rb_free: 241 | ring_buffer__free(rb); 242 | cleanup: 243 | emadump_bpf__destroy(obj); 244 | 245 | return err != 0; 246 | } 247 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-dump.8.md: -------------------------------------------------------------------------------- 1 | % resmon-dump(8) | Linux 2 | 3 | NAME 4 | ==== 5 | 6 | `resmon dump` - Show the contents of the tables that the daemon uses to 7 | keep track of resource allocation. 8 | 9 | SYNOPSIS 10 | ======== 11 | 12 | `resmon dump list tables` 13 | 14 | `resmon dump table ` 15 | 16 | DESCRIPTION 17 | =========== 18 | 19 | The `dump` family of commands serves for introspecting state of the daemon. 20 | It allows querying which internal tables the daemon has, and dumping the 21 | contents of individual tables. 22 | 23 | Note that the `dump` tool introspects tables, not resources. Some tables 24 | hold more than one resource, and some resources are backed by more than one 25 | table. A resource reference in some form is to be expected in individual 26 | dumps, but is not required to be -- a helper table that is wholly 27 | resource-agnostic is theoretically possible. 28 | 29 | The command `resmon dump list tables` returns the list of names of tables 30 | that the daemon knows about. These names can be plugged into the `resmon 31 | dump table` command to request a dump of that table. 32 | 33 | In verbose mode, `resmon dump list tables` lists for each table also the 34 | number of rows, and a sequence number, a 32-bit integer value that gets 35 | incremented every time a row is added to or removed from the table. 36 | 37 | PARAMETERS 38 | ========== 39 | 40 | `table
` 41 | 42 | : Name of the table to dump. The list of valid table names can be obtained 43 | through the `dump list tables` command. 44 | 45 | EXAMPLE 46 | ======= 47 | 48 | ``` 49 | $ resmon emad string 08040000801382012d68bbc20004cbd3102100000000000000000` 50 | `00000000000000000000000000000000000000000000000000000` 51 | `00000000000000000000000000000000000000000000000000000` 52 | `00000000000000000000000000000000000000000000000000000` 53 | `00000000000000000000000000000000000000000000000000000` 54 | `0000000000000000000000000000000180f000000010000000000` 55 | `0000000020000000000000000000000000c601020380200002000` 56 | `00000000000000000000000000000000000000000000000010000 57 | $ resmon dump table ralue 58 | || dip | vr || resource | slots || 59 | || 198.1.2.3/32 | 0 || LPM_IPV4 | 1 || 60 | ``` 61 | 62 | RPC REQUEST: get_tables 63 | ======================= 64 | 65 | For the `dump list tables` command: 66 | 67 | ``` 68 | { 69 | "jsonrpc": "2.0", 70 | "id": $ID, 71 | "method": "get_tables" 72 | } 73 | ``` 74 | 75 | RPC RESPONSE: get_tables 76 | ======================== 77 | 78 | ``` 79 | { 80 | "jsonrpc": "2.0", 81 | "id": $ID, 82 | "result": { 83 | "tables": [ 84 | { 85 | "name": $NAME, 86 | "seqnn": $SEQNN, 87 | "nrows": $NROWS 88 | }, 89 | ... 90 | ] 91 | } 92 | } 93 | ``` 94 | 95 | `$NAME` is a string representing the name of the table, `$SEQNN` is a 96 | sequence number, a 32-bit quantity that gets incremented every time a row 97 | is added to or deleted from the table, and `$NROWS` is number of rows in 98 | the table. 99 | 100 | Caveats 101 | ------- 102 | 103 | This is a provisional API. It can change or go away in the future. 104 | 105 | RPC REQUEST: next_row 106 | ======================= 107 | 108 | ``` 109 | { 110 | "jsonrpc": "2.0", 111 | "id": $ID, 112 | "method": "next_row", 113 | "params": { 114 | "table": $NAME 115 | } 116 | } 117 | ``` 118 | 119 | See below for the usage details. 120 | 121 | Caveats 122 | ------- 123 | 124 | This is a provisional API. It can change or go away in the future. 125 | 126 | RPC RESPONSE: next_row 127 | ====================== 128 | 129 | Either the request returns one row from the table: 130 | 131 | ``` 132 | { 133 | "jsonrpc": "2.0", 134 | "id": $ID, 135 | "result": { 136 | "row": { 137 | "key": { 138 | $FIELD1: $VALUE1, 139 | $FIELD2: $VALUE2, 140 | ... 141 | }, 142 | "value": { 143 | $FIELD3: $VALUE3, 144 | $FIELD4: $VALUE4, 145 | ... 146 | } 147 | } 148 | } 149 | } 150 | ``` 151 | 152 | In this case, the result contains a JSON object with fields "key" and 153 | "value". The "key" object contains fields that form the unique key of this 154 | table record. The "value" object contains other information, typically what 155 | resource is being allocated by this row. 156 | 157 | Or it returns a null, indicating the iteration is over: 158 | 159 | ``` 160 | { 161 | "jsonrpc": "2.0", 162 | "id": $ID, 163 | "result": { 164 | "row": null 165 | } 166 | } 167 | ``` 168 | 169 | The `next_row` RPC is meant to be called repeatedly, while it keeps 170 | yielding rows. The iteration stops after a null row is returned. 171 | 172 | Caveats 173 | ------- 174 | 175 | The RPC is not designed to handle concurrent iteration (each table can be 176 | iterated only by one client at a time), and does not attempt to achieve 177 | atomicity. 178 | 179 | The iteration will be stopped prematurely if there were changes to the 180 | table that make maintenance of consistent cursor too difficult. Such 181 | changes will be reflected in seqnn bump. 182 | 183 | The RPC is best-effort in that it is unreasonably difficult to make sure 184 | that the dump is consistent. A way to maximize the likelihood of detecting 185 | an inconsistent dump is by: 186 | 187 | - observing table seqnn before and after the iteration: if it does not 188 | change, that is an indication that no rows were added or removed 189 | (however, though there is also a remote possibility that exactly 0x100M 190 | changes took place instead). 191 | 192 | - checking number of rows actually dumped. If it is fewer than the number 193 | of rows reported at the table before the dump, then the dump certainly is 194 | incomplete. However even if the number of rows matches, what could have 195 | happened is that concurrent access to the cursor causes this client to 196 | inadvertently restart the iteration. Then further concurrent accesses 197 | cause it to get exactly the right number of rows, but some are missing 198 | and some are duplicated. 199 | 200 | - looking for duplicates among the reported keys. 201 | 202 | The first two measures are implemented in the resmon command-line client. 203 | Users that rely on the interface in production (which they should not) and 204 | want to ensure consistent dumps, should implement a mutual exclusion scheme 205 | on the layer above the RPC. Provided a locking scheme, checking seqnn is 206 | realistically the only thing that is necessary to be quite sure about dump 207 | consistency. 208 | 209 | This is a provisional API. It can change or go away in the future. 210 | 211 | SEE ALSO 212 | ======== 213 | 214 | resmon(8) 215 | 216 | [JSON RPC specification][JSON RPC]. 217 | 218 | REPORTING ISSUES 219 | ================ 220 | 221 | To report issues please send an email to: mlxsw@nvidia.com. 222 | 223 | [JSON RPC]: https://www.jsonrpc.org/specification 224 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/emadlatency.bpf.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | #include "vmlinux.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "emadlatency.h" 8 | #include "bits.bpf.h" 9 | 10 | #define EMAD_ETH_HDR_LEN 0x10 11 | #define EMAD_OP_TLV_LEN 0x10 12 | #define EMAD_STRING_TLV_LEN 0x84 13 | #define EMAD_LATENCY_TLV_LEN 0x1C 14 | #define EMAD_OP_TLV_METHOD_MASK 0x7F 15 | 16 | #define EMAD_HDR_LEN EMAD_ETH_HDR_LEN + \ 17 | EMAD_OP_TLV_LEN + \ 18 | EMAD_STRING_TLV_LEN + \ 19 | EMAD_LATENCY_TLV_LEN 20 | 21 | #define EMAD_STRING_TLV_TYPE 2 22 | #define EMAD_LATENCY_TLV_TYPE 4 23 | 24 | enum { 25 | EMAD_OP_TLV_METHOD_QUERY = 1, 26 | EMAD_OP_TLV_METHOD_WRITE = 2, 27 | EMAD_OP_TLV_METHOD_EVENT = 5, 28 | }; 29 | 30 | struct emad_op_tlv { 31 | u16 resv1; 32 | u8 status; 33 | u8 resv2; 34 | u16 reg_id; 35 | u8 r_method; 36 | u8 resv3; 37 | u64 tid; 38 | }; 39 | 40 | struct emad_latency_tlv { 41 | u16 type_len; 42 | u16 resv1; 43 | u32 latency_time; 44 | u32 resv2; 45 | u32 resv3; 46 | u32 resv4; 47 | u32 resv5; 48 | u32 resv6; 49 | }; 50 | 51 | struct emad_type_len_tlv { 52 | u16 type_len; 53 | u16 pad; 54 | }; 55 | 56 | struct emad_type_len { 57 | u8 type; 58 | u16 len; 59 | }; 60 | 61 | static struct emad_type_len emad_decode_tl(u16 type_len_be) 62 | { 63 | u16 type_len = bpf_ntohs(type_len_be); 64 | 65 | return (struct emad_type_len) { 66 | .type = type_len >> 11, 67 | .len = type_len & 0x7ff, 68 | }; 69 | } 70 | 71 | #define MAX_ENTRIES 10240 72 | 73 | const volatile bool targ_ms = false; 74 | const volatile __u16 targ_reg_id = 0; 75 | 76 | struct { 77 | __uint(type, BPF_MAP_TYPE_HASH); 78 | __uint(max_entries, MAX_ENTRIES); 79 | __type(key, u64); 80 | __type(value, u64); 81 | } start SEC(".maps"); 82 | 83 | static struct hist initial_hist; 84 | 85 | struct { 86 | __uint(type, BPF_MAP_TYPE_HASH); 87 | __uint(max_entries, MAX_ENTRIES); 88 | __type(key, struct hist_key); 89 | __type(value, struct hist); 90 | } hists_e2e SEC(".maps"); 91 | 92 | struct { 93 | __uint(type, BPF_MAP_TYPE_HASH); 94 | __uint(max_entries, MAX_ENTRIES); 95 | __type(key, struct hist_key); 96 | __type(value, struct hist); 97 | } hists_fw SEC(".maps"); 98 | 99 | SEC("fentry/mlxsw_emad_transmit") 100 | int BPF_PROG(mlxsw_emad_transmit, struct mlxsw_core *mlxsw_core, 101 | struct mlxsw_reg_trans *trans) 102 | { 103 | struct emad_latency_tlv *latency_tlv = NULL; 104 | struct emad_type_len_tlv *tmp_tlv; 105 | struct emad_type_len type_len; 106 | u64 ts = bpf_ktime_get_ns(); 107 | struct emad_op_tlv *op_tlv; 108 | u8 emad[EMAD_HDR_LEN]; 109 | struct sk_buff *skb; 110 | u32 latency_time; 111 | u32 next_tlv_off; 112 | void *buf; 113 | 114 | skb = trans->tx_skb; 115 | buf = skb->data; 116 | 117 | bpf_probe_read(emad, EMAD_HDR_LEN, buf); 118 | 119 | next_tlv_off = EMAD_ETH_HDR_LEN; 120 | op_tlv = (struct emad_op_tlv *)(emad + next_tlv_off); 121 | 122 | /* Check if there is STRING_TLV. */ 123 | next_tlv_off += EMAD_OP_TLV_LEN; 124 | tmp_tlv = (struct emad_type_len_tlv *)(emad + next_tlv_off); 125 | type_len = emad_decode_tl(tmp_tlv->type_len); 126 | if (type_len.type == EMAD_STRING_TLV_TYPE) 127 | next_tlv_off += EMAD_STRING_TLV_LEN; 128 | 129 | /* Check if there is LATENCY_TLV. */ 130 | tmp_tlv = (struct emad_type_len_tlv *)(emad + next_tlv_off); 131 | type_len = emad_decode_tl(tmp_tlv->type_len); 132 | if (type_len.type == EMAD_LATENCY_TLV_TYPE) { 133 | latency_tlv = (struct emad_latency_tlv *)(emad + next_tlv_off); 134 | latency_time = bpf_ntohl(latency_tlv->latency_time); 135 | } 136 | 137 | if (targ_reg_id && bpf_ntohs(op_tlv->reg_id) != targ_reg_id) 138 | return 0; 139 | 140 | bpf_map_update_elem(&start, &op_tlv->tid, &ts, BPF_ANY); 141 | return 0; 142 | } 143 | 144 | SEC("fentry/mlxsw_emad_rx_listener_func") 145 | int BPF_PROG(mlxsw_emad_rx_listener_func, struct sk_buff *skb) 146 | { 147 | struct emad_latency_tlv *latency_tlv = NULL; 148 | u64 slot, *tsp, ts = bpf_ktime_get_ns(); 149 | struct hist *histp_e2e, *histp_fw; 150 | struct emad_type_len_tlv *tmp_tlv; 151 | struct emad_type_len type_len; 152 | struct emad_op_tlv *op_tlv; 153 | void *buf = skb->data; 154 | u8 emad[EMAD_HDR_LEN]; 155 | struct hist_key hkey; 156 | u32 next_tlv_off; 157 | u32 latency_time; 158 | s64 delta; 159 | 160 | bpf_probe_read(emad, EMAD_HDR_LEN, buf); 161 | 162 | next_tlv_off = EMAD_ETH_HDR_LEN; 163 | op_tlv = (struct emad_op_tlv *)(emad + next_tlv_off); 164 | 165 | /* Check if there is STRING_TLV. */ 166 | next_tlv_off += EMAD_OP_TLV_LEN; 167 | tmp_tlv = (struct emad_type_len_tlv *)(emad + next_tlv_off); 168 | type_len = emad_decode_tl(tmp_tlv->type_len); 169 | if (type_len.type == EMAD_STRING_TLV_TYPE) 170 | next_tlv_off += EMAD_STRING_TLV_LEN; 171 | 172 | /* Check if there is LATENCY_TLV. */ 173 | tmp_tlv = (struct emad_type_len_tlv *)(emad + next_tlv_off); 174 | type_len = emad_decode_tl(tmp_tlv->type_len); 175 | if (type_len.type == EMAD_LATENCY_TLV_TYPE) { 176 | latency_tlv = (struct emad_latency_tlv *)(emad + next_tlv_off); 177 | latency_time = bpf_ntohl(latency_tlv->latency_time); 178 | } 179 | 180 | if (targ_reg_id && bpf_ntohs(op_tlv->reg_id) != targ_reg_id) 181 | return 0; 182 | 183 | tsp = bpf_map_lookup_elem(&start, &op_tlv->tid); 184 | if (!tsp) 185 | return 0; 186 | 187 | delta = (s64)(ts - *tsp); 188 | if (delta < 0) 189 | goto cleanup; 190 | 191 | __builtin_memset(&hkey, 0, sizeof(hkey)); 192 | hkey.reg_id = bpf_ntohs(op_tlv->reg_id); 193 | hkey.write = ((op_tlv->r_method & EMAD_OP_TLV_METHOD_MASK) == 194 | EMAD_OP_TLV_METHOD_WRITE); 195 | 196 | /* Lookup at hists_e2e */ 197 | histp_e2e = bpf_map_lookup_elem(&hists_e2e, &hkey); 198 | if (!histp_e2e) { 199 | bpf_map_update_elem(&hists_e2e, &hkey, &initial_hist, BPF_ANY); 200 | histp_e2e = bpf_map_lookup_elem(&hists_e2e, &hkey); 201 | if (!histp_e2e) 202 | goto cleanup; 203 | } 204 | 205 | /* Insert to histp_e2e */ 206 | if (targ_ms) 207 | delta /= 1000000U; 208 | else 209 | delta /= 1000U; 210 | 211 | slot = log2l(delta); 212 | if (slot >= MAX_SLOTS) 213 | slot = MAX_SLOTS - 1; 214 | __sync_fetch_and_add(&histp_e2e->slots[slot], 1); 215 | __sync_fetch_and_add(&histp_e2e->latency, delta); 216 | __sync_fetch_and_add(&histp_e2e->count, 1); 217 | 218 | if (!latency_tlv) 219 | goto cleanup; 220 | 221 | /* Lookup at hists_fw */ 222 | histp_fw = bpf_map_lookup_elem(&hists_fw, &hkey); 223 | if (!histp_fw) { 224 | bpf_map_update_elem(&hists_fw, &hkey, &initial_hist, BPF_ANY); 225 | histp_fw = bpf_map_lookup_elem(&hists_fw, &hkey); 226 | if (!histp_fw) 227 | goto cleanup; 228 | } 229 | 230 | /* Insert to histp_fw */ 231 | slot = log2l(latency_time); 232 | if (slot >= MAX_SLOTS) 233 | slot = MAX_SLOTS - 1; 234 | __sync_fetch_and_add(&histp_fw->slots[slot], 1); 235 | __sync_fetch_and_add(&histp_fw->latency, latency_time); 236 | __sync_fetch_and_add(&histp_fw->count, 1); 237 | 238 | cleanup: 239 | bpf_map_delete_elem(&start, &op_tlv->tid); 240 | return 0; 241 | } 242 | 243 | char LICENSE[] SEC("license") = "GPL"; 244 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/mlxsw.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */ 2 | 3 | /* EMAD TLV Types */ 4 | enum { 5 | MLXSW_EMAD_TLV_TYPE_END, 6 | MLXSW_EMAD_TLV_TYPE_OP, 7 | MLXSW_EMAD_TLV_TYPE_STRING, 8 | MLXSW_EMAD_TLV_TYPE_REG, 9 | }; 10 | 11 | enum mlxsw_reg_ralxx_protocol { 12 | MLXSW_REG_RALXX_PROTOCOL_IPV4, 13 | MLXSW_REG_RALXX_PROTOCOL_IPV6, 14 | }; 15 | 16 | #define MLXSW_REG_RALUE_ID 0x8013 17 | 18 | enum mlxsw_reg_ralue_op { 19 | /* Read operation. If entry doesn't exist, the operation fails. */ 20 | MLXSW_REG_RALUE_OP_QUERY_READ = 0, 21 | /* Clear on read operation. Used to read entry and 22 | * clear Activity bit. 23 | */ 24 | MLXSW_REG_RALUE_OP_QUERY_CLEAR = 1, 25 | /* Write operation. Used to write a new entry to the table. All RW 26 | * fields are written for new entry. Activity bit is set 27 | * for new entries. 28 | */ 29 | MLXSW_REG_RALUE_OP_WRITE_WRITE = 0, 30 | /* Update operation. Used to update an existing route entry and 31 | * only update the RW fields that are detailed in the field 32 | * op_u_mask. If entry doesn't exist, the operation fails. 33 | */ 34 | MLXSW_REG_RALUE_OP_WRITE_UPDATE = 1, 35 | /* Clear activity. The Activity bit (the field a) is cleared 36 | * for the entry. 37 | */ 38 | MLXSW_REG_RALUE_OP_WRITE_CLEAR = 2, 39 | /* Delete operation. Used to delete an existing entry. If entry 40 | * doesn't exist, the operation fails. 41 | */ 42 | MLXSW_REG_RALUE_OP_WRITE_DELETE = 3, 43 | }; 44 | 45 | #define MLXSW_REG_PTAR_ID 0x3006 46 | 47 | enum mlxsw_reg_ptar_op { 48 | /* allocate a TCAM region */ 49 | MLXSW_REG_PTAR_OP_ALLOC, 50 | /* resize a TCAM region */ 51 | MLXSW_REG_PTAR_OP_RESIZE, 52 | /* deallocate TCAM region */ 53 | MLXSW_REG_PTAR_OP_FREE, 54 | /* test allocation */ 55 | MLXSW_REG_PTAR_OP_TEST, 56 | }; 57 | 58 | enum mlxsw_reg_ptar_key_type { 59 | MLXSW_REG_PTAR_KEY_TYPE_FLEX = 0x50, /* Spetrum */ 60 | MLXSW_REG_PTAR_KEY_TYPE_FLEX2 = 0x51, /* Spectrum-2 */ 61 | }; 62 | 63 | #define MLXSW_REG_PTCE3_ID 0x3027 64 | 65 | enum mlxsw_reg_ptce3_op { 66 | /* Write operation. Used to write a new entry to the table. 67 | * All R/W fields are relevant for new entry. Activity bit is set 68 | * for new entries. Write with v = 0 will delete the entry. Must 69 | * not be used if an entry exists. 70 | */ 71 | MLXSW_REG_PTCE3_OP_WRITE_WRITE = 0, 72 | /* Update operation */ 73 | MLXSW_REG_PTCE3_OP_WRITE_UPDATE = 1, 74 | /* Read operation */ 75 | MLXSW_REG_PTCE3_OP_QUERY_READ = 0, 76 | }; 77 | 78 | #define MLXSW_REG_PEFA_ID 0x300F 79 | #define MLXSW_REG_IEDR_ID 0x3804 80 | 81 | #define MLXSW_REG_RAUHT_ID 0x8014 82 | 83 | enum mlxsw_reg_rauht_op { 84 | /* Read operation */ 85 | MLXSW_REG_RAUHT_OP_QUERY_READ = 0, 86 | /* Clear on read operation. Used to read entry and clear 87 | * activity bit. 88 | */ 89 | MLXSW_REG_RAUHT_OP_QUERY_CLEAR_ON_READ = 1, 90 | /* Add. Used to write a new entry to the table. All R/W fields are 91 | * relevant for new entry. Activity bit is set for new entries. 92 | */ 93 | MLXSW_REG_RAUHT_OP_WRITE_ADD = 0, 94 | /* Update action. Used to update an existing route entry and 95 | * only update the following fields: 96 | * trap_action, trap_id, mac, counter_set_type, counter_index 97 | */ 98 | MLXSW_REG_RAUHT_OP_WRITE_UPDATE = 1, 99 | /* Clear activity. A bit is cleared for the entry. */ 100 | MLXSW_REG_RAUHT_OP_WRITE_CLEAR_ACTIVITY = 2, 101 | /* Delete entry */ 102 | MLXSW_REG_RAUHT_OP_WRITE_DELETE = 3, 103 | /* Delete all host entries on a RIF. In this command, dip 104 | * field is reserved. 105 | */ 106 | MLXSW_REG_RAUHT_OP_WRITE_DELETE_ALL = 4, 107 | }; 108 | 109 | #define MLXSW_REG_RATR_ID 0x8008 110 | 111 | enum mlxsw_reg_ratr_op { 112 | /* Read */ 113 | MLXSW_REG_RATR_OP_QUERY_READ = 0, 114 | /* Read and clear activity */ 115 | MLXSW_REG_RATR_OP_QUERY_READ_CLEAR = 2, 116 | /* Write Adjacency entry */ 117 | MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY = 1, 118 | /* Write Adjacency entry only if the activity is cleared. 119 | * The write may not succeed if the activity is set. There is not 120 | * direct feedback if the write has succeeded or not, however 121 | * the get will reveal the actual entry (SW can compare the get 122 | * response to the set command). 123 | */ 124 | MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY_ON_ACTIVITY = 3, 125 | }; 126 | 127 | #define MLXSW_REG_SFD_ID 0x200A 128 | 129 | enum mlxsw_reg_sfd_op { 130 | /* Dump entire FDB a (process according to record_locator) */ 131 | MLXSW_REG_SFD_OP_QUERY_DUMP = 0, 132 | /* Query records by {MAC, VID/FID} value */ 133 | MLXSW_REG_SFD_OP_QUERY_QUERY = 1, 134 | /* Query and clear activity. Query records by {MAC, VID/FID} value */ 135 | MLXSW_REG_SFD_OP_QUERY_QUERY_AND_CLEAR_ACTIVITY = 2, 136 | /* Test. Response indicates if each of the records could be 137 | * added to the FDB. 138 | */ 139 | MLXSW_REG_SFD_OP_WRITE_TEST = 0, 140 | /* Add/modify. Aged-out records cannot be added. This command removes 141 | * the learning notification of the {MAC, VID/FID}. Response includes 142 | * the entries that were added to the FDB. 143 | */ 144 | MLXSW_REG_SFD_OP_WRITE_EDIT = 1, 145 | /* Remove record by {MAC, VID/FID}. This command also removes 146 | * the learning notification and aged-out notifications 147 | * of the {MAC, VID/FID}. The response provides current (pre-removal) 148 | * entries as non-aged-out. 149 | */ 150 | MLXSW_REG_SFD_OP_WRITE_REMOVE = 2, 151 | /* Remove learned notification by {MAC, VID/FID}. The response provides 152 | * the removed learning notification. 153 | */ 154 | MLXSW_REG_SFD_OP_WRITE_REMOVE_NOTIFICATION = 2, 155 | }; 156 | 157 | enum mlxsw_reg_sfd_rec_type { 158 | MLXSW_REG_SFD_REC_TYPE_UNICAST = 0x0, 159 | MLXSW_REG_SFD_REC_TYPE_UNICAST_LAG = 0x1, 160 | MLXSW_REG_SFD_REC_TYPE_MULTICAST = 0x2, 161 | MLXSW_REG_SFD_REC_TYPE_UNICAST_TUNNEL = 0xC, 162 | }; 163 | 164 | #define MLXSW_REG_SFDF_ID 0x2013 165 | 166 | enum mlxsw_reg_sfdf_flush_type { 167 | /* All SWID dynamic entries are flushed. */ 168 | MLXSW_REG_SFDF_FLUSH_PER_SWID, 169 | /* All FID dynamic entries are flushed. */ 170 | MLXSW_REG_SFDF_FLUSH_PER_FID, 171 | /* All dynamic entries pointing to port are flushed. */ 172 | MLXSW_REG_SFDF_FLUSH_PER_PORT, 173 | /* All FID dynamic entries pointing to port are flushed. */ 174 | MLXSW_REG_SFDF_FLUSH_PER_PORT_AND_FID, 175 | /* All dynamic entries pointing to LAG are flushed. */ 176 | MLXSW_REG_SFDF_FLUSH_PER_LAG, 177 | /* All FID dynamic entries pointing to LAG are flushed. */ 178 | MLXSW_REG_SFDF_FLUSH_PER_LAG_AND_FID, 179 | /* All entries of type "Unicast Tunnel" or "Multicast Tunnel" are 180 | * flushed. 181 | */ 182 | MLXSW_REG_SFDF_FLUSH_PER_NVE, 183 | /* All entries of type "Unicast Tunnel" or "Multicast Tunnel" are 184 | * flushed, per FID. 185 | */ 186 | MLXSW_REG_SFDF_FLUSH_PER_NVE_AND_FID, 187 | }; 188 | 189 | #define MLXSW_REG_SVFA_ID 0x201C 190 | 191 | enum mlxsw_reg_svfa_mt { 192 | MLXSW_REG_SVFA_MT_VID_TO_FID, 193 | MLXSW_REG_SVFA_MT_PORT_VID_TO_FID, 194 | MLXSW_REG_SVFA_MT_VNI_TO_FID, 195 | }; 196 | 197 | #define MLXSW_REG_RIPS_ID 0x8021 198 | 199 | enum mlxsw_reg_sfmr_op { 200 | MLXSW_REG_SFMR_OP_CREATE_FID, 201 | MLXSW_REG_SFMR_OP_DESTROY_FID, 202 | }; 203 | 204 | #define MLXSW_REG_SFMR_ID 0x201F 205 | -------------------------------------------------------------------------------- /Debugging/hdroom_sz: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from collections import namedtuple 3 | import argparse 4 | 5 | Gearbox = namedtuple("Gearbox", ["ns"]) 6 | gearboxes = { 7 | "amos": Gearbox(ns=155), 8 | "abir": Gearbox(ns=120), 9 | "inphi": Gearbox(ns=400), 10 | "none": Gearbox(ns=0), 11 | } 12 | 13 | ASIC = namedtuple("ASIC", ["cell_size_B", "tile_ns"]) 14 | asics = { 15 | "spc1": ASIC(cell_size_B=96, tile_ns=0), 16 | "spc2": ASIC(cell_size_B=144, tile_ns=0), 17 | "spc3": ASIC(cell_size_B=144, tile_ns=0), 18 | "spc4": ASIC(cell_size_B=192, tile_ns=120), 19 | } 20 | 21 | # peer_resp seems to be in units of 512 bit times 22 | LineRate = namedtuple("LineRate", 23 | ["Gbps", "peer_resp", "xon_size_KiB", "hysteresis"]) 24 | linerates = { 25 | "400G": LineRate(Gbps=400, peer_resp=905, xon_size_KiB=38, 26 | hysteresis=True), 27 | "200G": LineRate(Gbps=200, peer_resp=453, xon_size_KiB=19, 28 | hysteresis=False), 29 | "100G": LineRate(Gbps=100, peer_resp=394, xon_size_KiB=19, 30 | hysteresis=False), 31 | "50G": LineRate(Gbps=50, peer_resp=147, xon_size_KiB=19, 32 | hysteresis=False), 33 | "40G": LineRate(Gbps=40, peer_resp=118, xon_size_KiB=19, 34 | hysteresis=False), 35 | "25G": LineRate(Gbps=25, peer_resp=80, xon_size_KiB=19, 36 | hysteresis=False), 37 | "10G": LineRate(Gbps=10, peer_resp=67, xon_size_KiB=19, 38 | hysteresis=False), 39 | } 40 | 41 | def parse_percent(s): 42 | if s[-1] == '%': 43 | s = s[:-1] 44 | 45 | n = int(s) 46 | if n < 0 or n > 100: 47 | raise argparse.ArgumentTypeError("Expected value 0..100, got %d" % n) 48 | return n 49 | 50 | parser = argparse.ArgumentParser(description='Lossless headroom size calculator') 51 | parser.add_argument('--asic', choices=asics.keys(), required=True, 52 | help="ASIC type") 53 | parser.add_argument('--linerate', choices=linerates.keys(), required=True, 54 | help="Line rate") 55 | parser.add_argument('--mtu', type=int, dest="MTU_B", required=True, 56 | help="MTU") 57 | parser.add_argument('--cable-length', type=float, dest="cable_m", required=True, 58 | help="Cable length in meters") 59 | parser.add_argument('--gearbox', choices=gearboxes.keys(), default="none", 60 | help="Gearbox type") 61 | parser.add_argument('--no-gearbox', 62 | action='store_const', const='none', dest="gearbox", 63 | help="The machine has no gearbox (default)") 64 | parser.add_argument('--macsec', action='store_true', dest="use_macsec", 65 | help="MACsec is configured on the interface") 66 | parser.add_argument('--no-macsec', action='store_false', dest="use_macsec", 67 | help="MACsec is not configured on the interface (default)") 68 | parser.add_argument('--mac-phy-delay', type=int, default="0", dest="mac_phy_B", 69 | help="Amount of traffic in the MAC/PHY layer, in bytes") 70 | parser.add_argument('--peer-mtu', type=int, dest="peer_MTU_B", 71 | help="Peer MTU. Defaults to the local MTU") 72 | parser.add_argument('--waste-prob', type=parse_percent, default="100", 73 | dest="waste_prob_pct", 74 | help="Likelihood of arrival of a packet with a size that maximally wastes the ASIC memory, in percent") 75 | 76 | args = parser.parse_args() 77 | 78 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 79 | 80 | asic = asics[args.asic] 81 | linerate = linerates[args.linerate] 82 | gearbox = gearboxes[args.gearbox] 83 | MTU_KiB = args.MTU_B / 1024 84 | cable_m = args.cable_m 85 | use_macsec = args.use_macsec 86 | mac_phy_delay_KiB = args.mac_phy_B / 1024 87 | peer_MTU_KiB = MTU_KiB if args.peer_MTU_B is None else (args.peer_MTU_B / 1024) 88 | 89 | # Spectrum allocates memory in cells. Cells are indivisible, each cell 90 | # contains data of at most one packet. Some memory is therefore wasted. On 91 | # systems with small cell size, a packet with the size of (cell size + 1) 92 | # will consume almost twice as much memory as its size suggest. On systems 93 | # with a large cell size, a 64-byte packet can consume memory many 94 | # multiples of its size. 95 | # 96 | # max_waste_prob is the probability that a maximally-wasting packet 97 | # arrives. 98 | # 99 | max_waste_prob = args.waste_prob_pct / 100 100 | 101 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 102 | 103 | def b_to_KiB(n): 104 | return n / 8 / 1024 105 | 106 | # IEEE 802.3 par 44.3 "Delay constraints" 107 | signal_speed_mps = 3e8 * 0.66 # speed of light in vacuum * 0.66 108 | cable_ns = cable_m / signal_speed_mps * 1e9 109 | cable_KiB = b_to_KiB(cable_ns * linerate.Gbps) 110 | 111 | # This is the amount of traffic that is still sent after the peer receives 112 | # the PAUSE / PFC frame. 113 | # 114 | peer_response_KiB = b_to_KiB(linerate.peer_resp * 512) 115 | 116 | gearbox_KiB = b_to_KiB(gearbox.ns * linerate.Gbps) 117 | tile_KB = b_to_KiB(asic.tile_ns * linerate.Gbps) 118 | 119 | MACsec_ns = 120 if use_macsec else 0 120 | MACsec_KB = b_to_KiB(linerate.Gbps * MACsec_ns) 121 | 122 | # Worst-case waste factor. See max_waste_prob for explanation. 123 | # 124 | max_waste_factor = max([(2 * asic.cell_size_B) / (asic.cell_size_B + 1), 125 | asic.cell_size_B / 64]) 126 | 127 | # The actual waste factor is determined on a sliding scale from 1.0 (no 128 | # waste) to max_waste_factor, depending on the probability that a 129 | # maximally-wasting packet appears (as determined by max_waste_prob). 130 | # 131 | waste_factor = 1.0 * (1 - max_waste_prob) + max_waste_factor * max_waste_prob 132 | 133 | # At the point in time that the ASIC determines that a PAUSE / PFC frame is 134 | # to be sent, some amount of traffic that has already been sent by the peer 135 | # is yet to arrive; more traffic will be sent by the peer before the PAUSE 136 | # / PFC frame arrives at the peer. This is called propagation delay and is 137 | # composed as follows: 138 | # 139 | # - traffic already on the cable 140 | # - traffic sent by the peer during latencies in MAC / PHY, gearbox, 141 | # MACsec, and ASIC tiles 142 | # - traffic sent by the peer while the local switch waits to finish its own 143 | # transmission of an MTU-sized packet so that it can send the PAUSE / PFC 144 | # frame 145 | # - traffic sent by the peer during the time that the PAUSE / PFC frame 146 | # travels through the cable 147 | # - traffic sent by the peer after it has received the PAUSE / PFC frame, 148 | # but before it reacts to it 149 | # 150 | prop_delay_KiB = (2 * cable_KiB + 2 * gearbox_KiB + 2 * MACsec_KB + tile_KB 151 | + mac_phy_delay_KiB + MTU_KiB + peer_response_KiB) 152 | 153 | # - an extra peer-MTU-sized packet that the peer has already started 154 | # sending. Since this is not a maximally-wasting packet, this size is not 155 | # multiplied by waste_factor. 156 | # 157 | buffer_size_KiB = peer_MTU_KiB + prop_delay_KiB * waste_factor 158 | 159 | xon_thresh_KiB = linerate.xon_size_KiB 160 | xoff_thresh_KiB = xon_thresh_KiB + (MTU_KiB if linerate.hysteresis else 0) 161 | headroom_size_KiB = xoff_thresh_KiB + buffer_size_KiB 162 | 163 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 164 | 165 | def show(what, value): 166 | print("%s\t%s" % (what, value)) 167 | 168 | show("xon_thresh", int(xon_thresh_KiB * 1024)) 169 | show("xoff_thresh", int(xoff_thresh_KiB * 1024)) 170 | show("headroom_size", int(headroom_size_KiB * 1024)) 171 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-dl.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "resmon.h" 14 | #include "trace_helpers.h" 15 | 16 | struct cb_args { 17 | char **devname; 18 | char **busname; 19 | int err; 20 | }; 21 | 22 | enum devlink_multicast_groups { 23 | DEVLINK_MCGRP_CONFIG, 24 | }; 25 | 26 | static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { 27 | [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, 28 | [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, 29 | [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, 30 | }; 31 | 32 | static int resmon_dl_dev_info_parser(struct nl_msg *msg, void *arg) 33 | { 34 | struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); 35 | struct cb_args *args = (struct cb_args *) arg; 36 | struct nlattr *attrs[DEVLINK_ATTR_MAX + 1]; 37 | char *attr_driver_name; 38 | char *attr_bus; 39 | char *attr_dev; 40 | int err; 41 | 42 | err = nla_parse(attrs, DEVLINK_ATTR_MAX, genlmsg_attrdata(gnlh, 0), 43 | genlmsg_attrlen(gnlh, 0), NULL); 44 | if (err < 0) 45 | goto nl_skip; 46 | 47 | attr_driver_name = nla_get_string(attrs[DEVLINK_ATTR_INFO_DRIVER_NAME]); 48 | attr_bus = nla_get_string(attrs[DEVLINK_ATTR_BUS_NAME]); 49 | attr_dev = nla_get_string(attrs[DEVLINK_ATTR_DEV_NAME]); 50 | 51 | if (attr_driver_name == NULL || attr_bus == NULL || attr_dev == NULL) 52 | goto nl_skip; 53 | 54 | if (strstr(attr_driver_name, "mlxsw_spectrum") == NULL) 55 | goto nl_skip; 56 | 57 | *args->busname = strdup(attr_bus); 58 | if (args->busname == NULL) 59 | goto nl_skip; 60 | 61 | *args->devname = strdup(attr_dev); 62 | if (args->devname == NULL) 63 | goto free_busname; 64 | 65 | args->err = 0; 66 | return 0; 67 | 68 | free_busname: 69 | free(*args->busname); 70 | nl_skip: 71 | return NL_SKIP; 72 | } 73 | 74 | struct resmon_dl { 75 | struct nl_sock *sk; 76 | int family; 77 | }; 78 | 79 | struct resmon_dl *resmon_dl_create(void) 80 | { 81 | struct resmon_dl *dl; 82 | int err; 83 | 84 | dl = malloc(sizeof(*dl)); 85 | if (dl == NULL) 86 | return NULL; 87 | 88 | dl->sk = nl_socket_alloc(); 89 | if (!dl->sk) { 90 | fprintf(stderr, "Failed to allocate data socket\n"); 91 | goto free_dl; 92 | } 93 | 94 | err = genl_connect(dl->sk); 95 | if (err) { 96 | fprintf(stderr, "Failed to connect socket\n"); 97 | goto free_sock; 98 | } 99 | 100 | err = nl_socket_set_nonblocking(dl->sk); 101 | if (err) { 102 | fprintf(stderr, "Failed to set socket nonblocking\n"); 103 | goto free_sock; 104 | } 105 | 106 | dl->family = genl_ctrl_resolve(dl->sk, "devlink"); 107 | if (dl->family < 0) { 108 | fprintf(stderr, "Failed to resolve ID of \"devlink\" family\n"); 109 | goto free_sock; 110 | } 111 | 112 | return dl; 113 | 114 | free_sock: 115 | nl_socket_free(dl->sk); 116 | free_dl: 117 | free(dl); 118 | return NULL; 119 | } 120 | 121 | void resmon_dl_destroy(struct resmon_dl *dl) 122 | { 123 | nl_socket_free(dl->sk); 124 | free(dl); 125 | } 126 | 127 | static int resmon_dl_netlink_get_dev(struct nl_sock *sk, int family, 128 | char **busname, char **devname, 129 | char **error) 130 | { 131 | struct cb_args args; 132 | struct nl_cb *cb; 133 | int err; 134 | 135 | err = genl_send_simple(sk, family, DEVLINK_CMD_INFO_GET, 0, NLM_F_DUMP); 136 | if (err < 0) { 137 | resmon_fmterr(error, "Failed to send devlink get command"); 138 | return err; 139 | } 140 | 141 | args.devname = devname; 142 | args.busname = busname; 143 | args.err = -1; 144 | 145 | cb = nl_cb_alloc(NL_CB_DEFAULT); 146 | if (cb == NULL) 147 | return -NLE_NOMEM; 148 | 149 | err = nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, resmon_dl_dev_info_parser, &args); 150 | if (err < 0) { 151 | resmon_fmterr(error, "Failed to set devlink info parser"); 152 | goto err_out; 153 | } 154 | 155 | err = nl_recvmsgs(sk, cb); 156 | if (err < 0 || args.err < 0) { 157 | err = -1; 158 | resmon_fmterr(error, "Failed to receive devlink get messages from netlink"); 159 | goto err_out; 160 | } 161 | err = 0; 162 | 163 | err_out: 164 | nl_cb_put(cb); 165 | return err; 166 | } 167 | 168 | static int resmon_dl_netlink_resources_get(struct nlattr **attrs, 169 | struct nlattr *nla_resources, 170 | uint64_t *size) 171 | { 172 | struct nlattr *nla_resource[DEVLINK_ATTR_MAX + 1]; 173 | struct nlattr *attr_name, *attr_size; 174 | struct nlattr *resource; 175 | int rem, err; 176 | char *name; 177 | 178 | err = nla_parse_nested(nla_resource, DEVLINK_ATTR_MAX, nla_resources, 179 | devlink_nl_policy); 180 | if (err < 0) 181 | return err; 182 | 183 | attr_name = nla_resource[DEVLINK_ATTR_RESOURCE_NAME]; 184 | attr_size = nla_resource[DEVLINK_ATTR_RESOURCE_SIZE]; 185 | 186 | if (attr_name && attr_size) { 187 | name = nla_get_string(attr_name); 188 | if (strcmp(name, "kvd") == 0) { 189 | *size = nla_get_u64(attr_size); 190 | return 0; 191 | } 192 | } 193 | 194 | nla_for_each_nested(resource, nla_resources, rem) { 195 | if (nla_resource[DEVLINK_ATTR_RESOURCE] || 196 | nla_resource[DEVLINK_ATTR_RESOURCE_LIST]) 197 | resmon_dl_netlink_resources_get(nla_resource, resource, 198 | size); 199 | } 200 | if (!(*size)) 201 | return -1; 202 | 203 | return 0; 204 | } 205 | 206 | static int resmon_dl_netlink_get_kvd_size(struct nl_sock *sk, int family, 207 | char *busname, char *devname, 208 | uint64_t *size, char **error) 209 | { 210 | struct nlattr *attrs[DEVLINK_ATTR_MAX + 1]; 211 | struct sockaddr_nl nla; 212 | unsigned char *buf; 213 | struct nl_msg *msg; 214 | int err, len; 215 | 216 | msg = nlmsg_alloc(); 217 | if (!msg) { 218 | resmon_fmterr(error, "Failed to allocate netlink message"); 219 | return -1; 220 | } 221 | 222 | if (!genlmsg_put(msg, 0, NL_AUTO_SEQ, family, 0, 223 | NLM_F_REQUEST, DEVLINK_CMD_RESOURCE_DUMP, 0)) 224 | goto genlmsg_put_failure; 225 | 226 | if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, busname)) 227 | goto nla_put_failure; 228 | 229 | if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, devname)) 230 | goto nla_put_failure; 231 | 232 | err = nl_send_sync(sk, msg); 233 | if (err < 0) { 234 | resmon_fmterr(error, "Failed to send devlink resource get command"); 235 | return err; 236 | } 237 | 238 | len = nl_recv(sk, &nla, &buf, NULL); 239 | if (len < 0) { 240 | resmon_fmterr(error, "Failed to receive message"); 241 | return -1; 242 | } 243 | 244 | err = genlmsg_parse((void *) buf, 0, attrs, DEVLINK_ATTR_MAX, 245 | devlink_nl_policy); 246 | if (err < 0) 247 | return err; 248 | 249 | err = resmon_dl_netlink_resources_get(attrs, 250 | attrs[DEVLINK_ATTR_RESOURCE_LIST], 251 | size); 252 | if (err < 0) 253 | return err; 254 | 255 | free(buf); 256 | return 0; 257 | 258 | nla_put_failure: 259 | genlmsg_put_failure: 260 | nlmsg_free(msg); 261 | return -EMSGSIZE; 262 | } 263 | 264 | int resmon_dl_get_kvd_size(struct resmon_dl *dl, uint64_t *size, char **error) 265 | { 266 | char *busname, *devname = NULL; 267 | int err; 268 | 269 | nl_socket_disable_auto_ack(dl->sk); 270 | 271 | err = resmon_dl_netlink_get_dev(dl->sk, dl->family, &busname, &devname, 272 | error); 273 | if (err < 0) 274 | return -1; 275 | 276 | err = resmon_dl_netlink_get_kvd_size(dl->sk, dl->family, busname, 277 | devname, size, error); 278 | if (err < 0) 279 | resmon_fmterr(error, "Failed to get devlink resource size from netlink"); 280 | 281 | free(busname); 282 | free(devname); 283 | return err; 284 | } 285 | -------------------------------------------------------------------------------- /Debugging/drgn-tools/fid_dump.txt: -------------------------------------------------------------------------------- 1 | NAME 2 | 3 | fid_dump - Dump mlxsw filtering identifier (FID) configuration 4 | 5 | SYNOPSIS: 6 | 7 | fid_dump 8 | 9 | DESCRIPTION: 10 | 11 | fid_dump is a tool written on top of drgn, for dumping internal 12 | structures of mlxsw driver related to FID management. That way it is 13 | possible to find out how FIDs are configured in the ASIC. 14 | 15 | OUTPUT 16 | 17 | As an output, fid_dump emits a JSON object with the following 18 | attributes: 19 | 20 | .ports..local_port 21 | The local port number of the port. 22 | 23 | .ports..virtual 24 | Whether the port is in virtual mode or not. When in virtual 25 | mode, ingress FID classification is performed based on 26 | {Port, VID}. Otherwise, based on VID alone. Virtual mode is 27 | less scalable as it requires more FID classification 28 | entries to be present in the ASIC. 29 | 30 | .fid_families. 31 | The FID family. See kernel commit a11074872545 ("mlxsw: 32 | spectrum: Implement common FID core") for more details. 33 | 34 | .fid_families..start_index 35 | Start index (inclusive) for FID allocation from given FID 36 | family. 37 | 38 | .fid_families..end_index 39 | End index (inclusive) for FID allocation for given FID 40 | family. 41 | 42 | .fid_families..rif_type 43 | The type of router interface (RIF) used for RIFs constructed 44 | on top of FIDs from the given FID family. 45 | 46 | .fid_families..fids..fid_offset 47 | FID offset, used to point into the flooding table. 48 | 49 | .fid_families..fids..vid 50 | VLAN identifier (VID) mapped to the FID. Only valid for 51 | "8021Q" FIDs. 52 | 53 | .fid_families..fids..br_ifindex 54 | VLAN-unaware bridge interface index mapped to the FID. Only 55 | valid for "8021D" FIDs. 56 | 57 | .fid_families..fids..br_ifname 58 | VLAN-unaware bridge interface name mapped to the FID. Only 59 | valid for "8021D" FIDs. 60 | 61 | .fid_families..fids..ref_count 62 | FID reference count. 63 | 64 | .fid_families..fids..rif.index 65 | Index of the RIF constructed on top of the FID. 66 | 67 | .fid_families..fids..rif.ifindex 68 | Interface index of the net device represented by the RIF. 69 | 70 | .fid_families..fids..rif.ifname 71 | Interface name of the net device represented by the RIF. 72 | 73 | .fid_families..fids..vni 74 | Virtual Network Identifier (VNI) mapped to the FID. 75 | 76 | .fid_families..fids..nve_ifindex 77 | Interface index of the Network Virtualization Endpoint (NVE) 78 | whose VNI is mapped to the FID. 79 | 80 | .fid_families..fids..nve_ifname 81 | Interface name of the Network Virtualization Endpoint (NVE) 82 | whose VNI is mapped to the FID. 83 | 84 | .fid_families..fids..nve_flood_index 85 | Index to a linked list of underlay IP addresses in the KVDL 86 | to which BUM packets should be replicated to. See kernel 87 | commit 90ea0bb55115 ("mlxsw: spectrum: Add a new type of KVD 88 | linear record") for more details. 89 | 90 | .fid_families..fids..flood_mid_indexes.UC 91 | MID index for flooding unicast packets, the value is the 92 | result of 'mid_base' + 'fid_offset', when 'mid_base' is per 93 | FID family and packet type (UC/MC/BC). See kernel commit 94 | 9f6f467a3cdb1 ("mlxsw: spectrum_fid: Set 'mid_base' as part 95 | of flood tables initialization") 96 | 97 | .fid_families..fids..flood_mid_indexes.MC 98 | MID index for flooding multicast packets, see more details 99 | in the description of 'flood_mid_indexes.UC'. 100 | 101 | .fid_families..fids..flood_mid_indexes.BC 102 | MID index for flooding broadcast packets, see more details 103 | in the description of 'flood_mid_indexes.UC'. 104 | 105 | .fid_families..fids..port_vid_list 106 | {Port, VID} pairs which are mapped to the FID. 107 | 108 | fid_dump always outputs the complete information. Filtering and 109 | querying can be done e.g. through `jq`. 110 | 111 | EXAMPLE: 112 | 113 | # fid_dump | jq 114 | { 115 | "ports": { 116 | "swp18": { 117 | "local_port": 1, 118 | "virtual": false 119 | }, 120 | [...] 121 | }, 122 | "fid_families": { 123 | "8021Q": { 124 | "start_index": 1, 125 | "end_index": 4094, 126 | "rif_type": "VLAN", 127 | "fids": {} 128 | }, 129 | "8021D": { 130 | "start_index": 4095, 131 | "end_index": 5118, 132 | "rif_type": "FID", 133 | "fids": { 134 | "4095": { 135 | "fid_offset": 0, 136 | "br_ifindex": 96, 137 | "br_ifname": "br0", 138 | "ref_count": 3, 139 | "rif": { 140 | "index": 2, 141 | "ifindex": 96, 142 | "ifname": "br0" 143 | }, 144 | "vni": 10, 145 | "nve_ifindex": 97, 146 | "nve_ifname": "vxlan0", 147 | "nve_flood_index": 0, 148 | "flood_mid_indexes": { 149 | "UC": 12282, 150 | "MC": 13306, 151 | "BC": 14330 152 | }, 153 | "port_vid_list": [ 154 | { 155 | "local_port": 97, 156 | "vid": 4095 157 | }, 158 | { 159 | "local_port": 109, 160 | "vid": 4095 161 | } 162 | ] 163 | } 164 | } 165 | }, 166 | "RFID": { 167 | "start_index": 5120, 168 | "end_index": 16383, 169 | "rif_type": "SUBPORT", 170 | "fids": { 171 | "5123": { 172 | "fid_offset": 0, 173 | "ref_count": 2, 174 | "rif": { 175 | "index": 4, 176 | "ifindex": 72, 177 | "ifname": "swp4" 178 | }, 179 | "port_vid_list": [ 180 | { 181 | "local_port": 101, 182 | "vid": 4095 183 | } 184 | ] 185 | }, 186 | "5122": { 187 | "fid_offset": 0, 188 | "ref_count": 2, 189 | "rif": { 190 | "index": 3, 191 | "ifindex": 71, 192 | "ifname": "swp3" 193 | }, 194 | "port_vid_list": [ 195 | { 196 | "local_port": 97, 197 | "vid": 4095 198 | } 199 | ] 200 | }, 201 | "5121": { 202 | "fid_offset": 0, 203 | "ref_count": 2, 204 | "rif": { 205 | "index": 1, 206 | "ifindex": 73, 207 | "ifname": "swp1" 208 | }, 209 | "port_vid_list": [ 210 | { 211 | "local_port": 105, 212 | "vid": 4095 213 | } 214 | ] 215 | } 216 | } 217 | }, 218 | "DUMMY": { 219 | "start_index": 5119, 220 | "end_index": 5119, 221 | "rif_type": "SUBPORT", 222 | "fids": { 223 | "5119": { 224 | "fid_offset": 0, 225 | "ref_count": 1, 226 | "port_vid_list": [] 227 | } 228 | } 229 | } 230 | } 231 | } 232 | 233 | SEE ALSO: 234 | 235 | https://github.com/Mellanox/mlxsw/wiki 236 | https://drgn.readthedocs.io 237 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/trapagg.bpf.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: GPL-2.0 2 | #include "vmlinux.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "trapagg.h" 8 | 9 | /* Define here to avoid conflicts with include files. */ 10 | #define ETH_HLEN 14 /* Total octets in header. */ 11 | #define ETH_P_IP 0x0800 /* Internet Protocol packet. */ 12 | #define ETH_P_IPV6 0x86DD /* IPv6 over bluebook. */ 13 | #define ETH_P_8021Q 0x8100 /* 802.1Q VLAN Extended Header. */ 14 | #define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN. */ 15 | #define USHRT_MAX 65535 16 | #define IP_OFFSET 0x1FFF 17 | #define GRE_VERSION bpf_htons(0x0007) 18 | #define GRE_CSUM bpf_htons(0x8000) 19 | #define GRE_KEY bpf_htons(0x2000) 20 | #define GRE_SEQ bpf_htons(0x1000) 21 | #define GRE_IS_CSUM(f) ((f) & GRE_CSUM) 22 | #define GRE_IS_KEY(f) ((f) & GRE_KEY) 23 | #define GRE_IS_SEQ(f) ((f) & GRE_SEQ) 24 | 25 | const volatile bool targ_drop = false; 26 | const volatile bool targ_exception = false; 27 | const volatile bool targ_control = false; 28 | const volatile bool targ_all = false; 29 | 30 | struct { 31 | __uint(type, BPF_MAP_TYPE_LRU_HASH); 32 | __uint(max_entries, MAX_ENTRIES); 33 | __type(key, struct trap_flow_key); 34 | __type(value, u64); 35 | } trap_flows SEC(".maps"); 36 | 37 | static __always_inline bool flow_dissector_valid_access(struct sk_buff *skb, 38 | u16 offset, 39 | u16 hdr_size) 40 | { 41 | unsigned int len, data_len; 42 | 43 | bpf_probe_read(&len, sizeof(len), &skb->len); 44 | bpf_probe_read(&data_len, sizeof(data_len), &skb->data_len); 45 | 46 | /* Verify this variable offset does not overflow. */ 47 | if (offset > (USHRT_MAX - hdr_size)) 48 | return false; 49 | 50 | /* Make sure we only access data in linear area. */ 51 | return offset + hdr_size < len - data_len; 52 | } 53 | 54 | static __always_inline bool 55 | flow_dissector_ipv4_dissect(struct sk_buff *skb, struct trap_flow_key *flow, 56 | u16 *p_offset) 57 | { 58 | void *skb_data = skb->head + skb->mac_header; 59 | struct iphdr iph; 60 | 61 | if (!flow_dissector_valid_access(skb, *p_offset, sizeof(iph))) 62 | return false; 63 | 64 | bpf_probe_read(&iph, sizeof(iph), skb_data + *p_offset); 65 | 66 | if (iph.ihl < 5) 67 | return false; 68 | 69 | flow->addr_proto = ETH_P_IP; 70 | flow->saddrv4 = iph.saddr; 71 | flow->daddrv4 = iph.daddr; 72 | flow->ip_proto = iph.protocol; 73 | 74 | /* After the first frag, packets do not have headers to parse, so 75 | * return false to stop the dissection. 76 | */ 77 | if (iph.frag_off & bpf_htons(IP_OFFSET)) 78 | return false; 79 | 80 | *p_offset += iph.ihl << 2; 81 | 82 | return true; 83 | } 84 | 85 | static __always_inline bool 86 | flow_dissector_ipv6_dissect(struct sk_buff *skb, struct trap_flow_key *flow, 87 | u16 *p_offset) 88 | { 89 | void *skb_data = skb->head + skb->mac_header; 90 | struct ipv6hdr ip6h; 91 | 92 | if (!flow_dissector_valid_access(skb, *p_offset, sizeof(ip6h))) 93 | return false; 94 | 95 | bpf_probe_read(&ip6h, sizeof(ip6h), skb_data + *p_offset); 96 | 97 | flow->addr_proto = ETH_P_IPV6; 98 | __builtin_memcpy(flow->saddrv6, &ip6h.saddr, sizeof(flow->saddrv6)); 99 | __builtin_memcpy(flow->daddrv6, &ip6h.daddr, sizeof(flow->daddrv6)); 100 | flow->ip_proto = ip6h.nexthdr; 101 | 102 | *p_offset += sizeof(ip6h); 103 | 104 | return true; 105 | } 106 | 107 | static __always_inline bool 108 | flow_dissector_gre_dissect(struct sk_buff *skb, struct trap_flow_key *flow, 109 | u16 *p_offset) 110 | { 111 | void *skb_data = skb->head + skb->mac_header; 112 | struct gre_base_hdr gre; 113 | 114 | if (!flow_dissector_valid_access(skb, *p_offset, sizeof(gre))) 115 | return false; 116 | 117 | bpf_probe_read(&gre, sizeof(gre), skb_data + *p_offset); 118 | 119 | if (gre.flags & GRE_VERSION) 120 | return false; 121 | 122 | *p_offset += sizeof(gre); 123 | if (GRE_IS_CSUM(gre.flags)) 124 | *p_offset += 4; 125 | if (GRE_IS_KEY(gre.flags)) 126 | *p_offset += 4; 127 | if (GRE_IS_SEQ(gre.flags)) 128 | *p_offset += 4; 129 | 130 | if (gre.protocol == bpf_htons(ETH_P_IP)) 131 | return flow_dissector_ipv4_dissect(skb, flow, p_offset); 132 | else if (gre.protocol == bpf_htons(ETH_P_IPV6)) 133 | return flow_dissector_ipv6_dissect(skb, flow, p_offset); 134 | 135 | return false; 136 | } 137 | 138 | static __always_inline bool 139 | flow_dissector_udp_dissect(struct sk_buff *skb, struct trap_flow_key *flow, 140 | u16 *p_offset) 141 | { 142 | void *skb_data = skb->head + skb->mac_header; 143 | struct udphdr udp; 144 | 145 | if (!flow_dissector_valid_access(skb, *p_offset, sizeof(udp))) 146 | return false; 147 | 148 | bpf_probe_read(&udp, sizeof(udp), skb_data + *p_offset); 149 | 150 | flow->sport = bpf_ntohs(udp.source); 151 | flow->dport = bpf_ntohs(udp.dest); 152 | 153 | *p_offset += bpf_ntohs(udp.len); 154 | 155 | return true; 156 | } 157 | 158 | static __always_inline bool 159 | flow_dissector_tcp_dissect(struct sk_buff *skb, struct trap_flow_key *flow, 160 | u16 *p_offset) 161 | { 162 | void *skb_data = skb->head + skb->mac_header; 163 | struct tcphdr tcp; 164 | 165 | if (!flow_dissector_valid_access(skb, *p_offset, sizeof(tcp))) 166 | return false; 167 | 168 | bpf_probe_read(&tcp, sizeof(tcp), skb_data + *p_offset); 169 | 170 | if (tcp.doff < 5 || tcp.doff > 15) 171 | return false; 172 | 173 | flow->sport = bpf_ntohs(tcp.source); 174 | flow->dport = bpf_ntohs(tcp.dest); 175 | 176 | *p_offset += tcp.doff << 2; 177 | 178 | return true; 179 | } 180 | 181 | static __always_inline void flow_dissector(struct sk_buff *skb, 182 | struct trap_flow_key *flow) 183 | { 184 | void *skb_data = skb->head + skb->mac_header; 185 | struct vlan_hdr vlan_hdr; 186 | u16 offset, eth_proto; 187 | struct ethhdr eth; 188 | 189 | /* Skip if MAC header was not set. */ 190 | if (skb->mac_header == 0xffff) 191 | return; 192 | 193 | if (!flow_dissector_valid_access(skb, 0, sizeof(eth))) 194 | return; 195 | 196 | bpf_probe_read(ð, sizeof(eth), skb_data); 197 | 198 | offset = ETH_HLEN; 199 | eth_proto = bpf_ntohs(eth.h_proto); 200 | 201 | if (eth_proto == ETH_P_8021AD) { 202 | bpf_probe_read(&vlan_hdr, sizeof(vlan_hdr), skb_data + offset); 203 | offset += sizeof(struct vlan_hdr); 204 | eth_proto = bpf_ntohs(vlan_hdr.h_vlan_encapsulated_proto); 205 | } 206 | 207 | if (eth_proto == ETH_P_8021Q) { 208 | bpf_probe_read(&vlan_hdr, sizeof(vlan_hdr), skb_data + offset); 209 | offset += sizeof(struct vlan_hdr); 210 | eth_proto = bpf_ntohs(vlan_hdr.h_vlan_encapsulated_proto); 211 | } 212 | 213 | switch (eth_proto) { 214 | case ETH_P_IP: 215 | if (!flow_dissector_ipv4_dissect(skb, flow, &offset)) 216 | return; 217 | break; 218 | case ETH_P_IPV6: 219 | if (!flow_dissector_ipv6_dissect(skb, flow, &offset)) 220 | return; 221 | break; 222 | default: 223 | return; 224 | } 225 | 226 | switch (flow->ip_proto) { 227 | case IPPROTO_IPIP: 228 | flow->is_encap = true; 229 | if (!flow_dissector_ipv4_dissect(skb, flow, &offset)) 230 | return; 231 | break; 232 | case IPPROTO_IPV6: 233 | flow->is_encap = true; 234 | if (!flow_dissector_ipv6_dissect(skb, flow, &offset)) 235 | return; 236 | break; 237 | case IPPROTO_GRE: 238 | flow->is_encap = true; 239 | if (!flow_dissector_gre_dissect(skb, flow, &offset)) 240 | return; 241 | break; 242 | default: 243 | break; 244 | } 245 | 246 | switch (flow->ip_proto) { 247 | case IPPROTO_UDP: 248 | case IPPROTO_UDPLITE: 249 | if (!flow_dissector_udp_dissect(skb, flow, &offset)) 250 | return; 251 | break; 252 | case IPPROTO_TCP: 253 | if (!flow_dissector_tcp_dissect(skb, flow, &offset)) 254 | return; 255 | break; 256 | default: 257 | return; 258 | } 259 | } 260 | 261 | SEC("tp_btf/devlink_trap_report") 262 | int BPF_PROG(devlink_trap_report, const struct devlink *devlink, 263 | struct sk_buff *skb, const struct devlink_trap_metadata *metadata) 264 | { 265 | enum devlink_trap_type type; 266 | struct trap_flow_key tfk; 267 | const char *trap_name; 268 | u64 *val, one = 1; 269 | 270 | /* Filter unwanted traps. */ 271 | type = metadata->trap_type; 272 | if (!targ_all) { 273 | if ((type == DEVLINK_TRAP_TYPE_DROP && !targ_drop) || 274 | (type == DEVLINK_TRAP_TYPE_EXCEPTION && !targ_exception) || 275 | (type == DEVLINK_TRAP_TYPE_CONTROL && !targ_control)) 276 | return 0; 277 | } 278 | 279 | /* Initialize key. */ 280 | __builtin_memset(&tfk, 0, sizeof(tfk)); 281 | bpf_probe_read_kernel_str(&tfk.trap_name, TRAP_NAME_LEN, 282 | metadata->trap_name); 283 | flow_dissector(skb, &tfk); 284 | 285 | /* Update LRU hash table. */ 286 | val = bpf_map_lookup_elem(&trap_flows, &tfk); 287 | if (!val) { 288 | bpf_map_update_elem(&trap_flows, &tfk, &one, BPF_NOEXIST); 289 | return 0; 290 | } 291 | __sync_fetch_and_add(val, 1); 292 | 293 | return 0; 294 | } 295 | 296 | char LICENSE[] SEC("license") = "GPL"; 297 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/src/trapagg.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "trapagg.h" 13 | #include "trapagg.skel.h" 14 | #include "trace_helpers.h" 15 | #include "map_helpers.h" 16 | 17 | static struct env { 18 | bool drop; 19 | bool exception; 20 | bool control; 21 | bool all; 22 | bool pin; 23 | bool unpin; 24 | bool stats; 25 | bool timestamp; 26 | bool verbose; 27 | time_t interval; 28 | int times; 29 | } env = { 30 | .all = true, 31 | .interval = 99999999, 32 | .times = 99999999, 33 | }; 34 | 35 | static const char *link_pin_path = "/sys/fs/bpf/pinned_trapagg_link"; 36 | static const char *map_pin_path = "/sys/fs/bpf/pinned_trapagg_map"; 37 | static volatile bool exiting; 38 | 39 | const char *argp_program_version = "trapagg 0.0"; 40 | const char *argp_program_bug_address = ""; 41 | const char argp_program_doc[] = 42 | "Dump aggregated per-{trap, flow} statistics.\n" 43 | "\n" 44 | "USAGE: trapagg [--help] [-d] [-e] [-c] [-p] [-u] [-s] [-T] [-v] [interval] [count]\n" 45 | "\n" 46 | "EXAMPLES:\n" 47 | " trapagg # dump aggregated per-{trap, flow} statistics\n" 48 | " trapagg -d # dump aggregated statistics of drop traps only\n" 49 | " trapagg -p # pin BPF objects and exit\n" 50 | " trapagg -u # unpin BPF objects and exit\n" 51 | " trapagg -s # dump statistics from pinned objects and exit\n" 52 | " trapagg 1 10 # print 1 second summaries, 10 times\n" 53 | " trapagg -T 1 # 1s summaries with timestamps\n"; 54 | 55 | static const struct argp_option opts[] = { 56 | { "drop", 'd', NULL, 0, "Trace drop traps only" }, 57 | { "exception", 'e', NULL, 0, "Trace exception traps only" }, 58 | { "control", 'c', NULL, 0, "Trace control traps only" }, 59 | { "pin", 'p', NULL, 0, "Pin BPF objects and exit" }, 60 | { "unpin", 'u', NULL, 0, "Unpin BPF objects and exit" }, 61 | { "stats", 's', NULL, 0, "Dump aggregated statistics from pinned objects and exit" }, 62 | { "timestamp", 'T', NULL, 0, "Include timestamp on output" }, 63 | { "verbose", 'v', NULL, 0, "Verbose debug output" }, 64 | {}, 65 | }; 66 | 67 | static error_t parse_arg(int key, char *arg, struct argp_state *state) 68 | { 69 | static int pos_args; 70 | 71 | switch (key) { 72 | case 'd': 73 | env.drop = true; 74 | env.all = false; 75 | break; 76 | case 'e': 77 | env.exception = true; 78 | env.all = false; 79 | break; 80 | case 'c': 81 | env.control = true; 82 | env.all = false; 83 | break; 84 | case 'p': 85 | env.pin = true; 86 | break; 87 | case 'u': 88 | env.unpin = true; 89 | break; 90 | case 's': 91 | env.stats = true; 92 | break; 93 | case 'T': 94 | env.timestamp = true; 95 | break; 96 | case 'v': 97 | env.verbose = true; 98 | break; 99 | case ARGP_KEY_ARG: 100 | errno = 0; 101 | if (pos_args == 0) { 102 | env.interval = strtol(arg, NULL, 10); 103 | if (errno) { 104 | fprintf(stderr, "Invalid interval\n"); 105 | argp_usage(state); 106 | } 107 | } else if (pos_args == 1) { 108 | env.times = strtol(arg, NULL, 10); 109 | if (errno) { 110 | fprintf(stderr, "Invalid times\n"); 111 | argp_usage(state); 112 | } 113 | } else { 114 | fprintf(stderr, 115 | "Unrecognized positional argument: %s\n", arg); 116 | argp_usage(state); 117 | } 118 | pos_args++; 119 | break; 120 | default: 121 | return ARGP_ERR_UNKNOWN; 122 | } 123 | return 0; 124 | } 125 | 126 | int libbpf_print_fn(enum libbpf_print_level level, const char *format, 127 | va_list args) 128 | { 129 | if (level == LIBBPF_DEBUG && !env.verbose) 130 | return 0; 131 | return vfprintf(stderr, format, args); 132 | } 133 | 134 | static void sig_handler(int sig) 135 | { 136 | exiting = true; 137 | } 138 | 139 | static int pin_objects(struct trapagg_bpf *obj) 140 | { 141 | int err; 142 | 143 | err = bpf_link__pin(obj->links.devlink_trap_report, link_pin_path); 144 | if (err) { 145 | fprintf(stderr, "Failed to pin BPF link: %d\n", err); 146 | return err; 147 | } 148 | 149 | err = bpf_map__pin(obj->maps.trap_flows, map_pin_path); 150 | if (err) { 151 | fprintf(stderr, "Failed to pin BPF map: %d\n", err); 152 | goto err_link_unpin; 153 | } 154 | 155 | return 0; 156 | 157 | err_link_unpin: 158 | bpf_link__unpin(obj->links.devlink_trap_report); 159 | return err; 160 | } 161 | 162 | static void unpin_objects() 163 | { 164 | struct bpf_link *link; 165 | int err; 166 | 167 | if (unlink(map_pin_path)) { 168 | fprintf(stderr, "Failed to unpin BPF map: %s\n", 169 | strerror(errno)); 170 | return; 171 | } 172 | 173 | link = bpf_link__open(link_pin_path); 174 | err = libbpf_get_error(link); 175 | if (err) { 176 | fprintf(stderr, "Failed to open pinned BPF link: %d\n", err); 177 | return; 178 | } 179 | bpf_link__unpin(link); 180 | bpf_link__destroy(link); 181 | } 182 | 183 | static void print_trap_v4(const struct trap_flow_key *tfk, __u64 count) 184 | { 185 | char s[INET_ADDRSTRLEN]; 186 | char d[INET_ADDRSTRLEN]; 187 | struct in_addr src; 188 | struct in_addr dst; 189 | 190 | src.s_addr = tfk->saddrv4; 191 | dst.s_addr = tfk->daddrv4; 192 | 193 | printf("%-40s %-25s %-25s %-10d %-10d %-10u %-10u %-10llu\n", 194 | tfk->trap_name, inet_ntop(AF_INET, &src, s, sizeof(s)), 195 | inet_ntop(AF_INET, &dst, d, sizeof(d)), tfk->sport, tfk->dport, 196 | tfk->ip_proto, tfk->is_encap, count); 197 | } 198 | 199 | static void print_trap_v6(const struct trap_flow_key *tfk, __u64 count) 200 | { 201 | char s[INET6_ADDRSTRLEN]; 202 | char d[INET6_ADDRSTRLEN]; 203 | struct in6_addr src; 204 | struct in6_addr dst; 205 | 206 | memcpy(src.s6_addr, tfk->saddrv6, sizeof(src.s6_addr)); 207 | memcpy(dst.s6_addr, tfk->daddrv6, sizeof(src.s6_addr)); 208 | 209 | printf("%-40s %-25s %-25s %-10d %-10d %-10u %-10u %-10llu\n", 210 | tfk->trap_name, inet_ntop(AF_INET6, &src, s, sizeof(s)), 211 | inet_ntop(AF_INET6, &dst, d, sizeof(d)), tfk->sport, tfk->dport, 212 | tfk->ip_proto, tfk->is_encap, count); 213 | } 214 | 215 | static void print_trap_non_ip(const struct trap_flow_key *tfk, __u64 count) 216 | { 217 | printf("%-40s %-25s %-25s %-10d %-10d %-10u %-10u %-10llu\n", 218 | tfk->trap_name, "", "", 0, 0, 0, 0, count); 219 | } 220 | 221 | static int print_traps(int map_fd) 222 | { 223 | static struct trap_flow_key tfks[MAX_ENTRIES]; 224 | __u32 key_size = sizeof(struct trap_flow_key); 225 | static struct trap_flow_key zero; 226 | __u32 value_size = sizeof(__u64); 227 | static __u64 counts[MAX_ENTRIES]; 228 | static const char *header_fmt; 229 | __u32 i, n = MAX_ENTRIES; 230 | 231 | header_fmt = "\n%-40s %-25s %-25s %-10s %-10s %-10s %-10s\n"; 232 | printf(header_fmt, "TRAP", "SIP", "DIP", "SPORT", "DPORT", "IP_PROTO", 233 | "IS_ENCAP"); 234 | 235 | if (dump_hash(map_fd, tfks, key_size, counts, value_size, &n, &zero)) { 236 | fprintf(stderr, "dump_hash: %s", strerror(errno)); 237 | return -1; 238 | } 239 | 240 | for (i = 0; i < n; i++) { 241 | switch (tfks[i].addr_proto) { 242 | case ETH_P_IP: 243 | print_trap_v4(&tfks[i], counts[i]); 244 | break; 245 | case ETH_P_IPV6: 246 | print_trap_v6(&tfks[i], counts[i]); 247 | break; 248 | default: 249 | print_trap_non_ip(&tfks[i], counts[i]); 250 | break; 251 | } 252 | } 253 | 254 | return 0; 255 | } 256 | 257 | static int print_stats() 258 | { 259 | int map_fd; 260 | 261 | map_fd = bpf_obj_get(map_pin_path); 262 | if (map_fd < 0) { 263 | fprintf(stderr, "Failed to get pinned BPF map: %d\n", map_fd); 264 | return map_fd; 265 | } 266 | 267 | return print_traps(map_fd); 268 | } 269 | 270 | int main(int argc, char **argv) 271 | { 272 | static const struct argp argp = { 273 | .options = opts, 274 | .parser = parse_arg, 275 | .doc = argp_program_doc, 276 | }; 277 | struct trapagg_bpf *obj; 278 | struct tm *tm; 279 | char ts[32]; 280 | time_t t; 281 | int err; 282 | 283 | err = argp_parse(&argp, argc, argv, 0, NULL, NULL); 284 | if (err) 285 | return err; 286 | 287 | libbpf_set_print(libbpf_print_fn); 288 | 289 | err = bump_memlock_rlimit(); 290 | if (err) { 291 | fprintf(stderr, "Failed to increase rlimit: %d\n", err); 292 | return 1; 293 | } 294 | 295 | if (env.stats) 296 | return print_stats(); 297 | 298 | if (env.unpin) { 299 | unpin_objects(); 300 | return 0; 301 | } 302 | 303 | obj = trapagg_bpf__open(); 304 | if (!obj) { 305 | fprintf(stderr, "Failed to open BPF object\n"); 306 | return 1; 307 | } 308 | 309 | /* Initialize global data (filtering options). */ 310 | obj->rodata->targ_drop = env.drop; 311 | obj->rodata->targ_exception = env.exception; 312 | obj->rodata->targ_control = env.control; 313 | obj->rodata->targ_all = env.all; 314 | 315 | err = trapagg_bpf__load(obj); 316 | if (err) { 317 | fprintf(stderr, "Failed to load BPF object: %d\n", err); 318 | goto cleanup; 319 | } 320 | 321 | err = trapagg_bpf__attach(obj); 322 | if (err) { 323 | fprintf(stderr, "Failed to attach BPF program\n"); 324 | goto cleanup; 325 | } 326 | 327 | if (env.pin) { 328 | err = pin_objects(obj); 329 | if (err) 330 | fprintf(stderr, "Failed to pin BPF objects: %d\n", err); 331 | goto cleanup; 332 | } 333 | 334 | signal(SIGINT, sig_handler); 335 | signal(SIGTERM, sig_handler); 336 | 337 | printf("Tracing packet traps... Hit Ctrl-C to end.\n"); 338 | 339 | /* main: poll */ 340 | while (1) { 341 | sleep(env.interval); 342 | printf("\n"); 343 | 344 | if (env.timestamp) { 345 | time(&t); 346 | tm = localtime(&t); 347 | strftime(ts, sizeof(ts), "%H:%M:%S", tm); 348 | printf("%-8s\n", ts); 349 | } 350 | 351 | err = print_traps(bpf_map__fd(obj->maps.trap_flows)); 352 | if (err) 353 | break; 354 | 355 | if (exiting || --env.times == 0) 356 | break; 357 | } 358 | 359 | cleanup: 360 | trapagg_bpf__destroy(obj); 361 | 362 | return err != 0; 363 | } 364 | -------------------------------------------------------------------------------- /Debugging/EMADs/bwz.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | """ 3 | Copyright 2018 Mellanox Technologies. All rights reserved. 4 | Licensed under the GNU General Public License, version 2 as 5 | published by the Free Software Foundation; see COPYING for details. 6 | """ 7 | 8 | __author__ = """ 9 | petrm@mellanox.com (Petr Machata) 10 | """ 11 | 12 | import os 13 | import sys 14 | import struct 15 | import pcapy 16 | import errno 17 | import getopt 18 | from common import pcap_header_out, pcap_packet_header, tag_dict, \ 19 | tlv_bus_name, tlv_dev_name, tlv_driver_name, tlv_incoming, tlv_type, tlv_buf 20 | 21 | def usage(): 22 | sys.stdout.write( 23 | "{argv[0]} [OPTIONS...] [FILE]\n" 24 | "\n" 25 | "Bearbeitungswerkzeug is a tool for processing pcap files captured by\n" 26 | "devlink-hwmsg.py. It's a shell filter that reads packets from one stream\n" 27 | "and produces a stream of packets filtered and transformed from the input.\n" 28 | "There's a simple DSL for selecting which packets to filter out and which\n" 29 | "to keep, and a way to describe what information and in which format should\n" 30 | "be included in the output stream.\n" 31 | "\n" 32 | "Options:\n" 33 | " -f EXPR Packet filter expression (which packets to include)\n" 34 | " -s EXPR Packet slicer expression (what to include in a packet)\n" 35 | ' -r FILE Read input from FILE ("-" means stdin, the default)\n' 36 | ' -w FILE Write output to FILE ("-" means stdout, the default)\n' 37 | ' -t TYPE LINKTYPE to use in pcap header (the default is 162)\n' 38 | " --help Show this help and exit\n" 39 | " --show Show filter and slicer expressions and exit\n" 40 | "\n" 41 | "Filter expressions:\n" 42 | " Keywords:\n" 43 | " bus Name of the bus where the message was captured\n" 44 | " dev Name of the device where the message was captured\n" 45 | " driver Name of the driver managing the device\n" 46 | " incoming Whether the message was send from the device to the kernel\n" 47 | " outgoing The opposite of incoming\n" 48 | " type Type of the message\n" 49 | " buf The message itself\n" 50 | "\n" 51 | " Literals:\n" 52 | ' "X" Literal string \n' 53 | " 123 Literal numbers\n" 54 | " True,False Booleans\n" 55 | "\n" 56 | " Compound expressions:\n" 57 | " X == Y Expressions X and Y evaluate to the same value\n" 58 | " X != Y The opposite\n" 59 | " X & Y Boolean conjunction\n" 60 | " X | Y Boolean disjunction\n" 61 | " ~X Boolean negation ('outgoing' is the same as '~incoming')\n" 62 | " X[Y] Sequence (string) access\n" 63 | " X[Y:Z] Sequence (string) slicing\n" 64 | "\n" 65 | " Examples:\n" 66 | ' driver == "mlxsw_spectrum" # Just messages to this driver\n' 67 | ' driver[:5] == "mlxsw" # Messages to any mlxsw driver\n' 68 | ' incoming & (driver == "X") # Only incoming messages to this driver\n' 69 | "\n" 70 | "Slicer expressions:\n" 71 | " Keywords:\n" 72 | " The same suite of keywords is supported for slicing as well.\n" 73 | "\n" 74 | " Combiners:\n" 75 | " X, Y, Z Dump values of these keywords one after another\n" 76 | " tlv(X, Y, Z) Dump the values in the same TLV format\n" 77 | "\n" 78 | " Examples:\n" 79 | " buf # Just the message payload without TLV marking\n" 80 | " tlv(incoming, buf) # These two pieces of data in TLV format\n" 81 | .format(argv=sys.argv) 82 | ) 83 | 84 | try: 85 | optlist, args = getopt.gnu_getopt(sys.argv[1:], 'f:r:s:t:vw:', 86 | ["help", "show"]) 87 | except(getopt.GetoptError, e): 88 | print(e) 89 | sys.exit(1) 90 | 91 | query_string = "True" 92 | slicer_string = "tlv(*all)" 93 | show_exprs_and_exit = False 94 | read_file = "-" 95 | write_file = "-" 96 | link_type = 162 97 | 98 | opts = dict(optlist) 99 | if "--help" in opts: 100 | usage() 101 | sys.exit(0) 102 | if "--show" in opts: 103 | show_exprs_and_exit = True 104 | if "-f" in opts: 105 | query_string = opts["-f"] 106 | if "-s" in opts: 107 | slicer_string = opts["-s"] 108 | if "-r" in opts: 109 | read_file = opts["-r"] 110 | if "-w" in opts: 111 | write_file = opts["-w"] 112 | if "-v" in opts: 113 | verbose = True 114 | if "-t" in opts: 115 | link_type = int(opts["-t"]) 116 | 117 | class Q(object): 118 | def __eq__(self, other): 119 | return Binary(self, other, "(%s == %s)", lambda a, b: a == b) 120 | 121 | def __ne__(self, other): 122 | return Binary(self, other, "(%s != %s)", lambda a, b: a != b) 123 | 124 | def __getitem__(self, key): 125 | return Binary(self, key, "(%s[%s])", lambda a, b: a[b]) 126 | 127 | def __and__(self, other): 128 | return Binary(self, other, "(%s & %s)", lambda a, b: a and b) 129 | 130 | def __or__(self, other): 131 | return Binary(self, other, "(%s | %s)", lambda a, b: a or b) 132 | 133 | def __invert__(self): 134 | return Unary(self, "(~%s)", lambda a: not a) 135 | 136 | class Immediate(Q): 137 | def __init__(self, value, tag=None): 138 | self._tag = tag 139 | self._value = value 140 | 141 | def value(self): 142 | return self._value 143 | 144 | def tag(self): 145 | return self._tag 146 | 147 | def evaluate(self, tlv): 148 | return self 149 | 150 | def __str__(self): 151 | return repr(self._value) 152 | 153 | class Unary(Q): 154 | def __init__(self, a, fmt, f): 155 | self._a = a 156 | self._fmt = fmt 157 | self._f = f 158 | 159 | def evaluate(self, tlv): 160 | a = evaluate(self._a, tlv) 161 | return Immediate(self._f(a.value()), None) 162 | 163 | def __str__(self): 164 | return self._fmt % self._a 165 | 166 | class Binary(Q): 167 | def __init__(self, a, b, fmt, f): 168 | self._a = a 169 | self._b = b 170 | self._fmt = fmt 171 | self._f = f 172 | 173 | def evaluate(self, tlv): 174 | a = evaluate(self._a, tlv) 175 | b = evaluate(self._b, tlv) 176 | return Immediate(self._f(a.value(), b.value()), None) 177 | 178 | def __str__(self): 179 | b = self._b if isinstance(self._b, Q) else Immediate(self._b) 180 | return self._fmt % (self._a, b) 181 | 182 | class Select(Q): 183 | def __init__(self, tag, name): 184 | self._tag = tag 185 | self._name = name 186 | 187 | def evaluate(self, tlv): 188 | return Immediate(tlv[self._tag], self._tag) 189 | 190 | def __str__(self): 191 | return self._name 192 | 193 | class Slicer(object): 194 | def __init__(self, gen): 195 | self._items = list(gen) 196 | 197 | def slice_data(self, tlv): 198 | ret = bytearray() 199 | for item in self._items: 200 | a = evaluate(item, tlv) 201 | tag = a.tag() 202 | if tag is None: 203 | raise RuntimeError("%s has indeterminate tag" % str(item)) 204 | 205 | v = tag_dict[tag].encode(a.value()) 206 | ret += self.pack(tag, v) 207 | return ret 208 | 209 | class IterableSlicer(Slicer): 210 | def pack(self, tag, data): 211 | return data 212 | 213 | def __str__(self): 214 | return ", ".join(str(item) for item in self._items) 215 | 216 | class TLVSlicer(Slicer): 217 | def pack(self, tag, data): 218 | return struct.pack("HH", tag, len(data)) + data 219 | 220 | def __str__(self): 221 | return "tlv(%s)" % ", ".join(str(item) for item in self._items) 222 | 223 | def evaluate(obj, tlv): 224 | if isinstance(obj, Q): 225 | return obj.evaluate(tlv) 226 | else: 227 | return Immediate(obj) 228 | 229 | def slice_data(obj, tlv): 230 | if isinstance(obj, Slicer): 231 | return obj.slice_data(tlv) 232 | 233 | if isinstance(obj, (tuple, list)): 234 | gen = iter(obj) 235 | else: 236 | gen = iter((obj, )) 237 | return IterableSlicer(gen).slice_data(tlv) 238 | 239 | class Query: 240 | bus = Select(tlv_bus_name.tag(), "bus") 241 | dev = Select(tlv_dev_name.tag(), "dev") 242 | driver = Select(tlv_driver_name.tag(), "driver") 243 | incoming = Select(tlv_incoming.tag(), "incoming") 244 | outgoing = ~incoming 245 | type = Select(tlv_type.tag(), "type") 246 | buf = Select(tlv_buf.tag(), "buf") 247 | v = Immediate 248 | 249 | query = eval(query_string, dict(Query.__dict__)) 250 | slicer = eval(slicer_string, dict(Query.__dict__), 251 | {"tlv": lambda *args: TLVSlicer(iter(args)), 252 | "all": (Query.bus, Query.dev, Query.driver, Query.incoming, 253 | Query.type, Query.buf)}) 254 | 255 | if show_exprs_and_exit: 256 | sys.stderr.write("filter=%s\n" % str(query)) 257 | sys.stderr.write("slice=%s\n" % str(slicer)) 258 | sys.exit(0) 259 | 260 | def read_tlv(data): 261 | ret = {} 262 | while len(data) != 0: 263 | tag, length = struct.unpack("HH", data[:4]) 264 | data = data[4:] 265 | value = tag_dict[tag].decode(data[:length]) 266 | data = data[length:] 267 | ret[tag] = value 268 | return ret 269 | 270 | def main(): 271 | out = os.fdopen(1, "wb") if write_file == "-" else open(write_file, "wb") 272 | pcap_header_out(out, link_type) 273 | 274 | r = pcapy.open_offline(read_file) 275 | while True: 276 | try: 277 | hdr, payload = r.next() 278 | except pcapy.PcapError: 279 | break 280 | 281 | if hdr == None: 282 | break 283 | secs, usecs = hdr.getts() 284 | tlv = read_tlv(payload) 285 | if evaluate(query, tlv).value(): 286 | data = slice_data(slicer, tlv) 287 | 288 | try: 289 | out.write(pcap_packet_header(secs, usecs, len(data))) 290 | out.write(data) 291 | out.flush() 292 | except(IOError, e): 293 | if e.errno == errno.EPIPE: 294 | return 295 | raise 296 | 297 | if __name__ == '__main__': 298 | try: 299 | main() 300 | except KeyboardInterrupt: 301 | sys.stderr.write("Interrupted.\n") 302 | -------------------------------------------------------------------------------- /Debugging/libbpf-tools/resmon/resmon-back.c: -------------------------------------------------------------------------------- 1 | // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "resmon.h" 10 | #include "resmon.skel.h" 11 | #include "trace_helpers.h" 12 | 13 | struct resmon_back { 14 | const struct resmon_back_cls *cls; 15 | }; 16 | 17 | struct resmon_back_cls { 18 | struct resmon_back *(*init)(void); 19 | void (*fini)(struct resmon_back *back); 20 | 21 | int (*get_capacity)(struct resmon_back *back, uint64_t *capacity, 22 | char **error); 23 | bool (*handle_method)(struct resmon_back *back, 24 | struct resmon_stat *stat, 25 | struct resmon_reg *rreg, 26 | const char *method, 27 | struct resmon_sock *peer, 28 | struct json_object *params_obj, 29 | struct json_object *id); 30 | int (*pollfd)(struct resmon_back *back); 31 | int (*activity)(struct resmon_back *back, struct resmon_stat *stat, 32 | struct resmon_reg *rreg); 33 | }; 34 | 35 | struct resmon_back * 36 | resmon_back_init(const struct resmon_back_cls *cls) 37 | { 38 | return cls->init(); 39 | } 40 | 41 | void resmon_back_fini(struct resmon_back *back) 42 | { 43 | return back->cls->fini(back); 44 | } 45 | 46 | int resmon_back_get_capacity(struct resmon_back *back, uint64_t *capacity, 47 | char **error) 48 | { 49 | return back->cls->get_capacity(back, capacity, error); 50 | } 51 | 52 | bool resmon_back_handle_method(struct resmon_back *back, 53 | struct resmon_stat *stat, 54 | struct resmon_reg *rreg, 55 | const char *method, 56 | struct resmon_sock *peer, 57 | struct json_object *params_obj, 58 | struct json_object *id) 59 | { 60 | if (back->cls->handle_method == NULL) 61 | return false; 62 | return back->cls->handle_method(back, stat, rreg, method, peer, 63 | params_obj, id); 64 | } 65 | 66 | int resmon_back_pollfd(struct resmon_back *back) 67 | { 68 | return back->cls->pollfd(back); 69 | } 70 | 71 | int resmon_back_activity(struct resmon_back *back, struct resmon_stat *stat, 72 | struct resmon_reg *rreg) 73 | { 74 | return back->cls->activity(back, stat, rreg); 75 | } 76 | 77 | struct resmon_back_hw { 78 | struct resmon_back base; 79 | struct resmon_dl *dl; 80 | struct resmon_bpf *bpf_obj; 81 | struct ring_buffer *ringbuf; 82 | struct resmon_stat *stat; 83 | struct resmon_reg *rreg; 84 | }; 85 | 86 | static int resmon_back_libbpf_print_fn(enum libbpf_print_level level, 87 | const char *format, 88 | va_list args) 89 | { 90 | int prio = 0; 91 | 92 | if ((int)level > env.verbosity) 93 | return 0; 94 | 95 | switch (level) { 96 | case LIBBPF_WARN: 97 | prio = LOG_WARNING; 98 | break; 99 | case LIBBPF_INFO: 100 | prio = LOG_INFO; 101 | break; 102 | case LIBBPF_DEBUG: 103 | prio = LOG_DEBUG; 104 | break; 105 | } 106 | 107 | vsyslog(prio, format, args); 108 | return 0; 109 | } 110 | 111 | static int resmon_back_hw_rb_sample_cb(void *ctx, void *data, size_t len) 112 | { 113 | struct resmon_back_hw *back = ctx; 114 | char *error; 115 | int rc; 116 | 117 | rc = resmon_reg_process_emad(back->rreg, back->stat, data, len, &error); 118 | if (rc != 0) { 119 | syslog(LOG_ERR, "EMAD processing error: %s", error); 120 | free(error); 121 | } 122 | return 0; 123 | } 124 | 125 | static struct resmon_back * 126 | resmon_back_hw_init(void) 127 | { 128 | struct resmon_back_hw *back; 129 | struct ring_buffer *ringbuf; 130 | struct resmon_bpf *bpf_obj; 131 | struct resmon_dl *dl; 132 | int rc; 133 | 134 | back = malloc(sizeof(*back)); 135 | if (back == NULL) 136 | return NULL; 137 | 138 | dl = resmon_dl_create(); 139 | if (dl == NULL) { 140 | fprintf(stderr, "Failed to open netlink socket\n"); 141 | goto free_back; 142 | } 143 | 144 | libbpf_set_print(resmon_back_libbpf_print_fn); 145 | 146 | rc = bump_memlock_rlimit(); 147 | if (rc != 0) { 148 | fprintf(stderr, "Failed to increase rlimit: %d\n", rc); 149 | goto destroy_dl; 150 | } 151 | 152 | bpf_obj = resmon_bpf__open(); 153 | if (bpf_obj == NULL) { 154 | fprintf(stderr, "Failed to open the resmon BPF object\n"); 155 | goto destroy_dl; 156 | } 157 | 158 | rc = resmon_bpf__load(bpf_obj); 159 | if (rc != 0) { 160 | fprintf(stderr, "Failed to load the resmon BPF object\n"); 161 | goto destroy_bpf; 162 | } 163 | 164 | ringbuf = ring_buffer__new(bpf_map__fd(bpf_obj->maps.ringbuf), 165 | resmon_back_hw_rb_sample_cb, back, NULL); 166 | if (ringbuf == NULL) 167 | goto destroy_bpf; 168 | 169 | rc = resmon_bpf__attach(bpf_obj); 170 | if (rc != 0) { 171 | fprintf(stderr, "Failed to attach BPF program\n"); 172 | goto free_ringbuf; 173 | } 174 | 175 | *back = (struct resmon_back_hw) { 176 | .base.cls = &resmon_back_cls_hw, 177 | .bpf_obj = bpf_obj, 178 | .ringbuf = ringbuf, 179 | .dl = dl, 180 | }; 181 | 182 | return &back->base; 183 | 184 | free_ringbuf: 185 | ring_buffer__free(ringbuf); 186 | destroy_bpf: 187 | resmon_bpf__destroy(bpf_obj); 188 | destroy_dl: 189 | resmon_dl_destroy(dl); 190 | free_back: 191 | free(back); 192 | return NULL; 193 | } 194 | 195 | static void resmon_back_hw_fini(struct resmon_back *base) 196 | { 197 | struct resmon_back_hw *back = 198 | container_of(base, struct resmon_back_hw, base); 199 | 200 | resmon_bpf__detach(back->bpf_obj); 201 | ring_buffer__free(back->ringbuf); 202 | resmon_bpf__destroy(back->bpf_obj); 203 | resmon_dl_destroy(back->dl); 204 | free(back); 205 | } 206 | 207 | static int resmon_back_hw_get_capacity(struct resmon_back *base, 208 | uint64_t *capacity, 209 | char **error) 210 | { 211 | struct resmon_back_hw *back = 212 | container_of(base, struct resmon_back_hw, base); 213 | 214 | return resmon_dl_get_kvd_size(back->dl, capacity, error); 215 | } 216 | 217 | static int resmon_back_hw_pollfd(struct resmon_back *base) 218 | { 219 | struct resmon_back_hw *back = 220 | container_of(base, struct resmon_back_hw, base); 221 | 222 | return ring_buffer__epoll_fd(back->ringbuf); 223 | } 224 | 225 | static int resmon_back_hw_activity(struct resmon_back *base, 226 | struct resmon_stat *stat, 227 | struct resmon_reg *rreg) 228 | { 229 | struct resmon_back_hw *back = 230 | container_of(base, struct resmon_back_hw, base); 231 | int n; 232 | 233 | back->stat = stat; 234 | back->rreg = rreg; 235 | n = ring_buffer__consume(back->ringbuf); 236 | back->rreg = NULL; 237 | back->stat = NULL; 238 | if (n < 0) 239 | return -1; 240 | return 0; 241 | } 242 | 243 | const struct resmon_back_cls resmon_back_cls_hw = { 244 | .init = resmon_back_hw_init, 245 | .fini = resmon_back_hw_fini, 246 | .get_capacity = resmon_back_hw_get_capacity, 247 | .pollfd = resmon_back_hw_pollfd, 248 | .activity = resmon_back_hw_activity, 249 | }; 250 | 251 | struct resmon_back_mock { 252 | struct resmon_back base; 253 | }; 254 | 255 | static struct resmon_back * 256 | resmon_back_mock_init(void) 257 | { 258 | struct resmon_back_mock *back; 259 | 260 | back = malloc(sizeof(*back)); 261 | if (back == NULL) 262 | return NULL; 263 | 264 | *back = (struct resmon_back_mock) { 265 | .base.cls = &resmon_back_cls_mock, 266 | }; 267 | 268 | return &back->base; 269 | } 270 | 271 | static void resmon_back_mock_fini(struct resmon_back *back) 272 | { 273 | free(back); 274 | } 275 | 276 | static int resmon_back_mock_get_capacity(struct resmon_back *back, 277 | uint64_t *capacity, 278 | char **error) 279 | { 280 | *capacity = 10000; 281 | return 0; 282 | } 283 | 284 | static int resmon_back_mock_emad_decode_payload(uint8_t *dec, const char *enc, 285 | size_t dec_len) 286 | { 287 | for (size_t i = 0; i < dec_len; i++) { 288 | char buf[3] = {enc[2 * i], enc[2 * i + 1], '\0'}; 289 | char *endptr = NULL; 290 | long byte; 291 | 292 | errno = 0; 293 | byte = strtol(buf, &endptr, 16); 294 | if (errno || *endptr != '\0') 295 | return -1; 296 | dec[i] = byte; 297 | } 298 | return 0; 299 | } 300 | 301 | static void resmon_back_mock_handle_emad(struct resmon_stat *stat, 302 | struct resmon_reg *rreg, 303 | struct resmon_sock *peer, 304 | struct json_object *params_obj, 305 | struct json_object *id) 306 | { 307 | struct json_object *obj; 308 | size_t dec_payload_len; 309 | uint8_t *dec_payload; 310 | const char *payload; 311 | size_t payload_len; 312 | char *error; 313 | int rc; 314 | 315 | rc = resmon_jrpc_dissect_params_emad(params_obj, &payload, 316 | &payload_len, &error); 317 | if (rc != 0) { 318 | resmon_d_respond_invalid_params(peer, id, error); 319 | free(error); 320 | return; 321 | } 322 | 323 | if (payload_len % 2 != 0) { 324 | resmon_d_respond_invalid_params(peer, id, 325 | "EMAD payload has an odd length"); 326 | return; 327 | } 328 | 329 | dec_payload_len = payload_len / 2; 330 | dec_payload = malloc(dec_payload_len); 331 | if (dec_payload == NULL) 332 | goto err_respond_memerr; 333 | 334 | rc = resmon_back_mock_emad_decode_payload(dec_payload, payload, 335 | dec_payload_len); 336 | if (rc != 0) { 337 | resmon_d_respond_invalid_params(peer, id, 338 | "EMAD payload expected in hexdump format"); 339 | goto out; 340 | } 341 | 342 | rc = resmon_reg_process_emad(rreg, stat, dec_payload, 343 | dec_payload_len, &error); 344 | if (rc != 0) { 345 | resmon_d_respond_error(peer, id, resmon_jrpc_e_reg_process_emad, 346 | "EMAD processing error", error); 347 | free(error); 348 | goto out; 349 | } 350 | 351 | obj = resmon_jrpc_new_object(id); 352 | if (obj == NULL) 353 | return; 354 | if (json_object_object_add(obj, "result", NULL)) 355 | goto err_free_dec_payload; 356 | 357 | resmon_jrpc_send(peer, obj); 358 | json_object_put(obj); 359 | 360 | out: 361 | free(dec_payload); 362 | return; 363 | 364 | err_free_dec_payload: 365 | free(dec_payload); 366 | json_object_put(obj); 367 | err_respond_memerr: 368 | resmon_d_respond_memerr(peer, id); 369 | } 370 | 371 | static bool resmon_back_mock_handle_method(struct resmon_back *back, 372 | struct resmon_stat *stat, 373 | struct resmon_reg *rreg, 374 | const char *method, 375 | struct resmon_sock *peer, 376 | struct json_object *params_obj, 377 | struct json_object *id) 378 | { 379 | if (strcmp(method, "emad") == 0) { 380 | resmon_back_mock_handle_emad(stat, rreg, peer, params_obj, id); 381 | return true; 382 | } else { 383 | return false; 384 | } 385 | } 386 | 387 | static int resmon_back_mock_pollfd(struct resmon_back *base) 388 | { 389 | return -1; 390 | } 391 | 392 | const struct resmon_back_cls resmon_back_cls_mock = { 393 | .init = resmon_back_mock_init, 394 | .fini = resmon_back_mock_fini, 395 | .get_capacity = resmon_back_mock_get_capacity, 396 | .handle_method = resmon_back_mock_handle_method, 397 | .pollfd = resmon_back_mock_pollfd, 398 | }; 399 | --------------------------------------------------------------------------------