├── autogen.sh ├── Makefile.am ├── tests ├── interrupts-test-shared ├── interrupts ├── irqdrc-auto-assign ├── interrupts-test ├── irqdrc └── interrupts-625 ├── .gitignore ├── src ├── Makefile.am ├── device.h ├── cfg_lex.l ├── strategy.c ├── event.h ├── log.c ├── cfg_grammar.y ├── interface.h ├── irqd.h ├── event.c ├── cpu.h ├── irqd.c ├── evenly.c ├── cpu.c └── interface.c ├── configure.ac ├── DESIGN ├── TODO ├── irqd.spec ├── rc.irqd ├── README ├── irqd.8 └── irqdrc.5 /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -e 2 | 3 | autoreconf -fi; 4 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | # -*- Makefile -*- 2 | 3 | SUBDIRS = src 4 | 5 | man8_MANS = irqd.8 6 | man5_MANS = irqdrc.5 7 | 8 | dist_doc_DATA = README DESIGN 9 | 10 | -------------------------------------------------------------------------------- /tests/interrupts-test-shared: -------------------------------------------------------------------------------- 1 | CPU0 CPU1 CPU2 CPU3 2 | 54: 6 6 7 10 PCI-MSI-edge eth0, eth1 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Makefile 2 | Makefile.in 3 | TAGS 4 | .deps 5 | *.o 6 | 7 | /aclocal.m4 8 | /autom4te.cache 9 | /config.* 10 | /configure 11 | /depcomp 12 | /install-sh 13 | /missing 14 | 15 | cfg_grammar.* 16 | cfg_lex.c 17 | ylwrap 18 | 19 | irqd 20 | -------------------------------------------------------------------------------- /tests/interrupts: -------------------------------------------------------------------------------- 1 | CPU0 CPU1 CPU2 CPU3 2 | 16: 26442 26174 26655 26452 IO-APIC-fasteoi uhci_hcd:usb3, pata_marvell, nvidia, eth1 3 | 46: 198430 198763 199078 198756 PCI-MSI-edge eth0 4 | -------------------------------------------------------------------------------- /tests/irqdrc-auto-assign: -------------------------------------------------------------------------------- 1 | 2 | /* CPU2, CPU3 */ 3 | cpuset "network" 0:1 { 4 | strategy "evenly" { 5 | initial-steering-cpus 2; 6 | }; 7 | 8 | devices { 9 | // use this cpuset for all interfaces otherwise unassigned. This 10 | // is only needed if having multiple cpusets. 11 | interface-auto-assign; 12 | }; 13 | }; 14 | 15 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | # -*- Makefile -*- 2 | 3 | CLEANFILES = cfg_lex.c cfg_grammar.c cfg_grammar.h 4 | 5 | BUILT_SOURCES = cfg_grammar.h 6 | 7 | AM_YFLAGS = -d 8 | # -d: enable debug 9 | AM_LFLAGS = -o$(LEX_OUTPUT_ROOT).c 10 | 11 | AM_CFLAGS = ${regular_CFLAGS} ${libglib_CFLAGS} ${libnl_CFLAGS} \ 12 | -D_GNU_SOURCE=1 13 | 14 | sbin_PROGRAMS = irqd 15 | 16 | irqd_SOURCES = irqd.c strategy.c evenly.c interface.c cpu.c event.c \ 17 | log.c cfg_grammar.y cfg_lex.l 18 | irqd_LDADD = ${libglib_LIBS} ${libnl_LIBS} -lrt 19 | -------------------------------------------------------------------------------- /tests/interrupts-test: -------------------------------------------------------------------------------- 1 | CPU0 CPU1 CPU2 CPU3 2 | 54: 6 6 7 10 PCI-MSI-edge eth0-TxRx-0 3 | 55: 0 0 0 0 PCI-MSI-edge eth0-TxRx-1 4 | 56: 4 2 2 2 PCI-MSI-edge eth0-TxRx-2 5 | 57: 1 0 0 0 PCI-MSI-edge eth0-TxRx-3 6 | 62: 0 0 0 0 PCI-MSI-edge eth0:lsc 7 | 63: 0 0 1 0 PCI-MSI-edge eth1 8 | 64: 901 872 879 869 PCI-MSI-edge eth1-TxRx-0 9 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_INIT([irqd], [0.8.0]) 2 | AC_PROG_INSTALL 3 | AM_INIT_AUTOMAKE([-Wall foreign]) 4 | AC_PROG_CC 5 | AC_PROG_LEX 6 | AC_PROG_YACC 7 | 8 | regular_CPPFLAGS="-D_FORTIFY_SOURCE=2 -D_LARGEFILE_SOURCE=1 -D_LARGE_FILES \ 9 | -D_FILE_OFFSET_BITS=64 -D_REENTRANT"; 10 | regular_CFLAGS="-Wall -Waggregate-return \ 11 | -Wmissing-declarations -Wmissing-prototypes -Wredundant-decls \ 12 | -Wshadow -Wstrict-prototypes -Winline -pipe" 13 | AC_SUBST([regular_CPPFLAGS]) 14 | AC_SUBST([regular_CFLAGS]) 15 | 16 | PKG_CHECK_MODULES([libglib], [glib-2.0]) 17 | PKG_CHECK_MODULES([libnl], [libnl-route-3.0]) 18 | 19 | AC_CONFIG_HEADERS([config.h]) 20 | AC_CONFIG_FILES([Makefile src/Makefile]) 21 | AC_OUTPUT 22 | -------------------------------------------------------------------------------- /DESIGN: -------------------------------------------------------------------------------- 1 | CPUSET 2 | 3 | Each cpuset is a contiguous, non-overlapping range of CPUs. 4 | Not defining any cpuset creates a "default" cpuset, containing 5 | all CPUs, important for backward compatibility. All NICs are 6 | assigned to this cpuset if no other is defined. 7 | 8 | It is possible to have CPUs not assigned to any cpuset, they 9 | are then unused by irqd. 10 | 11 | Each NIC may be assigned to exactly one cpuset, in case no cpuset 12 | is assigned irqd does not balance the NIC interrupts. 13 | NOTE: problematic, because the unbalanced IRQs may break proper 14 | balancing. Goal is therefore to assign all NICs to some cpuset. 15 | 16 | General format for cpusets: 17 | 18 | // first eight CPUs for the network path 19 | cpuset "network" 0 8 { 20 | }; 21 | 22 | -------------------------------------------------------------------------------- /tests/irqdrc: -------------------------------------------------------------------------------- 1 | 2 | /* cpuset "NAME" FROM[:TO] */ 3 | cpuset "network" 0:1 { 4 | // set strategy per cpuset (default: evenly) 5 | // 6 | // evenly - spread IRQs evenly across CPUs 7 | strategy "evenly" { 8 | // Initial number of CPUs used for packet steering (default: 2). 9 | // If you specify 1 you may be better off with another strategy. 10 | // valid: 1 - 4 11 | initial-steering-cpus 2; 12 | }; 13 | 14 | devices { 15 | // assign interface manually 16 | // interface "eth0"; 17 | 18 | // optionally pass CPU subrange, which can be helpfull for 19 | // testing. 20 | interface "eth1" 1; 21 | }; 22 | 23 | // Add interfaces to this cpuset automatically. Only one 24 | // cpuset can use it, otherwise the first cpuset is the 25 | // one receiving new NICs. 26 | interface-auto-assign; 27 | }; 28 | 29 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | * when traversing a cpu_info.ci_queues list there is no way 2 | from differentiating NICs from other devices. One way to solve 3 | this is prbbly by introducing a 'struct irq', which holds the 4 | actual type of IRQ, or just holds a ref to the 'struct device' 5 | it is part of. 6 | 7 | * use bison %locations 8 | 9 | * IRQs: since introducing split RX/TX IRQs check the IRQ stats 10 | if they are still correct 11 | 12 | * use SIGHUP for reconfiguration 13 | 14 | * config: specifying an invalid range does not give an error 15 | 16 | * load strategies from DSO, allowing to make the balancing 17 | decision dependant on system configuration. 18 | 19 | * strategy "static" { }: make empty curly braces optional 20 | 21 | * balance non-NIC IRQs 22 | 23 | * implement other strategy, using less CPUs 24 | 25 | * use sysfs to read CPU info (road to 'CPU hotplug') 26 | 27 | * automake: add libnl version check 28 | 29 | * configuration: ban some IRQs? 30 | 31 | -------------------------------------------------------------------------------- /src/device.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2012. 16 | */ 17 | #ifndef DEVICE_H 18 | #define DEVICE_H 19 | 20 | struct device { 21 | enum DevType { 22 | DEV_INVAL = 0, 23 | DEV_INTERFACE, 24 | } type; 25 | }; 26 | 27 | static inline void 28 | device_init(struct device *dev, enum DevType type) 29 | { 30 | dev->type = type; 31 | } 32 | 33 | #endif /* DEVICE_H */ 34 | -------------------------------------------------------------------------------- /irqd.spec: -------------------------------------------------------------------------------- 1 | # spec file for irqd 2 | 3 | # norootforbuild 4 | 5 | Name: irqd 6 | License: GPL v2 or later 7 | Summary: IRQ Balancer 8 | Version: 0.7.0 9 | Release: 1.%{_gitrelease} 10 | AutoReqProv: on 11 | BuildRequires: pkg-config libnl-devel glib2-devel 12 | Source0: %{name}.git.tar.gz 13 | Source1: rc.irqd 14 | Source2: selfmon.irqd 15 | Source99: gitinfo 16 | Group: System/Monitoring 17 | Provides: %{_gitprovides} 18 | BuildRoot: %{_tmppath}/%{name}-%{version}-build 19 | Conflicts: irqbalance 20 | 21 | %description 22 | Alternative IRQ balancer. 23 | 24 | %prep 25 | %setup -n %{name} 26 | 27 | %build 28 | make %{?jobs:-j%jobs} OPTFLAGS="$RPM_OPT_FLAGS" 29 | 30 | %install 31 | %makeinstall 32 | install -m 755 %SOURCE1 -D $RPM_BUILD_ROOT/etc/init.d/irqd 33 | install -m 644 %SOURCE2 -D $RPM_BUILD_ROOT/etc/selfmonng.d/irqd.check 34 | mkdir -p $RPM_BUILD_ROOT/etc/init.d/rc3.d 35 | ln -s ../irqd $RPM_BUILD_ROOT/etc/init.d/rc3.d/S05irqd 36 | ln -s ../irqd $RPM_BUILD_ROOT/etc/init.d/rc3.d/K40irqd 37 | 38 | %clean 39 | rm -rf $RPM_BUILD_ROOT 40 | 41 | %files 42 | %defattr (-,root,root,755) 43 | /usr/sbin/irqd 44 | /etc/init.d/irqd 45 | /etc/selfmonng.d/irqd.check 46 | %config /etc/init.d/rc3.d/S05irqd 47 | %config /etc/init.d/rc3.d/K40irqd 48 | 49 | %changelog 50 | * Tue Apr 12 2011 - heitzenberger@astaro.com 51 | - initial 52 | 53 | -------------------------------------------------------------------------------- /src/cfg_lex.l: -------------------------------------------------------------------------------- 1 | /* definitions */ 2 | 3 | %{ 4 | #include "cfg_grammar.h" 5 | 6 | /* #define T(msg) puts(msg) */ 7 | #define T(msg) 8 | 9 | #define STR_LEN 4096 10 | %} 11 | 12 | %option noyywrap yylineno 13 | %option bison-bridge 14 | /* %option bison-locations */ 15 | 16 | %x STRING 17 | %x COMMENT 18 | 19 | DIGIT [0-9] 20 | ID [a-zA-Z][a-zA-Z0-9-]+ 21 | 22 | char str_buf[STR_LEN], *pch; 23 | 24 | %% 25 | /* rule section */ 26 | 27 | [ \t]+ /* empty */ 28 | \n /* empty */ 29 | \/\/.*\n /* single line comment */ 30 | {DIGIT}+ yylval->val = atoi(yytext); T("NUM"); return T_NUM; 31 | cpuset T("CPUSET"); return T_CPUSET; 32 | devices T("DEVS"); return T_DEVS; 33 | interface T("IFACE"); return T_IFACE; 34 | interface-auto-assign return T_IFACE_AUTO_ASSIGN; 35 | strategy T("STRATEGY"); return T_STRATEGY; 36 | initial-steering-cpus return T_INIT_STEER_CPUS; 37 | {ID} yylval->str = strdup(yytext); T("ID"); return T_ID; 38 | \" pch = str_buf; BEGIN(STRING); 39 | "/*" BEGIN(COMMENT); 40 | \/ T("SLASH"); return '/'; 41 | = T("EQ"); return '='; 42 | , T("COMMA"); return ','; 43 | : T("COLON"); return ':'; 44 | ; T("SEM"); return ';'; 45 | \{ T("{"); return '{'; 46 | \} T("}"); return '}'; 47 | \" { 48 | BEGIN(0); 49 | *pch = '\0'; 50 | yylval->str = strdup(str_buf); 51 | T("STR"); 52 | return T_STR; 53 | } 54 | [\r\n]+ /* skip */ 55 | . { 56 | if (pch - str_buf + 1 >= STR_LEN) 57 | YY_FATAL_ERROR("string too long"); 58 | *pch++ = yytext[0]; 59 | } 60 | { 61 | "*\/" BEGIN(0); 62 | [^*\n]+ 63 | } 64 | . /* ignore everything else */ 65 | 66 | %% 67 | 68 | -------------------------------------------------------------------------------- /rc.irqd: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ### BEGIN INIT INFO 4 | # Provides: irqd 5 | # Default-Start: 3 6 | # Default-Stop: 7 | # Description: Alternative IRQ balancer 8 | ### END INIT INFO 9 | 10 | . /lib/lsb/init-functions 11 | 12 | IRQD=/usr/sbin/irqd 13 | PIDFILE=/var/run/irqd.pid 14 | NOSELFMON=/etc/no-selfmonitor/irqd 15 | 16 | test -x $IRQD || exit 5 17 | 18 | NCPUS=$(grep ^processor /proc/cpuinfo | wc -l) 19 | 20 | case "$1" in 21 | start) 22 | if [ "$NCPUS" = "1" ] ; then 23 | log_warning_msg "irqd disabled on single CPU" 24 | exit 0 25 | fi 26 | 27 | mkdir -p /etc/no-selfmonitor 28 | touch $NOSELFMON 29 | log_daemon_msg "Starting IRQ Daemon" "irqd" 30 | start-stop-daemon --start --quiet --oknodo --exec $IRQD 31 | log_end_msg $? 32 | rm -f $NOSELFMON 33 | ;; 34 | stop) 35 | log_daemon_msg "Stopping IRQ Daemon" "irqd" 36 | touch $NOSELFMON 37 | start-stop-daemon --stop --quiet --oknodo --pidfile $PIDFILE 38 | log_end_msg $? 39 | ;; 40 | try-restart) 41 | $0 status >/dev/null && $0 restart 42 | rc_status 43 | ;; 44 | restart | force-reload ) 45 | log_deamon_msg "Restarting IRQ Daemon" "irqd" 46 | start-stop-daemon --stop --quiet --oknodo --retry 30 --pidfile $PIDFILE 47 | touch $NOSELFMON 48 | $0 stop 49 | $0 start 50 | rm -f $NOSELFMON 51 | ;; 52 | reload) 53 | log_daemon_msg "Reloading IRQ Daemon" "irqd" 54 | start-stop-daemon --stop --signal 1 --quiet --oknod --pidfile $PIDFILE --exec $IRQD 55 | log_end_msg $?a 56 | ;; 57 | status) 58 | status_of_proc -p $PIDFILE /$IRQD irqd && exit 0 || exit %? 59 | ;; 60 | *) 61 | echo "Usage: $0 {start|stop|status|try-restart|restart|force-reload|reload}" 62 | exit 1 63 | ;; 64 | esac 65 | 66 | exit 0 67 | 68 | -------------------------------------------------------------------------------- /src/strategy.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2011. 16 | */ 17 | #include "irqd.h" 18 | #include "cpu.h" 19 | #include "interface.h" 20 | 21 | GSList *strategy_type_list; 22 | 23 | extern struct strategy_type evenly_strategy_type; 24 | 25 | static int 26 | static_balance_queue(struct interface *iface, int queue) 27 | { 28 | const struct cpuset *set = iface->if_cpuset; 29 | struct if_queue_info *qi; 30 | 31 | BUG_ON(iface->if_fixed_range != NULL); 32 | 33 | qi = if_queue(iface, queue); 34 | if_queue_assign_range(qi, &set->cs_range); 35 | 36 | return 0; 37 | } 38 | 39 | struct strategy_type static_strategy_type = { 40 | .name = "static", 41 | .balance_queue = static_balance_queue, 42 | }; 43 | 44 | struct strategy_type * 45 | strategy_find_type(const char *name) 46 | { 47 | GSList *node; 48 | 49 | for (node = strategy_type_list; node; node = g_slist_next(node)) { 50 | struct strategy_type *type = node->data; 51 | 52 | if (!strcmp(type->name, name)) 53 | return type; 54 | } 55 | 56 | return NULL; 57 | } 58 | 59 | int 60 | strategy_init(void) 61 | { 62 | strategy_type_list = g_slist_append(strategy_type_list, 63 | &static_strategy_type); 64 | strategy_type_list = g_slist_append(strategy_type_list, 65 | &evenly_strategy_type); 66 | 67 | return 0; 68 | } 69 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | README 2 | ------ 3 | 4 | This is irqd, an alternative IRQ balancer for Linux kernels. It 5 | balances the IRQs more or less evenly across the CPUs you have in your 6 | system. 7 | 8 | As of version 0.6.2 it requires glib-2.0 and libnl-3.2 to compile. 9 | 10 | It currently relies on the Receive Packet Steering (RPS) 11 | implementation of newer Linux kernels, it is of less use currently if 12 | RPS is not available. For details on RPS see the very good article 13 | from LWN: http://lwn.net/Articles/362339. 14 | 15 | 16 | STRATEGY 17 | -------- 18 | 19 | It currently implements the most gainfull cases only: 20 | 21 | 1. each multiqueue NIC is balanced over as much CPUs as possible. 22 | 23 | 2. for singlequeue NICs start with assigning two CPUs per IRQs. This 24 | way you should experience an improvement over the existing 25 | right from the start. 26 | 27 | 3. for singlequeue NICs it will currently assign up to four CPUs 28 | per IRQ. 29 | 30 | 4. for non-RPS kernels it just balances multiqueue NICs. Singlequeue 31 | NICs on non-RPS kernels will be addressed in the future. 32 | 33 | Implementation of irqd is currently ongoing. 34 | 35 | 36 | RUNNING 37 | ------- 38 | 39 | irqd is currently very minimalistic, only serving a very dedicated 40 | purpose. It reports the current IRQ mapping to the different 41 | CPUs to the file /var/lib/misc/irqd.cpumap. Example contents: 42 | 43 | cpu0: eth0:0 eth1:2 44 | cpu1: eth0:0 eth1:3 45 | cpu2: eth1:0 46 | cpu3: eth1:1 47 | 48 | Here eth0 being a singlequeue NIC and eth1 being multiqueue (with four 49 | queues). 50 | 51 | 52 | BUGS 53 | ---- 54 | 55 | irqd may contain bugs. Any bug report and/or feature request is 56 | welcomed, but please look at the TODO file contained before 57 | making a request. 58 | 59 | -- 60 | Holger Eitzenberger 61 | -------------------------------------------------------------------------------- /src/event.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2010. 16 | */ 17 | #ifndef EVENT_H 18 | #define EVENT_H 19 | 20 | #include 21 | 22 | enum EvReturn { 23 | EvError = -1, 24 | EvOk, 25 | EvStop, 26 | }; 27 | 28 | struct ev; 29 | 30 | typedef enum EvReturn (* ev_cb_t)(struct ev *, unsigned short); 31 | typedef int (* ev_cb_done_t)(void *, int); 32 | 33 | #define EV_ID 0xdeadbeaf 34 | 35 | struct ev { 36 | int fd; 37 | unsigned short when; 38 | /** 39 | * Returns %EvOk, %EvStop to stop event processing, or %EvError 40 | * on error. Receives error events. 41 | */ 42 | ev_cb_t cb_read; 43 | /** 44 | * Returns %EvOk, %EvStop to stop event processing, or %EvError 45 | * on error. 46 | */ 47 | ev_cb_t cb_write; 48 | /** 49 | * cb_done() - inform higher layer about destruction 50 | */ 51 | ev_cb_done_t cb_done; 52 | void *arg; 53 | int id; 54 | struct { 55 | unsigned nhandled; 56 | } stat; 57 | }; 58 | 59 | #define EV_READ EPOLLIN 60 | #define EV_WRITE EPOLLOUT 61 | 62 | int ev_init(void); 63 | void ev_fini(void); 64 | struct ev *ev_new(void); 65 | int ev_del(struct ev *); 66 | void ev_free(struct ev *); 67 | void ev_set(struct ev *, int, ev_cb_done_t, void *); 68 | int ev_add(struct ev *, unsigned short); 69 | int ev_mod(struct ev *, unsigned short); 70 | int ev_clear(struct ev *, unsigned short); 71 | void ev_done(struct ev *, int); 72 | int ev_dispatch(void); 73 | 74 | #endif /* EVENT_H */ 75 | -------------------------------------------------------------------------------- /irqd.8: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: irqd 3 | .\" Author: [see the "AUTHOR" section] 4 | .\" Generator: DocBook XSL Stylesheets v1.76.1 5 | .\" Date: 03/06/2014 6 | .\" Manual: \ \& 7 | .\" Source: \ \& 8 | .\" Language: English 9 | .\" 10 | .TH "IRQD" "8" "03/06/2014" "\ \&" "\ \&" 11 | .\" ----------------------------------------------------------------- 12 | .\" * Define some portability stuff 13 | .\" ----------------------------------------------------------------- 14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .\" http://bugs.debian.org/507673 16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html 17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .ie \n(.g .ds Aq \(aq 19 | .el .ds Aq ' 20 | .\" ----------------------------------------------------------------- 21 | .\" * set default formatting 22 | .\" ----------------------------------------------------------------- 23 | .\" disable hyphenation 24 | .nh 25 | .\" disable justification (adjust text to left margin only) 26 | .ad l 27 | .\" ----------------------------------------------------------------- 28 | .\" * MAIN CONTENT STARTS HERE * 29 | .\" ----------------------------------------------------------------- 30 | .SH "NAME" 31 | irqd \- Interrupt balancer 32 | .SH "SYNOPSIS" 33 | .sp 34 | \fBirqd\fR [OPTIONS] 35 | .SH "DESCRIPTION" 36 | .sp 37 | \fBirqd\fR is a program to balance interrupts on multiple processor machine on a Linux system\&. It features different kind of performance policies, which allows for tuning the balancing for different needs, such as performance or power consumption\&. 38 | .SH "OPTIONS" 39 | .PP 40 | \-c, \-\-config 41 | .RS 4 42 | Specify the configuration file to be used\&. The default is 43 | \fB/etc/irqdrc\fR\&. The syntax is documented in the 44 | \fBirqdrc(5)\fR\&. 45 | .RE 46 | .PP 47 | \-v, \-\-verbose 48 | .RS 4 49 | Increase the verbosity of the program\&. 50 | .RE 51 | .PP 52 | \-\-version 53 | .RS 4 54 | Report the 55 | \fBirqd\fR 56 | version\&. 57 | .RE 58 | .SH "CONFIGURATION" 59 | .sp 60 | See \fBirqd\&.conf\fR(5) for more details on configuring \fBirqd\fR using the \fBirqd\&.conf\fR file\&. 61 | .SH "BUGS" 62 | .sp 63 | Please send bug reports to to \&. 64 | .SH "SEE ALSO" 65 | .sp 66 | \fBirqdrc\fR(5) 67 | .SH "AUTHOR" 68 | .sp 69 | Holger Eitzenberger 70 | -------------------------------------------------------------------------------- /irqdrc.5: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: irqdrc 3 | .\" Author: [see the "AUTHOR" section] 4 | .\" Generator: DocBook XSL Stylesheets v1.76.1 5 | .\" Date: 03/06/2014 6 | .\" Manual: \ \& 7 | .\" Source: \ \& 8 | .\" Language: English 9 | .\" 10 | .TH "IRQDRC" "5" "03/06/2014" "\ \&" "\ \&" 11 | .\" ----------------------------------------------------------------- 12 | .\" * Define some portability stuff 13 | .\" ----------------------------------------------------------------- 14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .\" http://bugs.debian.org/507673 16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html 17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .ie \n(.g .ds Aq \(aq 19 | .el .ds Aq ' 20 | .\" ----------------------------------------------------------------- 21 | .\" * set default formatting 22 | .\" ----------------------------------------------------------------- 23 | .\" disable hyphenation 24 | .nh 25 | .\" disable justification (adjust text to left margin only) 26 | .ad l 27 | .\" ----------------------------------------------------------------- 28 | .\" * MAIN CONTENT STARTS HERE * 29 | .\" ----------------------------------------------------------------- 30 | .SH "NAME" 31 | irqdrc \- configuration file file *irqd* 32 | .SH "SYNOPSIS" 33 | .sp 34 | \fBirqd\fR 35 | .SH "DESCRIPTION" 36 | .sp 37 | \fBirqdrc\fR is the configuration file for \fBirqd\fR\&. The syntax is modeled after the syntax used by ISC Bind, which means that most of the statements begin with command, followed by some options, and sometimes a block enclosed in curly braces\&. All commands end with a semicolon\&. The syntax supports both C style 38 | .sp 39 | .if n \{\ 40 | .RS 4 41 | .\} 42 | .nf 43 | /* \&.\&.\&. */ 44 | .fi 45 | .if n \{\ 46 | .RE 47 | .\} 48 | .sp 49 | and C++ single line comments 50 | .sp 51 | .if n \{\ 52 | .RS 4 53 | .\} 54 | .nf 55 | // 56 | .fi 57 | .if n \{\ 58 | .RE 59 | .\} 60 | .sp 61 | The basic building block for the configuration file are cpusets, which define a continuous range of CPUs being used for assigning interrupts\&. It also supports Linux Receive Packet Steering (RPS) as well as Transmit Packet Steering (XPS) if supported by your kernel\&. 62 | .SH "COMMANDS" 63 | .sp 64 | .if n \{\ 65 | .RS 4 66 | .\} 67 | .nf 68 | cpuset "network" 0:3 { 69 | }; 70 | .fi 71 | .if n \{\ 72 | .RE 73 | .\} 74 | .SH "FILES" 75 | .sp 76 | /etc/irqdrc 77 | .SH "SEE ALSO" 78 | .sp 79 | \fBirqd\fR(8), \fBnamed\&.conf(5)\fR 80 | .SH "AUTHOR" 81 | .sp 82 | Holger Eitzenberger 83 | -------------------------------------------------------------------------------- /src/log.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2011. 16 | */ 17 | #include "irqd.h" 18 | 19 | static int log_buf_len = 1024; 20 | static char *log_buf_stdout; 21 | static char *log_buf_stderr; 22 | 23 | 24 | static void 25 | log_va(FILE *fp, char *buf, const char *file, int line, const char *prefix, 26 | int prio, const char *fmt, va_list ap) 27 | { 28 | char *pch = buf, *end = buf + log_buf_len; 29 | 30 | if (with_debug) 31 | flockfile(fp); 32 | 33 | if (file && line) 34 | pch += snprintf(pch, end - pch, "%s:%d: ", file, line); 35 | if (prefix) 36 | pch += snprintf(pch, end - pch, "%s: ", prefix); 37 | pch += vsnprintf(pch, end - pch, fmt, ap); 38 | 39 | if (with_debug) { 40 | if (pch > buf && pch[-1] != '\n') 41 | strcat(pch, "\n"); 42 | fputs_unlocked(buf, fp); 43 | funlockfile(fp); 44 | } else 45 | syslog(prio, "%s", buf); 46 | } 47 | 48 | void 49 | id_log(const char *fmt, ...) 50 | { 51 | va_list ap; 52 | 53 | va_start(ap, fmt); 54 | log_va(stdout, log_buf_stdout, NULL, 0, NULL, LOG_INFO, fmt, ap); 55 | va_end(ap); 56 | } 57 | 58 | void 59 | id_err(const char *file, int line, const char *fmt, ...) 60 | { 61 | va_list ap; 62 | 63 | va_start(ap, fmt); 64 | log_va(stderr, log_buf_stderr, file, line, "ERROR", LOG_ERR, fmt, ap); 65 | va_end(ap); 66 | } 67 | 68 | static void 69 | id_fail_va(const char *file, int line, const char *fmt, va_list ap) 70 | { 71 | log_va(stderr, log_buf_stderr, file, line, NULL, LOG_CRIT, fmt, ap); 72 | } 73 | 74 | void 75 | id_bug(const char *file, int line) 76 | { 77 | // As a NULL is not allowed for ARM va_list 78 | va_list ap; 79 | id_fail_va(file, line, "BUG", ap); 80 | abort(); 81 | } 82 | 83 | void 84 | id_oom(const char *file, int line) 85 | { 86 | // As a NULL is not allowed for ARM va_list 87 | va_list ap; 88 | id_fail_va(file, line, "OOM", ap); 89 | errno = ENOMEM; 90 | } 91 | 92 | int 93 | log_init(void) 94 | { 95 | log_buf_stdout = g_malloc(log_buf_len); 96 | log_buf_stderr = g_malloc(log_buf_len); 97 | if (!log_buf_stdout || !log_buf_stderr) { 98 | fprintf(stderr, "log: %m\n"); 99 | return -1; 100 | } 101 | 102 | return 0; 103 | } 104 | 105 | -------------------------------------------------------------------------------- /tests/interrupts-625: -------------------------------------------------------------------------------- 1 | CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 2 | 0: 121 0 0 0 1 0 0 0 IO-APIC-edge timer 3 | 1: 0 1 0 0 0 1 0 0 IO-APIC-edge i8042 4 | 4: 1 3 2 2 0 2 3 2 IO-APIC-edge serial 5 | 7: 0 0 0 0 0 0 0 0 IO-APIC-edge parport0 6 | 8: 5 4 5 2 5 8 5 3 IO-APIC-edge rtc0 7 | 9: 0 0 0 0 0 0 0 0 IO-APIC-fasteoi acpi 8 | 12: 0 0 0 1 0 0 1 1 IO-APIC-edge i8042 9 | 14: 0 0 0 0 0 0 0 0 IO-APIC-edge ata_piix 10 | 15: 0 0 0 0 0 0 0 0 IO-APIC-edge ata_piix 11 | 16: 2419 2394 2400 2397 2415 2396 2400 2418 IO-APIC-fasteoi aacraid 12 | 19: 64 78 81 71 65 72 78 69 IO-APIC-fasteoi ahci, uhci_hcd:usb3 13 | 23: 0 0 0 0 1 0 0 1 IO-APIC-fasteoi ehci_hcd:usb1, uhci_hcd:usb2 14 | 53: 560 598 581 607 596 559 551 573 PCI-MSI-edge eth3 15 | 54: 6 6 7 10 2 4 8 5 PCI-MSI-edge eth0-TxRx-0 16 | 55: 0 0 0 0 2 2 2 4 PCI-MSI-edge eth0-TxRx-1 17 | 56: 4 2 2 2 0 0 0 0 PCI-MSI-edge eth0-TxRx-2 18 | 57: 1 0 0 0 4 2 2 2 PCI-MSI-edge eth0-TxRx-3 19 | 58: 2 2 4 2 0 0 0 0 PCI-MSI-edge eth0-TxRx-4 20 | 59: 0 0 0 0 3 2 4 2 PCI-MSI-edge eth0-TxRx-5 21 | 60: 2 4 2 2 0 0 0 0 PCI-MSI-edge eth0-TxRx-6 22 | 61: 0 0 0 0 2 4 2 2 PCI-MSI-edge eth0-TxRx-7 23 | 62: 0 0 0 0 0 1 0 0 PCI-MSI-edge eth0:lsc 24 | 63: 0 0 1 0 0 0 0 0 PCI-MSI-edge eth1 25 | 64: 901 872 879 869 869 912 909 883 PCI-MSI-edge eth1-TxRx-0 26 | -------------------------------------------------------------------------------- /src/cfg_grammar.y: -------------------------------------------------------------------------------- 1 | %{ 2 | /* prologue */ 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "irqd.h" 9 | #include "cpu.h" 10 | #include "interface.h" 11 | 12 | /* #define YYERROR_VERBOSE 1 */ 13 | 14 | void yyerror(char *); 15 | void yyerr_printf(const char *, ...); 16 | int yyget_lineno(void); 17 | 18 | static int cfg_if_add(const char *, struct cpuset *, const struct range *); 19 | 20 | static struct cpuset *g_cpuset; 21 | static struct range g_range; 22 | %} 23 | 24 | /* create a pure, reentrant parser */ 25 | %define api.pure 26 | /* %locations */ 27 | 28 | %union { 29 | char *str; 30 | int val; 31 | struct range *range; 32 | } 33 | 34 | %token T_NUM 35 | %token T_ID T_STR; 36 | %token T_CPUSET T_DEVS T_IFACE T_IFACE_AUTO_ASSIGN T_STRATEGY 37 | %token T_INIT_STEER_CPUS 38 | %token ':' ';' '(' ')' '{' '}' ',' 39 | %type range 40 | 41 | %% /* grammar rules and actions */ 42 | 43 | input: /* empty */ 44 | | input stmt; 45 | 46 | stmt: cmd ';'; 47 | 48 | cmd: cpuset; 49 | 50 | cpuset: T_CPUSET T_STR range { 51 | assert(g_cpuset == NULL); 52 | if ((g_cpuset = cpuset_new($2, $3)) == NULL) { 53 | yyerr_printf("cpuset invalid"); 54 | YYERROR; 55 | } 56 | } '{' cpuset_blk '}' { 57 | int ret; 58 | 59 | if (!g_cpuset->cs_strategy.s_type) 60 | cpuset_set_strategy(g_cpuset, "evenly"); 61 | if ((ret = cpuset_list_add(g_cpuset)) < 0) { 62 | yyerr_printf("%s", strerror(-ret)); 63 | cpuset_free(g_cpuset); 64 | YYERROR; 65 | } 66 | g_cpuset = NULL; 67 | }; 68 | cpuset_blk: /* empty */ | cpuset_blk cpuset_cmds ';'; 69 | cpuset_cmds: devs | strategy | iface_auto_assign; 70 | 71 | /* FIXME don't allow whitespace here */ 72 | range: T_NUM ':' T_NUM { 73 | g_range.rg_from = $1; 74 | g_range.rg_to = $3; 75 | if (!range_valid(&g_range)) { 76 | /* TODO range invalid */ 77 | YYERROR; 78 | } 79 | 80 | $$ = &g_range; 81 | } | T_NUM { 82 | g_range.rg_from = g_range.rg_to = $1; 83 | if (!range_valid(&g_range)) { 84 | /* TODO invalid range error */ 85 | YYERROR; 86 | } 87 | $$ = &g_range; 88 | }; 89 | 90 | devs: T_DEVS '{' devs_blk '}'; 91 | devs_blk: /* empty */ | devs_blk devs_cmds ';'; 92 | devs_cmds: iface; 93 | iface: T_IFACE T_STR range { 94 | if (cfg_if_add($2, g_cpuset, $3) < 0) { 95 | /* failed to create interface */ 96 | YYERROR; 97 | } 98 | } | T_IFACE T_STR { 99 | if (cfg_if_add($2, g_cpuset, NULL) < 0) { 100 | /* failed to create interface */ 101 | YYERROR; 102 | } 103 | }; 104 | 105 | iface_auto_assign: T_IFACE_AUTO_ASSIGN { 106 | assert(g_cpuset != NULL); 107 | if (cpuset_set_auto_assign(g_cpuset) < 0) { 108 | yyerr_printf("%s: only one cpuset can have 'auto' status", 109 | g_cpuset->cs_name); 110 | YYERROR; 111 | } 112 | }; 113 | strategy: T_STRATEGY T_STR { 114 | assert(g_cpuset != NULL); 115 | if (cpuset_set_strategy(g_cpuset, $2) < 0) { 116 | yyerr_printf("%s: unknown strategy", $2); 117 | YYERROR; 118 | } 119 | } opt_strategy_blk; 120 | opt_strategy_blk: /* empty */ | '{' strategy_blk '}'; 121 | strategy_blk: /* empty */ | strategy_blk strategy_cmds ';'; 122 | strategy_cmds: init_steer_cpus; 123 | init_steer_cpus: T_INIT_STEER_CPUS T_NUM { 124 | assert(g_cpuset != NULL); 125 | /* TODO check value */ 126 | g_cpuset->cs_strategy.u.evenly.init_steer_cpus = $2; 127 | }; 128 | 129 | %% 130 | 131 | /* epilogue */ 132 | 133 | static int 134 | cfg_if_add(const char *name, struct cpuset *set, const struct range *range) 135 | { 136 | struct interface *iface; 137 | 138 | assert(set != NULL); 139 | 140 | if ((iface = if_new(name, set)) == NULL) 141 | goto err; 142 | if (cpuset_add_device(set, if_to_dev(iface)) < 0) 143 | goto err; 144 | 145 | if (range && if_assign_fixed_range(iface, range) < 0) 146 | goto err; 147 | 148 | return if_register(iface); 149 | 150 | err: 151 | if_free(iface); 152 | return -1; 153 | } 154 | 155 | void 156 | yyerror(char *msg) 157 | { 158 | fprintf(stderr, "line %d: %s\n", yyget_lineno(), msg); 159 | } 160 | 161 | void 162 | yyerr_printf(const char *fmt, ...) 163 | { 164 | va_list ap; 165 | 166 | fprintf(stderr, "line %d: ", yyget_lineno()); 167 | va_start(ap, fmt); 168 | vfprintf(stderr, fmt, ap); 169 | va_end(ap); 170 | fputc('\n', stderr); 171 | } 172 | 173 | -------------------------------------------------------------------------------- /src/interface.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2011. 16 | */ 17 | #ifndef INTERFACE_H 18 | #define INTERFACE_H 19 | 20 | #include 21 | #include 22 | 23 | #include "device.h" 24 | 25 | struct interface; 26 | struct cpu_bitmask; 27 | struct cpuset; 28 | 29 | typedef unsigned irq_ctr_t; 30 | 31 | /* 32 | * There are different scenarios possible. The easiest one is 1) a single 33 | * IRQ used for Link Status Control, RX and TX: 34 | * 35 | * 46: 208685 202268 220905 215620 PCI-MSI-edge eth0 36 | * 37 | * There are some NICs with a dedicated LSC IRQ: 38 | * 39 | * 60: 0 1 PCI-MSI-edge eth3 40 | * 61: 0 0 PCI-MSI-edge eth3-rxtx-0 41 | * 42 | * And of course different IRQs used for LSC, RX and TX: 43 | * 44 | * 51: 12187473 14354377 PCI-MSI-edge eth0-rx-0 45 | * 52: 96883 106078 PCI-MSI-edge eth0-tx-0 46 | * 53: 292 131 PCI-MSI-edge eth0 47 | * 48 | * 49 | */ 50 | struct if_queue_info { 51 | unsigned qi_num; 52 | int qi_rx_irq; 53 | int qi_tx_irq; 54 | struct interface *qi_iface; 55 | struct cpu_bitmask *qi_cpu_bitmask; /* both IRQ and RPS affinity */ 56 | irq_ctr_t qi_irq_stats[2][CPU_MAX]; 57 | }; 58 | 59 | struct interface { 60 | /* must come first */ 61 | struct device if_dev; 62 | 63 | #define IF_F_SHARED_IRQ 0x0001 64 | unsigned if_flags; 65 | 66 | int if_irq; /* possibly just LSC */ 67 | 68 | struct if_queue_info *if_queues; 69 | unsigned if_num_queues; 70 | 71 | /* the CPUs we are allowed to run on */ 72 | struct cpuset *if_cpuset; 73 | 74 | /* range specifying a subset of CPUs to be used */ 75 | struct range *if_fixed_range; 76 | 77 | /* Linux net_device_stats */ 78 | struct if_net_device_stats { 79 | uint64_t rx_bytes; /* total bytes received */ 80 | uint64_t rx_packets; /* total packets received */ 81 | uint64_t rx_errors; /* bad packets received */ 82 | uint64_t rx_dropped; /* no space in linux buffers */ 83 | uint64_t rx_fifo_errors; 84 | uint64_t rx_frame_errors; 85 | uint64_t rx_compressed; 86 | uint64_t rx_mcast; 87 | 88 | uint64_t tx_bytes; 89 | uint64_t tx_packets; 90 | uint64_t tx_errors; 91 | uint64_t tx_dropped; 92 | uint64_t tx_fifo_errors; 93 | uint64_t tx_collisions; 94 | uint64_t tx_carrier_errors; 95 | uint64_t tx_compressed; 96 | } if_stats[2]; 97 | 98 | char if_name[IFNAMSIZ]; 99 | }; 100 | 101 | struct interface *if_new(const char *, struct cpuset *); 102 | void if_free(struct interface *); 103 | int if_register(struct interface *); 104 | 105 | static inline struct device * 106 | if_to_dev(struct interface *iface) 107 | { 108 | return &iface->if_dev; 109 | } 110 | 111 | static inline struct interface * 112 | dev_to_if(struct device *dev) 113 | { 114 | BUG_ON(dev->type != DEV_INTERFACE); 115 | return (struct interface *)dev; 116 | } 117 | 118 | struct if_queue_info *if_queue(const struct interface *, int); 119 | struct if_queue_info *if_queue_by_name(const char *, int); 120 | int if_queue_assign_range(struct if_queue_info *, const struct range *); 121 | 122 | int if_assign_fixed_range(struct interface *, const struct range *); 123 | 124 | static inline bool 125 | if_is_multiqueue(struct interface *iface) 126 | { 127 | BUG_ON(!iface->if_num_queues); 128 | 129 | return iface->if_num_queues > 1; 130 | } 131 | 132 | int if_init(void); 133 | int if_rtnl_init(void); 134 | void if_fini(void); 135 | bool if_can_rps(const struct interface *); 136 | bool if_can_xps(const struct interface *); 137 | int if_set_steering_cpus(const struct interface *, int, uint64_t, uint64_t); 138 | int if_get_queue_stat(struct if_queue_info *); 139 | 140 | int queue_set_affinity(const struct if_queue_info *, uint64_t); 141 | 142 | int if_assign_cpus(struct interface *); 143 | int if_remove_cpus(struct interface *); 144 | 145 | #endif /* INTERFACE_H */ 146 | -------------------------------------------------------------------------------- /src/irqd.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2011. 16 | */ 17 | #ifndef IRQD_H 18 | #define IRQD_H 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #define CPU_MAX 64 41 | 42 | /* number of per-NIC queues supported */ 43 | #define QUEUE_MAX 128 44 | 45 | #define __PRINTF(idx, first) __attribute__((format (printf, idx, first))) 46 | #define __NORETURN __attribute__((noreturn)) 47 | #define __COLD __attribute__((cold)) 48 | #define __UNUSED __attribute__((unused)) 49 | #define __WARN_UNUSED_RESULT __attribute__((warn_unused_result)) 50 | 51 | #define likely(expr) __builtin_expect(!!(expr), 1) 52 | #define unlikely(expr) __builtin_expect(!!(expr), 0) 53 | 54 | #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 55 | 56 | #define min(x, y) ({ \ 57 | typeof(x) _x = (x); typeof(y) _y = (y); \ 58 | _x < _y ? _x : _y; }) 59 | #define max(x, y) ({ \ 60 | typeof(x) _x = (x); typeof(y) _y = (y); \ 61 | _x > _y ? _x : _y; }) 62 | 63 | #define ENV_BUF_SIZE 4096 64 | 65 | struct interface; 66 | struct cpu_info; 67 | 68 | struct evenly_args { 69 | unsigned init_steer_cpus; 70 | }; 71 | 72 | struct strategy; 73 | struct cpuset; 74 | 75 | struct strategy_type { 76 | const char *name; 77 | void (* init)(struct strategy *); 78 | 79 | /** 80 | * Strategy handler to balance an interface queue, called once 81 | * the interface becomes %IFF_UP. 82 | */ 83 | int (* balance_queue)(struct interface *, int); 84 | 85 | /** 86 | * Strategy handler to rebalance in case of a CPU becoming too busy. */ 87 | int (* cpu_busy)(struct cpu_info *); 88 | 89 | /** 90 | * Strategy handler to eventually rebalance in case of an interface 91 | * going down. The handler is called before the queues are removed 92 | * from the CPUs. 93 | */ 94 | int (* interface_down)(struct cpuset *, struct interface *); 95 | }; 96 | 97 | struct strategy { 98 | const struct strategy_type *s_type; 99 | union { 100 | struct evenly_args evenly; 101 | } u; 102 | }; 103 | 104 | int strategy_init(void); 105 | struct strategy_type *strategy_find_type(const char *); 106 | 107 | /* logging */ 108 | int log_init(void); 109 | void id_log(const char *fmt, ...) __PRINTF(1, 2); 110 | void id_err(const char *file, int line, const char *, ...) __PRINTF(3, 4); 111 | void id_err_status(const char *file, int line, const char *, int, 112 | ...) __PRINTF(3, 5); 113 | void id_bug(const char *file, int line) __NORETURN __COLD; 114 | void id_oom(const char *file, int line) __COLD; 115 | 116 | #define log(fmt, args...) id_log(fmt, ##args) 117 | #define err(fmt, args...) id_err(__FILE__, __LINE__, fmt, ##args) 118 | #define err_status(fmt, status, args...) id_err_status(__FILE__, __LINE__, \ 119 | fmt, status, ##args) 120 | #define OOM() id_oom(__FILE__, __LINE__) 121 | 122 | #define WARN() id_err(__FILE__, __LINE__, "warning"); 123 | #define BUG() id_bug(__FILE__, __LINE__) 124 | #define BUG_ON(expr) do { if (unlikely(expr)) BUG(); } while (0) 125 | #define WARN_ON(expr) do { if (unlikely(expr)) WARN(); } while (0) 126 | 127 | #ifdef DEBUG 128 | #define dbg(fmt, args...) id_log(fmt, ##args) 129 | #else 130 | #define dbg(fmt, args...) 131 | #endif /* DEBUG */ 132 | 133 | extern enum RpsStatus { 134 | RPS_S_NEED_CHECK = 0, 135 | RPS_S_DISABLED, 136 | RPS_S_ENABLED, 137 | } g_rps_status; 138 | 139 | extern enum XpsStatus { 140 | XPS_S_NEED_CHECK = 0, 141 | XPS_S_DISABLED, 142 | XPS_S_ENABLED, 143 | } g_xps_status; 144 | 145 | extern bool config_is_read; 146 | extern char *irqd_prefix; 147 | extern bool no_daemon; 148 | extern bool with_debug; 149 | extern int verbose; 150 | 151 | int id_set_fd_flags(int, int); 152 | char *id_path(const char *path); 153 | FILE *id_fopen(const char *, const char *); 154 | 155 | int irq_set_affinity(int, uint64_t); 156 | 157 | char *xstrncpy(char *, const char *, size_t); 158 | 159 | #endif /* IRQD_H */ 160 | -------------------------------------------------------------------------------- /src/event.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2011. 16 | */ 17 | #include "irqd.h" 18 | #include "event.h" 19 | 20 | #include 21 | #include 22 | 23 | #define EPOLL_MAX_EVENTS 10 24 | 25 | /* events allowed to be set by ev_add() */ 26 | #define EV_MASK_ALLOW (EV_READ | EV_WRITE) 27 | 28 | /* events handled by user */ 29 | #define EV_MASK_USER_HANDLER (EV_READ | EV_WRITE) 30 | 31 | static int epoll_fd; 32 | 33 | 34 | int 35 | ev_init(void) 36 | { 37 | if ((epoll_fd = epoll_create1(O_CLOEXEC)) < 0) { 38 | err("epoll_create: %m"); 39 | return -1; 40 | } 41 | 42 | return 0; 43 | } 44 | 45 | void 46 | ev_fini(void) 47 | { 48 | close(epoll_fd); 49 | } 50 | 51 | struct ev * 52 | ev_new(void) 53 | { 54 | struct ev *ev; 55 | 56 | if ((ev = g_malloc0(sizeof(struct ev))) == NULL) { 57 | OOM(); 58 | return NULL; 59 | } 60 | 61 | ev->fd = -1; 62 | ev->id = EV_ID; 63 | 64 | return ev; 65 | } 66 | 67 | void 68 | ev_free(struct ev *ev) 69 | { 70 | if (ev) { 71 | if (ev->fd >= 0) 72 | close(ev->fd); 73 | g_free(ev); 74 | } 75 | } 76 | 77 | /** 78 | * ev_set() - initialize event 79 | * 80 | * @arg ev 81 | * @arg fd 82 | * @arg cb_io callback used for IO 83 | * @arg cb_done callback to inform higher layers (or %NULL) 84 | * @arg arg 85 | */ 86 | void 87 | ev_set(struct ev *ev, int fd, ev_cb_done_t cb_done, void *arg) 88 | { 89 | ev->fd = fd; 90 | ev->cb_done = cb_done; 91 | ev->arg = arg; 92 | ev->id = EV_ID; 93 | } 94 | 95 | int 96 | ev_add(struct ev *ev, unsigned short when) 97 | { 98 | struct epoll_event eev; 99 | 100 | BUG_ON(ev->fd < 0); 101 | BUG_ON(when & ~EV_MASK_ALLOW); 102 | 103 | id_set_fd_flags(ev->fd, O_NONBLOCK); 104 | eev.data.ptr = ev; 105 | eev.events = when | EPOLLET; 106 | if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ev->fd, &eev) < 0) { 107 | err("%s: %m", __func__); 108 | goto err; 109 | } 110 | 111 | ev->when |= when; 112 | 113 | return 0; 114 | 115 | err: 116 | return -1; 117 | } 118 | 119 | int 120 | ev_mod(struct ev *ev, unsigned short when) 121 | { 122 | struct epoll_event eev = { 123 | .data.ptr = ev, 124 | }; 125 | int ret; 126 | 127 | if (ev->when & when) 128 | return 0; 129 | eev.events = ev->when | when | EPOLLET; 130 | if (ev->when) 131 | ret = epoll_ctl(epoll_fd, EPOLL_CTL_MOD, ev->fd, &eev); 132 | else 133 | ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, ev->fd, &eev); 134 | if (ret < 0) 135 | err("%s: %m", __func__); 136 | ev->when |= when; 137 | 138 | return ret; 139 | } 140 | 141 | int 142 | ev_clear(struct ev *ev, unsigned short when) 143 | { 144 | struct epoll_event eev = { 145 | .data.ptr = ev, 146 | }; 147 | int ret; 148 | 149 | if ((ev->when & when) == 0) 150 | return 0; 151 | ev->when &= ~when; 152 | eev.events = ev->when; 153 | if (!ev->when) 154 | ret = ev_del(ev); 155 | else 156 | ret = epoll_ctl(epoll_fd, EPOLL_CTL_MOD, ev->fd, &eev); 157 | if (ret < 0) 158 | err("%s: %m", __func__); 159 | 160 | return ret; 161 | } 162 | 163 | /** 164 | * ev_del() - remove an event descriptor 165 | * 166 | * The event is not freed, just in case 167 | */ 168 | int 169 | ev_del(struct ev *ev) 170 | { 171 | if (ev->fd >= 0) { 172 | if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, ev->fd, NULL) < 0) { 173 | err("epoll_ctl: %m\n"); 174 | return -1; 175 | } 176 | 177 | dbg("%s: ev=%p", __func__, ev); 178 | } 179 | 180 | return 0; 181 | } 182 | 183 | void 184 | ev_done(struct ev *ev, int why) 185 | { 186 | if (ev->cb_done) 187 | ev->cb_done(ev, why); 188 | else 189 | ev_del(ev); 190 | } 191 | 192 | int 193 | ev_dispatch(void) 194 | { 195 | struct epoll_event events[EPOLL_MAX_EVENTS]; 196 | 197 | for (;;) { 198 | int nfds, i; 199 | 200 | nfds = epoll_wait(epoll_fd, events, ARRAY_SIZE(events), -1); 201 | if (nfds < 0) { 202 | if (errno == EINTR) 203 | continue; 204 | dbg("epoll_wait: %m"); 205 | } 206 | 207 | dbg("%s: nfds=%d", __func__, nfds); 208 | for (i = 0; i < nfds; i++) { 209 | struct epoll_event *eev = &events[i]; 210 | struct ev *ev = events[i].data.ptr; 211 | enum EvReturn ret; 212 | 213 | BUG_ON(ev->id != EV_ID); 214 | if (eev->events & EPOLLIN) { 215 | BUG_ON(!ev->cb_read); 216 | ret = ev->cb_read(ev, eev->events & EV_MASK_USER_HANDLER); 217 | ev->stat.nhandled++; 218 | if (ret != EvOk) 219 | goto ev_err; 220 | } 221 | if (eev->events & EPOLLOUT) { 222 | BUG_ON(!ev->cb_write); 223 | ret = ev->cb_write(ev, eev->events & EV_MASK_USER_HANDLER); 224 | ev->stat.nhandled++; 225 | if (ret != EvOk) 226 | goto ev_err; 227 | } 228 | continue; 229 | 230 | ev_err: 231 | if (ret == EvStop) 232 | ev_done(ev, 0); 233 | else if (ret == EvError) 234 | ev_done(ev, 0); 235 | } 236 | } 237 | 238 | return 0; 239 | } 240 | -------------------------------------------------------------------------------- /src/cpu.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2011. 16 | */ 17 | #ifndef CPU_H 18 | #define CPU_H 19 | 20 | #include 21 | 22 | #define HT_PER_CPU 2 23 | #define HT_MASK ((1 << HT_PER_CPU) - 1) 24 | 25 | #define OLD 0 26 | #define NEW 1 27 | 28 | struct proc_stat_cpu { 29 | unsigned long long psc_user; 30 | unsigned long long psc_nice; 31 | unsigned long long psc_system; 32 | unsigned long long psc_idle; 33 | unsigned long long psc_iowait; 34 | unsigned long long psc_irq; 35 | unsigned long long psc_softirq; 36 | unsigned long long psc_steal; 37 | unsigned long long psc_guest; 38 | unsigned long long psc_softirq_ctr; 39 | }; 40 | 41 | /* /proc/stat */ 42 | struct proc_stat { 43 | size_t ps_len; 44 | unsigned long long ps_ctxt; 45 | unsigned long ps_btime; 46 | unsigned long ps_procs; 47 | unsigned long ps_procs_running; 48 | unsigned long ps_procs_blocked; 49 | struct proc_stat_cpu ps_cpu_total; 50 | }; 51 | 52 | struct if_queue_info; 53 | struct cpuset; 54 | 55 | struct cpu_info { 56 | unsigned ci_num; 57 | GSList *ci_queues; 58 | unsigned ci_num_queues; 59 | unsigned ci_load; /* in percent */ 60 | unsigned ci_si_load; /* softirq load (in percent) */ 61 | 62 | struct cpuset *ci_cpuset; /* or NULL */ 63 | 64 | /* /proc/net/softnet_stat */ 65 | struct softnet_stat { 66 | unsigned total; 67 | unsigned dropped; 68 | unsigned time_squeeze; 69 | unsigned cpu_collision; 70 | } ci_ss[2]; 71 | struct proc_stat_cpu ci_psc; 72 | struct proc_stat_cpu ci_psc_old; 73 | }; 74 | 75 | #define CPU_SS_DIFF(ci, var) ((ci)->ci_ss[NEW].var - (ci)->ci_ss[OLD].var) 76 | 77 | extern GSList *cpu_lru_list; 78 | extern GSList *cpu_si_load_lru_list; 79 | 80 | extern GSList *cpuset_list; 81 | 82 | int cpu_init(void); 83 | void cpu_fini(void); 84 | unsigned cpu_count(void); 85 | struct cpu_info *cpu_add_queue(int, struct interface *, int); 86 | struct cpu_info *cpu_add_queue_lru(struct interface *, int); 87 | int cpu_del_queue(int, struct if_queue_info *qi); 88 | struct cpu_info *cpu_nth(int); 89 | 90 | int cpu_read_stat(void); 91 | int cpu_do_stat(void); 92 | void cpu_dump_map(void); 93 | 94 | /* a contigous range of CPUs */ 95 | struct cpu_bitmask { 96 | struct cpuset *cpuset; 97 | unsigned len; 98 | int nbits; 99 | uint8_t data[]; 100 | }; 101 | 102 | struct cpu_bitmask *cpu_bitmask_new(struct cpuset *); 103 | void cpu_bitmask_free(struct cpu_bitmask *); 104 | int cpu_bitmask_set(struct cpu_bitmask *, unsigned) __WARN_UNUSED_RESULT; 105 | int cpu_bitmask_clear(struct cpu_bitmask *, unsigned) __WARN_UNUSED_RESULT; 106 | bool cpu_bitmask_is_set(const struct cpu_bitmask *, unsigned); 107 | int cpu_bitmask_ffs(const struct cpu_bitmask *); 108 | uint64_t cpu_bitmask_mask64(const struct cpu_bitmask *); 109 | 110 | static inline bool cpu_bitmask_is_empty(const struct cpu_bitmask *bmask) 111 | { 112 | return bmask->nbits == 0; 113 | } 114 | 115 | static inline int cpu_bitmask_ncpus(const struct cpu_bitmask *bmask) 116 | { 117 | return bmask->nbits; 118 | } 119 | 120 | struct device; 121 | 122 | struct range { 123 | int rg_from; 124 | int rg_to; 125 | }; 126 | 127 | struct range *range_new(unsigned, unsigned); 128 | void range_free(struct range *); 129 | bool range_valid(const struct range *); 130 | bool cpu_in_range(const struct range *, unsigned); 131 | bool range_in_range(const struct range *, const struct range *); 132 | 133 | struct cpuset { 134 | struct range cs_range; 135 | char *cs_name; 136 | 137 | struct strategy cs_strategy; 138 | 139 | /* CPU info sorted by number of queues/IRQs assigned */ 140 | GSList *cs_cpu_lru_list; 141 | 142 | GSList *cs_dev_list; 143 | }; 144 | 145 | extern struct cpuset *g_cpuset_auto_assign; 146 | 147 | struct cpuset *cpuset_new(const char *, const struct range *); 148 | void cpuset_free(struct cpuset *); 149 | GSList *cpuset_get_by_name(const char *); 150 | int cpuset_add_device(struct cpuset *, struct device *); 151 | int cpuset_list_add(struct cpuset *); 152 | bool cpuset_in(const struct cpuset *, unsigned); 153 | int cpuset_set_auto_assign(struct cpuset *); 154 | int cpuset_set_strategy(struct cpuset *, const char *); 155 | int cpuset_interface_down(struct cpuset *, struct interface *); 156 | int cpuset_cpu_busy(struct cpuset *, struct cpu_info *); 157 | int cpuset_balance_queue(struct cpuset *, struct interface *, int); 158 | void cpuset_dump(void); 159 | 160 | static inline unsigned 161 | cpuset_len(const struct cpuset *set) 162 | { 163 | return set->cs_range.rg_to - set->cs_range.rg_from + 1; 164 | } 165 | 166 | static inline int 167 | cpuset_first_cpu(const struct cpuset *set) 168 | { 169 | return set->cs_range.rg_from; 170 | } 171 | 172 | static inline unsigned 173 | cpuset_last_cpu(const struct cpuset *set) 174 | { 175 | return set->cs_range.rg_from + cpuset_len(set) - 1; 176 | } 177 | 178 | #endif /* CPU_H */ 179 | -------------------------------------------------------------------------------- /src/irqd.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2011. 16 | */ 17 | #include 18 | 19 | #include "irqd.h" 20 | #include "event.h" 21 | #include "cpu.h" 22 | #include "interface.h" 23 | #include "cfg_grammar.h" 24 | #include 25 | 26 | #define PID_FILE "irqd.pid" 27 | 28 | 29 | static struct ev sig_ev; 30 | 31 | /* if set allows to access files below /sys and /proc below a subdirectory */ 32 | /* FIXME make automake aware */ 33 | char *cfg_file = "/etc/irqdrc"; 34 | bool config_is_read; 35 | 36 | char *irqd_prefix; 37 | bool no_daemon; 38 | bool with_debug; 39 | int verbose; 40 | enum RpsStatus g_rps_status; 41 | enum XpsStatus g_xps_status; 42 | 43 | extern int yyparse(); 44 | extern void yyset_in(FILE *); 45 | 46 | static int 47 | check_opts(int argc, char *argv[]) 48 | { 49 | static struct option lopts[] = { 50 | { "config", required_argument, NULL, 'c' }, 51 | { "debug", no_argument, NULL, 'd' }, /* implies 'foreground' */ 52 | { "foreground", no_argument, NULL, 'f' }, 53 | { "verbose", 0, NULL, 'v' }, 54 | { "version", 0, NULL, 0 }, 55 | { 0 } 56 | }; 57 | int c, idx = 0; 58 | 59 | while ((c = getopt_long(argc, argv, "c:dfv", lopts, &idx)) != -1) { 60 | if (!c) { /* long-only option */ 61 | switch (idx) { 62 | case 1: /* version */ 63 | break; 64 | 65 | default: 66 | return -1; 67 | } 68 | continue; 69 | } 70 | 71 | switch (c) { 72 | case 'c': 73 | cfg_file = strdup(optarg); 74 | break; 75 | 76 | case 'd': 77 | no_daemon = true; 78 | with_debug = true; 79 | break; 80 | 81 | case 'f': 82 | no_daemon = true; 83 | with_debug = false;; 84 | break; 85 | 86 | case 'v': /* verbose */ 87 | verbose++; 88 | break; 89 | 90 | case '?': 91 | return -1; 92 | } 93 | } 94 | 95 | return 0; 96 | } 97 | 98 | static int 99 | config_read(void) 100 | { 101 | struct cpuset *set; 102 | FILE *fp; 103 | 104 | if ((fp = fopen(cfg_file, "r")) == NULL) { 105 | if (errno == ENOENT) { 106 | log("no config file found"); 107 | goto out_read; 108 | } 109 | 110 | err("%s: %m", cfg_file); 111 | return -1; 112 | } 113 | 114 | yyset_in(fp); 115 | if (yyparse() == 1) 116 | goto err; 117 | else if (yyparse() == 2) { 118 | OOM(); 119 | goto err; 120 | } 121 | 122 | out_read: 123 | if (!cpuset_list) { 124 | struct range range = { 0, cpu_count() - 1 }; 125 | 126 | if ((set = cpuset_new("default", &range)) == NULL) 127 | return -1; 128 | cpuset_set_auto_assign(set); 129 | cpuset_set_strategy(set, "evenly"); 130 | cpuset_list_add(set); 131 | } else 132 | cpuset_set_auto_assign(g_slist_nth_data(cpuset_list, 0)); 133 | 134 | if (fp) 135 | fclose(fp); 136 | config_is_read = true; 137 | 138 | return 0; 139 | 140 | err: 141 | fclose(fp); 142 | return -1; 143 | } 144 | 145 | static void 146 | config_dump(void) 147 | { 148 | cpuset_dump(); 149 | } 150 | 151 | /* returned string needs to be freed by caller */ 152 | char * 153 | id_path(const char *path) 154 | { 155 | char *buf = malloc(PATH_MAX); 156 | 157 | BUG_ON(*path != '/'); 158 | 159 | if ((buf = malloc(PATH_MAX)) == NULL) { 160 | OOM(); 161 | return NULL; 162 | } 163 | 164 | snprintf(buf, PATH_MAX, "%s%s", irqd_prefix, path); 165 | buf[PATH_MAX - 1] = '\0'; 166 | 167 | return buf; 168 | } 169 | 170 | int 171 | id_set_fd_flags(int fd, int new_flags) 172 | { 173 | int flags = fcntl(fd, F_GETFD, 0); 174 | 175 | if (flags < 0) 176 | return -1; 177 | return fcntl(fd, F_SETFD, flags | new_flags); 178 | } 179 | 180 | /** 181 | * id_fopen() - wrapper arund fopen() with debug possibilities 182 | * 183 | * Only works for absolute paths. 184 | */ 185 | FILE * 186 | id_fopen(const char *file, const char *mode) 187 | { 188 | FILE *fp; 189 | 190 | BUG_ON(file[0] != '/'); 191 | if (irqd_prefix) { 192 | char path[2 * PATH_MAX]; 193 | 194 | snprintf(path, sizeof(path), "%s%s", irqd_prefix, file); 195 | path[sizeof(path) - 1] = '\0'; 196 | if ((fp = fopen(path, mode)) == NULL) { 197 | if (errno != ENOENT) 198 | goto err; 199 | } 200 | } 201 | 202 | if (!fp && (fp = fopen(file, mode)) == NULL) 203 | goto err; 204 | /* FIXME remove race */ 205 | id_set_fd_flags(fileno(fp), O_CLOEXEC); 206 | 207 | return fp; 208 | 209 | err: 210 | err("%s: %m", file); 211 | return NULL; 212 | } 213 | 214 | int 215 | irq_set_affinity(int irq, uint64_t mask) 216 | { 217 | char path[PATH_MAX], buf[16]; 218 | int fd, len, nwritten; 219 | 220 | BUG_ON(irq == 0); 221 | snprintf(path, sizeof(path), "/proc/irq/%d/smp_affinity", irq); 222 | if ((fd = open(path, O_WRONLY | O_CLOEXEC)) < 0) { 223 | err("%s: %m", path); 224 | return -1; 225 | } 226 | 227 | if (!(mask >> 32)) { 228 | len = snprintf(buf, sizeof(buf), "%" PRIx64 "\n", mask); 229 | } else { 230 | len = snprintf(buf, sizeof(buf), "%" PRIx64 ",%" PRIx64 "\n", mask >> 32, mask & 0xffffffff); 231 | } 232 | 233 | nwritten = write(fd, buf, len); 234 | if (nwritten < 0) 235 | err("irq%d: error writing smp_affinity: %m", irq); 236 | 237 | close(fd); 238 | 239 | return 0; 240 | } 241 | 242 | char * 243 | xstrncpy(char *dst, const char *src, size_t n) 244 | { 245 | strncpy(dst, src, n); 246 | if (dst[n - 1]) 247 | dst[n - 1] = '\0'; 248 | return dst; 249 | } 250 | 251 | 252 | static void 253 | irqd_at_exit(void) 254 | { 255 | char path[PATH_MAX]; 256 | 257 | snprintf(path, sizeof(path), "%s%s", _PATH_VARRUN, PID_FILE); 258 | unlink(path); 259 | unlink("/var/lib/misc/irqd.cpumap"); 260 | } 261 | 262 | /* write PID file unless already running */ 263 | static int 264 | write_pid(void) 265 | { 266 | char path[PATH_MAX]; 267 | FILE *fp = NULL; 268 | int fd; 269 | 270 | snprintf(path, sizeof(path), "%s%s", _PATH_VARRUN, PID_FILE); 271 | if ((fd = open(path, O_RDWR | O_CREAT | O_CLOEXEC, 0644)) < 0) { 272 | err("already running"); 273 | return -1; 274 | } 275 | 276 | if ((fp = fdopen(fd, "r+")) == NULL) { 277 | err("%s: %m", PID_FILE); 278 | close(fd); 279 | 280 | return -1; 281 | } 282 | 283 | fprintf(fp, "%u\n", getpid()); 284 | fclose(fp); 285 | 286 | atexit(irqd_at_exit); 287 | 288 | return 0; 289 | } 290 | 291 | static enum EvReturn 292 | sig_ev_cb(struct ev *ev, unsigned short what) 293 | { 294 | BUG_ON(what != EV_READ); 295 | 296 | do { 297 | struct signalfd_siginfo siginfo; 298 | ssize_t nread; 299 | 300 | if ((nread = read(ev->fd, &siginfo, sizeof(siginfo))) < 0) { 301 | if (errno == EWOULDBLOCK) 302 | break; 303 | err("signalfd: %m"); 304 | } 305 | 306 | switch (siginfo.ssi_signo) { 307 | case SIGTERM: 308 | log("received SIGTERM"); 309 | irqd_at_exit(); 310 | exit(0); 311 | 312 | default: 313 | BUG(); 314 | } 315 | } while (1); 316 | 317 | return EvOk; 318 | } 319 | 320 | int 321 | main(int argc, char *argv[]) 322 | { 323 | sigset_t ss; 324 | int sig_fd; 325 | 326 | log_init(); 327 | 328 | if (check_opts(argc, argv) < 0) 329 | exit(EXIT_FAILURE); 330 | 331 | if (!no_daemon && !with_debug) 332 | openlog("irqd", LOG_PID | LOG_NDELAY, LOG_DAEMON); 333 | 334 | if ((irqd_prefix = getenv("IRQD_PREFIX")) == NULL) 335 | irqd_prefix = ""; 336 | 337 | setlocale(LC_ALL, ""); 338 | 339 | if (geteuid()) { 340 | err("root required"); 341 | exit(1); 342 | } 343 | 344 | ev_init(); 345 | strategy_init(); 346 | 347 | cpu_init(); 348 | if(cpu_count() == 1) { 349 | log("single CPU, nothing to balance"); 350 | exit(0); 351 | } 352 | 353 | if (optind < argc) { 354 | err("extra arguments on command line"); 355 | exit(1); 356 | } 357 | 358 | if_init(); 359 | 360 | if (config_read() < 0) 361 | exit(1); 362 | if (no_daemon && verbose) 363 | config_dump(); 364 | 365 | if (!no_daemon && daemon(0, 0) < 0) { 366 | err("can't start daemon\n"); 367 | exit(1); 368 | } 369 | 370 | if (write_pid() < 0) 371 | exit(1); 372 | 373 | sigemptyset(&ss); 374 | sigaddset(&ss, SIGTERM); 375 | sigprocmask(SIG_BLOCK, &ss, NULL); 376 | sig_fd = signalfd(-1, &ss, SFD_NONBLOCK | SFD_CLOEXEC); 377 | ev_set(&sig_ev, sig_fd, NULL, NULL); 378 | sig_ev.cb_read = sig_ev_cb; 379 | ev_add(&sig_ev, EV_READ); 380 | 381 | if_rtnl_init(); 382 | 383 | ev_dispatch(); 384 | 385 | if_fini(); 386 | cpu_fini(); 387 | ev_fini(); 388 | 389 | return 0; 390 | } 391 | -------------------------------------------------------------------------------- /src/evenly.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2011. 16 | */ 17 | #include "irqd.h" 18 | #include "cpu.h" 19 | #include "interface.h" 20 | 21 | /* up to 4 CPUs mapped per queue */ 22 | #define RPS_CPU_MAX_ORDER 2 23 | #define RPS_CPU_MAX (1 << RPS_CPU_MAX_ORDER) 24 | 25 | /* Softirq load threshold (percent) up to which new queues are mapped 26 | to a Softirq */ 27 | #define CPU_SI_MAP_THRESH 50 28 | 29 | static void cpu_dump_queues(const struct cpu_info *) __UNUSED; 30 | static void cpuset_dump_lru(const char *, const struct cpuset *) __UNUSED; 31 | 32 | static bool 33 | cpu_is_idle(const struct cpu_info *ci) 34 | { 35 | return ci->ci_si_load < CPU_SI_MAP_THRESH; 36 | } 37 | 38 | /* maps up to four CPUs for a queue, making sure that the selected 39 | CPU is in the cpuset */ 40 | static struct cpu_info * 41 | rps_select_nearby_cpu(const struct if_queue_info *qi, int cpu) 42 | { 43 | const struct cpuset *set = qi->qi_iface->if_cpuset; 44 | int order; 45 | 46 | for (order = 1; order < RPS_CPU_MAX_ORDER; order++) { 47 | unsigned order_ncpus = 1 << order; 48 | int order_base, probe; 49 | 50 | order_base = (cpu - set->cs_range.rg_from) / order_ncpus * order_ncpus; 51 | for (probe = 0; probe < order_ncpus; probe++) { 52 | unsigned c = set->cs_range.rg_from + order_base + probe; 53 | 54 | if (cpuset_in(set, c) 55 | && !cpu_bitmask_is_set(qi->qi_cpu_bitmask, c)) { 56 | struct cpu_info *new = cpu_nth(c); 57 | 58 | if (!new) 59 | continue; 60 | if (cpu_is_idle(new)) 61 | return new; 62 | } 63 | } 64 | } 65 | 66 | return NULL; 67 | } 68 | 69 | /* 70 | * For multiqueue we assign the queues consecutively, with the first 71 | * being assigned by LRU. 72 | */ 73 | static struct cpu_info * 74 | assign_mq_queue(struct interface *iface, int queue) 75 | { 76 | const struct cpuset *cset = iface->if_cpuset; 77 | const struct if_queue_info *qi_first = if_queue(iface, 0); 78 | int cpu, first_used = cpu_bitmask_ffs(qi_first->qi_cpu_bitmask); 79 | 80 | BUG_ON(first_used < 0); 81 | cpu = cpuset_first_cpu(cset) + (first_used + queue) % cpuset_len(cset); 82 | 83 | return cpu_add_queue(cpu, iface, queue); 84 | } 85 | 86 | static void 87 | check_cpuset_lru(const struct cpuset *set) 88 | { 89 | GSList *node; 90 | int min = INT_MAX, max = 0; 91 | 92 | for (node = set->cs_cpu_lru_list; node; node = node->next) { 93 | const struct cpu_info *ci = node->data; 94 | 95 | if (min > ci->ci_num_queues) 96 | min = ci->ci_num_queues; 97 | if (max < ci->ci_num_queues) 98 | max = ci->ci_num_queues; 99 | } 100 | 101 | if (max - min > 1) 102 | err("uneven distribution detected for '%s'", set->cs_name); 103 | } 104 | 105 | static int 106 | evenly_balance_queue(struct interface *iface, int queue) 107 | { 108 | struct if_queue_info *qi; 109 | struct cpu_info *ci; 110 | 111 | BUG_ON(queue < 0); 112 | 113 | if (queue == 0) 114 | ci = cpu_add_queue_lru(iface, queue); 115 | else 116 | ci = assign_mq_queue(iface, queue); 117 | if (ci == NULL) 118 | return -1; 119 | 120 | qi = if_queue(iface, queue); 121 | if (!cpu_bitmask_set(qi->qi_cpu_bitmask, ci->ci_num)) 122 | BUG(); 123 | 124 | if (!if_is_multiqueue(iface) && g_rps_status == RPS_S_ENABLED) { 125 | int ncpus = iface->if_cpuset->cs_strategy.u.evenly.init_steer_cpus; 126 | int cpu; 127 | 128 | for (cpu = 1; cpu < ncpus; cpu++) { 129 | struct cpu_info *ci2 = rps_select_nearby_cpu(qi, ci->ci_num); 130 | 131 | if (ci2) { 132 | if (cpu_bitmask_set(qi->qi_cpu_bitmask, ci2->ci_num)) 133 | cpu_add_queue(ci2->ci_num, iface, queue); 134 | } 135 | } 136 | } 137 | 138 | check_cpuset_lru(ci->ci_cpuset); 139 | 140 | return 0; 141 | } 142 | 143 | static gint 144 | queue_irq_cmp(gconstpointer __a, gconstpointer __b, gpointer data) 145 | { 146 | const struct cpu_info *ci = data; 147 | const struct if_queue_info *qia = __a, *qib = __b; 148 | 149 | return qia->qi_irq_stats[NEW][ci->ci_num] 150 | - qib->qi_irq_stats[NEW][ci->ci_num]; 151 | } 152 | 153 | /** 154 | * queue_map_cpu - try to map queue to another CPU 155 | * 156 | * Nearby CPUs preferred, in the hope of caching effects. 157 | * 158 | * @return 1: mapped, 0: not mapped, <0: error 159 | */ 160 | static int 161 | queue_map_cpu(struct if_queue_info *qi) 162 | { 163 | struct interface *iface = qi->qi_iface; 164 | struct cpu_info *ci_new; 165 | int cpu = cpu_bitmask_ffs(qi->qi_cpu_bitmask); 166 | uint64_t cpumask; 167 | 168 | BUG_ON(iface->if_num_queues > 1); 169 | if ((ci_new = rps_select_nearby_cpu(qi, cpu)) == NULL) 170 | return 0; 171 | 172 | if (cpu_bitmask_set(qi->qi_cpu_bitmask, ci_new->ci_num)) 173 | cpu_add_queue(ci_new->ci_num, iface, qi->qi_num); 174 | 175 | cpumask = cpu_bitmask_mask64(qi->qi_cpu_bitmask); 176 | if_set_steering_cpus(iface, qi->qi_num, cpumask, cpumask); 177 | 178 | queue_set_affinity(qi, cpumask); 179 | 180 | log("%s:%d: rps_cpus=%#" PRIx64 " smp_affinity=%#" PRIx64, 181 | iface->if_name, qi->qi_num, cpumask, cpumask); 182 | 183 | return 1; 184 | } 185 | 186 | static void 187 | evenly_init(struct strategy *strategy) 188 | { 189 | strategy->u.evenly.init_steer_cpus = 2; 190 | } 191 | 192 | static int 193 | evenly_cpu_busy(struct cpu_info *ci) 194 | { 195 | struct if_queue_info *qi; 196 | GList *queue_irqs_list = NULL; 197 | GList *node; 198 | GSList *snode; 199 | 200 | if (!ci->ci_num_queues) { 201 | /* TODO softirq busy, but no NICs assigned, search for other 202 | devices */ 203 | return 0; 204 | } 205 | 206 | /* One or many queues mapped, search for the busiest queue 207 | (in terms of IRQs handled). NAPI may reduce accuracy though, 208 | as well as shared IRQs. */ 209 | for (snode = ci->ci_queues; snode; snode = snode->next) { 210 | qi = snode->data; 211 | queue_irqs_list = g_list_insert_sorted_with_data(queue_irqs_list, qi, 212 | queue_irq_cmp, ci); 213 | } 214 | 215 | /* for multiqueue NICs simply assume enough queues */ 216 | node = g_list_last(queue_irqs_list); 217 | for (; node; node = g_list_previous(node)) { 218 | qi = node->data; 219 | if (qi->qi_iface->if_num_queues == 1) 220 | break; 221 | } 222 | 223 | if (!node) { 224 | err("cpu%d: no singlequeue NIC found to map", ci->ci_num); 225 | goto done; 226 | } 227 | 228 | if (queue_map_cpu(qi) != 1) { 229 | /* TODO move queue completely */ 230 | } 231 | 232 | done: 233 | g_list_free(queue_irqs_list); 234 | 235 | return 0; 236 | } 237 | 238 | static void 239 | cpu_dump_queues(const struct cpu_info *ci) 240 | { 241 | GSList *node; 242 | 243 | printf("cpu[%d]: q#=%u ", ci->ci_num, ci->ci_num_queues); 244 | 245 | for (node = ci->ci_queues; node != NULL; node = node->next) { 246 | const struct if_queue_info *qi = node->data; 247 | const struct interface *iface = qi->qi_iface; 248 | 249 | printf("%s:%d ", iface->if_name, qi->qi_num); 250 | } 251 | putchar('\n'); 252 | } 253 | 254 | static void 255 | cpuset_dump_lru(const char *prefix, const struct cpuset *set) 256 | { 257 | GSList *node; 258 | 259 | puts(prefix); 260 | for (node = set->cs_cpu_lru_list; node != NULL; node = node->next) { 261 | const struct cpu_info *ci = node->data; 262 | 263 | cpu_dump_queues(ci); 264 | } 265 | } 266 | 267 | /* 268 | * Which IRQ to remove from most busy CPU? Can be a difficult decision. 269 | * 270 | * For now we just choose last IRQ in list. We determine the interface 271 | * it is part of, then remove it completely from the configuration. 272 | * Afterwards we add it to the configuration again. 273 | * 274 | * By reconfiguring all interface queues we make sure that configuration 275 | * is same compared to device having appeared for the first time, and 276 | * also avoid any issues with multiqueue NICs (e. g. two MQ queues 277 | * assigned to same CPU). 278 | * 279 | * We repeat this process until imbalance is solved. 280 | */ 281 | static int 282 | evenly_interface_down(struct cpuset *set, struct interface *iface_down) 283 | { 284 | if (set->cs_cpu_lru_list == NULL) 285 | return 0; 286 | 287 | do { 288 | struct cpu_info *ci_first, *ci_last; 289 | const struct if_queue_info *qi; 290 | struct interface *iface; 291 | int minq, maxq; 292 | 293 | ci_first = set->cs_cpu_lru_list->data; 294 | minq = ci_first->ci_num_queues; 295 | ci_last = g_slist_last(set->cs_cpu_lru_list)->data; 296 | maxq = ci_last->ci_num_queues; 297 | if (maxq - minq < 2) 298 | break; 299 | 300 | qi = g_slist_last(ci_last->ci_queues)->data; 301 | iface = qi->qi_iface; 302 | if_remove_cpus(iface); 303 | 304 | log("%s: rebalancing interface (%d queue(s))", iface->if_name, 305 | iface->if_num_queues); 306 | if (if_assign_cpus(iface) < 0) 307 | break; 308 | } while (1); 309 | 310 | return 0; 311 | } 312 | 313 | struct strategy_type evenly_strategy_type = { 314 | .name = "evenly", 315 | .init = evenly_init, 316 | .balance_queue = evenly_balance_queue, 317 | .cpu_busy = evenly_cpu_busy, 318 | .interface_down = evenly_interface_down, 319 | }; 320 | -------------------------------------------------------------------------------- /src/cpu.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2011. 16 | */ 17 | #include "irqd.h" 18 | #include "cpu.h" 19 | #include "interface.h" 20 | 21 | #define CPUSET_BITS 8 22 | #define CPUSET_SIZE(bits) (((bits) + CPUSET_BITS - 1) & ~(CPUSET_BITS - 1)) 23 | 24 | #define CPU_MAP_FILE "irqd.cpumap" 25 | 26 | 27 | static struct cpu_info *cpus; 28 | GSList *cpu_load_lru_list; 29 | static unsigned num_cpus; 30 | struct proc_stat proc_stat, proc_stat_old; 31 | 32 | /* each CPU belongs to a single cpuset only */ 33 | GSList *cpuset_list; 34 | 35 | struct cpuset *g_cpuset_auto_assign; 36 | 37 | static void dump_cpus(const char *, const GSList *list) __UNUSED; 38 | 39 | static gint 40 | cpu_cmp(gconstpointer __a, gconstpointer __b) 41 | { 42 | const struct cpu_info *a = __a, *b = __b; 43 | 44 | if (b->ci_num_queues != a->ci_num_queues) 45 | return a->ci_num_queues - b->ci_num_queues; 46 | return a->ci_num - b->ci_num; 47 | } 48 | 49 | void 50 | cpu_fini(void) 51 | { 52 | free(cpus); 53 | } 54 | 55 | unsigned 56 | cpu_count(void) 57 | { 58 | return num_cpus; 59 | } 60 | 61 | struct cpu_info * 62 | cpu_nth(int cpu) 63 | { 64 | BUG_ON(cpu < 0); 65 | if (cpu >= num_cpus) 66 | return NULL; 67 | return &cpus[cpu]; 68 | } 69 | 70 | static void 71 | dump_cpus(const char *prefix, const GSList *list) 72 | { 73 | char buf[1024], *pch = buf, *end = buf + 1024; 74 | 75 | snprintf(pch, end - pch, "%s: ", prefix); 76 | for (; list; list = list->next) { 77 | const struct cpu_info *ci = list->data; 78 | 79 | pch += snprintf(pch, end - pch, "cpu%d/%dq ", ci->ci_num, 80 | ci->ci_num_queues); 81 | } 82 | 83 | log("%s", buf); 84 | } 85 | 86 | static int 87 | add_queue(struct cpu_info *ci, struct if_queue_info *qi) 88 | { 89 | struct cpuset *set = ci->ci_cpuset; 90 | 91 | ci->ci_num_queues++; 92 | 93 | set->cs_cpu_lru_list = g_slist_remove_link(set->cs_cpu_lru_list, 94 | set->cs_cpu_lru_list); 95 | set->cs_cpu_lru_list = g_slist_insert_sorted(set->cs_cpu_lru_list, ci, 96 | cpu_cmp); 97 | 98 | ci->ci_queues = g_slist_append(ci->ci_queues, qi); 99 | 100 | return 0; 101 | } 102 | 103 | struct cpu_info * 104 | cpu_add_queue(int cpu, struct interface *iface, int queue) 105 | { 106 | struct cpu_info *ci = cpu_nth(cpu); 107 | struct if_queue_info *qi = if_queue(iface, queue); 108 | 109 | if (add_queue(ci, qi) < 0) 110 | return NULL; 111 | return ci; 112 | } 113 | 114 | /* assign queue to CPU, select most idle CPU from a cpuset */ 115 | struct cpu_info * 116 | cpu_add_queue_lru(struct interface *iface, int queue) 117 | { 118 | const struct cpuset *set = iface->if_cpuset; 119 | struct cpu_info *ci = set->cs_cpu_lru_list->data; 120 | struct if_queue_info *qi = if_queue(iface, queue); 121 | 122 | if (add_queue(ci, qi) < 0) 123 | return NULL; 124 | return ci; 125 | } 126 | 127 | int 128 | cpu_del_queue(int cpu, struct if_queue_info *qi) 129 | { 130 | struct cpu_info *ci = cpu_nth(cpu); 131 | struct cpuset *set = ci->ci_cpuset; 132 | 133 | BUG_ON(!ci || ci->ci_num_queues == 0); 134 | ci->ci_queues = g_slist_remove(ci->ci_queues, qi); 135 | ci->ci_num_queues--; 136 | set->cs_cpu_lru_list = g_slist_sort(set->cs_cpu_lru_list, cpu_cmp); 137 | 138 | return -1; 139 | } 140 | 141 | #ifdef DEBUG 142 | #define __SS_WRAP_CHECK(ci, var) ({ \ 143 | typeof((ci)->ci_ss[OLD].var) old = (ci)->ci_ss[OLD].var; \ 144 | typeof((ci)->ci_ss[NEW].var) new = (ci)->ci_ss[NEW].var; \ 145 | if (new < old && old - new > (1 << 31)) BUG(); \ 146 | }) 147 | #else 148 | #define __SS_WRAP_CHECK(ci, var) 149 | #endif /* DEBUG */ 150 | 151 | #define SS_WRAP(ci, var) ({ \ 152 | if ((ci)->ci_ss[NEW].var < (ci)->ci_ss[OLD].var) \ 153 | ci->ci_ss[OLD].var = 0U; \ 154 | __SS_WRAP_CHECK(ci, var); \ 155 | }) 156 | 157 | static int 158 | read_softnet_stat(void) 159 | { 160 | char *line = NULL; 161 | FILE *fp; 162 | size_t line_len; 163 | int cpu, ret; 164 | 165 | if ((fp = id_fopen("/proc/net/softnet_stat", "r")) == NULL) 166 | BUG(); 167 | 168 | for (cpu = 0; cpu < num_cpus; cpu++) { 169 | struct cpu_info *ci = &cpus[cpu]; 170 | struct softnet_stat *ss = &ci->ci_ss[NEW]; 171 | 172 | if (getline(&line, &line_len, fp) == EOF) 173 | BUG(); 174 | 175 | memcpy(&ci->ci_ss[OLD], &ci->ci_ss[NEW], sizeof(struct softnet_stat)); 176 | 177 | /* there is another field 'received_rps' in newer kernels, which 178 | is currently ignored */ 179 | ret = sscanf(line, "%08x %08x %08x 00000000 00000000 00000000 " 180 | "00000000 00000000 %08x", &ss->total, &ss->dropped, 181 | &ss->time_squeeze, &ss->cpu_collision); 182 | BUG_ON(ret != 4); 183 | 184 | SS_WRAP(ci, total); 185 | SS_WRAP(ci, dropped); 186 | SS_WRAP(ci, time_squeeze); 187 | SS_WRAP(ci, cpu_collision); 188 | } 189 | 190 | g_free(line); 191 | fclose(fp); 192 | 193 | return 0; 194 | } 195 | 196 | static int 197 | read_proc_stat_softirq(struct proc_stat *ps, char *line) 198 | { 199 | char *tok = strtok(line, " \t"); 200 | int cpu = 0; 201 | 202 | BUG_ON(strcmp(tok, "softirq")); 203 | while ((tok = strtok(NULL, " \t")) != NULL) { 204 | struct cpu_info *ci = cpu_nth(cpu); 205 | 206 | ci->ci_psc.psc_softirq_ctr = strtoull(tok, NULL, 10); 207 | } 208 | 209 | return 0; 210 | } 211 | 212 | static int 213 | read_proc_stat(struct proc_stat *ps) 214 | { 215 | size_t line_len = 4096; 216 | char *line = malloc(line_len); 217 | FILE *fp; 218 | int ret; 219 | 220 | if ((fp = id_fopen("/proc/stat", "r")) == NULL) 221 | return -1; 222 | 223 | do { 224 | struct proc_stat_cpu *psc; 225 | int cpu; 226 | 227 | psc = &ps->ps_cpu_total; 228 | if ((getline(&line, &line_len, fp)) == EOF) 229 | break; 230 | ret = sscanf(line, "cpu %Lu %Lu %Lu %Lu %Lu %Lu %Lu %Lu %Lu", 231 | &psc->psc_user, &psc->psc_nice, &psc->psc_system, 232 | &psc->psc_idle, &psc->psc_iowait, &psc->psc_irq, 233 | &psc->psc_softirq, &psc->psc_steal, &psc->psc_guest); 234 | BUG_ON(ret != 9); 235 | 236 | /* There could be missing cpu%d entries, e. g. in case of hotplug 237 | or just broken CPUs */ 238 | do { 239 | struct proc_stat_cpu psc_cpu; 240 | struct cpu_info *ci; 241 | 242 | if (getline(&line, &line_len, fp) == EOF) 243 | goto out; 244 | if (!strncmp(line, "intr ", sizeof("intr ") - 1)) 245 | break; 246 | 247 | ret = sscanf(line, "cpu%d %Lu %Lu %Lu %Lu %Lu %Lu %Lu %Lu %Lu", 248 | &cpu, 249 | &psc_cpu.psc_user, &psc_cpu.psc_nice, 250 | &psc_cpu.psc_system, &psc_cpu.psc_idle, 251 | &psc_cpu.psc_iowait, &psc_cpu.psc_irq, 252 | &psc_cpu.psc_softirq, &psc_cpu.psc_steal, 253 | &psc_cpu.psc_guest); 254 | BUG_ON(ret != 10); 255 | ci = cpu_nth(cpu); 256 | BUG_ON(!ci); 257 | memcpy(&ci->ci_psc, &psc_cpu, sizeof(psc_cpu)); 258 | } while (1); 259 | 260 | /* ignore IRQ line for now */ 261 | 262 | if ((ret = fscanf(fp, "ctxt %Lu\n", &ps->ps_ctxt)) != 1) 263 | BUG(); 264 | if ((ret = fscanf(fp, "btime %lu\n", &ps->ps_btime)) != 1) 265 | BUG(); 266 | if ((ret = fscanf(fp, "processes %lu\n", &ps->ps_procs)) != 1) 267 | BUG(); 268 | if ((ret = fscanf(fp, "procs_running %lu\n", 269 | &ps->ps_procs_running)) != 1) 270 | BUG(); 271 | if ((ret = fscanf(fp, "procs_blocked %lu\n", 272 | &ps->ps_procs_blocked)) != 1) 273 | BUG(); 274 | 275 | if (getline(&line, &line_len, fp) == EOF) 276 | break; 277 | if (read_proc_stat_softirq(ps, line) < 0) 278 | break; 279 | } while (0); 280 | 281 | out: 282 | free(line); 283 | fclose(fp); 284 | 285 | return 0; 286 | } 287 | 288 | int 289 | cpu_read_stat(void) 290 | { 291 | int cpu; 292 | 293 | if (read_softnet_stat() < 0) 294 | return -1; 295 | 296 | memcpy(&proc_stat_old, &proc_stat, sizeof(proc_stat)); 297 | for (cpu = 0; cpu < num_cpus; cpu++) { 298 | struct cpu_info *ci = cpu_nth(cpu); 299 | 300 | if (!ci) 301 | continue; 302 | memcpy(&ci->ci_psc_old, &ci->ci_psc, sizeof(ci->ci_psc_old)); 303 | } 304 | if (read_proc_stat(&proc_stat) < 0) 305 | return -1; 306 | 307 | return 0; 308 | } 309 | 310 | static gint 311 | cpu_load_cmp(gconstpointer __a, gconstpointer __b) 312 | { 313 | const struct cpu_info *a = __a, *b = __b; 314 | 315 | if (a->ci_si_load != b->ci_si_load) 316 | return a->ci_si_load - b->ci_si_load; 317 | return a->ci_num - b->ci_num; 318 | } 319 | 320 | static int 321 | do_stat_cpu(struct cpu_info *ci) 322 | { 323 | const struct proc_stat_cpu *psc = &ci->ci_psc; 324 | const struct proc_stat_cpu *psco = &ci->ci_psc_old; 325 | unsigned long long frm_busy, frm_busy_old, frm_tot, frm_tot_old; 326 | 327 | frm_busy = psc->psc_user + psc->psc_nice + psc->psc_system 328 | + psc->psc_iowait + psc->psc_irq + psc->psc_softirq 329 | + psc->psc_steal + psc->psc_guest; 330 | frm_tot = frm_busy + psc->psc_idle; 331 | frm_busy_old = psco->psc_user + psco->psc_nice + psco->psc_system 332 | + psco->psc_iowait + psco->psc_irq + psco->psc_softirq 333 | + psco->psc_steal + psco->psc_guest; 334 | frm_tot_old = frm_busy_old + psco->psc_idle; 335 | if (frm_tot > frm_tot_old) { 336 | unsigned d = frm_tot - frm_tot_old; 337 | 338 | if (frm_busy >= frm_busy_old) 339 | ci->ci_load = (frm_busy - frm_busy_old) * 100 / d; 340 | if (psc->psc_softirq >= psco->psc_softirq) 341 | ci->ci_si_load = (psc->psc_softirq - psco->psc_softirq) * 100 / d; 342 | } 343 | 344 | cpu_load_lru_list = g_slist_remove(cpu_load_lru_list, ci); 345 | cpu_load_lru_list = g_slist_insert_sorted(cpu_load_lru_list, 346 | ci, cpu_load_cmp); 347 | 348 | return 0; 349 | } 350 | 351 | int 352 | cpu_do_stat(void) 353 | { 354 | int cpu; 355 | 356 | for (cpu = 0; cpu < num_cpus; cpu++) { 357 | struct cpu_info *ci = cpu_nth(cpu); 358 | 359 | if (ci) 360 | do_stat_cpu(ci); 361 | } 362 | 363 | #if 0 364 | if (verbose > 1) { 365 | char buf[4096], *pch = buf, *end = buf + sizeof(buf); 366 | GSList *node; 367 | 368 | for (node = cpu_load_lru_list; node; node = g_slist_next(node)) { 369 | struct cpu_info *ci = node->data; 370 | 371 | pch += snprintf(pch, end - pch, "cpu%d=%u/%u", 372 | ci->ci_num, ci->ci_load, ci->ci_si_load); 373 | if (node->next) 374 | *pch++ = ' '; 375 | } 376 | log("LRU: %s", buf); 377 | } 378 | #endif /* 0 */ 379 | 380 | return 0; 381 | } 382 | 383 | void 384 | cpu_dump_map(void) 385 | { 386 | char path[PATH_MAX]; 387 | FILE *fp; 388 | int cpu; 389 | 390 | /* do not use _PATH_VARDB, as on sles11 it points to /var/db, 391 | which doesn't exist, */ 392 | snprintf(path, sizeof(path), "/var/lib/misc/%s", CPU_MAP_FILE); 393 | if ((fp = fopen(path, "w")) == NULL) { 394 | err("%s: %m", CPU_MAP_FILE); 395 | return; 396 | } 397 | id_set_fd_flags(fileno(fp), O_CLOEXEC); 398 | 399 | for (cpu = 0; cpu < num_cpus; cpu++) { 400 | GSList *node; 401 | 402 | if (fprintf(fp, "cpu%d:", cpu) == EOF) 403 | goto out; 404 | 405 | for (node = cpus[cpu].ci_queues; node; node = node->next) { 406 | const struct if_queue_info *qi = node->data; 407 | 408 | if (fprintf(fp, " %s:%d", qi->qi_iface->if_name, qi->qi_num) == EOF) 409 | goto out; 410 | } 411 | 412 | if (fputc('\n', fp) == EOF) 413 | goto out; 414 | } 415 | 416 | out: 417 | if (fclose(fp) == EOF) 418 | err("%s: %m", CPU_MAP_FILE); 419 | } 420 | 421 | struct cpu_bitmask * 422 | cpu_bitmask_new(struct cpuset *set) 423 | { 424 | struct cpu_bitmask *bmask; 425 | 426 | BUG_ON(!num_cpus); 427 | BUG_ON(!set); 428 | bmask = g_malloc0(sizeof(struct cpu_bitmask) + CPUSET_SIZE(num_cpus) / 8); 429 | if (bmask) { 430 | bmask->cpuset = set; 431 | bmask->len = CPUSET_SIZE(num_cpus) / 8; 432 | } else 433 | OOM(); 434 | 435 | return bmask; 436 | } 437 | 438 | void 439 | cpu_bitmask_free(struct cpu_bitmask *bmask) 440 | { 441 | g_free(bmask); 442 | } 443 | 444 | /** 445 | * @return 1: set, 0: already set 446 | */ 447 | int 448 | cpu_bitmask_set(struct cpu_bitmask *bmask, unsigned cpu) 449 | { 450 | const struct cpuset *set = bmask->cpuset; 451 | const struct range *rg = &set->cs_range; 452 | int off = cpu / CPUSET_BITS, bit = cpu % CPUSET_BITS; 453 | 454 | BUG_ON(cpu < rg->rg_from || cpu > cpuset_last_cpu(set)); 455 | BUG_ON(off >= bmask->len); 456 | if ((bmask->data[off] & (1 << bit)) == 0) { 457 | bmask->data[off] |= (1 << bit); 458 | bmask->nbits++; 459 | 460 | return 1; 461 | } 462 | 463 | return 0; 464 | } 465 | 466 | /** 467 | * @return 1: cleared, 0: already cleared 468 | */ 469 | int 470 | cpu_bitmask_clear(struct cpu_bitmask *bmask, unsigned cpu) 471 | { 472 | const struct cpuset *set = bmask->cpuset; 473 | const struct range *rg = &set->cs_range; 474 | int off = cpu / CPUSET_BITS, bit = cpu % CPUSET_BITS; 475 | 476 | BUG_ON(cpu < rg->rg_from || cpu > cpuset_last_cpu(set)); 477 | BUG_ON(off >= bmask->len); 478 | if (bmask->data[off] & (1 << bit)) { 479 | bmask->data[off] &= ~(1 << bit); 480 | BUG_ON(bmask->nbits == 0); 481 | bmask->nbits--; 482 | 483 | return 1; 484 | } 485 | 486 | return 0; 487 | } 488 | 489 | bool 490 | cpu_bitmask_is_set(const struct cpu_bitmask *bmask, unsigned cpu) 491 | { 492 | const struct cpuset *set = bmask->cpuset; 493 | const struct range *rg = &set->cs_range; 494 | int off = cpu / CPUSET_BITS, bit = cpu % CPUSET_BITS; 495 | 496 | BUG_ON(cpu < rg->rg_from || cpu > cpuset_last_cpu(set)); 497 | BUG_ON(off >= bmask->len); 498 | return (bmask->data[off] & (1 << bit)) != 0; 499 | } 500 | 501 | int 502 | cpu_bitmask_ffs(const struct cpu_bitmask *bmask) 503 | { 504 | int off; 505 | 506 | for (off = 0; off < bmask->len; off++) { 507 | if (bmask->data[off]) { 508 | int bit; 509 | 510 | for (bit = 0; bit < 8; bit++) 511 | if (bmask->data[off] & (1 << bit)) 512 | return off * 8 + bit; 513 | } 514 | } 515 | 516 | return -1; 517 | } 518 | 519 | uint64_t 520 | cpu_bitmask_mask64(const struct cpu_bitmask *bmask) 521 | { 522 | uint64_t mask = 0ULL; 523 | size_t len; 524 | 525 | #if 0 526 | { 527 | int cpu; 528 | 529 | for (cpu = 0; cpu < bmask->len * 8; cpu++) 530 | if (cpu_bitmask_is_set(bmask, cpu)) 531 | mask |= (1LLU << cpu); 532 | } 533 | #endif /* 0 */ 534 | 535 | len = bmask->len > sizeof(uint64_t) ? sizeof(uint64_t) : bmask->len; 536 | memcpy(&mask, bmask->data, len); 537 | 538 | return mask; 539 | } 540 | 541 | struct range * 542 | range_new(unsigned from, unsigned to) 543 | { 544 | struct range *range; 545 | 546 | if ((range = g_malloc(sizeof(struct range))) == NULL) 547 | return NULL; 548 | range->rg_from = from; 549 | range->rg_to = to; 550 | 551 | return range; 552 | } 553 | 554 | void 555 | range_free(struct range *range) 556 | { 557 | free(range); 558 | } 559 | 560 | bool 561 | range_valid(const struct range *range) 562 | { 563 | return range->rg_from <= range->rg_to; 564 | } 565 | 566 | bool 567 | cpu_in_range(const struct range *rg, unsigned cpu) 568 | { 569 | return cpu >= rg->rg_from && cpu <= rg->rg_to; 570 | } 571 | 572 | bool 573 | range_in_range(const struct range *rg, const struct range *subrg) 574 | { 575 | return range_valid(subrg) 576 | && subrg->rg_from >= rg->rg_from 577 | && subrg->rg_to <= rg->rg_to; 578 | } 579 | 580 | struct cpuset * 581 | cpuset_new(const char *name, const struct range *range) 582 | { 583 | struct cpuset *set; 584 | int cpu; 585 | 586 | BUG_ON(!range); 587 | BUG_ON(!num_cpus); 588 | if (range->rg_from > range->rg_to 589 | || range->rg_to > num_cpus) { 590 | dbg("cpuset: out of range (from %u, to %u)", range->rg_from, 591 | range->rg_to); 592 | return NULL; 593 | } 594 | 595 | if ((set = g_malloc0(sizeof(struct cpuset))) == NULL) 596 | return NULL; 597 | if ((set->cs_name = strdup(name)) == NULL) { 598 | cpuset_free(set); 599 | return NULL; 600 | } 601 | 602 | memcpy(&set->cs_range, range, sizeof(struct range)); 603 | 604 | for (cpu = range->rg_from; cpu <= range->rg_to; cpu++) { 605 | set->cs_cpu_lru_list = g_slist_append(set->cs_cpu_lru_list, &cpus[cpu]); 606 | BUG_ON(cpus[cpu].ci_cpuset); 607 | cpus[cpu].ci_cpuset = set; 608 | } 609 | 610 | return set; 611 | } 612 | 613 | void 614 | cpuset_free(struct cpuset *set) 615 | { 616 | /* TODO cleanup dev_list */ 617 | if (set) 618 | free(set); 619 | } 620 | 621 | void 622 | cpuset_dump(void) 623 | { 624 | GSList *node; 625 | 626 | for (node = cpuset_list; node; node = node->next) { 627 | const struct cpuset *set = node->data; 628 | const GSList *dev_node; 629 | 630 | printf("cpuset['%s']: cpus=%d-%d strategy='%s'\n", 631 | set->cs_name, set->cs_range.rg_from, set->cs_range.rg_to, 632 | set->cs_strategy.s_type->name); 633 | for (dev_node = set->cs_dev_list; dev_node; dev_node = dev_node->next) { 634 | struct interface *iface = dev_to_if(dev_node->data); 635 | 636 | printf(" %s\n", iface->if_name); 637 | } 638 | } 639 | } 640 | 641 | int 642 | cpuset_set_auto_assign(struct cpuset *set) 643 | { 644 | if (g_cpuset_auto_assign) 645 | return -EEXIST; 646 | g_cpuset_auto_assign = set; 647 | 648 | return 0; 649 | } 650 | 651 | int 652 | cpuset_set_strategy(struct cpuset *set, const char *name) 653 | { 654 | const struct strategy_type *type = strategy_find_type(name); 655 | 656 | if (!type) 657 | return -EINVAL; 658 | set->cs_strategy.s_type = type; 659 | if (set->cs_strategy.s_type->init != NULL) 660 | set->cs_strategy.s_type->init(&set->cs_strategy); 661 | 662 | return 0; 663 | } 664 | 665 | static bool 666 | cpuset_has_device(const struct cpuset *set, const struct device *dev) 667 | { 668 | const GSList *node; 669 | 670 | for (node = set->cs_dev_list; node; node = node->next) 671 | if (node->data == dev) 672 | return true; 673 | 674 | return false; 675 | } 676 | 677 | int 678 | cpuset_add_device(struct cpuset *set, struct device *dev) 679 | { 680 | BUG_ON(dev->type == DEV_INVAL); 681 | if (cpuset_has_device(set, dev)) 682 | return -EBUSY; 683 | set->cs_dev_list = g_slist_append(set->cs_dev_list, dev); 684 | dbg("%s: added device %p (type %d)", __func__, dev, dev->type); 685 | 686 | return 0; 687 | } 688 | 689 | GSList * 690 | cpuset_get_by_name(const char *name) 691 | { 692 | GSList *node; 693 | 694 | for (node = cpuset_list; node; node = g_slist_next(node)) { 695 | struct cpuset *set = node->data; 696 | 697 | if (!strcmp(set->cs_name, name)) 698 | return node; 699 | } 700 | 701 | return NULL; 702 | } 703 | 704 | bool 705 | cpuset_in(const struct cpuset *set, unsigned n) 706 | { 707 | return cpu_in_range(&set->cs_range, n); 708 | } 709 | 710 | int 711 | cpuset_list_add(struct cpuset *new) 712 | { 713 | GSList *node; 714 | 715 | if ((node = cpuset_get_by_name("default")) != NULL) { 716 | struct cpuset *set = node->data; 717 | 718 | cpuset_list = g_slist_delete_link(cpuset_list, node); 719 | cpuset_free(set); 720 | } 721 | 722 | for (node = cpuset_list; node; node = g_slist_next(node)) { 723 | const struct cpuset *set = node->data; 724 | 725 | if (!strcmp(set->cs_name, new->cs_name)) 726 | return -EBUSY; 727 | if (cpuset_in(set, new->cs_range.rg_from) 728 | || cpuset_in(set, cpuset_last_cpu(new))) 729 | return -EINVAL; 730 | } 731 | 732 | cpuset_list = g_slist_append(cpuset_list, new); 733 | 734 | return 0; 735 | } 736 | 737 | int 738 | cpuset_interface_down(struct cpuset *set, struct interface *iface) 739 | { 740 | if (set->cs_strategy.s_type->interface_down) 741 | return set->cs_strategy.s_type->interface_down(set, iface); 742 | 743 | return 0; 744 | } 745 | 746 | int 747 | cpuset_cpu_busy(struct cpuset *set, struct cpu_info *ci) 748 | { 749 | if (set->cs_strategy.s_type->cpu_busy) 750 | return set->cs_strategy.s_type->cpu_busy(ci); 751 | 752 | return 0; 753 | } 754 | 755 | /** 756 | * cpuset_balance_queue() - actually assign queue to CPUs 757 | * 758 | * A fixed CPU range of CPUs takes precedence over other the strategy. 759 | * If no fixed CPU range is specified the strategy handler is 760 | * consulted. 761 | */ 762 | int 763 | cpuset_balance_queue(struct cpuset *set, struct interface *iface, int queue) 764 | { 765 | struct if_queue_info *qi = if_queue(iface, queue); 766 | uint64_t cpumask; 767 | 768 | /* a fixed range takes precedence over the balance strategy being 769 | used */ 770 | if (iface->if_fixed_range != NULL) { 771 | const struct range *range = iface->if_fixed_range; 772 | int cpu; 773 | 774 | for (cpu = range->rg_from; cpu <= range->rg_to; cpu++) { 775 | if (!cpu_bitmask_set(qi->qi_cpu_bitmask, cpu)) 776 | BUG(); 777 | if (cpu_add_queue(cpu, iface, queue) < 0) 778 | return -1; 779 | } 780 | } else { 781 | if (set->cs_strategy.s_type->balance_queue(iface, queue) < 0) 782 | return -1; 783 | } 784 | 785 | BUG_ON(cpu_bitmask_ncpus(qi->qi_cpu_bitmask) == 0); 786 | 787 | cpumask = cpu_bitmask_mask64(qi->qi_cpu_bitmask); 788 | if (g_rps_status == RPS_S_ENABLED || g_xps_status == XPS_S_ENABLED) 789 | if_set_steering_cpus(iface, queue, cpumask, cpumask); 790 | 791 | queue_set_affinity(qi, cpumask); 792 | 793 | log("%s:%d: affinity irq=%#" PRIx64 " rps/xps=%#" PRIx64, 794 | iface->if_name, queue, cpumask, cpumask); 795 | 796 | return 0; 797 | } 798 | 799 | int 800 | cpu_init(void) 801 | { 802 | int cpu; 803 | 804 | /* TODO read sysfs instead */ 805 | num_cpus = sysconf(_SC_NPROCESSORS_ONLN); 806 | if ((cpus = calloc(num_cpus, sizeof(struct cpu_info))) == NULL) { 807 | OOM(); 808 | return -1; 809 | } 810 | 811 | for (cpu = 0; cpu < num_cpus; cpu++) 812 | cpus[cpu].ci_num = cpu; 813 | 814 | return 0; 815 | } 816 | -------------------------------------------------------------------------------- /src/interface.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This program is free software; you can redistribute it and/or modify 3 | * it under the terms of the GNU General Public License version 2 4 | * as published by the Free Software Foundation 5 | * 6 | * This program is distributed in the hope that it will be useful, 7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | * GNU General Public License for more details. 10 | * 11 | * You should have received a copy of the GNU General Public License 12 | * along with this program; if not, write to the Free Software 13 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 14 | * 15 | * Holger Eitzenberger , Sophos, 2011. 16 | */ 17 | #include "irqd.h" 18 | #include "event.h" 19 | #include "cpu.h" 20 | #include "interface.h" 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #define IRQ_INFO_CHIP_NAME_LEN 32 28 | #define IRQ_INFO_ACTION_LEN 64 29 | 30 | enum ProcIrqAction { 31 | PIA_NoMatch = 0, 32 | PIA_LSC, /* Link Control Status */ 33 | PIA_Rx, 34 | PIA_Tx, 35 | PIA_RxTx, 36 | }; 37 | 38 | struct irq_info { 39 | unsigned ii_irq; 40 | char ii_chip_name[IRQ_INFO_CHIP_NAME_LEN]; 41 | irq_ctr_t ii_handled[CPU_MAX]; 42 | char ii_action[IRQ_INFO_ACTION_LEN]; 43 | }; 44 | 45 | #define REBALANCE_IVAL 5 46 | 47 | /* threshold (in percent) of the softirq load, from which on 48 | a rebalance of some queues to a different CPU is scheduled */ 49 | #define REBALANCE_SI_THRESH 70 50 | 51 | static struct nl_sock *nlh; 52 | static struct nl_cache *nlcache; 53 | static struct nl_cache_mngr *mngr; 54 | static struct ev nl_ev; 55 | static struct ev rebalance_ev; 56 | static GHashTable *if_hash; 57 | 58 | static struct cpuset *if_assign_cpuset_by_name(struct interface *, 59 | const char *) __UNUSED; 60 | 61 | struct interface * 62 | if_new(const char *dev, struct cpuset *set) 63 | { 64 | struct interface *iface; 65 | 66 | if ((iface = g_new0(struct interface, 1)) == NULL) { 67 | OOM(); 68 | return NULL; 69 | } 70 | 71 | device_init(&iface->if_dev, DEV_INTERFACE); 72 | 73 | iface->if_cpuset = set; 74 | strncpy(iface->if_name, dev, IFNAMSIZ); 75 | iface->if_queues = g_new0(struct if_queue_info, QUEUE_MAX); 76 | if (!iface->if_queues) { 77 | g_free(iface); 78 | iface = NULL; 79 | } 80 | 81 | dbg("new interface '%s' (%p)", dev, iface); 82 | 83 | return iface; 84 | } 85 | 86 | void 87 | if_free(struct interface *iface) 88 | { 89 | if (iface) { 90 | int queue; 91 | 92 | dbg("free interface %p", iface); 93 | for (queue = 0; queue < iface->if_num_queues; queue++) 94 | BUG_ON(!cpu_bitmask_is_empty(if_queue(iface, queue)->qi_cpu_bitmask)); 95 | g_free(iface->if_queues); 96 | g_free(iface); 97 | } 98 | } 99 | 100 | int 101 | if_register(struct interface *iface) 102 | { 103 | BUG_ON(g_hash_table_lookup(if_hash, iface->if_name)); 104 | g_hash_table_insert(if_hash, strdup(iface->if_name), iface); 105 | dbg("registered interface '%s'", iface->if_name); 106 | 107 | return 0; 108 | } 109 | 110 | static void 111 | if_assign_cpuset(struct interface *iface, struct cpuset *set) 112 | { 113 | BUG_ON(iface->if_cpuset); 114 | iface->if_cpuset = set; 115 | } 116 | 117 | static struct cpuset * 118 | if_assign_cpuset_by_name(struct interface *iface, const char *name) 119 | { 120 | GSList *node; 121 | 122 | for (node = cpuset_list; node; node = node->next) { 123 | struct cpuset *set = node->data; 124 | 125 | if (!strcmp(set->cs_name, name)) { 126 | if_assign_cpuset(iface, set); 127 | return set; 128 | } 129 | } 130 | 131 | return NULL; 132 | } 133 | 134 | struct if_queue_info * 135 | if_queue(const struct interface *iface, int queue) 136 | { 137 | BUG_ON(queue < 0 || queue >= QUEUE_MAX); 138 | return &iface->if_queues[queue]; 139 | } 140 | 141 | struct if_queue_info * 142 | if_queue_by_name(const char *dev, int queue) 143 | { 144 | const struct interface *iface; 145 | 146 | if ((iface = g_hash_table_lookup(if_hash, dev)) == NULL) 147 | return NULL; 148 | 149 | return if_queue(iface, queue); 150 | } 151 | 152 | /** 153 | * if_queue_assign_range() - assign CPUs in range to queue 154 | * 155 | * Low-level function to assign CPUs to an interface queue. 156 | */ 157 | int 158 | if_queue_assign_range(struct if_queue_info *qi, const struct range *range) 159 | { 160 | int cpu; 161 | 162 | for (cpu = range->rg_from; cpu <= range->rg_to; cpu++) { 163 | if (!cpu_bitmask_set(qi->qi_cpu_bitmask, cpu)) 164 | BUG(); 165 | if (cpu_add_queue(cpu, qi->qi_iface, qi->qi_num) < 0) 166 | return -1; 167 | } 168 | 169 | return 0; 170 | } 171 | 172 | /* 173 | * if_assign_fixed_range() - assign an unchangeable subrange 174 | * 175 | * Intended use-case is for single-queue NICs, but all queues 176 | * are pinned if there are multiple queues. 177 | * 178 | * The actual pinning happens at the time the interface comes 179 | * up. 180 | */ 181 | int 182 | if_assign_fixed_range(struct interface *iface, const struct range *range) 183 | { 184 | struct cpuset *set = iface->if_cpuset; 185 | 186 | BUG_ON(set == NULL); 187 | if (!range_in_range(&set->cs_range, range)) { 188 | dbg("range [%u,%u] within '%s' cpuset is invalid", 189 | range->rg_from, range->rg_to, set->cs_name); 190 | return -EINVAL; 191 | } 192 | 193 | BUG_ON(iface->if_fixed_range); 194 | iface->if_fixed_range = range_new(range->rg_from, range->rg_to); 195 | if (!iface->if_fixed_range) 196 | return -ENOMEM; 197 | 198 | return 0; 199 | } 200 | 201 | bool 202 | if_can_rps(const struct interface *iface) 203 | { 204 | char path[PATH_MAX], *fmt; 205 | struct stat st; 206 | 207 | fmt = id_path("/sys/class/net/%s/queues/rx-0/rps_cpus"); 208 | snprintf(path, sizeof(path), fmt, iface->if_name); 209 | g_free(fmt); 210 | if (stat(path, &st) < 0) 211 | return false; 212 | 213 | return true; 214 | } 215 | 216 | bool 217 | if_can_xps(const struct interface *iface) 218 | { 219 | char path[PATH_MAX], *fmt; 220 | struct stat st; 221 | 222 | fmt = id_path("/sys/class/net/%s/queues/tx-0/xps_cpus"); 223 | snprintf(path, sizeof(path), fmt, iface->if_name); 224 | g_free(fmt); 225 | if (stat(path, &st) < 0) 226 | return false; 227 | 228 | return true; 229 | } 230 | 231 | static enum EvReturn 232 | rtnl_io_cb(struct ev *ev, unsigned short what) 233 | { 234 | switch (what) { 235 | case EV_READ: 236 | nl_cache_mngr_data_ready(mngr); 237 | break; 238 | 239 | default: 240 | BUG(); 241 | } 242 | 243 | return 0; 244 | } 245 | 246 | static int 247 | write_u64_mask(const char *file, uint64_t mask) 248 | { 249 | char buf[32]; 250 | int fd, len, nwritten; 251 | 252 | if ((fd = open(file, O_WRONLY | O_CLOEXEC)) < 0) { 253 | err("%s: %m", file); 254 | return -1; 255 | } 256 | 257 | len = snprintf(buf, sizeof(buf), "%" PRIx64, mask); 258 | nwritten = write(fd, buf, len); 259 | BUG_ON(nwritten != len); 260 | 261 | close(fd); 262 | 263 | return 0; 264 | } 265 | 266 | int 267 | if_set_steering_cpus(const struct interface *iface, int queue, 268 | uint64_t rps_mask, uint64_t xps_mask) 269 | { 270 | char path[PATH_MAX]; 271 | 272 | if (g_rps_status == RPS_S_ENABLED) { 273 | snprintf(path, sizeof(path), "/sys/class/net/%s/queues/rx-%d/rps_cpus", 274 | iface->if_name, queue); 275 | write_u64_mask(path, rps_mask); 276 | } 277 | 278 | if (g_xps_status == XPS_S_ENABLED) { 279 | snprintf(path, sizeof(path), "/sys/class/net/%s/queues/tx-%d/xps_cpus", 280 | iface->if_name, queue); 281 | write_u64_mask(path, xps_mask); 282 | } 283 | 284 | return 0; 285 | } 286 | 287 | static enum ProcIrqAction 288 | parse_iface_irq_action_tail(const char *tail, int *queue) 289 | { 290 | if (*tail == '\0') 291 | return PIA_LSC; 292 | 293 | if (sscanf(tail, "-TxRx-%u", queue) == 1) 294 | return PIA_RxTx; 295 | /* some Intel e1000e */ 296 | if (sscanf(tail, "-rxtx-%u", queue) == 1) 297 | return PIA_RxTx; 298 | 299 | /* Broadcom NICs (netxen, bnx2) */ 300 | if (sscanf(tail, "[%u]", queue) == 1) 301 | return PIA_RxTx; 302 | /* Broadcom bnx2 */ 303 | if (sscanf(tail, "-%u", queue) == 1) 304 | return PIA_RxTx; 305 | 306 | /* Intel igb driver */ 307 | if (sscanf(tail, "-rx-%u", queue) == 1) 308 | return PIA_Rx; 309 | if (sscanf(tail, "-tx-%u", queue) == 1) 310 | return PIA_Tx; 311 | 312 | /* Qualcomm Atheros (alx) */ 313 | if (sscanf(tail, "-TR-%u", queue) == 1) 314 | return PIA_RxTx; 315 | 316 | return PIA_NoMatch; 317 | } 318 | 319 | static enum ProcIrqAction 320 | parse_iface_irq_action(struct interface *iface, const char *action, 321 | int *queue) 322 | { 323 | const int len = strlen(iface->if_name); 324 | 325 | if (strncmp(iface->if_name, action, len)) 326 | return PIA_NoMatch; 327 | 328 | *queue = 0; 329 | 330 | return parse_iface_irq_action_tail(action + len, queue); 331 | } 332 | 333 | static struct if_queue_info * 334 | if_add_queue(struct interface *iface, int queue, int rx_irq, int tx_irq) 335 | { 336 | struct if_queue_info *qi = if_queue(iface, queue); 337 | 338 | if (!qi->qi_cpu_bitmask) { 339 | struct cpuset *set = iface->if_cpuset; 340 | 341 | if ((qi->qi_cpu_bitmask = cpu_bitmask_new(set)) == NULL) 342 | return NULL; 343 | } 344 | qi->qi_num = queue; 345 | qi->qi_iface = iface; 346 | if (rx_irq > 0) 347 | qi->qi_rx_irq = rx_irq; 348 | if (tx_irq > 0) 349 | qi->qi_tx_irq = tx_irq; 350 | 351 | iface->if_num_queues = max(iface->if_num_queues, queue + 1); 352 | 353 | return qi; 354 | } 355 | 356 | /** 357 | * queues_from_interrupts() - parse /proc/interrupts for for NIC 358 | * 359 | * See documentation for if_queue_info for the different cases 360 | * to consider. 361 | * 362 | * @return 0: ok, -1 on error 363 | */ 364 | static int 365 | queues_from_interrupts(struct interface *iface, size_t qi_len) 366 | { 367 | FILE *fp; 368 | char *line = NULL; 369 | size_t line_len; 370 | int num_cpus = sysconf(_SC_NPROCESSORS_ONLN); 371 | int lineno = 0; 372 | 373 | BUG_ON(g_rps_status == RPS_S_NEED_CHECK); 374 | iface->if_num_queues = 0; 375 | 376 | if ((fp = id_fopen("/proc/interrupts", "r")) == NULL) 377 | goto err_free; 378 | 379 | getline(&line, &line_len, fp); 380 | lineno++; 381 | next_line: 382 | while (!feof(fp)) { 383 | struct if_queue_info *qi = NULL; 384 | char *pch, *tok, *end, *saveptr; 385 | int i, irq, devs = 0; 386 | enum ProcIrqAction pia; 387 | 388 | if (getline(&line, &line_len, fp) == EOF) 389 | break; 390 | lineno++; 391 | pch = g_strstrip(line); 392 | 393 | tok = strtok_r(pch, " \t", &saveptr); 394 | irq = strtoul(tok, &end, 0); 395 | if (*end != ':') 396 | continue; /* not an IRQ line */ 397 | if ((pch = strstr(saveptr, iface->if_name)) == NULL) 398 | continue; /* not a NIC IRQ line */ 399 | 400 | for (i = 0; i < num_cpus; i++) 401 | if ((tok = strtok_r(NULL, " \t", &saveptr)) == NULL) 402 | goto next_line; 403 | 404 | /* chip */ 405 | if ((tok = strtok_r(NULL, " \t", &saveptr)) == NULL) 406 | continue; 407 | 408 | /* action */ 409 | do { 410 | int queue = 0; 411 | 412 | if ((tok = strtok_r(NULL, " \t,", &saveptr)) == NULL) 413 | break; 414 | 415 | pia = parse_iface_irq_action(iface, tok, &queue); 416 | switch (pia) { 417 | case PIA_LSC: 418 | /* this may not be just LSC if both rx_irq and tx_irq 419 | are zero */ 420 | iface->if_irq = irq; 421 | qi = if_add_queue(iface, queue, -1, -1); 422 | break; 423 | 424 | case PIA_Rx: 425 | qi = if_add_queue(iface, queue, irq, -1); 426 | break; 427 | 428 | case PIA_Tx: 429 | qi = if_add_queue(iface, queue, -1, irq); 430 | break; 431 | 432 | case PIA_RxTx: 433 | qi = if_add_queue(iface, queue, irq, irq); 434 | break; 435 | 436 | case PIA_NoMatch: 437 | log("interrupts: failed to parse '%s'. Please report.", tok); 438 | qi = NULL; 439 | break; 440 | } 441 | 442 | devs++; 443 | } while (1); 444 | 445 | if (pia == PIA_LSC && devs > 1) 446 | iface->if_flags |= IF_F_SHARED_IRQ; 447 | 448 | if (qi && verbose > 1) 449 | log("%s: irqs: LSC=%d RX=%d TX=%d\n", iface->if_name, 450 | iface->if_irq, qi->qi_rx_irq, qi->qi_tx_irq); 451 | } 452 | 453 | free(line); 454 | /* if (ferror(fp)) ... */ 455 | fclose(fp); 456 | 457 | return 0; 458 | 459 | err_free: 460 | g_free(line); 461 | if (fp) 462 | fclose(fp); 463 | 464 | return -1; 465 | } 466 | 467 | int 468 | queue_set_affinity(const struct if_queue_info *qi, uint64_t cpumask) 469 | { 470 | const struct interface *iface = qi->qi_iface; 471 | 472 | if (qi->qi_rx_irq > 0) { 473 | irq_set_affinity(qi->qi_rx_irq, cpumask); 474 | 475 | if (qi->qi_tx_irq > 0 && qi->qi_tx_irq != qi->qi_rx_irq) 476 | irq_set_affinity(qi->qi_tx_irq, cpumask); 477 | } 478 | 479 | /* virtual interfaces (lo, tun, ...) don't have an IRQ */ 480 | if (iface->if_irq > 0) 481 | irq_set_affinity(iface->if_irq, cpumask); 482 | 483 | return 0; 484 | } 485 | 486 | /** 487 | * if_assign_cpus() - assign CPUs to an interface 488 | * 489 | * High level function to assign CPUs to all queues of an interface. 490 | * It does so by hooking into the strategy handlers. 491 | * 492 | * Number of queues has to be determined earlier. 493 | */ 494 | int 495 | if_assign_cpus(struct interface *iface) 496 | { 497 | int i; 498 | 499 | BUG_ON(iface->if_num_queues == 0); 500 | 501 | for (i = 0; i < iface->if_num_queues; i++) 502 | cpuset_balance_queue(iface->if_cpuset, iface, i); 503 | 504 | return 0; 505 | } 506 | 507 | static int 508 | if_on_up(struct interface *iface, const char *dev) 509 | { 510 | if (g_rps_status == RPS_S_NEED_CHECK) { 511 | g_rps_status = if_can_rps(iface) ? RPS_S_ENABLED : RPS_S_DISABLED; 512 | g_xps_status = if_can_xps(iface) ? XPS_S_ENABLED : XPS_S_DISABLED; 513 | 514 | log("RPS %s, XPS %s", 515 | g_rps_status == RPS_S_ENABLED ? "enabled" : "disabled", 516 | g_xps_status == XPS_S_ENABLED ? "enabled" : "disabled"); 517 | } 518 | 519 | if (queues_from_interrupts(iface, QUEUE_MAX) < 0) 520 | return -1; 521 | if (iface->if_num_queues == 0) 522 | if_add_queue(iface, 0, -1, -1); /* lo, tun, etc. */ 523 | 524 | log("%s: detected %d queue(s), '%s' cpuset", iface->if_name, 525 | iface->if_num_queues, iface->if_cpuset->cs_name); 526 | 527 | if_assign_cpus(iface); 528 | 529 | log("%s: up", iface->if_name); 530 | 531 | return 0; 532 | } 533 | 534 | /** 535 | * if_remove_cpus() - remove an interface from all CPUs 536 | * 537 | * Low-level function to remove all queues of an interface from 538 | * all the currently used CPUs. 539 | */ 540 | int 541 | if_remove_cpus(struct interface *iface) 542 | { 543 | struct cpuset *set = iface->if_cpuset; 544 | int queue; 545 | 546 | for (queue = 0; queue < iface->if_num_queues; queue++) { 547 | struct if_queue_info *qi = if_queue(iface, queue); 548 | int cpu; 549 | 550 | for (cpu = set->cs_range.rg_from; cpu <= cpuset_last_cpu(set); cpu++) 551 | if (cpu_bitmask_clear(qi->qi_cpu_bitmask, cpu)) 552 | cpu_del_queue(cpu, qi); 553 | } 554 | 555 | return 0; 556 | } 557 | 558 | static int 559 | if_on_down(struct interface *iface, const char *dev) 560 | { 561 | struct cpuset *set = iface->if_cpuset; 562 | 563 | if_remove_cpus(iface); 564 | cpuset_interface_down(set, iface); 565 | 566 | log("%s: down", iface->if_name); 567 | 568 | return 0; 569 | } 570 | 571 | static int 572 | rtnl_balance_link(struct rtnl_link *lnk) 573 | { 574 | struct interface *iface; 575 | const char *dev; 576 | int flags; 577 | bool change = false; 578 | 579 | if ((dev = rtnl_link_get_name(lnk)) == NULL) 580 | return 0; 581 | 582 | if ((iface = g_hash_table_lookup(if_hash, dev)) == NULL) { 583 | if (g_cpuset_auto_assign) { 584 | if ((iface = if_new(dev, g_cpuset_auto_assign)) == NULL) 585 | return -1; 586 | cpuset_add_device(g_cpuset_auto_assign, if_to_dev(iface)); 587 | if_register(iface); 588 | } else { 589 | log("%s: ignored by configuration", dev); 590 | return 0; 591 | } 592 | } 593 | 594 | flags = rtnl_link_get_flags(lnk); 595 | if ((iface->if_flags & IFF_UP) == 0 && (flags & IFF_UP)) { 596 | if (if_on_up(iface, dev) < 0) 597 | goto err; 598 | change = true; 599 | } else if ((iface->if_flags & IFF_UP) && (flags & IFF_UP) == 0) { 600 | if (if_on_down(iface, dev) < 0) 601 | goto err; 602 | change = true; 603 | } 604 | 605 | iface->if_flags = flags; 606 | 607 | if (change) 608 | cpu_dump_map(); 609 | 610 | return 0; 611 | 612 | err: 613 | return -1; 614 | } 615 | 616 | /** 617 | * @return 1: IRQ line, 0: no IRQ line, <1 on error 618 | */ 619 | static int 620 | read_irq_info(char *line, struct irq_info *ii) 621 | { 622 | char *pch, *tok, *end, *saveptr; 623 | int cpu; 624 | 625 | pch = g_strstrip(line); 626 | 627 | /* 628 | * EXAMPLES 629 | * 630 | * 11: 24 XT-PIC-XT eth2, eth7 631 | * 46: 2 0 2 0 0 0 2 2 PCI-MSI-edge eth11-TxRx-1 632 | */ 633 | tok = strtok_r(pch, " \t", &saveptr); 634 | ii->ii_irq = strtoul(tok, &end, 0); 635 | if (*end != ':') 636 | return 0; /* not an IRQ line */ 637 | 638 | for (cpu = 0; cpu < cpu_count(); cpu++) { 639 | if ((tok = strtok_r(NULL, " \t", &saveptr)) == NULL) 640 | return -EINVAL; 641 | 642 | ii->ii_handled[cpu] = strtoul(tok, NULL, 10); 643 | } 644 | 645 | tok = strtok_r(NULL, " \t", &saveptr); 646 | BUG_ON(!tok); 647 | xstrncpy(ii->ii_chip_name, tok, IRQ_INFO_CHIP_NAME_LEN); 648 | 649 | tok = g_strchug(saveptr); 650 | xstrncpy(ii->ii_action, tok ? tok : "", IRQ_INFO_ACTION_LEN); 651 | 652 | return 1; 653 | } 654 | 655 | static int 656 | read_net_device_stats(void) 657 | { 658 | char *line = NULL; 659 | size_t line_len; 660 | FILE *fp; 661 | int ret; 662 | 663 | if ((fp = id_fopen("/proc/net/dev", "r")) == NULL) 664 | BUG(); 665 | 666 | getline(&line, &line_len, fp); 667 | getline(&line, &line_len, fp); 668 | while (!feof(fp)) { 669 | struct if_net_device_stats nds; 670 | struct interface *iface; 671 | char *name, *saveptr; 672 | 673 | if (getline(&line, &line_len, fp) == EOF) 674 | break; 675 | 676 | if ((name = strtok_r(line, ": ", &saveptr)) == NULL) 677 | continue; 678 | BUG_ON(strlen(name) > IFNAMSIZ); 679 | 680 | #define __S "%" PRIx64 681 | ret = sscanf(saveptr, __S " " __S " " __S " " __S " "__S " "__S " " 682 | __S " " __S " " __S " " __S " " __S " " __S " " __S " " 683 | __S " " __S " " __S, 684 | #undef __S 685 | /* RX */ 686 | &nds.rx_bytes, &nds.rx_packets, 687 | &nds.rx_errors, &nds.rx_dropped, 688 | &nds.rx_fifo_errors, &nds.rx_frame_errors, 689 | &nds.rx_compressed, &nds.rx_mcast, 690 | /* TX */ 691 | &nds.tx_bytes, &nds.tx_packets, 692 | &nds.tx_errors, &nds.tx_dropped, 693 | &nds.tx_fifo_errors, &nds.tx_collisions, 694 | &nds.tx_carrier_errors, &nds.tx_compressed); 695 | if (ret != 16) 696 | continue; 697 | if ((iface = g_hash_table_lookup(if_hash, name)) == NULL) 698 | continue; /* not UP or not interested */ 699 | 700 | memcpy(&iface->if_stats[OLD], &iface->if_stats[NEW], 701 | sizeof(iface->if_stats[OLD])); 702 | memcpy(&iface->if_stats[NEW], &nds, sizeof(iface->if_stats[NEW])); 703 | } 704 | 705 | g_free(line); 706 | fclose(fp); 707 | 708 | return 0; 709 | } 710 | 711 | static void 712 | queue_update_irqs(struct if_queue_info *qi, const struct irq_info *ii) 713 | { 714 | int cpu; 715 | 716 | memcpy(qi->qi_irq_stats[OLD], qi->qi_irq_stats[NEW], 717 | cpu_count() * sizeof(unsigned)); 718 | memcpy(qi->qi_irq_stats[NEW], ii->ii_handled, 719 | cpu_count() * sizeof(unsigned)); 720 | for (cpu = 0; cpu < cpu_count(); cpu++) 721 | if (qi->qi_irq_stats[OLD][cpu] > qi->qi_irq_stats[NEW][cpu]) 722 | qi->qi_irq_stats[OLD][cpu] = 0U; 723 | 724 | #ifdef DEBUG 725 | { 726 | char buf[128], *pch = buf, *end = buf + 128; 727 | 728 | for (cpu = 0; cpu < cpu_count(); cpu++) 729 | pch += snprintf(pch, end - pch, "%d:%d ", 730 | cpu, qi->qi_irq_stats[NEW][cpu]); 731 | buf[127] = '\0'; 732 | dbg("irqs: %s:%d: %s", qi->qi_iface->if_name, qi->qi_num, buf); 733 | } 734 | #endif 735 | } 736 | 737 | static void 738 | irq_update_stats(const char *action, const struct irq_info *ii) 739 | { 740 | struct if_queue_info *qi = NULL; 741 | enum ProcIrqAction pia; 742 | const char *tail; 743 | int queue = 0; 744 | 745 | if ((tail = strpbrk(action, "-[")) != NULL) { 746 | pia = parse_iface_irq_action_tail(tail, &queue); 747 | switch (pia) { 748 | break; 749 | 750 | case PIA_Rx: 751 | case PIA_Tx: 752 | case PIA_RxTx: 753 | qi = if_queue_by_name(action, queue); 754 | break; 755 | 756 | case PIA_LSC: 757 | /* can't happen */ 758 | case PIA_NoMatch: 759 | break; 760 | } 761 | } else 762 | qi = if_queue_by_name(action, queue); 763 | 764 | if (qi) 765 | queue_update_irqs(qi, ii); 766 | } 767 | 768 | static int 769 | read_irq_stats(void) 770 | { 771 | char *line = NULL; 772 | size_t line_len; 773 | FILE *fp; 774 | 775 | if ((fp = id_fopen("/proc/interrupts", "r")) == NULL) 776 | goto err; 777 | 778 | getline(&line, &line_len, fp); 779 | while (!feof(fp)) { 780 | struct irq_info ii; 781 | char *tok, *saveptr; 782 | int ret; 783 | 784 | if (getline(&line, &line_len, fp) == EOF) 785 | break; 786 | if ((ret = read_irq_info(line, &ii)) < 0) 787 | goto err; 788 | else if (ret == 0) 789 | continue; 790 | 791 | tok = strtok_r(ii.ii_action, " ,\t", &saveptr); 792 | while (tok) { 793 | irq_update_stats(tok, &ii); 794 | 795 | tok = strtok_r(NULL, " ,\t", &saveptr); 796 | } 797 | } 798 | 799 | fclose(fp); 800 | g_free(line); 801 | 802 | return 0; 803 | 804 | err: 805 | if (fp) 806 | fclose(fp); 807 | g_free(line); 808 | return -1; 809 | } 810 | 811 | static enum EvReturn 812 | rebalance_cb(struct ev *ev, unsigned short what) 813 | { 814 | static int turn; 815 | uint64_t exp; 816 | int nread, cpu; 817 | 818 | BUG_ON(what != EV_READ); 819 | if ((nread = read(ev->fd, &exp, sizeof(exp))) < 0) { 820 | if (errno == EAGAIN) 821 | return EvOk; 822 | err("read: %m"); 823 | return EvStop; 824 | } 825 | BUG_ON(nread != sizeof(exp)); 826 | 827 | cpu_read_stat(); 828 | read_net_device_stats(); 829 | read_irq_stats(); 830 | 831 | if (turn++ == 0) 832 | return EvOk; 833 | 834 | cpu_do_stat(); 835 | 836 | for (cpu = 0; cpu < cpu_count(); cpu++) { 837 | struct cpu_info *ci = cpu_nth(cpu); 838 | struct cpuset *set = ci->ci_cpuset; 839 | 840 | #if 0 841 | log("cpu%d: dropped:%u,%u time_squeeze:%u,%u", cpu, 842 | ci->ci_ss[OLD].dropped, ci->ci_ss[NEW].dropped, 843 | ci->ci_ss[OLD].time_squeeze, ci->ci_ss[NEW].time_squeeze); 844 | #endif /* 0 */ 845 | 846 | /* Not all CPUs are part of a cpuset */ 847 | if ((ci->ci_si_load > REBALANCE_SI_THRESH 848 | || CPU_SS_DIFF(ci, dropped) > 0) && set != NULL) 849 | cpuset_cpu_busy(set, ci); 850 | } 851 | 852 | return EvOk; 853 | } 854 | 855 | static int 856 | rebalance_init(void) 857 | { 858 | struct itimerspec its = { 859 | .it_interval = { .tv_sec = REBALANCE_IVAL, }, 860 | .it_value = { .tv_sec = REBALANCE_IVAL, }, 861 | }; 862 | struct timespec now; 863 | int fd; 864 | 865 | if (clock_gettime(CLOCK_REALTIME, &now) < 0) { 866 | err("clock_gettime: %m"); 867 | return -1; 868 | } 869 | 870 | fd = timerfd_create(CLOCK_REALTIME, TFD_NONBLOCK | TFD_CLOEXEC); 871 | if (fd < 0) { 872 | err("timerfd_create: %m"); 873 | return -1; 874 | } 875 | 876 | its.it_value.tv_sec += now.tv_sec; 877 | if (timerfd_settime(fd, TFD_TIMER_ABSTIME, &its, NULL) < 0) { 878 | err("timerfd_settime: %m"); 879 | return -1; 880 | } 881 | 882 | ev_set(&rebalance_ev, fd, NULL, NULL); 883 | rebalance_ev.cb_read = rebalance_cb; 884 | ev_add(&rebalance_ev, EV_READ); 885 | 886 | log("rebalance started (every %d sec)", REBALANCE_IVAL); 887 | 888 | return 0; 889 | } 890 | 891 | static void 892 | rtnl_interface_cb(struct nl_object *obj, void *arg) 893 | { 894 | char buf[128]; 895 | struct nl_dump_params dp = { 896 | .dp_type = NL_DUMP_LINE, 897 | .dp_buf = buf, 898 | .dp_buflen = sizeof(buf), 899 | }; 900 | 901 | buf[0] = '\0'; 902 | nl_object_dump(obj, &dp); 903 | log("%s", buf); 904 | 905 | rtnl_balance_link((struct rtnl_link *)obj); 906 | } 907 | 908 | static void 909 | rtnl_change_cb(struct nl_cache *cache, struct nl_object *obj, int action, 910 | void *arg) 911 | { 912 | char buf[128]; 913 | struct nl_dump_params dp = { 914 | .dp_type = NL_DUMP_LINE, 915 | .dp_buf = buf, 916 | .dp_buflen = sizeof(buf), 917 | }; 918 | 919 | buf[0] = '\0'; 920 | nl_object_dump(obj, &dp); 921 | log("%s", buf); 922 | 923 | rtnl_balance_link((struct rtnl_link *)obj); 924 | } 925 | 926 | int 927 | if_init(void) 928 | { 929 | if_hash = g_hash_table_new_full(g_str_hash, g_str_equal, free, NULL); 930 | if (!if_hash) { 931 | OOM(); 932 | return -1; 933 | } 934 | 935 | return 0; 936 | } 937 | 938 | int 939 | if_rtnl_init(void) 940 | { 941 | int ret; 942 | 943 | BUG_ON(!cpu_count() || !config_is_read); 944 | if ((nlh = nl_socket_alloc()) == NULL) { 945 | err("unable to allocate netlink handle"); 946 | return -1; 947 | } 948 | 949 | nl_socket_disable_seq_check(nlh); 950 | 951 | ret = nl_cache_mngr_alloc(nlh, NETLINK_ROUTE, NL_AUTO_PROVIDE, &mngr); 952 | if (ret < 0) { 953 | err("%s\n", nl_geterror(ret)); 954 | return -1; 955 | } 956 | 957 | ret = nl_cache_mngr_add(mngr, "route/link", rtnl_change_cb, NULL, 958 | &nlcache); 959 | if (ret < 0) { 960 | err("%s\n", nl_geterror(ret)); 961 | return -1; 962 | } 963 | 964 | ev_set(&nl_ev, nl_cache_mngr_get_fd(mngr), NULL, mngr); 965 | nl_ev.cb_read = rtnl_io_cb; 966 | ev_add(&nl_ev, EV_READ); 967 | log("getting interface notifications"); 968 | 969 | nl_cache_foreach(nlcache, rtnl_interface_cb, NULL); 970 | 971 | return rebalance_init(); 972 | } 973 | 974 | void 975 | if_fini(void) 976 | { 977 | nl_cache_mngr_free(mngr); 978 | } 979 | --------------------------------------------------------------------------------