├── CPU-experiment ├── aessample.c ├── aessampletiming.cpp ├── cpu_enc ├── cpu_enc.c ├── do_rdtsc.s ├── iaes_asm_interface.h ├── iaesni.h ├── iaesx64.s ├── intel_aes.c ├── mk.sh ├── rtp.pkt ├── sha1-fast-64.S ├── sha1-fast.c ├── sha1-naive.c ├── sha1 │ ├── sha1-fast-64.S │ ├── sha1-fast.c │ ├── sha1-naive.c │ └── sha1test.c └── yasm ├── IOEngine ├── driver │ ├── Makefile │ ├── affinity.py │ ├── install.py │ ├── ixgbe.h │ ├── ixgbe_82598.c │ ├── ixgbe_82599.c │ ├── ixgbe_api.c │ ├── ixgbe_api.h │ ├── ixgbe_common.c │ ├── ixgbe_common.h │ ├── ixgbe_dcb.c │ ├── ixgbe_dcb.h │ ├── ixgbe_dcb_82598.c │ ├── ixgbe_dcb_82598.h │ ├── ixgbe_dcb_82599.c │ ├── ixgbe_dcb_82599.h │ ├── ixgbe_dcb_nl.c │ ├── ixgbe_ethtool.c │ ├── ixgbe_fcoe.c │ ├── ixgbe_fcoe.h │ ├── ixgbe_main.c │ ├── ixgbe_osdep.h │ ├── ixgbe_param.c │ ├── ixgbe_phy.c │ ├── ixgbe_phy.h │ ├── ixgbe_sysfs.c │ ├── ixgbe_type.h │ ├── kcompat.c │ ├── kcompat.h │ └── kcompat_ethtool.c ├── include │ └── psio.h ├── lib │ ├── Makefile │ └── psio.c └── samples │ ├── echo │ ├── Makefile │ └── echo.c │ ├── list_devices │ ├── Makefile │ └── list_devices.c │ ├── monitoring │ └── thruput.py │ ├── packet_generator │ ├── Makefile │ ├── packet_generator.c │ └── pspgen.c │ ├── rxdump │ ├── Makefile │ └── rxdump.c │ └── tx │ ├── 2pkt1con.pcap │ ├── Makefile │ ├── pkt_buff.c │ ├── pkt_buff.h │ ├── rtp1.pcap │ └── tx.c ├── README.md ├── latency-experiment ├── 10g_experiment │ ├── minus.py │ ├── result.py │ ├── sort.py │ └── static.py ├── 25ms_experiment │ ├── result.py │ ├── sort.py │ └── static.py ├── rxdump │ ├── Makefile │ ├── packet.txt │ └── rxdump.c └── tx │ ├── Makefile │ ├── dpkt.py │ ├── packet.txt │ ├── pkt_buff.c │ ├── pkt_buff.h │ ├── rtp1.pcap │ └── tx.c ├── libgpucrypto ├── Makefile ├── Makefile.dep ├── README ├── aes.cu ├── aes_core.h ├── co_aes_sha1.cu ├── crypto_kernel.h ├── crypto_mem.c ├── crypto_mem.h ├── crypto_size.h ├── libgpucrypto.h ├── sha1.cu ├── sha1.h └── test │ ├── Makefile │ ├── README │ ├── perf.py │ ├── rtp.pkt │ └── workright.c ├── src-20G ├── Makefile ├── include │ ├── memcpy.h │ ├── upro_batch.h │ ├── upro_collector.h │ ├── upro_config.h │ ├── upro_context.h │ ├── upro_forwarder.h │ ├── upro_gpu_worker.h │ ├── upro_job.h │ ├── upro_log.h │ ├── upro_macros.h │ ├── upro_memory.h │ ├── upro_timer.h │ └── upro_transworker.h ├── memcpy.c ├── memcpy_sse.c ├── upro.c ├── upro_collector.c ├── upro_forwarder.c ├── upro_gpu_worker.c ├── upro_log.c ├── upro_memory.c └── upro_timer.c └── src ├── Makefile ├── README ├── include ├── upro_batch.h ├── upro_collector.h ├── upro_config.h ├── upro_context.h ├── upro_forwarder.h ├── upro_gpu_worker.h ├── upro_job.h ├── upro_log.h ├── upro_macros.h ├── upro_memory.h ├── upro_timer.h └── upro_transworker.h ├── upro.c ├── upro_collector.c ├── upro_forwarder.c ├── upro_gpu_worker.c ├── upro_log.c ├── upro_memory.c ├── upro_timer.c └── upro_transworker.c /CPU-experiment/cpu_enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kay21s/UPro/c95585a0721325a6855233c97b6a32cdb98763fa/CPU-experiment/cpu_enc -------------------------------------------------------------------------------- /CPU-experiment/cpu_enc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | #define NUM_FLOWS 4096 9 | #define THREADS_PER_BLK 128 10 | #define MEMORY_ALIGNMENT 4096 11 | #define BLOCK_SIZE 16 12 | #define HMAC_KEY_SIZE 20 13 | #define HMAC_TAG_SIZE 10 14 | #define ALIGN_UP(x,size) ( ((size_t)x+(size-1))&(~(size-1)) ) 15 | #define TRAN_NONE 1 16 | 17 | void sha1_hash(uint8_t *message, uint32_t len, uint32_t *hash); 18 | extern void sha1_compress(uint32_t *state, uint32_t *block); 19 | 20 | unsigned char test_key_128[16] = { 0x2b,0x7e,0x15,0x16,0x28,0xae,0xd2,0xa6,0xab,0xf7,0x15,0x88,0x09,0xcf,0x4f,0x3c}; 21 | unsigned char test_init_counter[16] = { 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff}; 22 | unsigned char test_init_vector[16] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f}; 23 | 24 | 25 | uint64_t swap64(uint64_t v) 26 | { 27 | return ((v & 0x00000000000000ffU) << 56) | 28 | ((v & 0x000000000000ff00U) << 48) | 29 | ((v & 0x0000000000ff0000U) << 24) | 30 | ((v & 0x00000000ff000000U) << 8) | 31 | ((v & 0x000000ff00000000U) >> 8) | 32 | ((v & 0x0000ff0000000000U) >> 24) | 33 | ((v & 0x00ff000000000000U) >> 48) | 34 | ((v & 0xff00000000000000U) >> 56); 35 | } 36 | 37 | 38 | 39 | int main() 40 | { 41 | FILE *fp; 42 | uint16_t i, j, fsize, pad_size, sha1_size; 43 | char * rtp_pkt; 44 | uint8_t * host_in, *host_out, *in; 45 | uint8_t default_hmac_keys[HMAC_KEY_SIZE]; 46 | uint32_t hash[5]; 47 | 48 | struct timeval start, end; 49 | struct timeval start_aes, end_aes; 50 | struct timeval start_sha, end_sha; 51 | 52 | unsigned long diff; 53 | uint8_t a = 123; 54 | 55 | fp = fopen("rtp.pkt", "rb"); 56 | fseek(fp, 0, SEEK_END); 57 | fsize = ftell(fp); 58 | fseek(fp, 0, SEEK_SET); 59 | 60 | rtp_pkt = (char *)calloc(fsize, sizeof(char)); 61 | fread(rtp_pkt, fsize, sizeof(char), fp); 62 | 63 | pad_size = (fsize + 8 + 1 + 63) & (~0x3F); 64 | sha1_size = (fsize + 63) & (~0x3F); 65 | const uint64_t len64 = swap64((uint64_t)fsize); 66 | 67 | 68 | printf("the original package is %d bytes,now we pad it to %d bytes\n", fsize, pad_size); 69 | 70 | for (i = 0; i < HMAC_KEY_SIZE; i ++) 71 | default_hmac_keys[i] = a; 72 | 73 | printf("duplicate it %d times, takes %d bytes\n",NUM_FLOWS,pad_size*NUM_FLOWS); 74 | host_in = (uint8_t *)calloc(pad_size * NUM_FLOWS, sizeof(uint8_t)); 75 | host_out = (uint8_t *)calloc(pad_size * NUM_FLOWS, sizeof(uint8_t)); 76 | 77 | uint8_t *testCounter = (uint8_t *)malloc(16); 78 | for (i = 0; i < BLOCK_SIZE; i ++) 79 | testCounter[i] = test_init_counter[i]; 80 | 81 | // write the buffer 82 | for (i = 0; i < NUM_FLOWS; i ++){ 83 | in = host_in + i * pad_size; 84 | memcpy(in, rtp_pkt, fsize * sizeof(uint8_t)); 85 | 86 | *(uint8_t *)(in + pad_size - 9) = 1 << 7; 87 | *(uint64_t *)(in + pad_size - 8) = len64; 88 | } 89 | 90 | gettimeofday(&start,NULL); 91 | 92 | gettimeofday(&start_aes,NULL); 93 | 94 | for (i = 0; i < NUM_FLOWS; i ++) 95 | { 96 | //i = 0; 97 | 98 | in = host_in + i * pad_size; 99 | uint8_t cc = in[0] & 0x0F; /* Get the number of CSRC identifiers */ 100 | uint16_t header_len = 12 + 4 * cc; /* Get the total header length */ 101 | in = in + header_len; /* Get to the payload */ 102 | int numBlocks, len = fsize - header_len; 103 | 104 | 105 | if (fsize & 0x0f == 0) 106 | numBlocks = fsize >> 4; 107 | else 108 | numBlocks = (fsize >> 4) + 1; 109 | 110 | //printf("header_len = %d, in = %p, numBlocks = %d\n", header_len, in, numBlocks); 111 | 112 | intel_AES_encdec128_CTR(in, in, test_key_128, numBlocks, testCounter); 113 | 114 | if (fsize & 0x0f != 0) { 115 | in = in + fsize; 116 | for (j = 0; j < numBlocks * 16 - fsize; j ++) { 117 | in[j] = 0; 118 | } 119 | } 120 | 121 | } 122 | 123 | gettimeofday(&end_aes,NULL); 124 | 125 | gettimeofday(&start_sha,NULL); 126 | in = host_in; 127 | 128 | unsigned long ji, mm = NUM_FLOWS * sha1_size / 64; 129 | printf("mm = %ld\n", mm); 130 | for (ji = 0; ji < mm ; ji++) 131 | sha1_compress(hash, (uint32_t *)in); 132 | 133 | #if 0 134 | const unsigned long N = 10000000; 135 | unsigned long ji; 136 | uint32_t state[5]; 137 | uint32_t block[16]; 138 | for (ji = 0; ji < N; ji++) 139 | sha1_compress(hash, in); 140 | #endif 141 | #if 0 142 | for (i = 0; i < NUM_FLOWS; i ++) 143 | { 144 | /* Sha1 comeon */ 145 | in = host_in + i * pad_size; 146 | sha1_hash(in, sha1_size, hash); 147 | memcpy(in + sha1_size, hash, HMAC_TAG_SIZE); 148 | } 149 | #endif 150 | gettimeofday(&end_sha,NULL); 151 | 152 | gettimeofday(&end,NULL); 153 | 154 | 155 | diff = 1000000 * (end.tv_sec-start.tv_sec)+ end.tv_usec-start.tv_usec; 156 | printf("Speed is %ld Mbps\n", ((fsize * 8) * NUM_FLOWS) / diff); 157 | diff = 1000000 * (end_aes.tv_sec-start_aes.tv_sec)+ end_aes.tv_usec-start_aes.tv_usec; 158 | printf("AES Speed is %ld Mbps\n", (((fsize - 12) * 8) * NUM_FLOWS) / diff); 159 | diff = 1000000 * (end_sha.tv_sec-start_sha.tv_sec)+ end_sha.tv_usec-start_sha.tv_usec; 160 | printf("SHA1 Speed is %ld Mbps\n", ((sha1_size * 8) * NUM_FLOWS) / diff); 161 | //printf("SHA1 Speed is %ld Mbps\n", (N * 64 * 8) / diff); 162 | 163 | 164 | ////////////////////////////////////// 165 | /* 166 | uint32_t state[5]; 167 | uint32_t block[16]; 168 | const unsigned long N = 10000000; 169 | 170 | gettimeofday(&start_sha,NULL); 171 | for (i = 0; i < N; i++) 172 | sha1_compress(state, block); 173 | gettimeofday(&end_sha,NULL); 174 | diff = 1000000 * (end_sha.tv_sec-start_sha.tv_sec)+ end_sha.tv_usec-start_sha.tv_usec; 175 | printf("SHA1 Speed is %ld Mbps\n", (N * 64 * 8) / diff); 176 | */ 177 | return 0; 178 | } 179 | 180 | void sha1_hash(uint8_t *message, uint32_t len, uint32_t *hash) 181 | { 182 | int i; 183 | 184 | hash[0] = 0x67452301; 185 | hash[1] = 0xEFCDAB89; 186 | hash[2] = 0x98BADCFE; 187 | hash[3] = 0x10325476; 188 | hash[4] = 0xC3D2E1F0; 189 | 190 | for (i = 0; i + 64 <= len; i += 64) 191 | sha1_compress(hash, (uint32_t*)(message + i)); 192 | 193 | 194 | assert(len - i == 0); 195 | return; 196 | } 197 | -------------------------------------------------------------------------------- /CPU-experiment/do_rdtsc.s: -------------------------------------------------------------------------------- 1 | [bits 64] 2 | [CPU intelnop] 3 | 4 | ; Copyright (c) 2010, Intel Corporation 5 | ; All rights reserved. 6 | ; 7 | ; Redistribution and use in source and binary forms, with or without 8 | ; modification, are permitted provided that the following conditions are met: 9 | ; 10 | ; * Redistributions of source code must retain the above copyright notice, 11 | ; this list of conditions and the following disclaimer. 12 | ; * Redistributions in binary form must reproduce the above copyright notice, 13 | ; this list of conditions and the following disclaimer in the documentation 14 | ; and/or other materials provided with the distribution. 15 | ; * Neither the name of Intel Corporation nor the names of its contributors 16 | ; may be used to endorse or promote products derived from this software 17 | ; without specific prior written permission. 18 | ; 19 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | ; IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 23 | ; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 | ; BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26 | ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 27 | ; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 28 | ; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | align 16 31 | global do_rdtsc 32 | do_rdtsc: 33 | 34 | rdtsc 35 | shl rdx, 32 36 | or rax, rdx 37 | ret 0 38 | -------------------------------------------------------------------------------- /CPU-experiment/iaes_asm_interface.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * * Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * * Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * * Neither the name of Intel Corporation nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 21 | * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 25 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 26 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #ifndef _INTEL_AES_ASM_INTERFACE_H__ 31 | #define _INTEL_AES_ASM_INTERFACE_H__ 32 | 33 | 34 | #include "iaesni.h" 35 | 36 | 37 | 38 | //structure to pass aes processing data to asm level functions 39 | typedef struct _sAesData 40 | { 41 | _AES_IN UCHAR *in_block; 42 | _AES_OUT UCHAR *out_block; 43 | _AES_IN UCHAR *expanded_key; 44 | _AES_INOUT UCHAR *iv; // for CBC mode 45 | _AES_IN size_t num_blocks; 46 | } sAesData; 47 | 48 | #if (__cplusplus) 49 | extern "C" 50 | { 51 | #endif 52 | #if 0 53 | #define MYSTDCALL __stdcall 54 | #else 55 | #define MYSTDCALL 56 | #endif 57 | 58 | #ifdef __linux__ 59 | #ifndef __LP64__ 60 | #define iEncExpandKey256 _iEncExpandKey256 61 | #define iEncExpandKey192 _iEncExpandKey192 62 | #define iEncExpandKey128 _iEncExpandKey128 63 | #define iDecExpandKey256 _iDecExpandKey256 64 | #define iDecExpandKey192 _iDecExpandKey192 65 | #define iDecExpandKey128 _iDecExpandKey128 66 | #define iEnc128 _iEnc128 67 | #define iDec128 _iDec128 68 | #define iEnc256 _iEnc256 69 | #define iDec256 _iDec256 70 | #define iEnc192 _iEnc192 71 | #define iDec192 _iDec192 72 | #define iEnc128_CBC _iEnc128_CBC 73 | #define iDec128_CBC _iDec128_CBC 74 | #define iEnc256_CBC _iEnc256_CBC 75 | #define iDec256_CBC _iDec256_CBC 76 | #define iEnc192_CBC _iEnc192_CBC 77 | #define iDec192_CBC _iDec192_CBC 78 | #define iEnc128_CTR _iEnc128_CTR 79 | #define iEnc192_CTR _iEnc192_CTR 80 | #define iEnc256_CTR _iEnc256_CTR 81 | #define do_rdtsc _do_rdtsc 82 | #endif 83 | #endif 84 | // prepearing the different key rounds, for enc/dec in asm 85 | // expnaded key should be 16-byte aligned 86 | // expanded key should have enough space to hold all key rounds (16 bytes per round) - 256 bytes would cover all cases (AES256 has 14 rounds + 1 xor) 87 | void MYSTDCALL iEncExpandKey256(_AES_IN UCHAR *key, _AES_OUT UCHAR *expanded_key); 88 | void MYSTDCALL iEncExpandKey192(_AES_IN UCHAR *key, _AES_OUT UCHAR *expanded_key); 89 | void MYSTDCALL iEncExpandKey128(_AES_IN UCHAR *key, _AES_OUT UCHAR *expanded_key); 90 | 91 | void MYSTDCALL iDecExpandKey256(UCHAR *key, _AES_OUT UCHAR *expanded_key); 92 | void MYSTDCALL iDecExpandKey192(UCHAR *key, _AES_OUT UCHAR *expanded_key); 93 | void MYSTDCALL iDecExpandKey128(UCHAR *key, _AES_OUT UCHAR *expanded_key); 94 | 95 | 96 | //enc/dec asm functions 97 | void MYSTDCALL iEnc128(sAesData *data); 98 | void MYSTDCALL iDec128(sAesData *data); 99 | void MYSTDCALL iEnc256(sAesData *data); 100 | void MYSTDCALL iDec256(sAesData *data); 101 | void MYSTDCALL iEnc192(sAesData *data); 102 | void MYSTDCALL iDec192(sAesData *data); 103 | 104 | void MYSTDCALL iEnc128_CBC(sAesData *data); 105 | void MYSTDCALL iDec128_CBC(sAesData *data); 106 | void MYSTDCALL iEnc256_CBC(sAesData *data); 107 | void MYSTDCALL iDec256_CBC(sAesData *data); 108 | void MYSTDCALL iEnc192_CBC(sAesData *data); 109 | void MYSTDCALL iDec192_CBC(sAesData *data); 110 | 111 | 112 | void MYSTDCALL iEnc128_CTR(sAesData *data); 113 | void MYSTDCALL iEnc256_CTR(sAesData *data); 114 | void MYSTDCALL iEnc192_CTR(sAesData *data); 115 | 116 | // rdtsc function 117 | unsigned long long do_rdtsc(void); 118 | 119 | 120 | #if (__cplusplus) 121 | } 122 | #endif 123 | 124 | 125 | #endif 126 | 127 | -------------------------------------------------------------------------------- /CPU-experiment/iaesni.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010, Intel Corporation 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * * Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * * Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * * Neither the name of Intel Corporation nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 21 | * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 25 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 26 | * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | 31 | #ifndef _IAESNI_H__ 32 | #define _IAESNI_H__ 33 | 34 | #include 35 | 36 | #define AES_INSTRCTIONS_CPUID_BIT (1<<25) 37 | 38 | //indicates input param 39 | #define _AES_IN 40 | 41 | //indicates output param 42 | #define _AES_OUT 43 | 44 | //indicates input/output param - based on context 45 | #define _AES_INOUT 46 | 47 | typedef unsigned char UCHAR; 48 | 49 | 50 | #ifndef bool 51 | #define bool BOOL 52 | #endif 53 | //test if the processor actually supports the above functions 54 | //executing one the functions below without processor support will cause UD fault 55 | //bool check_for_aes_instructions(void); 56 | #if (__cplusplus) 57 | extern "C" { 58 | #endif 59 | int check_for_aes_instructions(void); 60 | 61 | #define ROUND_KEYS_UNALIGNED_TESTING 62 | 63 | #ifdef __linux__ 64 | 65 | #ifdef ROUND_KEYS_UNALIGNED_TESTING 66 | 67 | #define DEFINE_ROUND_KEYS \ 68 | UCHAR __attribute__ ((aligned (16))) _expandedKey[16*16]; \ 69 | UCHAR *expandedKey = _expandedKey + 4; \ 70 | 71 | 72 | #else 73 | 74 | 75 | 76 | #define DEFINE_ROUND_KEYS \ 77 | UCHAR __attribute__ ((aligned (16))) _expandedKey[16*16]; \ 78 | UCHAR *expandedKey = _expandedKey; \ 79 | 80 | #endif 81 | 82 | #else // if not __linux__ 83 | 84 | #ifdef ROUND_KEYS_UNALIGNED_TESTING 85 | 86 | #define DEFINE_ROUND_KEYS \ 87 | __declspec(align(16)) UCHAR _expandedKey[16*16]; \ 88 | UCHAR *expandedKey = _expandedKey + 4; \ 89 | 90 | 91 | #else 92 | 93 | 94 | 95 | #define DEFINE_ROUND_KEYS \ 96 | __declspec(align(16)) UCHAR _expandedKey[16*16]; \ 97 | UCHAR *expandedKey = _expandedKey; \ 98 | 99 | 100 | #endif 101 | 102 | #endif 103 | 104 | 105 | 106 | // encryption functions 107 | // plainText is pointer to input stream 108 | // cipherText is pointer to buffer to be filled with encrypted (cipher text) data 109 | // key is pointer to enc key (sizes are 16 bytes for AES-128, 24 bytes for AES-192, 32 for AES-256) 110 | // numBlocks is number of 16 bytes blocks to process - note that encryption is done of full 16 byte blocks 111 | void intel_AES_enc128(_AES_IN UCHAR *plainText, _AES_OUT UCHAR *cipherText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks); 112 | void intel_AES_enc192(_AES_IN UCHAR *plainText, _AES_OUT UCHAR *cipherText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks); 113 | void intel_AES_enc256(_AES_IN UCHAR *plainText, _AES_OUT UCHAR *cipherText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks); 114 | 115 | 116 | void intel_AES_enc128_CBC(_AES_IN UCHAR *plainText, _AES_OUT UCHAR *cipherText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks, _AES_IN UCHAR *iv); 117 | void intel_AES_enc192_CBC(_AES_IN UCHAR *plainText, _AES_OUT UCHAR *cipherText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks, _AES_IN UCHAR *iv); 118 | void intel_AES_enc256_CBC(_AES_IN UCHAR *plainText, _AES_OUT UCHAR *cipherText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks, _AES_IN UCHAR *iv); 119 | 120 | 121 | // encryption functions 122 | // cipherText is pointer to encrypted stream 123 | // plainText is pointer to buffer to be filled with original (plain text) data 124 | // key is pointer to enc key (sizes are 16 bytes for AES-128, 24 bytes for AES-192, 32 for AES-256) 125 | // numBlocks is number of 16 bytes blocks to process - note that decryption is done of full 16 byte blocks 126 | void intel_AES_dec128(_AES_IN UCHAR *cipherText, _AES_OUT UCHAR *plainText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks); 127 | void intel_AES_dec192(_AES_IN UCHAR *cipherText, _AES_OUT UCHAR *plainText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks); 128 | void intel_AES_dec256(_AES_IN UCHAR *cipherText, _AES_OUT UCHAR *plainText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks); 129 | 130 | void intel_AES_dec128_CBC(_AES_IN UCHAR *cipherText, _AES_OUT UCHAR *plainText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks, _AES_IN UCHAR *iv); 131 | void intel_AES_dec192_CBC(_AES_IN UCHAR *cipherText, _AES_OUT UCHAR *plainText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks, _AES_IN UCHAR *iv); 132 | void intel_AES_dec256_CBC(_AES_IN UCHAR *cipherText, _AES_OUT UCHAR *plainText, _AES_IN UCHAR *key, _AES_IN size_t numBlocks, _AES_IN UCHAR *iv); 133 | 134 | void intel_AES_encdec128_CTR(_AES_IN UCHAR *input, _AES_OUT UCHAR *output, _AES_IN UCHAR *key, _AES_IN size_t numBlocks, _AES_IN UCHAR *initial_counter); 135 | void intel_AES_encdec192_CTR(_AES_IN UCHAR *input, _AES_OUT UCHAR *output, _AES_IN UCHAR *key, _AES_IN size_t numBlocks, _AES_IN UCHAR *initial_counter); 136 | void intel_AES_encdec256_CTR(_AES_IN UCHAR *input, _AES_OUT UCHAR *output, _AES_IN UCHAR *key, _AES_IN size_t numBlocks, _AES_IN UCHAR *initial_counter); 137 | 138 | 139 | #if (__cplusplus) 140 | } 141 | #endif 142 | 143 | 144 | #endif 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /CPU-experiment/mk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | yasm="./yasm" 4 | 5 | pushd . 6 | asm="iaesx64 do_rdtsc" 7 | for i in $asm; do echo do $i.s; $yasm -D__linux__ -g dwarf2 -f elf64 $i.s -o obj/$i.o; done 8 | gcc -O3 -c intel_aes.c -o obj/intel_aes64.o 9 | ar -r lib/intel_aes64.a obj/*.o 10 | popd 11 | 12 | gcc -o cpu_enc cpu_enc.c sha1-fast-64.S lib/intel_aes64.a 13 | -------------------------------------------------------------------------------- /CPU-experiment/rtp.pkt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kay21s/UPro/c95585a0721325a6855233c97b6a32cdb98763fa/CPU-experiment/rtp.pkt -------------------------------------------------------------------------------- /CPU-experiment/sha1-fast-64.S: -------------------------------------------------------------------------------- 1 | /* 2 | * SHA-1 hash in x86-64 assembly 3 | * Copyright (c) 2012 Nayuki Minase 4 | * 5 | * http://nayuki.eigenstate.org/page/fast-sha1-hash-implementation-in-x86-assembly 6 | */ 7 | 8 | 9 | /* 10 | * Storage usage: 11 | * Bytes Location Description 12 | * 4 eax SHA-1 state variable A 13 | * 4 ebx SHA-1 state variable B 14 | * 4 ecx SHA-1 state variable C 15 | * 4 edx SHA-1 state variable D 16 | * 4 ebp SHA-1 state variable E 17 | * 4 esi Temporary for calculation per round 18 | * 4 edi (Last 64 rounds) temporary for calculation per round 19 | * 8 rdi (First 16 rounds) base address of block array argument (read-only) 20 | * 8 r8 Base address of state array argument (read-only) 21 | * 8 rsp x86-64 stack pointer 22 | * 64 [rsp+0] Circular buffer of most recent 16 key schedule items, 4 bytes each 23 | * 16 xmm0 Caller's value of rbx (only lower 64 bits are used) 24 | * 16 xmm1 Caller's value of rbp (only lower 64 bits are used) 25 | */ 26 | 27 | #define ROUND0a(a,b,c,d,e,i) \ 28 | movl (i*4)(%rdi), %esi; \ 29 | bswapl %esi; \ 30 | movl %esi, (i*4)(%rsp); \ 31 | addl %esi, %e; \ 32 | movl %c, %esi; \ 33 | xorl %d, %esi; \ 34 | andl %b, %esi; \ 35 | xorl %d, %esi; \ 36 | ROUNDTAIL(a,b,e,i,0x5A827999) 37 | 38 | #define SCHEDULE(i,e) \ 39 | movl (((i- 3)&0xF)*4)(%rsp), %esi; \ 40 | xorl (((i- 8)&0xF)*4)(%rsp), %esi; \ 41 | xorl (((i-14)&0xF)*4)(%rsp), %esi; \ 42 | xorl (((i-16)&0xF)*4)(%rsp), %esi; \ 43 | roll $1, %esi; \ 44 | addl %esi, %e; \ 45 | movl %esi, ((i&0xF)*4)(%rsp); 46 | 47 | #define ROUND0b(a,b,c,d,e,i) \ 48 | SCHEDULE(i,e) \ 49 | movl %c, %esi; \ 50 | xorl %d, %esi; \ 51 | andl %b, %esi; \ 52 | xorl %d, %esi; \ 53 | ROUNDTAIL(a,b,e,i,0x5A827999) 54 | 55 | #define ROUND1(a,b,c,d,e,i) \ 56 | SCHEDULE(i,e) \ 57 | movl %b, %esi; \ 58 | xorl %c, %esi; \ 59 | xorl %d, %esi; \ 60 | ROUNDTAIL(a,b,e,i,0x6ED9EBA1) 61 | 62 | #define ROUND2(a,b,c,d,e,i) \ 63 | SCHEDULE(i,e) \ 64 | movl %c, %esi; \ 65 | movl %c, %edi; \ 66 | orl %d, %esi; \ 67 | andl %b, %esi; \ 68 | andl %d, %edi; \ 69 | orl %edi, %esi; \ 70 | ROUNDTAIL(a,b,e,i,-0x70E44324) 71 | 72 | #define ROUND3(a,b,c,d,e,i) \ 73 | SCHEDULE(i,e) \ 74 | movl %b, %esi; \ 75 | xorl %c, %esi; \ 76 | xorl %d, %esi; \ 77 | ROUNDTAIL(a,b,e,i,-0x359D3E2A) 78 | 79 | #define ROUNDTAIL(a,b,e,i,k) \ 80 | roll $30, %b; \ 81 | leal k(%e,%esi), %e; \ 82 | movl %a, %esi; \ 83 | roll $5, %esi; \ 84 | addl %esi, %e; 85 | 86 | 87 | /* void sha1_compress(uint32_t *state, uint32_t *block) */ 88 | .globl sha1_compress 89 | sha1_compress: 90 | /* Save registers, allocate scratch space */ 91 | movq %rbx, %xmm0 92 | movq %rbp, %xmm1 93 | subq $64, %rsp 94 | 95 | /* Load arguments */ 96 | movq %rdi, %r8 97 | movl 0(%rdi), %eax /* a */ 98 | movl 4(%rdi), %ebx /* b */ 99 | movl 8(%rdi), %ecx /* c */ 100 | movl 12(%rdi), %edx /* d */ 101 | movl 16(%rdi), %ebp /* e */ 102 | movq %rsi, %rdi 103 | 104 | /* 80 rounds of hashing */ 105 | ROUND0a(eax, ebx, ecx, edx, ebp, 0) 106 | ROUND0a(ebp, eax, ebx, ecx, edx, 1) 107 | ROUND0a(edx, ebp, eax, ebx, ecx, 2) 108 | ROUND0a(ecx, edx, ebp, eax, ebx, 3) 109 | ROUND0a(ebx, ecx, edx, ebp, eax, 4) 110 | ROUND0a(eax, ebx, ecx, edx, ebp, 5) 111 | ROUND0a(ebp, eax, ebx, ecx, edx, 6) 112 | ROUND0a(edx, ebp, eax, ebx, ecx, 7) 113 | ROUND0a(ecx, edx, ebp, eax, ebx, 8) 114 | ROUND0a(ebx, ecx, edx, ebp, eax, 9) 115 | ROUND0a(eax, ebx, ecx, edx, ebp, 10) 116 | ROUND0a(ebp, eax, ebx, ecx, edx, 11) 117 | ROUND0a(edx, ebp, eax, ebx, ecx, 12) 118 | ROUND0a(ecx, edx, ebp, eax, ebx, 13) 119 | ROUND0a(ebx, ecx, edx, ebp, eax, 14) 120 | ROUND0a(eax, ebx, ecx, edx, ebp, 15) 121 | ROUND0b(ebp, eax, ebx, ecx, edx, 16) 122 | ROUND0b(edx, ebp, eax, ebx, ecx, 17) 123 | ROUND0b(ecx, edx, ebp, eax, ebx, 18) 124 | ROUND0b(ebx, ecx, edx, ebp, eax, 19) 125 | ROUND1(eax, ebx, ecx, edx, ebp, 20) 126 | ROUND1(ebp, eax, ebx, ecx, edx, 21) 127 | ROUND1(edx, ebp, eax, ebx, ecx, 22) 128 | ROUND1(ecx, edx, ebp, eax, ebx, 23) 129 | ROUND1(ebx, ecx, edx, ebp, eax, 24) 130 | ROUND1(eax, ebx, ecx, edx, ebp, 25) 131 | ROUND1(ebp, eax, ebx, ecx, edx, 26) 132 | ROUND1(edx, ebp, eax, ebx, ecx, 27) 133 | ROUND1(ecx, edx, ebp, eax, ebx, 28) 134 | ROUND1(ebx, ecx, edx, ebp, eax, 29) 135 | ROUND1(eax, ebx, ecx, edx, ebp, 30) 136 | ROUND1(ebp, eax, ebx, ecx, edx, 31) 137 | ROUND1(edx, ebp, eax, ebx, ecx, 32) 138 | ROUND1(ecx, edx, ebp, eax, ebx, 33) 139 | ROUND1(ebx, ecx, edx, ebp, eax, 34) 140 | ROUND1(eax, ebx, ecx, edx, ebp, 35) 141 | ROUND1(ebp, eax, ebx, ecx, edx, 36) 142 | ROUND1(edx, ebp, eax, ebx, ecx, 37) 143 | ROUND1(ecx, edx, ebp, eax, ebx, 38) 144 | ROUND1(ebx, ecx, edx, ebp, eax, 39) 145 | ROUND2(eax, ebx, ecx, edx, ebp, 40) 146 | ROUND2(ebp, eax, ebx, ecx, edx, 41) 147 | ROUND2(edx, ebp, eax, ebx, ecx, 42) 148 | ROUND2(ecx, edx, ebp, eax, ebx, 43) 149 | ROUND2(ebx, ecx, edx, ebp, eax, 44) 150 | ROUND2(eax, ebx, ecx, edx, ebp, 45) 151 | ROUND2(ebp, eax, ebx, ecx, edx, 46) 152 | ROUND2(edx, ebp, eax, ebx, ecx, 47) 153 | ROUND2(ecx, edx, ebp, eax, ebx, 48) 154 | ROUND2(ebx, ecx, edx, ebp, eax, 49) 155 | ROUND2(eax, ebx, ecx, edx, ebp, 50) 156 | ROUND2(ebp, eax, ebx, ecx, edx, 51) 157 | ROUND2(edx, ebp, eax, ebx, ecx, 52) 158 | ROUND2(ecx, edx, ebp, eax, ebx, 53) 159 | ROUND2(ebx, ecx, edx, ebp, eax, 54) 160 | ROUND2(eax, ebx, ecx, edx, ebp, 55) 161 | ROUND2(ebp, eax, ebx, ecx, edx, 56) 162 | ROUND2(edx, ebp, eax, ebx, ecx, 57) 163 | ROUND2(ecx, edx, ebp, eax, ebx, 58) 164 | ROUND2(ebx, ecx, edx, ebp, eax, 59) 165 | ROUND3(eax, ebx, ecx, edx, ebp, 60) 166 | ROUND3(ebp, eax, ebx, ecx, edx, 61) 167 | ROUND3(edx, ebp, eax, ebx, ecx, 62) 168 | ROUND3(ecx, edx, ebp, eax, ebx, 63) 169 | ROUND3(ebx, ecx, edx, ebp, eax, 64) 170 | ROUND3(eax, ebx, ecx, edx, ebp, 65) 171 | ROUND3(ebp, eax, ebx, ecx, edx, 66) 172 | ROUND3(edx, ebp, eax, ebx, ecx, 67) 173 | ROUND3(ecx, edx, ebp, eax, ebx, 68) 174 | ROUND3(ebx, ecx, edx, ebp, eax, 69) 175 | ROUND3(eax, ebx, ecx, edx, ebp, 70) 176 | ROUND3(ebp, eax, ebx, ecx, edx, 71) 177 | ROUND3(edx, ebp, eax, ebx, ecx, 72) 178 | ROUND3(ecx, edx, ebp, eax, ebx, 73) 179 | ROUND3(ebx, ecx, edx, ebp, eax, 74) 180 | ROUND3(eax, ebx, ecx, edx, ebp, 75) 181 | ROUND3(ebp, eax, ebx, ecx, edx, 76) 182 | ROUND3(edx, ebp, eax, ebx, ecx, 77) 183 | ROUND3(ecx, edx, ebp, eax, ebx, 78) 184 | ROUND3(ebx, ecx, edx, ebp, eax, 79) 185 | 186 | /* Save updated state */ 187 | addl %eax, 0(%r8) 188 | addl %ebx, 4(%r8) 189 | addl %ecx, 8(%r8) 190 | addl %edx, 12(%r8) 191 | addl %ebp, 16(%r8) 192 | 193 | /* Restore registers */ 194 | movq %xmm0, %rbx 195 | movq %xmm1, %rbp 196 | addq $64, %rsp 197 | ret 198 | -------------------------------------------------------------------------------- /CPU-experiment/sha1-fast.c: -------------------------------------------------------------------------------- 1 | /* 2 | * SHA-1 hash in C 3 | * Copyright (c) 2012 Nayuki Minase 4 | * 5 | * http://nayuki.eigenstate.org/page/fast-sha1-hash-implementation-in-x86-assembly 6 | */ 7 | 8 | 9 | #include 10 | 11 | 12 | #define SCHEDULE(i) \ 13 | temp = schedule[(i-3)&0xF] ^ schedule[(i-8)&0xF] ^ schedule[(i-14)&0xF] ^ schedule[(i-16)&0xF]; \ 14 | schedule[i & 0xF] = temp << 1 | temp >> 31; 15 | 16 | #define ROUND0a(a,b,c,d,e,i) \ 17 | schedule[i] = (block[i] << 24) | ((block[i] & 0xFF00) << 8) | ((block[i] >> 8) & 0xFF00) | (block[i] >> 24); \ 18 | ROUNDTAIL(a, b, e, ((b & c) | (~b & d)), i, 0x5A827999) 19 | 20 | #define ROUND0b(a,b,c,d,e,i) \ 21 | SCHEDULE(i) \ 22 | ROUNDTAIL(a, b, e, ((b & c) | (~b & d)), i, 0x5A827999) 23 | 24 | #define ROUND1(a,b,c,d,e,i) \ 25 | SCHEDULE(i) \ 26 | ROUNDTAIL(a, b, e, (b ^ c ^ d), i, 0x6ED9EBA1) 27 | 28 | #define ROUND2(a,b,c,d,e,i) \ 29 | SCHEDULE(i) \ 30 | ROUNDTAIL(a, b, e, ((b & c) ^ (b & d) ^ (c & d)), i, 0x8F1BBCDC) 31 | 32 | #define ROUND3(a,b,c,d,e,i) \ 33 | SCHEDULE(i) \ 34 | ROUNDTAIL(a, b, e, (b ^ c ^ d), i, 0xCA62C1D6) 35 | 36 | #define ROUNDTAIL(a,b,e,f,i,k) \ 37 | e += (a << 5 | a >> 27) + f + k + schedule[i & 0xF]; \ 38 | b = b << 30 | b >> 2; 39 | 40 | void sha1_compress(uint32_t *state, uint32_t *block) { 41 | uint32_t a = state[0]; 42 | uint32_t b = state[1]; 43 | uint32_t c = state[2]; 44 | uint32_t d = state[3]; 45 | uint32_t e = state[4]; 46 | 47 | uint32_t schedule[16]; 48 | uint32_t temp; 49 | ROUND0a(a, b, c, d, e, 0) 50 | ROUND0a(e, a, b, c, d, 1) 51 | ROUND0a(d, e, a, b, c, 2) 52 | ROUND0a(c, d, e, a, b, 3) 53 | ROUND0a(b, c, d, e, a, 4) 54 | ROUND0a(a, b, c, d, e, 5) 55 | ROUND0a(e, a, b, c, d, 6) 56 | ROUND0a(d, e, a, b, c, 7) 57 | ROUND0a(c, d, e, a, b, 8) 58 | ROUND0a(b, c, d, e, a, 9) 59 | ROUND0a(a, b, c, d, e, 10) 60 | ROUND0a(e, a, b, c, d, 11) 61 | ROUND0a(d, e, a, b, c, 12) 62 | ROUND0a(c, d, e, a, b, 13) 63 | ROUND0a(b, c, d, e, a, 14) 64 | ROUND0a(a, b, c, d, e, 15) 65 | ROUND0b(e, a, b, c, d, 16) 66 | ROUND0b(d, e, a, b, c, 17) 67 | ROUND0b(c, d, e, a, b, 18) 68 | ROUND0b(b, c, d, e, a, 19) 69 | ROUND1(a, b, c, d, e, 20) 70 | ROUND1(e, a, b, c, d, 21) 71 | ROUND1(d, e, a, b, c, 22) 72 | ROUND1(c, d, e, a, b, 23) 73 | ROUND1(b, c, d, e, a, 24) 74 | ROUND1(a, b, c, d, e, 25) 75 | ROUND1(e, a, b, c, d, 26) 76 | ROUND1(d, e, a, b, c, 27) 77 | ROUND1(c, d, e, a, b, 28) 78 | ROUND1(b, c, d, e, a, 29) 79 | ROUND1(a, b, c, d, e, 30) 80 | ROUND1(e, a, b, c, d, 31) 81 | ROUND1(d, e, a, b, c, 32) 82 | ROUND1(c, d, e, a, b, 33) 83 | ROUND1(b, c, d, e, a, 34) 84 | ROUND1(a, b, c, d, e, 35) 85 | ROUND1(e, a, b, c, d, 36) 86 | ROUND1(d, e, a, b, c, 37) 87 | ROUND1(c, d, e, a, b, 38) 88 | ROUND1(b, c, d, e, a, 39) 89 | ROUND2(a, b, c, d, e, 40) 90 | ROUND2(e, a, b, c, d, 41) 91 | ROUND2(d, e, a, b, c, 42) 92 | ROUND2(c, d, e, a, b, 43) 93 | ROUND2(b, c, d, e, a, 44) 94 | ROUND2(a, b, c, d, e, 45) 95 | ROUND2(e, a, b, c, d, 46) 96 | ROUND2(d, e, a, b, c, 47) 97 | ROUND2(c, d, e, a, b, 48) 98 | ROUND2(b, c, d, e, a, 49) 99 | ROUND2(a, b, c, d, e, 50) 100 | ROUND2(e, a, b, c, d, 51) 101 | ROUND2(d, e, a, b, c, 52) 102 | ROUND2(c, d, e, a, b, 53) 103 | ROUND2(b, c, d, e, a, 54) 104 | ROUND2(a, b, c, d, e, 55) 105 | ROUND2(e, a, b, c, d, 56) 106 | ROUND2(d, e, a, b, c, 57) 107 | ROUND2(c, d, e, a, b, 58) 108 | ROUND2(b, c, d, e, a, 59) 109 | ROUND3(a, b, c, d, e, 60) 110 | ROUND3(e, a, b, c, d, 61) 111 | ROUND3(d, e, a, b, c, 62) 112 | ROUND3(c, d, e, a, b, 63) 113 | ROUND3(b, c, d, e, a, 64) 114 | ROUND3(a, b, c, d, e, 65) 115 | ROUND3(e, a, b, c, d, 66) 116 | ROUND3(d, e, a, b, c, 67) 117 | ROUND3(c, d, e, a, b, 68) 118 | ROUND3(b, c, d, e, a, 69) 119 | ROUND3(a, b, c, d, e, 70) 120 | ROUND3(e, a, b, c, d, 71) 121 | ROUND3(d, e, a, b, c, 72) 122 | ROUND3(c, d, e, a, b, 73) 123 | ROUND3(b, c, d, e, a, 74) 124 | ROUND3(a, b, c, d, e, 75) 125 | ROUND3(e, a, b, c, d, 76) 126 | ROUND3(d, e, a, b, c, 77) 127 | ROUND3(c, d, e, a, b, 78) 128 | ROUND3(b, c, d, e, a, 79) 129 | 130 | state[0] += a; 131 | state[1] += b; 132 | state[2] += c; 133 | state[3] += d; 134 | state[4] += e; 135 | } 136 | -------------------------------------------------------------------------------- /CPU-experiment/sha1-naive.c: -------------------------------------------------------------------------------- 1 | /* 2 | * SHA-1 hash in C 3 | * Copyright (c) 2012 Nayuki Minase 4 | * 5 | * http://nayuki.eigenstate.org/page/fast-sha1-hash-implementation-in-x86-assembly 6 | */ 7 | 8 | 9 | #include 10 | 11 | 12 | #define LOADSCHEDULE(i) \ 13 | schedule[i] = (block[i] << 24) | ((block[i] & 0xFF00) << 8) | ((block[i] >> 8) & 0xFF00) | (block[i] >> 24); 14 | 15 | #define SCHEDULE(i) \ 16 | temp = schedule[i-3] ^ schedule[i-8] ^ schedule[i-14] ^ schedule[i-16]; \ 17 | schedule[i] = temp << 1 | temp >> 31; 18 | 19 | #define ROUND0(a,b,c,d,e,i) ROUNDTAIL(a, b, e, ((b & c) | (~b & d)), i, 0x5A827999) 20 | #define ROUND1(a,b,c,d,e,i) ROUNDTAIL(a, b, e, (b ^ c ^ d), i, 0x6ED9EBA1) 21 | #define ROUND2(a,b,c,d,e,i) ROUNDTAIL(a, b, e, ((b & c) ^ (b & d) ^ (c & d)), i, 0x8F1BBCDC) 22 | #define ROUND3(a,b,c,d,e,i) ROUNDTAIL(a, b, e, (b ^ c ^ d), i, 0xCA62C1D6) 23 | 24 | #define ROUNDTAIL(a,b,e,f,i,k) \ 25 | e += (a << 5 | a >> 27) + f + k + schedule[i]; \ 26 | b = b << 30 | b >> 2; 27 | 28 | void sha1_compress(uint32_t *state, uint32_t *block) { 29 | uint32_t a = state[0]; 30 | uint32_t b = state[1]; 31 | uint32_t c = state[2]; 32 | uint32_t d = state[3]; 33 | uint32_t e = state[4]; 34 | 35 | uint32_t schedule[80]; 36 | uint32_t temp; 37 | LOADSCHEDULE( 0) 38 | LOADSCHEDULE( 1) 39 | LOADSCHEDULE( 2) 40 | LOADSCHEDULE( 3) 41 | LOADSCHEDULE( 4) 42 | LOADSCHEDULE( 5) 43 | LOADSCHEDULE( 6) 44 | LOADSCHEDULE( 7) 45 | LOADSCHEDULE( 8) 46 | LOADSCHEDULE( 9) 47 | LOADSCHEDULE(10) 48 | LOADSCHEDULE(11) 49 | LOADSCHEDULE(12) 50 | LOADSCHEDULE(13) 51 | LOADSCHEDULE(14) 52 | LOADSCHEDULE(15) 53 | SCHEDULE(16) 54 | SCHEDULE(17) 55 | SCHEDULE(18) 56 | SCHEDULE(19) 57 | SCHEDULE(20) 58 | SCHEDULE(21) 59 | SCHEDULE(22) 60 | SCHEDULE(23) 61 | SCHEDULE(24) 62 | SCHEDULE(25) 63 | SCHEDULE(26) 64 | SCHEDULE(27) 65 | SCHEDULE(28) 66 | SCHEDULE(29) 67 | SCHEDULE(30) 68 | SCHEDULE(31) 69 | SCHEDULE(32) 70 | SCHEDULE(33) 71 | SCHEDULE(34) 72 | SCHEDULE(35) 73 | SCHEDULE(36) 74 | SCHEDULE(37) 75 | SCHEDULE(38) 76 | SCHEDULE(39) 77 | SCHEDULE(40) 78 | SCHEDULE(41) 79 | SCHEDULE(42) 80 | SCHEDULE(43) 81 | SCHEDULE(44) 82 | SCHEDULE(45) 83 | SCHEDULE(46) 84 | SCHEDULE(47) 85 | SCHEDULE(48) 86 | SCHEDULE(49) 87 | SCHEDULE(50) 88 | SCHEDULE(51) 89 | SCHEDULE(52) 90 | SCHEDULE(53) 91 | SCHEDULE(54) 92 | SCHEDULE(55) 93 | SCHEDULE(56) 94 | SCHEDULE(57) 95 | SCHEDULE(58) 96 | SCHEDULE(59) 97 | SCHEDULE(60) 98 | SCHEDULE(61) 99 | SCHEDULE(62) 100 | SCHEDULE(63) 101 | SCHEDULE(64) 102 | SCHEDULE(65) 103 | SCHEDULE(66) 104 | SCHEDULE(67) 105 | SCHEDULE(68) 106 | SCHEDULE(69) 107 | SCHEDULE(70) 108 | SCHEDULE(71) 109 | SCHEDULE(72) 110 | SCHEDULE(73) 111 | SCHEDULE(74) 112 | SCHEDULE(75) 113 | SCHEDULE(76) 114 | SCHEDULE(77) 115 | SCHEDULE(78) 116 | SCHEDULE(79) 117 | 118 | ROUND0(a, b, c, d, e, 0) 119 | ROUND0(e, a, b, c, d, 1) 120 | ROUND0(d, e, a, b, c, 2) 121 | ROUND0(c, d, e, a, b, 3) 122 | ROUND0(b, c, d, e, a, 4) 123 | ROUND0(a, b, c, d, e, 5) 124 | ROUND0(e, a, b, c, d, 6) 125 | ROUND0(d, e, a, b, c, 7) 126 | ROUND0(c, d, e, a, b, 8) 127 | ROUND0(b, c, d, e, a, 9) 128 | ROUND0(a, b, c, d, e, 10) 129 | ROUND0(e, a, b, c, d, 11) 130 | ROUND0(d, e, a, b, c, 12) 131 | ROUND0(c, d, e, a, b, 13) 132 | ROUND0(b, c, d, e, a, 14) 133 | ROUND0(a, b, c, d, e, 15) 134 | ROUND0(e, a, b, c, d, 16) 135 | ROUND0(d, e, a, b, c, 17) 136 | ROUND0(c, d, e, a, b, 18) 137 | ROUND0(b, c, d, e, a, 19) 138 | ROUND1(a, b, c, d, e, 20) 139 | ROUND1(e, a, b, c, d, 21) 140 | ROUND1(d, e, a, b, c, 22) 141 | ROUND1(c, d, e, a, b, 23) 142 | ROUND1(b, c, d, e, a, 24) 143 | ROUND1(a, b, c, d, e, 25) 144 | ROUND1(e, a, b, c, d, 26) 145 | ROUND1(d, e, a, b, c, 27) 146 | ROUND1(c, d, e, a, b, 28) 147 | ROUND1(b, c, d, e, a, 29) 148 | ROUND1(a, b, c, d, e, 30) 149 | ROUND1(e, a, b, c, d, 31) 150 | ROUND1(d, e, a, b, c, 32) 151 | ROUND1(c, d, e, a, b, 33) 152 | ROUND1(b, c, d, e, a, 34) 153 | ROUND1(a, b, c, d, e, 35) 154 | ROUND1(e, a, b, c, d, 36) 155 | ROUND1(d, e, a, b, c, 37) 156 | ROUND1(c, d, e, a, b, 38) 157 | ROUND1(b, c, d, e, a, 39) 158 | ROUND2(a, b, c, d, e, 40) 159 | ROUND2(e, a, b, c, d, 41) 160 | ROUND2(d, e, a, b, c, 42) 161 | ROUND2(c, d, e, a, b, 43) 162 | ROUND2(b, c, d, e, a, 44) 163 | ROUND2(a, b, c, d, e, 45) 164 | ROUND2(e, a, b, c, d, 46) 165 | ROUND2(d, e, a, b, c, 47) 166 | ROUND2(c, d, e, a, b, 48) 167 | ROUND2(b, c, d, e, a, 49) 168 | ROUND2(a, b, c, d, e, 50) 169 | ROUND2(e, a, b, c, d, 51) 170 | ROUND2(d, e, a, b, c, 52) 171 | ROUND2(c, d, e, a, b, 53) 172 | ROUND2(b, c, d, e, a, 54) 173 | ROUND2(a, b, c, d, e, 55) 174 | ROUND2(e, a, b, c, d, 56) 175 | ROUND2(d, e, a, b, c, 57) 176 | ROUND2(c, d, e, a, b, 58) 177 | ROUND2(b, c, d, e, a, 59) 178 | ROUND3(a, b, c, d, e, 60) 179 | ROUND3(e, a, b, c, d, 61) 180 | ROUND3(d, e, a, b, c, 62) 181 | ROUND3(c, d, e, a, b, 63) 182 | ROUND3(b, c, d, e, a, 64) 183 | ROUND3(a, b, c, d, e, 65) 184 | ROUND3(e, a, b, c, d, 66) 185 | ROUND3(d, e, a, b, c, 67) 186 | ROUND3(c, d, e, a, b, 68) 187 | ROUND3(b, c, d, e, a, 69) 188 | ROUND3(a, b, c, d, e, 70) 189 | ROUND3(e, a, b, c, d, 71) 190 | ROUND3(d, e, a, b, c, 72) 191 | ROUND3(c, d, e, a, b, 73) 192 | ROUND3(b, c, d, e, a, 74) 193 | ROUND3(a, b, c, d, e, 75) 194 | ROUND3(e, a, b, c, d, 76) 195 | ROUND3(d, e, a, b, c, 77) 196 | ROUND3(c, d, e, a, b, 78) 197 | ROUND3(b, c, d, e, a, 79) 198 | 199 | state[0] += a; 200 | state[1] += b; 201 | state[2] += c; 202 | state[3] += d; 203 | state[4] += e; 204 | } 205 | -------------------------------------------------------------------------------- /CPU-experiment/sha1/sha1-fast-64.S: -------------------------------------------------------------------------------- 1 | /* 2 | * SHA-1 hash in x86-64 assembly 3 | * Copyright (c) 2012 Nayuki Minase 4 | * 5 | * http://nayuki.eigenstate.org/page/fast-sha1-hash-implementation-in-x86-assembly 6 | */ 7 | 8 | 9 | /* 10 | * Storage usage: 11 | * Bytes Location Description 12 | * 4 eax SHA-1 state variable A 13 | * 4 ebx SHA-1 state variable B 14 | * 4 ecx SHA-1 state variable C 15 | * 4 edx SHA-1 state variable D 16 | * 4 ebp SHA-1 state variable E 17 | * 4 esi Temporary for calculation per round 18 | * 4 edi (Last 64 rounds) temporary for calculation per round 19 | * 8 rdi (First 16 rounds) base address of block array argument (read-only) 20 | * 8 r8 Base address of state array argument (read-only) 21 | * 8 rsp x86-64 stack pointer 22 | * 64 [rsp+0] Circular buffer of most recent 16 key schedule items, 4 bytes each 23 | * 16 xmm0 Caller's value of rbx (only lower 64 bits are used) 24 | * 16 xmm1 Caller's value of rbp (only lower 64 bits are used) 25 | */ 26 | 27 | #define ROUND0a(a,b,c,d,e,i) \ 28 | movl (i*4)(%rdi), %esi; \ 29 | bswapl %esi; \ 30 | movl %esi, (i*4)(%rsp); \ 31 | addl %esi, %e; \ 32 | movl %c, %esi; \ 33 | xorl %d, %esi; \ 34 | andl %b, %esi; \ 35 | xorl %d, %esi; \ 36 | ROUNDTAIL(a,b,e,i,0x5A827999) 37 | 38 | #define SCHEDULE(i,e) \ 39 | movl (((i- 3)&0xF)*4)(%rsp), %esi; \ 40 | xorl (((i- 8)&0xF)*4)(%rsp), %esi; \ 41 | xorl (((i-14)&0xF)*4)(%rsp), %esi; \ 42 | xorl (((i-16)&0xF)*4)(%rsp), %esi; \ 43 | roll $1, %esi; \ 44 | addl %esi, %e; \ 45 | movl %esi, ((i&0xF)*4)(%rsp); 46 | 47 | #define ROUND0b(a,b,c,d,e,i) \ 48 | SCHEDULE(i,e) \ 49 | movl %c, %esi; \ 50 | xorl %d, %esi; \ 51 | andl %b, %esi; \ 52 | xorl %d, %esi; \ 53 | ROUNDTAIL(a,b,e,i,0x5A827999) 54 | 55 | #define ROUND1(a,b,c,d,e,i) \ 56 | SCHEDULE(i,e) \ 57 | movl %b, %esi; \ 58 | xorl %c, %esi; \ 59 | xorl %d, %esi; \ 60 | ROUNDTAIL(a,b,e,i,0x6ED9EBA1) 61 | 62 | #define ROUND2(a,b,c,d,e,i) \ 63 | SCHEDULE(i,e) \ 64 | movl %c, %esi; \ 65 | movl %c, %edi; \ 66 | orl %d, %esi; \ 67 | andl %b, %esi; \ 68 | andl %d, %edi; \ 69 | orl %edi, %esi; \ 70 | ROUNDTAIL(a,b,e,i,-0x70E44324) 71 | 72 | #define ROUND3(a,b,c,d,e,i) \ 73 | SCHEDULE(i,e) \ 74 | movl %b, %esi; \ 75 | xorl %c, %esi; \ 76 | xorl %d, %esi; \ 77 | ROUNDTAIL(a,b,e,i,-0x359D3E2A) 78 | 79 | #define ROUNDTAIL(a,b,e,i,k) \ 80 | roll $30, %b; \ 81 | leal k(%e,%esi), %e; \ 82 | movl %a, %esi; \ 83 | roll $5, %esi; \ 84 | addl %esi, %e; 85 | 86 | 87 | /* void sha1_compress(uint32_t *state, uint32_t *block) */ 88 | .globl sha1_compress 89 | sha1_compress: 90 | /* Save registers, allocate scratch space */ 91 | movq %rbx, %xmm0 92 | movq %rbp, %xmm1 93 | subq $64, %rsp 94 | 95 | /* Load arguments */ 96 | movq %rdi, %r8 97 | movl 0(%rdi), %eax /* a */ 98 | movl 4(%rdi), %ebx /* b */ 99 | movl 8(%rdi), %ecx /* c */ 100 | movl 12(%rdi), %edx /* d */ 101 | movl 16(%rdi), %ebp /* e */ 102 | movq %rsi, %rdi 103 | 104 | /* 80 rounds of hashing */ 105 | ROUND0a(eax, ebx, ecx, edx, ebp, 0) 106 | ROUND0a(ebp, eax, ebx, ecx, edx, 1) 107 | ROUND0a(edx, ebp, eax, ebx, ecx, 2) 108 | ROUND0a(ecx, edx, ebp, eax, ebx, 3) 109 | ROUND0a(ebx, ecx, edx, ebp, eax, 4) 110 | ROUND0a(eax, ebx, ecx, edx, ebp, 5) 111 | ROUND0a(ebp, eax, ebx, ecx, edx, 6) 112 | ROUND0a(edx, ebp, eax, ebx, ecx, 7) 113 | ROUND0a(ecx, edx, ebp, eax, ebx, 8) 114 | ROUND0a(ebx, ecx, edx, ebp, eax, 9) 115 | ROUND0a(eax, ebx, ecx, edx, ebp, 10) 116 | ROUND0a(ebp, eax, ebx, ecx, edx, 11) 117 | ROUND0a(edx, ebp, eax, ebx, ecx, 12) 118 | ROUND0a(ecx, edx, ebp, eax, ebx, 13) 119 | ROUND0a(ebx, ecx, edx, ebp, eax, 14) 120 | ROUND0a(eax, ebx, ecx, edx, ebp, 15) 121 | ROUND0b(ebp, eax, ebx, ecx, edx, 16) 122 | ROUND0b(edx, ebp, eax, ebx, ecx, 17) 123 | ROUND0b(ecx, edx, ebp, eax, ebx, 18) 124 | ROUND0b(ebx, ecx, edx, ebp, eax, 19) 125 | ROUND1(eax, ebx, ecx, edx, ebp, 20) 126 | ROUND1(ebp, eax, ebx, ecx, edx, 21) 127 | ROUND1(edx, ebp, eax, ebx, ecx, 22) 128 | ROUND1(ecx, edx, ebp, eax, ebx, 23) 129 | ROUND1(ebx, ecx, edx, ebp, eax, 24) 130 | ROUND1(eax, ebx, ecx, edx, ebp, 25) 131 | ROUND1(ebp, eax, ebx, ecx, edx, 26) 132 | ROUND1(edx, ebp, eax, ebx, ecx, 27) 133 | ROUND1(ecx, edx, ebp, eax, ebx, 28) 134 | ROUND1(ebx, ecx, edx, ebp, eax, 29) 135 | ROUND1(eax, ebx, ecx, edx, ebp, 30) 136 | ROUND1(ebp, eax, ebx, ecx, edx, 31) 137 | ROUND1(edx, ebp, eax, ebx, ecx, 32) 138 | ROUND1(ecx, edx, ebp, eax, ebx, 33) 139 | ROUND1(ebx, ecx, edx, ebp, eax, 34) 140 | ROUND1(eax, ebx, ecx, edx, ebp, 35) 141 | ROUND1(ebp, eax, ebx, ecx, edx, 36) 142 | ROUND1(edx, ebp, eax, ebx, ecx, 37) 143 | ROUND1(ecx, edx, ebp, eax, ebx, 38) 144 | ROUND1(ebx, ecx, edx, ebp, eax, 39) 145 | ROUND2(eax, ebx, ecx, edx, ebp, 40) 146 | ROUND2(ebp, eax, ebx, ecx, edx, 41) 147 | ROUND2(edx, ebp, eax, ebx, ecx, 42) 148 | ROUND2(ecx, edx, ebp, eax, ebx, 43) 149 | ROUND2(ebx, ecx, edx, ebp, eax, 44) 150 | ROUND2(eax, ebx, ecx, edx, ebp, 45) 151 | ROUND2(ebp, eax, ebx, ecx, edx, 46) 152 | ROUND2(edx, ebp, eax, ebx, ecx, 47) 153 | ROUND2(ecx, edx, ebp, eax, ebx, 48) 154 | ROUND2(ebx, ecx, edx, ebp, eax, 49) 155 | ROUND2(eax, ebx, ecx, edx, ebp, 50) 156 | ROUND2(ebp, eax, ebx, ecx, edx, 51) 157 | ROUND2(edx, ebp, eax, ebx, ecx, 52) 158 | ROUND2(ecx, edx, ebp, eax, ebx, 53) 159 | ROUND2(ebx, ecx, edx, ebp, eax, 54) 160 | ROUND2(eax, ebx, ecx, edx, ebp, 55) 161 | ROUND2(ebp, eax, ebx, ecx, edx, 56) 162 | ROUND2(edx, ebp, eax, ebx, ecx, 57) 163 | ROUND2(ecx, edx, ebp, eax, ebx, 58) 164 | ROUND2(ebx, ecx, edx, ebp, eax, 59) 165 | ROUND3(eax, ebx, ecx, edx, ebp, 60) 166 | ROUND3(ebp, eax, ebx, ecx, edx, 61) 167 | ROUND3(edx, ebp, eax, ebx, ecx, 62) 168 | ROUND3(ecx, edx, ebp, eax, ebx, 63) 169 | ROUND3(ebx, ecx, edx, ebp, eax, 64) 170 | ROUND3(eax, ebx, ecx, edx, ebp, 65) 171 | ROUND3(ebp, eax, ebx, ecx, edx, 66) 172 | ROUND3(edx, ebp, eax, ebx, ecx, 67) 173 | ROUND3(ecx, edx, ebp, eax, ebx, 68) 174 | ROUND3(ebx, ecx, edx, ebp, eax, 69) 175 | ROUND3(eax, ebx, ecx, edx, ebp, 70) 176 | ROUND3(ebp, eax, ebx, ecx, edx, 71) 177 | ROUND3(edx, ebp, eax, ebx, ecx, 72) 178 | ROUND3(ecx, edx, ebp, eax, ebx, 73) 179 | ROUND3(ebx, ecx, edx, ebp, eax, 74) 180 | ROUND3(eax, ebx, ecx, edx, ebp, 75) 181 | ROUND3(ebp, eax, ebx, ecx, edx, 76) 182 | ROUND3(edx, ebp, eax, ebx, ecx, 77) 183 | ROUND3(ecx, edx, ebp, eax, ebx, 78) 184 | ROUND3(ebx, ecx, edx, ebp, eax, 79) 185 | 186 | /* Save updated state */ 187 | addl %eax, 0(%r8) 188 | addl %ebx, 4(%r8) 189 | addl %ecx, 8(%r8) 190 | addl %edx, 12(%r8) 191 | addl %ebp, 16(%r8) 192 | 193 | /* Restore registers */ 194 | movq %xmm0, %rbx 195 | movq %xmm1, %rbp 196 | addq $64, %rsp 197 | ret 198 | -------------------------------------------------------------------------------- /CPU-experiment/sha1/sha1-fast.c: -------------------------------------------------------------------------------- 1 | /* 2 | * SHA-1 hash in C 3 | * Copyright (c) 2012 Nayuki Minase 4 | * 5 | * http://nayuki.eigenstate.org/page/fast-sha1-hash-implementation-in-x86-assembly 6 | */ 7 | 8 | 9 | #include 10 | 11 | 12 | #define SCHEDULE(i) \ 13 | temp = schedule[(i-3)&0xF] ^ schedule[(i-8)&0xF] ^ schedule[(i-14)&0xF] ^ schedule[(i-16)&0xF]; \ 14 | schedule[i & 0xF] = temp << 1 | temp >> 31; 15 | 16 | #define ROUND0a(a,b,c,d,e,i) \ 17 | schedule[i] = (block[i] << 24) | ((block[i] & 0xFF00) << 8) | ((block[i] >> 8) & 0xFF00) | (block[i] >> 24); \ 18 | ROUNDTAIL(a, b, e, ((b & c) | (~b & d)), i, 0x5A827999) 19 | 20 | #define ROUND0b(a,b,c,d,e,i) \ 21 | SCHEDULE(i) \ 22 | ROUNDTAIL(a, b, e, ((b & c) | (~b & d)), i, 0x5A827999) 23 | 24 | #define ROUND1(a,b,c,d,e,i) \ 25 | SCHEDULE(i) \ 26 | ROUNDTAIL(a, b, e, (b ^ c ^ d), i, 0x6ED9EBA1) 27 | 28 | #define ROUND2(a,b,c,d,e,i) \ 29 | SCHEDULE(i) \ 30 | ROUNDTAIL(a, b, e, ((b & c) ^ (b & d) ^ (c & d)), i, 0x8F1BBCDC) 31 | 32 | #define ROUND3(a,b,c,d,e,i) \ 33 | SCHEDULE(i) \ 34 | ROUNDTAIL(a, b, e, (b ^ c ^ d), i, 0xCA62C1D6) 35 | 36 | #define ROUNDTAIL(a,b,e,f,i,k) \ 37 | e += (a << 5 | a >> 27) + f + k + schedule[i & 0xF]; \ 38 | b = b << 30 | b >> 2; 39 | 40 | void sha1_compress(uint32_t *state, uint32_t *block) { 41 | uint32_t a = state[0]; 42 | uint32_t b = state[1]; 43 | uint32_t c = state[2]; 44 | uint32_t d = state[3]; 45 | uint32_t e = state[4]; 46 | 47 | uint32_t schedule[16]; 48 | uint32_t temp; 49 | ROUND0a(a, b, c, d, e, 0) 50 | ROUND0a(e, a, b, c, d, 1) 51 | ROUND0a(d, e, a, b, c, 2) 52 | ROUND0a(c, d, e, a, b, 3) 53 | ROUND0a(b, c, d, e, a, 4) 54 | ROUND0a(a, b, c, d, e, 5) 55 | ROUND0a(e, a, b, c, d, 6) 56 | ROUND0a(d, e, a, b, c, 7) 57 | ROUND0a(c, d, e, a, b, 8) 58 | ROUND0a(b, c, d, e, a, 9) 59 | ROUND0a(a, b, c, d, e, 10) 60 | ROUND0a(e, a, b, c, d, 11) 61 | ROUND0a(d, e, a, b, c, 12) 62 | ROUND0a(c, d, e, a, b, 13) 63 | ROUND0a(b, c, d, e, a, 14) 64 | ROUND0a(a, b, c, d, e, 15) 65 | ROUND0b(e, a, b, c, d, 16) 66 | ROUND0b(d, e, a, b, c, 17) 67 | ROUND0b(c, d, e, a, b, 18) 68 | ROUND0b(b, c, d, e, a, 19) 69 | ROUND1(a, b, c, d, e, 20) 70 | ROUND1(e, a, b, c, d, 21) 71 | ROUND1(d, e, a, b, c, 22) 72 | ROUND1(c, d, e, a, b, 23) 73 | ROUND1(b, c, d, e, a, 24) 74 | ROUND1(a, b, c, d, e, 25) 75 | ROUND1(e, a, b, c, d, 26) 76 | ROUND1(d, e, a, b, c, 27) 77 | ROUND1(c, d, e, a, b, 28) 78 | ROUND1(b, c, d, e, a, 29) 79 | ROUND1(a, b, c, d, e, 30) 80 | ROUND1(e, a, b, c, d, 31) 81 | ROUND1(d, e, a, b, c, 32) 82 | ROUND1(c, d, e, a, b, 33) 83 | ROUND1(b, c, d, e, a, 34) 84 | ROUND1(a, b, c, d, e, 35) 85 | ROUND1(e, a, b, c, d, 36) 86 | ROUND1(d, e, a, b, c, 37) 87 | ROUND1(c, d, e, a, b, 38) 88 | ROUND1(b, c, d, e, a, 39) 89 | ROUND2(a, b, c, d, e, 40) 90 | ROUND2(e, a, b, c, d, 41) 91 | ROUND2(d, e, a, b, c, 42) 92 | ROUND2(c, d, e, a, b, 43) 93 | ROUND2(b, c, d, e, a, 44) 94 | ROUND2(a, b, c, d, e, 45) 95 | ROUND2(e, a, b, c, d, 46) 96 | ROUND2(d, e, a, b, c, 47) 97 | ROUND2(c, d, e, a, b, 48) 98 | ROUND2(b, c, d, e, a, 49) 99 | ROUND2(a, b, c, d, e, 50) 100 | ROUND2(e, a, b, c, d, 51) 101 | ROUND2(d, e, a, b, c, 52) 102 | ROUND2(c, d, e, a, b, 53) 103 | ROUND2(b, c, d, e, a, 54) 104 | ROUND2(a, b, c, d, e, 55) 105 | ROUND2(e, a, b, c, d, 56) 106 | ROUND2(d, e, a, b, c, 57) 107 | ROUND2(c, d, e, a, b, 58) 108 | ROUND2(b, c, d, e, a, 59) 109 | ROUND3(a, b, c, d, e, 60) 110 | ROUND3(e, a, b, c, d, 61) 111 | ROUND3(d, e, a, b, c, 62) 112 | ROUND3(c, d, e, a, b, 63) 113 | ROUND3(b, c, d, e, a, 64) 114 | ROUND3(a, b, c, d, e, 65) 115 | ROUND3(e, a, b, c, d, 66) 116 | ROUND3(d, e, a, b, c, 67) 117 | ROUND3(c, d, e, a, b, 68) 118 | ROUND3(b, c, d, e, a, 69) 119 | ROUND3(a, b, c, d, e, 70) 120 | ROUND3(e, a, b, c, d, 71) 121 | ROUND3(d, e, a, b, c, 72) 122 | ROUND3(c, d, e, a, b, 73) 123 | ROUND3(b, c, d, e, a, 74) 124 | ROUND3(a, b, c, d, e, 75) 125 | ROUND3(e, a, b, c, d, 76) 126 | ROUND3(d, e, a, b, c, 77) 127 | ROUND3(c, d, e, a, b, 78) 128 | ROUND3(b, c, d, e, a, 79) 129 | 130 | state[0] += a; 131 | state[1] += b; 132 | state[2] += c; 133 | state[3] += d; 134 | state[4] += e; 135 | } 136 | -------------------------------------------------------------------------------- /CPU-experiment/sha1/sha1-naive.c: -------------------------------------------------------------------------------- 1 | /* 2 | * SHA-1 hash in C 3 | * Copyright (c) 2012 Nayuki Minase 4 | * 5 | * http://nayuki.eigenstate.org/page/fast-sha1-hash-implementation-in-x86-assembly 6 | */ 7 | 8 | 9 | #include 10 | 11 | 12 | #define LOADSCHEDULE(i) \ 13 | schedule[i] = (block[i] << 24) | ((block[i] & 0xFF00) << 8) | ((block[i] >> 8) & 0xFF00) | (block[i] >> 24); 14 | 15 | #define SCHEDULE(i) \ 16 | temp = schedule[i-3] ^ schedule[i-8] ^ schedule[i-14] ^ schedule[i-16]; \ 17 | schedule[i] = temp << 1 | temp >> 31; 18 | 19 | #define ROUND0(a,b,c,d,e,i) ROUNDTAIL(a, b, e, ((b & c) | (~b & d)), i, 0x5A827999) 20 | #define ROUND1(a,b,c,d,e,i) ROUNDTAIL(a, b, e, (b ^ c ^ d), i, 0x6ED9EBA1) 21 | #define ROUND2(a,b,c,d,e,i) ROUNDTAIL(a, b, e, ((b & c) ^ (b & d) ^ (c & d)), i, 0x8F1BBCDC) 22 | #define ROUND3(a,b,c,d,e,i) ROUNDTAIL(a, b, e, (b ^ c ^ d), i, 0xCA62C1D6) 23 | 24 | #define ROUNDTAIL(a,b,e,f,i,k) \ 25 | e += (a << 5 | a >> 27) + f + k + schedule[i]; \ 26 | b = b << 30 | b >> 2; 27 | 28 | void sha1_compress(uint32_t *state, uint32_t *block) { 29 | uint32_t a = state[0]; 30 | uint32_t b = state[1]; 31 | uint32_t c = state[2]; 32 | uint32_t d = state[3]; 33 | uint32_t e = state[4]; 34 | 35 | uint32_t schedule[80]; 36 | uint32_t temp; 37 | LOADSCHEDULE( 0) 38 | LOADSCHEDULE( 1) 39 | LOADSCHEDULE( 2) 40 | LOADSCHEDULE( 3) 41 | LOADSCHEDULE( 4) 42 | LOADSCHEDULE( 5) 43 | LOADSCHEDULE( 6) 44 | LOADSCHEDULE( 7) 45 | LOADSCHEDULE( 8) 46 | LOADSCHEDULE( 9) 47 | LOADSCHEDULE(10) 48 | LOADSCHEDULE(11) 49 | LOADSCHEDULE(12) 50 | LOADSCHEDULE(13) 51 | LOADSCHEDULE(14) 52 | LOADSCHEDULE(15) 53 | SCHEDULE(16) 54 | SCHEDULE(17) 55 | SCHEDULE(18) 56 | SCHEDULE(19) 57 | SCHEDULE(20) 58 | SCHEDULE(21) 59 | SCHEDULE(22) 60 | SCHEDULE(23) 61 | SCHEDULE(24) 62 | SCHEDULE(25) 63 | SCHEDULE(26) 64 | SCHEDULE(27) 65 | SCHEDULE(28) 66 | SCHEDULE(29) 67 | SCHEDULE(30) 68 | SCHEDULE(31) 69 | SCHEDULE(32) 70 | SCHEDULE(33) 71 | SCHEDULE(34) 72 | SCHEDULE(35) 73 | SCHEDULE(36) 74 | SCHEDULE(37) 75 | SCHEDULE(38) 76 | SCHEDULE(39) 77 | SCHEDULE(40) 78 | SCHEDULE(41) 79 | SCHEDULE(42) 80 | SCHEDULE(43) 81 | SCHEDULE(44) 82 | SCHEDULE(45) 83 | SCHEDULE(46) 84 | SCHEDULE(47) 85 | SCHEDULE(48) 86 | SCHEDULE(49) 87 | SCHEDULE(50) 88 | SCHEDULE(51) 89 | SCHEDULE(52) 90 | SCHEDULE(53) 91 | SCHEDULE(54) 92 | SCHEDULE(55) 93 | SCHEDULE(56) 94 | SCHEDULE(57) 95 | SCHEDULE(58) 96 | SCHEDULE(59) 97 | SCHEDULE(60) 98 | SCHEDULE(61) 99 | SCHEDULE(62) 100 | SCHEDULE(63) 101 | SCHEDULE(64) 102 | SCHEDULE(65) 103 | SCHEDULE(66) 104 | SCHEDULE(67) 105 | SCHEDULE(68) 106 | SCHEDULE(69) 107 | SCHEDULE(70) 108 | SCHEDULE(71) 109 | SCHEDULE(72) 110 | SCHEDULE(73) 111 | SCHEDULE(74) 112 | SCHEDULE(75) 113 | SCHEDULE(76) 114 | SCHEDULE(77) 115 | SCHEDULE(78) 116 | SCHEDULE(79) 117 | 118 | ROUND0(a, b, c, d, e, 0) 119 | ROUND0(e, a, b, c, d, 1) 120 | ROUND0(d, e, a, b, c, 2) 121 | ROUND0(c, d, e, a, b, 3) 122 | ROUND0(b, c, d, e, a, 4) 123 | ROUND0(a, b, c, d, e, 5) 124 | ROUND0(e, a, b, c, d, 6) 125 | ROUND0(d, e, a, b, c, 7) 126 | ROUND0(c, d, e, a, b, 8) 127 | ROUND0(b, c, d, e, a, 9) 128 | ROUND0(a, b, c, d, e, 10) 129 | ROUND0(e, a, b, c, d, 11) 130 | ROUND0(d, e, a, b, c, 12) 131 | ROUND0(c, d, e, a, b, 13) 132 | ROUND0(b, c, d, e, a, 14) 133 | ROUND0(a, b, c, d, e, 15) 134 | ROUND0(e, a, b, c, d, 16) 135 | ROUND0(d, e, a, b, c, 17) 136 | ROUND0(c, d, e, a, b, 18) 137 | ROUND0(b, c, d, e, a, 19) 138 | ROUND1(a, b, c, d, e, 20) 139 | ROUND1(e, a, b, c, d, 21) 140 | ROUND1(d, e, a, b, c, 22) 141 | ROUND1(c, d, e, a, b, 23) 142 | ROUND1(b, c, d, e, a, 24) 143 | ROUND1(a, b, c, d, e, 25) 144 | ROUND1(e, a, b, c, d, 26) 145 | ROUND1(d, e, a, b, c, 27) 146 | ROUND1(c, d, e, a, b, 28) 147 | ROUND1(b, c, d, e, a, 29) 148 | ROUND1(a, b, c, d, e, 30) 149 | ROUND1(e, a, b, c, d, 31) 150 | ROUND1(d, e, a, b, c, 32) 151 | ROUND1(c, d, e, a, b, 33) 152 | ROUND1(b, c, d, e, a, 34) 153 | ROUND1(a, b, c, d, e, 35) 154 | ROUND1(e, a, b, c, d, 36) 155 | ROUND1(d, e, a, b, c, 37) 156 | ROUND1(c, d, e, a, b, 38) 157 | ROUND1(b, c, d, e, a, 39) 158 | ROUND2(a, b, c, d, e, 40) 159 | ROUND2(e, a, b, c, d, 41) 160 | ROUND2(d, e, a, b, c, 42) 161 | ROUND2(c, d, e, a, b, 43) 162 | ROUND2(b, c, d, e, a, 44) 163 | ROUND2(a, b, c, d, e, 45) 164 | ROUND2(e, a, b, c, d, 46) 165 | ROUND2(d, e, a, b, c, 47) 166 | ROUND2(c, d, e, a, b, 48) 167 | ROUND2(b, c, d, e, a, 49) 168 | ROUND2(a, b, c, d, e, 50) 169 | ROUND2(e, a, b, c, d, 51) 170 | ROUND2(d, e, a, b, c, 52) 171 | ROUND2(c, d, e, a, b, 53) 172 | ROUND2(b, c, d, e, a, 54) 173 | ROUND2(a, b, c, d, e, 55) 174 | ROUND2(e, a, b, c, d, 56) 175 | ROUND2(d, e, a, b, c, 57) 176 | ROUND2(c, d, e, a, b, 58) 177 | ROUND2(b, c, d, e, a, 59) 178 | ROUND3(a, b, c, d, e, 60) 179 | ROUND3(e, a, b, c, d, 61) 180 | ROUND3(d, e, a, b, c, 62) 181 | ROUND3(c, d, e, a, b, 63) 182 | ROUND3(b, c, d, e, a, 64) 183 | ROUND3(a, b, c, d, e, 65) 184 | ROUND3(e, a, b, c, d, 66) 185 | ROUND3(d, e, a, b, c, 67) 186 | ROUND3(c, d, e, a, b, 68) 187 | ROUND3(b, c, d, e, a, 69) 188 | ROUND3(a, b, c, d, e, 70) 189 | ROUND3(e, a, b, c, d, 71) 190 | ROUND3(d, e, a, b, c, 72) 191 | ROUND3(c, d, e, a, b, 73) 192 | ROUND3(b, c, d, e, a, 74) 193 | ROUND3(a, b, c, d, e, 75) 194 | ROUND3(e, a, b, c, d, 76) 195 | ROUND3(d, e, a, b, c, 77) 196 | ROUND3(c, d, e, a, b, 78) 197 | ROUND3(b, c, d, e, a, 79) 198 | 199 | state[0] += a; 200 | state[1] += b; 201 | state[2] += c; 202 | state[3] += d; 203 | state[4] += e; 204 | } 205 | -------------------------------------------------------------------------------- /CPU-experiment/sha1/sha1test.c: -------------------------------------------------------------------------------- 1 | /* 2 | * SHA-1 hash in C and x86 assembly 3 | * Copyright (c) 2012 Nayuki Minase 4 | * 5 | * http://nayuki.eigenstate.org/page/fast-sha1-hash-implementation-in-x86-assembly 6 | */ 7 | 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | /* Function prototypes */ 17 | 18 | static int self_check(); 19 | void sha1_hash(uint8_t *message, uint32_t len, uint32_t *hash); 20 | 21 | // Link this program with an external C or x86 compression function 22 | extern void sha1_compress(uint32_t *state, uint32_t *block); 23 | 24 | 25 | /* Main program */ 26 | 27 | int main(int argc, char **argv) { 28 | struct timeval start_sha, end_sha; 29 | if (!self_check()) { 30 | printf("Self-check failed\n"); 31 | return 1; 32 | } 33 | printf("Self-check passed\n"); 34 | 35 | // Benchmark speed 36 | uint32_t state[5]; 37 | uint32_t block[16]; 38 | const unsigned long N = 10000000; 39 | clock_t start_time = clock(); 40 | int i; 41 | unsigned long diff; 42 | 43 | gettimeofday(&start_sha,NULL); 44 | for (i = 0; i < N; i++) 45 | sha1_compress(state, block); 46 | gettimeofday(&end_sha,NULL); 47 | 48 | diff = 1000000 * (end_sha.tv_sec-start_sha.tv_sec)+ end_sha.tv_usec-start_sha.tv_usec; 49 | 50 | printf("SHA1 Speed is %ld Mbps\n", (N * 64 * 8) / diff); 51 | printf("Speed: %.1f Mbps/s\n", (double)N * 64 * 8 / (clock() - start_time) * CLOCKS_PER_SEC / 1048576); 52 | 53 | return 0; 54 | } 55 | 56 | 57 | /* Self-check */ 58 | 59 | static int self_check() { 60 | uint32_t hash[5]; 61 | 62 | sha1_hash((uint8_t*)"", 0, hash); 63 | if (hash[0]!=0xDA39A3EE||hash[1]!=0x5E6B4B0D||hash[2]!=0x3255BFEF||hash[3]!=0x95601890||hash[4]!=0xAFD80709) return 0; 64 | 65 | sha1_hash((uint8_t*)"a", 1, hash); 66 | if (hash[0]!=0x86F7E437||hash[1]!=0xFAA5A7FC||hash[2]!=0xE15D1DDC||hash[3]!=0xB9EAEAEA||hash[4]!=0x377667B8) return 0; 67 | 68 | sha1_hash((uint8_t*)"abc", 3, hash); 69 | if (hash[0]!=0xA9993E36||hash[1]!=0x4706816A||hash[2]!=0xBA3E2571||hash[3]!=0x7850C26C||hash[4]!=0x9CD0D89D) return 0; 70 | 71 | sha1_hash((uint8_t*)"message digest", 14, hash); 72 | if (hash[0]!=0xC12252CE||hash[1]!=0xDA8BE899||hash[2]!=0x4D5FA029||hash[3]!=0x0A47231C||hash[4]!=0x1D16AAE3) return 0; 73 | 74 | sha1_hash((uint8_t*)"abcdefghijklmnopqrstuvwxyz", 26, hash); 75 | if (hash[0]!=0x32D10C7B||hash[1]!=0x8CF96570||hash[2]!=0xCA04CE37||hash[3]!=0xF2A19D84||hash[4]!=0x240D3A89) return 0; 76 | 77 | sha1_hash((uint8_t*)"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", 56, hash); 78 | if (hash[0]!=0x84983E44||hash[1]!=0x1C3BD26E||hash[2]!=0xBAAE4AA1||hash[3]!=0xF95129E5||hash[4]!=0xE54670F1) return 0; 79 | 80 | return 1; 81 | } 82 | 83 | 84 | /* Full message hasher */ 85 | 86 | void sha1_hash(uint8_t *message, uint32_t len, uint32_t *hash) { 87 | hash[0] = 0x67452301; 88 | hash[1] = 0xEFCDAB89; 89 | hash[2] = 0x98BADCFE; 90 | hash[3] = 0x10325476; 91 | hash[4] = 0xC3D2E1F0; 92 | 93 | int i; 94 | for (i = 0; i + 64 <= len; i += 64) 95 | sha1_compress(hash, (uint32_t*)(message + i)); 96 | 97 | uint32_t block[16]; 98 | uint8_t *byteBlock = (uint8_t*)block; 99 | 100 | int rem = len - i; 101 | memcpy(byteBlock, message + i, rem); 102 | 103 | byteBlock[rem] = 0x80; 104 | rem++; 105 | if (64 - rem >= 8) 106 | memset(byteBlock + rem, 0, 56 - rem); 107 | else { 108 | memset(byteBlock + rem, 0, 64 - rem); 109 | sha1_compress(hash, block); 110 | memset(block, 0, 56); 111 | } 112 | 113 | uint64_t longLen = ((uint64_t)len) << 3; 114 | for (i = 0; i < 8; i++) 115 | byteBlock[64 - 1 - i] = (uint8_t)(longLen >> (i * 8)); 116 | sha1_compress(hash, block); 117 | } 118 | -------------------------------------------------------------------------------- /CPU-experiment/yasm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kay21s/UPro/c95585a0721325a6855233c97b6a32cdb98763fa/CPU-experiment/yasm -------------------------------------------------------------------------------- /IOEngine/driver/affinity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # This script requires Python 2.6 or higher. 3 | 4 | from __future__ import print_function, with_statement 5 | import os 6 | import sys 7 | import subprocess 8 | import re 9 | 10 | _exec_cache = {} 11 | 12 | def execute(cmd, cache=False): 13 | global _exec_cache 14 | if cache and cmd in _exec_cache: 15 | return _exec_cache[cmd] 16 | try: 17 | proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 18 | result = proc.communicate()[0] 19 | if cache: 20 | _exec_cache[cmd] = result 21 | return result 22 | except: 23 | return None 24 | 25 | def find_iface_node(name): 26 | ifnode = -1 27 | for line in execute('ethtool -i {0}'.format(ifname), cache=True).splitlines(): 28 | if line.startswith('bus-info:'): 29 | bus_location = line.split(':', 1)[1].strip() 30 | p1, p2, _ = bus_location.split(':') 31 | bus_prefix = '{0}:{1:02x}'.format(p1, int(p2, 16) & 0xf0) 32 | bus_affinity = execute('cat /sys/devices/pci{0}/pci_bus/{0}/cpuaffinity'.format(bus_prefix), cache=True).strip() 33 | for node in range(num_nodes): 34 | node_affinity = execute('cat /sys/devices/system/node/node{0}/cpumap'.format(node), cache=True).strip() 35 | if node_affinity == bus_affinity: 36 | ifnode = node 37 | break 38 | assert ifnode != -1 39 | return ifnode 40 | 41 | if os.getuid() != 0: 42 | print('You must be root!', file=sys.stderr) 43 | sys.exit(1) 44 | 45 | num_cpus = int(execute('cat /proc/cpuinfo | grep -c processor').strip()) 46 | 47 | if len(sys.argv) < 3: 48 | print('usage: %s <#intefaces>' % sys.argv[0]) 49 | sys.exit(1) 50 | 51 | ifname = sys.argv[1] 52 | num_devices = int(sys.argv[2]) 53 | 54 | intrmap = execute('cat /proc/interrupts | grep -i %s-rx-' % ifname).strip().split('\n') 55 | num_nodes = int(execute('cat /proc/cpuinfo | grep \'physical id\' | sort -u | wc -l')) 56 | 57 | for intr in intrmap: 58 | if intr: 59 | irq = int(re.search(r'^\s*(\d+):', intr).group(1)) 60 | queue = int(re.search(r'-(\d+)$', intr).group(1)) 61 | cpu = (queue * num_nodes) + find_iface_node(ifname) 62 | print('echo %x > /proc/irq/%d/smp_affinity' % (1 << cpu, irq)) 63 | execute('echo %x > /proc/irq/%d/smp_affinity' % (1 << cpu, irq)) 64 | else: 65 | print('The device {0} is not found on the interrupt table!'.format(ifname), file=sys.stderr) 66 | break 67 | -------------------------------------------------------------------------------- /IOEngine/driver/install.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # This script requires Python 2.6 or higher. 3 | 4 | from __future__ import print_function, with_statement 5 | import sys 6 | import os, socket 7 | import subprocess 8 | import time 9 | from optparse import OptionParser 10 | 11 | # Whereever this script is executed, it runs inside its directory. 12 | # (It's for convenience) 13 | base_path = os.path.abspath(os.path.dirname(__file__)) 14 | 15 | def execute(cmd, check_returncode=False): 16 | proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 17 | stdout, stderr = proc.communicate() 18 | if check_returncode and proc.returncode != 0: 19 | raise subprocess.CalledProcessError(proc.returncode, cmd) 20 | return stdout 21 | 22 | def get_num_interfaces(): 23 | num_82598 = int(execute('lspci | grep -c 82598').strip()) 24 | num_82599 = int(execute('lspci | grep -c 82599').strip()) 25 | return num_82598 + num_82599 26 | 27 | def get_num_cpus(): 28 | return int(execute('cat /proc/cpuinfo | grep -c processor').strip()) 29 | 30 | def check_all_links_up(): 31 | output = execute('ifconfig -s | grep xge') 32 | all_links_up = True 33 | for line in output.split('\n'): 34 | flags = line.strip().split() 35 | if len(flags) == 0: 36 | continue 37 | if 'R' not in flags[-1]: 38 | all_links_up = False 39 | return all_links_up 40 | 41 | def validate_ipaddr(addr): 42 | try: 43 | socket.inet_aton(addr) 44 | except socket.error: 45 | return False 46 | else: 47 | return addr.count('.') == 3 48 | 49 | if __name__ == '__main__': 50 | 51 | if os.geteuid() != 0: 52 | print('You must be root!', file=sys.stderr) 53 | sys.exit(1) 54 | 55 | oparser = OptionParser(usage='%prog [OPTIONS] #RX_QUEUES #TX_QUEUES', 56 | epilog='You can specify 0 instead of actual number of RX/TX queues to allocate one queue for each core.') 57 | oparser.add_option('--itr', type='int', dest='itr', default=956, 58 | help='sets the interrupt throttling rate. (default: 956)') 59 | oparser.add_option('-p', '--postfix', type='int', dest='postfix', default=1, 60 | help='sets the postfix of IP address allocated to the 10G NICs. If x is the postfix and n is the NIC index, the IP addresses will be {IP_PREFIX}.n.x. (default: 1)') 61 | oparser.add_option('--skip-check', dest='skip_check', action='store_true', default=False, 62 | help='skips checking the link state after driver installation. (default: False)') 63 | oparser.add_option('--ip-prefix', dest='ip_prefix', default='10.42', 64 | help='sets the prefix of IP address. (default: 10.42)') 65 | opts, args = oparser.parse_args() 66 | 67 | if len(args) < 2: 68 | oparser.print_help() 69 | sys.exit(1) 70 | 71 | assert opts.postfix >= 1 and opts.postfix <= 254 72 | num_rx_queues = int(args[0]) 73 | num_tx_queues = int(args[1]) 74 | assert 0 <= num_rx_queues <= 16 75 | assert 0 <= num_tx_queues <= 16 76 | assert validate_ipaddr(opts.ip_prefix + '.0.0') 77 | 78 | os.chdir(base_path) 79 | 80 | if not (os.path.exists('./ps_ixgbe.ko')): 81 | print('The compiled kernel module is not found.', file=sys.stderr) 82 | sys.exit(1) 83 | 84 | num_ifs = get_num_interfaces() 85 | num_cpus = get_num_cpus() 86 | 87 | execute('lsmod | grep ^ixgbe > /dev/null && sudo rmmod ixgbe') 88 | execute('lsmod | grep ^ps_ixgbe > /dev/null && sudo rmmod ps_ixgbe') 89 | execute('insmod ./ps_ixgbe.ko RXQ=%s TXQ=%s InterruptThrottleRate=%s' % 90 | (','.join([str(num_rx_queues)] * num_ifs), 91 | ','.join([str(num_tx_queues)] * num_ifs), 92 | ','.join([str(opts.itr)] * num_ifs)) 93 | , True) 94 | 95 | time.sleep(3) 96 | 97 | for i in range(num_ifs): 98 | ifname = 'xge%d' % i 99 | print('Setting {0}...'.format(ifname)) 100 | 101 | execute('ethtool -A %s autoneg off rx off tx off' % ifname) 102 | execute('ifconfig %s %s.%d.%s netmask 255.255.255.0 mtu 1500' % (ifname, opts.ip_prefix, i, opts.postfix), True) 103 | 104 | print('OK') 105 | print(execute('./affinity.py %s %d' % (ifname, num_ifs), True).strip()) 106 | 107 | execute('rm -f /dev/packet_shader') 108 | execute('mknod /dev/packet_shader c 1010 0') 109 | execute('chmod 666 /dev/packet_shader') 110 | 111 | if not opts.skip_check: 112 | print('Waiting for all links up...') 113 | time.sleep(2) 114 | check_count = 0 115 | warning_printed = False 116 | while not check_all_links_up(): 117 | time.sleep(1) 118 | check_count += 1 119 | if check_count > 10 and not warning_printed: 120 | print(' If this step takes too long,\n' + \ 121 | ' 1) stop the script and try to reinstall the driver.\n' + \ 122 | ' 2) check if the cables are tightly connected.\n') 123 | warning_printed = True 124 | 125 | print('Ready.') 126 | -------------------------------------------------------------------------------- /IOEngine/driver/ixgbe_common.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | 3 | Intel 10 Gigabit PCI Express Linux driver 4 | Copyright(c) 1999 - 2009 Intel Corporation. 5 | 6 | This program is free software; you can redistribute it and/or modify it 7 | under the terms and conditions of the GNU General Public License, 8 | version 2, as published by the Free Software Foundation. 9 | 10 | This program is distributed in the hope it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | more details. 14 | 15 | You should have received a copy of the GNU General Public License along with 16 | this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | The full GNU General Public License is included in this distribution in 20 | the file called "COPYING". 21 | 22 | Contact Information: 23 | e1000-devel Mailing List 24 | Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 25 | 26 | *******************************************************************************/ 27 | 28 | #ifndef _IXGBE_COMMON_H_ 29 | #define _IXGBE_COMMON_H_ 30 | 31 | #include "ixgbe_type.h" 32 | 33 | s32 ixgbe_init_ops_generic(struct ixgbe_hw *hw); 34 | s32 ixgbe_init_hw_generic(struct ixgbe_hw *hw); 35 | s32 ixgbe_start_hw_generic(struct ixgbe_hw *hw); 36 | s32 ixgbe_clear_hw_cntrs_generic(struct ixgbe_hw *hw); 37 | s32 ixgbe_read_pba_num_generic(struct ixgbe_hw *hw, u32 *pba_num); 38 | s32 ixgbe_get_mac_addr_generic(struct ixgbe_hw *hw, u8 *mac_addr); 39 | s32 ixgbe_get_bus_info_generic(struct ixgbe_hw *hw); 40 | void ixgbe_set_lan_id_multi_port_pcie(struct ixgbe_hw *hw); 41 | s32 ixgbe_stop_adapter_generic(struct ixgbe_hw *hw); 42 | 43 | s32 ixgbe_led_on_generic(struct ixgbe_hw *hw, u32 index); 44 | s32 ixgbe_led_off_generic(struct ixgbe_hw *hw, u32 index); 45 | 46 | s32 ixgbe_init_eeprom_params_generic(struct ixgbe_hw *hw); 47 | s32 ixgbe_write_eeprom_generic(struct ixgbe_hw *hw, u16 offset, u16 data); 48 | s32 ixgbe_read_eeprom_generic(struct ixgbe_hw *hw, u16 offset, u16 *data); 49 | s32 ixgbe_read_eeprom_bit_bang_generic(struct ixgbe_hw *hw, u16 offset, 50 | u16 *data); 51 | s32 ixgbe_validate_eeprom_checksum_generic(struct ixgbe_hw *hw, 52 | u16 *checksum_val); 53 | s32 ixgbe_update_eeprom_checksum_generic(struct ixgbe_hw *hw); 54 | 55 | s32 ixgbe_set_rar_generic(struct ixgbe_hw *hw, u32 index, u8 *addr, u32 vmdq, 56 | u32 enable_addr); 57 | s32 ixgbe_clear_rar_generic(struct ixgbe_hw *hw, u32 index); 58 | s32 ixgbe_init_rx_addrs_generic(struct ixgbe_hw *hw); 59 | s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list, 60 | u32 mc_addr_count, 61 | ixgbe_mc_addr_itr func); 62 | s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, u8 *addr_list, 63 | u32 addr_count, ixgbe_mc_addr_itr func); 64 | void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq); 65 | s32 ixgbe_enable_mc_generic(struct ixgbe_hw *hw); 66 | s32 ixgbe_disable_mc_generic(struct ixgbe_hw *hw); 67 | s32 ixgbe_enable_rx_dma_generic(struct ixgbe_hw *hw, u32 regval); 68 | 69 | s32 ixgbe_setup_fc(struct ixgbe_hw *hw, s32 packetbuf_num); 70 | s32 ixgbe_fc_enable_generic(struct ixgbe_hw *hw, s32 packtetbuf_num); 71 | s32 ixgbe_fc_autoneg(struct ixgbe_hw *hw); 72 | 73 | s32 ixgbe_validate_mac_addr(u8 *mac_addr); 74 | s32 ixgbe_acquire_swfw_sync(struct ixgbe_hw *hw, u16 mask); 75 | void ixgbe_release_swfw_sync(struct ixgbe_hw *hw, u16 mask); 76 | s32 ixgbe_disable_pcie_master(struct ixgbe_hw *hw); 77 | 78 | s32 ixgbe_read_analog_reg8_generic(struct ixgbe_hw *hw, u32 reg, u8 *val); 79 | s32 ixgbe_write_analog_reg8_generic(struct ixgbe_hw *hw, u32 reg, u8 val); 80 | s32 ixgbe_blink_led_start_generic(struct ixgbe_hw *hw, u32 index); 81 | s32 ixgbe_blink_led_stop_generic(struct ixgbe_hw *hw, u32 index); 82 | 83 | #endif /* IXGBE_COMMON */ 84 | -------------------------------------------------------------------------------- /IOEngine/driver/ixgbe_dcb.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | 3 | Intel 10 Gigabit PCI Express Linux driver 4 | Copyright(c) 1999 - 2009 Intel Corporation. 5 | 6 | This program is free software; you can redistribute it and/or modify it 7 | under the terms and conditions of the GNU General Public License, 8 | version 2, as published by the Free Software Foundation. 9 | 10 | This program is distributed in the hope it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | more details. 14 | 15 | You should have received a copy of the GNU General Public License along with 16 | this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | The full GNU General Public License is included in this distribution in 20 | the file called "COPYING". 21 | 22 | Contact Information: 23 | e1000-devel Mailing List 24 | Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 25 | 26 | *******************************************************************************/ 27 | 28 | #ifndef _DCB_CONFIG_H_ 29 | #define _DCB_CONFIG_H_ 30 | 31 | #include "ixgbe_type.h" 32 | 33 | /* DCB data structures */ 34 | 35 | #define IXGBE_MAX_PACKET_BUFFERS 8 36 | #define MAX_USER_PRIORITY 8 37 | #define MAX_TRAFFIC_CLASS 8 38 | #define MAX_BW_GROUP 8 39 | #define BW_PERCENT 100 40 | 41 | #define DCB_TX_CONFIG 0 42 | #define DCB_RX_CONFIG 1 43 | 44 | /* DCB error Codes */ 45 | #define DCB_SUCCESS 0 46 | #define DCB_ERR_CONFIG -1 47 | #define DCB_ERR_PARAM -2 48 | 49 | /* Transmit and receive Errors */ 50 | /* Error in bandwidth group allocation */ 51 | #define DCB_ERR_BW_GROUP -3 52 | /* Error in traffic class bandwidth allocation */ 53 | #define DCB_ERR_TC_BW -4 54 | /* Traffic class has both link strict and group strict enabled */ 55 | #define DCB_ERR_LS_GS -5 56 | /* Link strict traffic class has non zero bandwidth */ 57 | #define DCB_ERR_LS_BW_NONZERO -6 58 | /* Link strict bandwidth group has non zero bandwidth */ 59 | #define DCB_ERR_LS_BWG_NONZERO -7 60 | /* Traffic class has zero bandwidth */ 61 | #define DCB_ERR_TC_BW_ZERO -8 62 | 63 | #define DCB_NOT_IMPLEMENTED 0x7FFFFFFF 64 | 65 | struct dcb_pfc_tc_debug { 66 | u8 tc; 67 | u8 pause_status; 68 | u64 pause_quanta; 69 | }; 70 | 71 | enum strict_prio_type { 72 | prio_none = 0, 73 | prio_group, 74 | prio_link 75 | }; 76 | 77 | /* DCB capability definitions */ 78 | #define IXGBE_DCB_PG_SUPPORT 0x00000001 79 | #define IXGBE_DCB_PFC_SUPPORT 0x00000002 80 | #define IXGBE_DCB_BCN_SUPPORT 0x00000004 81 | #define IXGBE_DCB_UP2TC_SUPPORT 0x00000008 82 | #define IXGBE_DCB_GSP_SUPPORT 0x00000010 83 | 84 | #define IXGBE_DCB_8_TC_SUPPORT 0x80 85 | 86 | struct dcb_support { 87 | /* DCB capabilities */ 88 | u32 capabilities; 89 | 90 | /* Each bit represents a number of TCs configurable in the hw. 91 | * If 8 traffic classes can be configured, the value is 0x80. 92 | */ 93 | u8 traffic_classes; 94 | u8 pfc_traffic_classes; 95 | }; 96 | 97 | /* Traffic class bandwidth allocation per direction */ 98 | struct tc_bw_alloc { 99 | u8 bwg_id; /* Bandwidth Group (BWG) ID */ 100 | u8 bwg_percent; /* % of BWG's bandwidth */ 101 | u8 link_percent; /* % of link bandwidth */ 102 | u8 up_to_tc_bitmap; /* User Priority to Traffic Class mapping */ 103 | u16 data_credits_refill; /* Credit refill amount in 64B granularity */ 104 | u16 data_credits_max; /* Max credits for a configured packet buffer 105 | * in 64B granularity.*/ 106 | enum strict_prio_type prio_type; /* Link or Group Strict Priority */ 107 | }; 108 | 109 | enum dcb_pfc_type { 110 | pfc_disabled = 0, 111 | pfc_enabled_full, 112 | pfc_enabled_tx, 113 | pfc_enabled_rx 114 | }; 115 | 116 | /* Traffic class configuration */ 117 | struct tc_configuration { 118 | struct tc_bw_alloc path[2]; /* One each for Tx/Rx */ 119 | enum dcb_pfc_type dcb_pfc; /* Class based flow control setting */ 120 | 121 | u16 desc_credits_max; /* For Tx Descriptor arbitration */ 122 | u8 tc; /* Traffic class (TC) */ 123 | }; 124 | 125 | enum dcb_rx_pba_cfg { 126 | pba_equal, /* PBA[0-7] each use 64KB FIFO */ 127 | pba_80_48 /* PBA[0-3] each use 80KB, PBA[4-7] each use 48KB */ 128 | }; 129 | 130 | struct dcb_num_tcs { 131 | u8 pg_tcs; 132 | u8 pfc_tcs; 133 | }; 134 | 135 | struct ixgbe_dcb_config { 136 | struct tc_configuration tc_config[MAX_TRAFFIC_CLASS]; 137 | struct dcb_support support; 138 | struct dcb_num_tcs num_tcs; 139 | u8 bw_percentage[2][MAX_BW_GROUP]; /* One each for Tx/Rx */ 140 | bool pfc_mode_enable; 141 | bool round_robin_enable; 142 | 143 | enum dcb_rx_pba_cfg rx_pba_cfg; 144 | 145 | u32 dcb_cfg_version; /* Not used...OS-specific? */ 146 | u32 link_speed; /* For bandwidth allocation validation purpose */ 147 | }; 148 | 149 | /* DCB driver APIs */ 150 | 151 | /* DCB rule checking function.*/ 152 | s32 ixgbe_dcb_check_config(struct ixgbe_dcb_config *config); 153 | 154 | /* DCB credits calculation */ 155 | s32 ixgbe_dcb_calculate_tc_credits(struct ixgbe_dcb_config *config, 156 | u8 direction); 157 | 158 | /* DCB PFC functions */ 159 | s32 ixgbe_dcb_config_pfc(struct ixgbe_hw *hw, 160 | struct ixgbe_dcb_config *dcb_config); 161 | s32 ixgbe_dcb_get_pfc_stats(struct ixgbe_hw *hw, struct ixgbe_hw_stats *stats, 162 | u8 tc_count); 163 | 164 | /* DCB traffic class stats */ 165 | s32 ixgbe_dcb_config_tc_stats(struct ixgbe_hw *); 166 | s32 ixgbe_dcb_get_tc_stats(struct ixgbe_hw *hw, struct ixgbe_hw_stats *stats, 167 | u8 tc_count); 168 | 169 | /* DCB config arbiters */ 170 | s32 ixgbe_dcb_config_tx_desc_arbiter(struct ixgbe_hw *hw, 171 | struct ixgbe_dcb_config *dcb_config); 172 | s32 ixgbe_dcb_config_tx_data_arbiter(struct ixgbe_hw *hw, 173 | struct ixgbe_dcb_config *dcb_config); 174 | s32 ixgbe_dcb_config_rx_arbiter(struct ixgbe_hw *hw, 175 | struct ixgbe_dcb_config *dcb_config); 176 | 177 | /* DCB hw initialization */ 178 | s32 ixgbe_dcb_hw_config(struct ixgbe_hw *hw, struct ixgbe_dcb_config *config); 179 | 180 | 181 | /* DCB definitions for credit calculation */ 182 | #define MAX_CREDIT_REFILL 511 /* 0x1FF * 64B = 32704B */ 183 | #define MINIMUM_CREDIT_REFILL 5 /* 5*64B = 320B */ 184 | #define MINIMUM_CREDIT_FOR_JUMBO 145 /* 145 = UpperBound((9*1024+54)/64B) 185 | * for 9KB jumbo frame */ 186 | #define DCB_MAX_TSO_SIZE 32*1024 /* MAX TSO packet size supported 187 | * in DCB mode */ 188 | #define MINIMUM_CREDIT_FOR_TSO (DCB_MAX_TSO_SIZE/64 + 1) /* 513 for 32KB TSO 189 | * packet */ 190 | #define MAX_CREDIT 4095 /* Maximum credit supported: 191 | * 256KB * 1204 / 64B */ 192 | 193 | #endif /* _DCB_CONFIG_H */ 194 | -------------------------------------------------------------------------------- /IOEngine/driver/ixgbe_dcb_82598.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | 3 | Intel 10 Gigabit PCI Express Linux driver 4 | Copyright(c) 1999 - 2009 Intel Corporation. 5 | 6 | This program is free software; you can redistribute it and/or modify it 7 | under the terms and conditions of the GNU General Public License, 8 | version 2, as published by the Free Software Foundation. 9 | 10 | This program is distributed in the hope it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | more details. 14 | 15 | You should have received a copy of the GNU General Public License along with 16 | this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | The full GNU General Public License is included in this distribution in 20 | the file called "COPYING". 21 | 22 | Contact Information: 23 | e1000-devel Mailing List 24 | Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 25 | 26 | *******************************************************************************/ 27 | 28 | #ifndef _DCB_82598_CONFIG_H_ 29 | #define _DCB_82598_CONFIG_H_ 30 | 31 | /* DCB register definitions */ 32 | 33 | #define IXGBE_DPMCS_MTSOS_SHIFT 16 34 | #define IXGBE_DPMCS_TDPAC 0x00000001 /* 0 Round Robin, 35 | * 1 DFP - Deficit Fixed Priority */ 36 | #define IXGBE_DPMCS_TRM 0x00000010 /* Transmit Recycle Mode */ 37 | #define IXGBE_DPMCS_ARBDIS 0x00000040 /* DCB arbiter disable */ 38 | #define IXGBE_DPMCS_TSOEF 0x00080000 /* TSO Expand Factor: 0=x4, 1=x2 */ 39 | 40 | #define IXGBE_RUPPBMR_MQA 0x80000000 /* Enable UP to queue mapping */ 41 | 42 | #define IXGBE_RT2CR_MCL_SHIFT 12 /* Offset to Max Credit Limit setting */ 43 | #define IXGBE_RT2CR_LSP 0x80000000 /* LSP enable bit */ 44 | 45 | #define IXGBE_RDRXCTL_MPBEN 0x00000010 /* DMA config for multiple packet 46 | * buffers enable */ 47 | #define IXGBE_RDRXCTL_MCEN 0x00000040 /* DMA config for multiple cores 48 | * (RSS) enable */ 49 | 50 | #define IXGBE_TDTQ2TCCR_MCL_SHIFT 12 51 | #define IXGBE_TDTQ2TCCR_BWG_SHIFT 9 52 | #define IXGBE_TDTQ2TCCR_GSP 0x40000000 53 | #define IXGBE_TDTQ2TCCR_LSP 0x80000000 54 | 55 | #define IXGBE_TDPT2TCCR_MCL_SHIFT 12 56 | #define IXGBE_TDPT2TCCR_BWG_SHIFT 9 57 | #define IXGBE_TDPT2TCCR_GSP 0x40000000 58 | #define IXGBE_TDPT2TCCR_LSP 0x80000000 59 | 60 | #define IXGBE_PDPMCS_TPPAC 0x00000020 /* 0 Round Robin, 61 | * 1 DFP - Deficit Fixed Priority */ 62 | #define IXGBE_PDPMCS_ARBDIS 0x00000040 /* Arbiter disable */ 63 | #define IXGBE_PDPMCS_TRM 0x00000100 /* Transmit Recycle Mode enable */ 64 | 65 | #define IXGBE_DTXCTL_ENDBUBD 0x00000004 /* Enable DBU buffer division */ 66 | 67 | #define IXGBE_TXPBSIZE_40KB 0x0000A000 /* 40KB Packet Buffer */ 68 | #define IXGBE_RXPBSIZE_48KB 0x0000C000 /* 48KB Packet Buffer */ 69 | #define IXGBE_RXPBSIZE_64KB 0x00010000 /* 64KB Packet Buffer */ 70 | #define IXGBE_RXPBSIZE_80KB 0x00014000 /* 80KB Packet Buffer */ 71 | 72 | /* DCB hardware-specific driver APIs */ 73 | 74 | /* DCB PFC functions */ 75 | s32 ixgbe_dcb_config_pfc_82598(struct ixgbe_hw *hw, 76 | struct ixgbe_dcb_config *dcb_config); 77 | s32 ixgbe_dcb_get_pfc_stats_82598(struct ixgbe_hw *hw, 78 | struct ixgbe_hw_stats *stats, 79 | u8 tc_count); 80 | 81 | /* DCB traffic class stats */ 82 | s32 ixgbe_dcb_config_tc_stats_82598(struct ixgbe_hw *hw); 83 | s32 ixgbe_dcb_get_tc_stats_82598(struct ixgbe_hw *hw, 84 | struct ixgbe_hw_stats *stats, 85 | u8 tc_count); 86 | 87 | /* DCB config arbiters */ 88 | s32 ixgbe_dcb_config_tx_desc_arbiter_82598(struct ixgbe_hw *hw, 89 | struct ixgbe_dcb_config *dcb_config); 90 | s32 ixgbe_dcb_config_tx_data_arbiter_82598(struct ixgbe_hw *hw, 91 | struct ixgbe_dcb_config *dcb_config); 92 | s32 ixgbe_dcb_config_rx_arbiter_82598(struct ixgbe_hw *hw, 93 | struct ixgbe_dcb_config *dcb_config); 94 | 95 | /* DCB hw initialization */ 96 | s32 ixgbe_dcb_hw_config_82598(struct ixgbe_hw *hw, 97 | struct ixgbe_dcb_config *config); 98 | 99 | #endif /* _DCB_82598_CONFIG_H */ 100 | -------------------------------------------------------------------------------- /IOEngine/driver/ixgbe_dcb_82599.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | 3 | Intel 10 Gigabit PCI Express Linux driver 4 | Copyright(c) 1999 - 2009 Intel Corporation. 5 | 6 | This program is free software; you can redistribute it and/or modify it 7 | under the terms and conditions of the GNU General Public License, 8 | version 2, as published by the Free Software Foundation. 9 | 10 | This program is distributed in the hope it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | more details. 14 | 15 | You should have received a copy of the GNU General Public License along with 16 | this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | The full GNU General Public License is included in this distribution in 20 | the file called "COPYING". 21 | 22 | Contact Information: 23 | e1000-devel Mailing List 24 | Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 25 | 26 | *******************************************************************************/ 27 | 28 | #ifndef _DCB_82599_CONFIG_H_ 29 | #define _DCB_82599_CONFIG_H_ 30 | 31 | /* DCB register definitions */ 32 | #define IXGBE_RTTDCS_TDPAC 0x00000001 /* 0 Round Robin, 33 | * 1 WSP - Weighted Strict Priority 34 | */ 35 | #define IXGBE_RTTDCS_VMPAC 0x00000002 /* 0 Round Robin, 36 | * 1 WRR - Weighted Round Robin 37 | */ 38 | #define IXGBE_RTTDCS_TDRM 0x00000010 /* Transmit Recycle Mode */ 39 | #define IXGBE_RTTDCS_BDPM 0x00400000 /* Bypass Data Pipe - must clear! */ 40 | #define IXGBE_RTTDCS_BPBFSM 0x00800000 /* Bypass PB Free Space - must 41 | * clear! 42 | */ 43 | #define IXGBE_RTTDCS_SPEED_CHG 0x80000000 /* Link speed change */ 44 | 45 | /* Receive UP2TC mapping */ 46 | #define IXGBE_RTRUP2TC_UP_SHIFT 3 47 | /* Transmit UP2TC mapping */ 48 | #define IXGBE_RTTUP2TC_UP_SHIFT 3 49 | 50 | #define IXGBE_RTRPT4C_MCL_SHIFT 12 /* Offset to Max Credit Limit setting */ 51 | #define IXGBE_RTRPT4C_BWG_SHIFT 9 /* Offset to BWG index */ 52 | #define IXGBE_RTRPT4C_GSP 0x40000000 /* GSP enable bit */ 53 | #define IXGBE_RTRPT4C_LSP 0x80000000 /* LSP enable bit */ 54 | 55 | #define IXGBE_RDRXCTL_MPBEN 0x00000010 /* DMA config for multiple packet 56 | * buffers enable 57 | */ 58 | #define IXGBE_RDRXCTL_MCEN 0x00000040 /* DMA config for multiple cores 59 | * (RSS) enable 60 | */ 61 | 62 | /* RTRPCS Bit Masks */ 63 | #define IXGBE_RTRPCS_RRM 0x00000002 /* Receive Recycle Mode enable */ 64 | /* Receive Arbitration Control: 0 Round Robin, 1 DFP */ 65 | #define IXGBE_RTRPCS_RAC 0x00000004 66 | #define IXGBE_RTRPCS_ARBDIS 0x00000040 /* Arbitration disable bit */ 67 | 68 | /* RTTDT2C Bit Masks */ 69 | #define IXGBE_RTTDT2C_MCL_SHIFT 12 70 | #define IXGBE_RTTDT2C_BWG_SHIFT 9 71 | #define IXGBE_RTTDT2C_GSP 0x40000000 72 | #define IXGBE_RTTDT2C_LSP 0x80000000 73 | 74 | #define IXGBE_RTTPT2C_MCL_SHIFT 12 75 | #define IXGBE_RTTPT2C_BWG_SHIFT 9 76 | #define IXGBE_RTTPT2C_GSP 0x40000000 77 | #define IXGBE_RTTPT2C_LSP 0x80000000 78 | 79 | /* RTTPCS Bit Masks */ 80 | #define IXGBE_RTTPCS_TPPAC 0x00000020 /* 0 Round Robin, 81 | * 1 SP - Strict Priority 82 | */ 83 | #define IXGBE_RTTPCS_ARBDIS 0x00000040 /* Arbiter disable */ 84 | #define IXGBE_RTTPCS_TPRM 0x00000100 /* Transmit Recycle Mode enable */ 85 | #define IXGBE_RTTPCS_ARBD_SHIFT 22 86 | #define IXGBE_RTTPCS_ARBD_DCB 0x4 /* Arbitration delay in DCB mode */ 87 | 88 | #define IXGBE_TXPBSIZE_20KB 0x00005000 /* 20KB Packet Buffer */ 89 | #define IXGBE_TXPBSIZE_40KB 0x0000A000 /* 40KB Packet Buffer */ 90 | #define IXGBE_RXPBSIZE_48KB 0x0000C000 /* 48KB Packet Buffer */ 91 | #define IXGBE_RXPBSIZE_64KB 0x00010000 /* 64KB Packet Buffer */ 92 | #define IXGBE_RXPBSIZE_80KB 0x00014000 /* 80KB Packet Buffer */ 93 | #define IXGBE_RXPBSIZE_128KB 0x00020000 /* 128KB Packet Buffer */ 94 | 95 | #define IXGBE_TXPBTHRESH_DCB 0xA /* THRESH value for DCB mode */ 96 | 97 | 98 | /* DCB hardware-specific driver APIs */ 99 | 100 | /* DCB PFC functions */ 101 | s32 ixgbe_dcb_config_pfc_82599(struct ixgbe_hw *hw, 102 | struct ixgbe_dcb_config *dcb_config); 103 | s32 ixgbe_dcb_get_pfc_stats_82599(struct ixgbe_hw *hw, 104 | struct ixgbe_hw_stats *stats, 105 | u8 tc_count); 106 | 107 | /* DCB traffic class stats */ 108 | s32 ixgbe_dcb_config_tc_stats_82599(struct ixgbe_hw *hw); 109 | s32 ixgbe_dcb_get_tc_stats_82599(struct ixgbe_hw *hw, 110 | struct ixgbe_hw_stats *stats, 111 | u8 tc_count); 112 | 113 | /* DCB config arbiters */ 114 | s32 ixgbe_dcb_config_tx_desc_arbiter_82599(struct ixgbe_hw *hw, 115 | struct ixgbe_dcb_config *dcb_config); 116 | s32 ixgbe_dcb_config_tx_data_arbiter_82599(struct ixgbe_hw *hw, 117 | struct ixgbe_dcb_config *dcb_config); 118 | s32 ixgbe_dcb_config_rx_arbiter_82599(struct ixgbe_hw *hw, 119 | struct ixgbe_dcb_config *dcb_config); 120 | 121 | /* DCB hw initialization */ 122 | s32 ixgbe_dcb_hw_config_82599(struct ixgbe_hw *hw, 123 | struct ixgbe_dcb_config *config); 124 | 125 | #endif /* _DCB_82599_CONFIG_H */ 126 | -------------------------------------------------------------------------------- /IOEngine/driver/ixgbe_fcoe.c: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | 3 | Intel 10 Gigabit PCI Express Linux driver 4 | Copyright(c) 1999 - 2009 Intel Corporation. 5 | 6 | This program is free software; you can redistribute it and/or modify it 7 | under the terms and conditions of the GNU General Public License, 8 | version 2, as published by the Free Software Foundation. 9 | 10 | This program is distributed in the hope it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | more details. 14 | 15 | You should have received a copy of the GNU General Public License along with 16 | this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | The full GNU General Public License is included in this distribution in 20 | the file called "COPYING". 21 | 22 | Contact Information: 23 | e1000-devel Mailing List 24 | Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 25 | 26 | *******************************************************************************/ 27 | 28 | #include "ixgbe.h" 29 | 30 | -------------------------------------------------------------------------------- /IOEngine/driver/ixgbe_fcoe.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | 3 | Intel 10 Gigabit PCI Express Linux driver 4 | Copyright(c) 1999 - 2009 Intel Corporation. 5 | 6 | This program is free software; you can redistribute it and/or modify it 7 | under the terms and conditions of the GNU General Public License, 8 | version 2, as published by the Free Software Foundation. 9 | 10 | This program is distributed in the hope it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | more details. 14 | 15 | You should have received a copy of the GNU General Public License along with 16 | this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | The full GNU General Public License is included in this distribution in 20 | the file called "COPYING". 21 | 22 | Contact Information: 23 | e1000-devel Mailing List 24 | Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 25 | 26 | *******************************************************************************/ 27 | 28 | #ifndef _IXGBE_FCOE_H 29 | #define _IXGBE_FCOE_H 30 | 31 | 32 | #endif /* _IXGBE_FCOE_H */ 33 | -------------------------------------------------------------------------------- /IOEngine/driver/ixgbe_osdep.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | 3 | Intel 10 Gigabit PCI Express Linux driver 4 | Copyright(c) 1999 - 2009 Intel Corporation. 5 | 6 | This program is free software; you can redistribute it and/or modify it 7 | under the terms and conditions of the GNU General Public License, 8 | version 2, as published by the Free Software Foundation. 9 | 10 | This program is distributed in the hope it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | more details. 14 | 15 | You should have received a copy of the GNU General Public License along with 16 | this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | The full GNU General Public License is included in this distribution in 20 | the file called "COPYING". 21 | 22 | Contact Information: 23 | e1000-devel Mailing List 24 | Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 25 | 26 | *******************************************************************************/ 27 | 28 | 29 | /* glue for the OS independent part of ixgbe 30 | * includes register access macros 31 | */ 32 | 33 | #ifndef _IXGBE_OSDEP_H_ 34 | #define _IXGBE_OSDEP_H_ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include "kcompat.h" 42 | 43 | 44 | #ifndef msleep 45 | #define msleep(x) do { if(in_interrupt()) { \ 46 | /* Don't mdelay in interrupt context! */ \ 47 | BUG(); \ 48 | } else { \ 49 | msleep(x); \ 50 | } } while (0) 51 | 52 | #endif 53 | 54 | #undef ASSERT 55 | 56 | #ifdef DBG 57 | #define hw_dbg(hw, S, A...) printk(KERN_DEBUG S, ## A) 58 | #else 59 | #define hw_dbg(hw, S, A...) do {} while (0) 60 | #endif 61 | 62 | #ifdef DBG 63 | #define IXGBE_WRITE_REG(a, reg, value) do {\ 64 | switch (reg) { \ 65 | case IXGBE_EIMS: \ 66 | case IXGBE_EIMC: \ 67 | case IXGBE_EIAM: \ 68 | case IXGBE_EIAC: \ 69 | case IXGBE_EICR: \ 70 | case IXGBE_EICS: \ 71 | printk("%s: Reg - 0x%05X, value - 0x%08X\n", __FUNCTION__, \ 72 | reg, (u32)(value)); \ 73 | default: \ 74 | break; \ 75 | } \ 76 | writel((value), ((a)->hw_addr + (reg))); \ 77 | } while (0) 78 | #else 79 | #define IXGBE_WRITE_REG(a, reg, value) writel((value), ((a)->hw_addr + (reg))) 80 | #endif 81 | 82 | #define IXGBE_READ_REG(a, reg) readl((a)->hw_addr + (reg)) 83 | 84 | #define IXGBE_WRITE_REG_ARRAY(a, reg, offset, value) ( \ 85 | writel((value), ((a)->hw_addr + (reg) + ((offset) << 2)))) 86 | 87 | #define IXGBE_READ_REG_ARRAY(a, reg, offset) ( \ 88 | readl((a)->hw_addr + (reg) + ((offset) << 2))) 89 | 90 | #ifndef writeq 91 | #define writeq(val, addr) writel((u32) (val), addr); \ 92 | writel((u32) (val >> 32), (addr + 4)); 93 | #endif 94 | 95 | #define IXGBE_WRITE_REG64(a, reg, value) writeq((value), ((a)->hw_addr + (reg))) 96 | 97 | #define IXGBE_WRITE_FLUSH(a) IXGBE_READ_REG(a, IXGBE_STATUS) 98 | struct ixgbe_hw; 99 | extern u16 ixgbe_read_pci_cfg_word(struct ixgbe_hw *hw, u32 reg); 100 | extern void ixgbe_write_pci_cfg_word(struct ixgbe_hw *hw, u32 reg, u16 value); 101 | #define IXGBE_READ_PCIE_WORD ixgbe_read_pci_cfg_word 102 | #define IXGBE_WRITE_PCIE_WORD ixgbe_write_pci_cfg_word 103 | #define IXGBE_EEPROM_GRANT_ATTEMPS 100 104 | #define IXGBE_HTONL(_i) htonl(_i) 105 | #define IXGBE_HTONS(_i) htons(_i) 106 | 107 | #endif /* _IXGBE_OSDEP_H_ */ 108 | -------------------------------------------------------------------------------- /IOEngine/driver/ixgbe_phy.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | 3 | Intel 10 Gigabit PCI Express Linux driver 4 | Copyright(c) 1999 - 2009 Intel Corporation. 5 | 6 | This program is free software; you can redistribute it and/or modify it 7 | under the terms and conditions of the GNU General Public License, 8 | version 2, as published by the Free Software Foundation. 9 | 10 | This program is distributed in the hope it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | more details. 14 | 15 | You should have received a copy of the GNU General Public License along with 16 | this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | The full GNU General Public License is included in this distribution in 20 | the file called "COPYING". 21 | 22 | Contact Information: 23 | e1000-devel Mailing List 24 | Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 25 | 26 | *******************************************************************************/ 27 | 28 | #ifndef _IXGBE_PHY_H_ 29 | #define _IXGBE_PHY_H_ 30 | 31 | #include "ixgbe_type.h" 32 | #define IXGBE_I2C_EEPROM_DEV_ADDR 0xA0 33 | 34 | /* EEPROM byte offsets */ 35 | #define IXGBE_SFF_IDENTIFIER 0x0 36 | #define IXGBE_SFF_IDENTIFIER_SFP 0x3 37 | #define IXGBE_SFF_VENDOR_OUI_BYTE0 0x25 38 | #define IXGBE_SFF_VENDOR_OUI_BYTE1 0x26 39 | #define IXGBE_SFF_VENDOR_OUI_BYTE2 0x27 40 | #define IXGBE_SFF_1GBE_COMP_CODES 0x6 41 | #define IXGBE_SFF_10GBE_COMP_CODES 0x3 42 | #define IXGBE_SFF_CABLE_TECHNOLOGY 0x8 43 | 44 | /* Bitmasks */ 45 | #define IXGBE_SFF_DA_PASSIVE_CABLE 0x4 46 | #define IXGBE_SFF_1GBASESX_CAPABLE 0x1 47 | #define IXGBE_SFF_1GBASELX_CAPABLE 0x2 48 | #define IXGBE_SFF_10GBASESR_CAPABLE 0x10 49 | #define IXGBE_SFF_10GBASELR_CAPABLE 0x20 50 | #define IXGBE_I2C_EEPROM_READ_MASK 0x100 51 | #define IXGBE_I2C_EEPROM_STATUS_MASK 0x3 52 | #define IXGBE_I2C_EEPROM_STATUS_NO_OPERATION 0x0 53 | #define IXGBE_I2C_EEPROM_STATUS_PASS 0x1 54 | #define IXGBE_I2C_EEPROM_STATUS_FAIL 0x2 55 | #define IXGBE_I2C_EEPROM_STATUS_IN_PROGRESS 0x3 56 | 57 | /* Bit-shift macros */ 58 | #define IXGBE_SFF_VENDOR_OUI_BYTE0_SHIFT 24 59 | #define IXGBE_SFF_VENDOR_OUI_BYTE1_SHIFT 16 60 | #define IXGBE_SFF_VENDOR_OUI_BYTE2_SHIFT 8 61 | 62 | /* Vendor OUIs: format of OUI is 0x[byte0][byte1][byte2][00] */ 63 | #define IXGBE_SFF_VENDOR_OUI_TYCO 0x00407600 64 | #define IXGBE_SFF_VENDOR_OUI_FTL 0x00906500 65 | #define IXGBE_SFF_VENDOR_OUI_AVAGO 0x00176A00 66 | #define IXGBE_SFF_VENDOR_OUI_INTEL 0x001B2100 67 | 68 | /* I2C SDA and SCL timing parameters for standard mode */ 69 | #define IXGBE_I2C_T_HD_STA 4 70 | #define IXGBE_I2C_T_LOW 5 71 | #define IXGBE_I2C_T_HIGH 4 72 | #define IXGBE_I2C_T_SU_STA 5 73 | #define IXGBE_I2C_T_HD_DATA 5 74 | #define IXGBE_I2C_T_SU_DATA 1 75 | #define IXGBE_I2C_T_RISE 1 76 | #define IXGBE_I2C_T_FALL 1 77 | #define IXGBE_I2C_T_SU_STO 4 78 | #define IXGBE_I2C_T_BUF 5 79 | 80 | 81 | s32 ixgbe_init_phy_ops_generic(struct ixgbe_hw *hw); 82 | bool ixgbe_validate_phy_addr(struct ixgbe_hw *hw, u32 phy_addr); 83 | enum ixgbe_phy_type ixgbe_get_phy_type_from_id(u32 phy_id); 84 | s32 ixgbe_get_phy_id(struct ixgbe_hw *hw); 85 | s32 ixgbe_identify_phy_generic(struct ixgbe_hw *hw); 86 | s32 ixgbe_reset_phy_generic(struct ixgbe_hw *hw); 87 | s32 ixgbe_read_phy_reg_generic(struct ixgbe_hw *hw, u32 reg_addr, 88 | u32 device_type, u16 *phy_data); 89 | s32 ixgbe_write_phy_reg_generic(struct ixgbe_hw *hw, u32 reg_addr, 90 | u32 device_type, u16 phy_data); 91 | s32 ixgbe_setup_phy_link_generic(struct ixgbe_hw *hw); 92 | s32 ixgbe_setup_phy_link_speed_generic(struct ixgbe_hw *hw, 93 | ixgbe_link_speed speed, 94 | bool autoneg, 95 | bool autoneg_wait_to_complete); 96 | s32 ixgbe_get_copper_link_capabilities_generic(struct ixgbe_hw *hw, 97 | ixgbe_link_speed *speed, 98 | bool *autoneg); 99 | 100 | /* PHY specific */ 101 | s32 ixgbe_check_phy_link_tnx(struct ixgbe_hw *hw, 102 | ixgbe_link_speed *speed, 103 | bool *link_up); 104 | s32 ixgbe_setup_phy_link_tnx(struct ixgbe_hw *hw); 105 | s32 ixgbe_get_phy_firmware_version_tnx(struct ixgbe_hw *hw, 106 | u16 *firmware_version); 107 | s32 ixgbe_get_phy_firmware_version_aq(struct ixgbe_hw *hw, 108 | u16 *firmware_version); 109 | 110 | s32 ixgbe_reset_phy_nl(struct ixgbe_hw *hw); 111 | s32 ixgbe_identify_sfp_module_generic(struct ixgbe_hw *hw); 112 | s32 ixgbe_get_sfp_init_sequence_offsets(struct ixgbe_hw *hw, 113 | u16 *list_offset, 114 | u16 *data_offset); 115 | s32 ixgbe_read_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, 116 | u8 dev_addr, u8 *data); 117 | s32 ixgbe_write_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset, 118 | u8 dev_addr, u8 data); 119 | s32 ixgbe_read_i2c_eeprom_generic(struct ixgbe_hw *hw, u8 byte_offset, 120 | u8 *eeprom_data); 121 | s32 ixgbe_write_i2c_eeprom_generic(struct ixgbe_hw *hw, u8 byte_offset, 122 | u8 eeprom_data); 123 | #endif /* _IXGBE_PHY_H_ */ 124 | -------------------------------------------------------------------------------- /IOEngine/driver/ixgbe_sysfs.c: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | 3 | Intel 10 Gigabit PCI Express Linux driver 4 | Copyright(c) 1999 - 2009 Intel Corporation. 5 | 6 | This program is free software; you can redistribute it and/or modify it 7 | under the terms and conditions of the GNU General Public License, 8 | version 2, as published by the Free Software Foundation. 9 | 10 | This program is distributed in the hope it will be useful, but WITHOUT 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | more details. 14 | 15 | You should have received a copy of the GNU General Public License along with 16 | this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 18 | 19 | The full GNU General Public License is included in this distribution in 20 | the file called "COPYING". 21 | 22 | Contact Information: 23 | e1000-devel Mailing List 24 | Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 25 | 26 | *******************************************************************************/ 27 | 28 | #include "ixgbe.h" 29 | 30 | -------------------------------------------------------------------------------- /IOEngine/lib/Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS=-O2 -g -Wall -I../include 2 | 3 | .PHONY: clean 4 | 5 | static: psio.c 6 | rm -f psio.o 7 | gcc $(CFLAGS) -c -o psio.o psio.c 8 | ar rcs libpsio.a psio.o 9 | 10 | shared: psio.c 11 | rm -f psio.o 12 | gcc $(CFLAGS) -fPIC -c -o psio.o psio.c 13 | gcc -shared -Wl,-soname,libpsio.so.1 \ 14 | -o libpsio.so.1.0.0 psio.o 15 | 16 | clean: 17 | rm -f *.o *.a *.so.* 18 | -------------------------------------------------------------------------------- /IOEngine/lib/psio.c: -------------------------------------------------------------------------------- 1 | #ifndef _GNU_SOURCE 2 | #define _GNU_SOURCE 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | #include "psio.h" 18 | 19 | static int k2ps_ifindex_map[PS_MAX_DEVICES]; 20 | static struct ps_device device_list[PS_MAX_DEVICES]; 21 | 22 | int ps_list_devices(struct ps_device *devices) 23 | { 24 | struct ps_handle handle; 25 | int ret, i; 26 | 27 | if (ps_init_handle(&handle)) 28 | return -1; 29 | 30 | ret = ioctl(handle.fd, PS_IOC_LIST_DEVICES, devices); 31 | for (i = 0; i < ret; i++) { 32 | k2ps_ifindex_map[devices[i].kifindex] = i; 33 | device_list[i] = devices[i]; 34 | } 35 | 36 | ps_close_handle(&handle); 37 | 38 | return ret; 39 | } 40 | 41 | int ps_init_handle(struct ps_handle *handle) 42 | { 43 | int i; 44 | 45 | memset(handle, 0, sizeof(struct ps_handle)); 46 | 47 | handle->fd = open("/dev/packet_shader", O_RDWR); 48 | if (handle->fd == -1) 49 | return -1; 50 | 51 | for (i = 0; i < PS_MAX_DEVICES; i++) 52 | k2ps_ifindex_map[i] = -1; 53 | return 0; 54 | } 55 | 56 | void ps_close_handle(struct ps_handle *handle) 57 | { 58 | close(handle->fd); 59 | handle->fd = -1; 60 | } 61 | 62 | int ps_attach_rx_device(struct ps_handle *handle, struct ps_queue *queue) 63 | { 64 | return ioctl(handle->fd, PS_IOC_ATTACH_RX_DEVICE, queue); 65 | } 66 | 67 | int ps_detach_rx_device(struct ps_handle *handle, struct ps_queue *queue) 68 | { 69 | return ioctl(handle->fd, PS_IOC_DETACH_RX_DEVICE, queue); 70 | } 71 | 72 | int ps_alloc_chunk(struct ps_handle *handle, struct ps_chunk *chunk) 73 | { 74 | memset(chunk, 0, sizeof(*chunk)); 75 | 76 | chunk->info = (struct ps_pkt_info *)malloc( 77 | sizeof(struct ps_pkt_info) * PS_MAX_CHUNK_SIZE); 78 | if (!chunk->info) 79 | return -1; 80 | 81 | chunk->buf = (char *)mmap(NULL, PS_MAX_PACKET_SIZE * PS_MAX_CHUNK_SIZE, 82 | PROT_READ | PROT_WRITE, MAP_SHARED, 83 | handle->fd, 0); 84 | if ((long)chunk->buf == -1) 85 | return -1; 86 | 87 | return 0; 88 | } 89 | 90 | void ps_free_chunk(struct ps_chunk *chunk) 91 | { 92 | free(chunk->info); 93 | munmap(chunk->buf, PS_MAX_PACKET_SIZE * PS_MAX_CHUNK_SIZE); 94 | 95 | chunk->info = NULL; 96 | chunk->buf = NULL; 97 | } 98 | 99 | int ps_recv_chunk(struct ps_handle *handle, struct ps_chunk *chunk) 100 | { 101 | int cnt; 102 | 103 | cnt = ioctl(handle->fd, PS_IOC_RECV_CHUNK, chunk); 104 | if (cnt > 0) { 105 | int i; 106 | int ifindex = chunk->queue.ifindex; 107 | 108 | handle->rx_chunks[ifindex]++; 109 | handle->rx_packets[ifindex] += cnt; 110 | 111 | for (i = 0; i < cnt; i++) 112 | handle->rx_bytes[ifindex] += chunk->info[i].len; 113 | } 114 | 115 | return cnt; 116 | } 117 | 118 | /* Receive chunk from a specified queue. */ 119 | int ps_recv_queue(struct ps_handle *handle, struct ps_chunk *chunk) 120 | { 121 | int cnt; 122 | 123 | cnt = ioctl(handle->fd, PS_IOC_RECV_QUEUE, chunk); 124 | if (cnt > 0) { 125 | int i; 126 | int ifindex = chunk->queue.ifindex; 127 | 128 | handle->rx_chunks[ifindex]++; 129 | handle->rx_packets[ifindex] += cnt; 130 | 131 | for (i = 0; i < cnt; i++) 132 | handle->rx_bytes[ifindex] += chunk->info[i].len; 133 | } 134 | 135 | return cnt; 136 | } 137 | 138 | /* Send the given chunk to the modified driver. */ 139 | int ps_send_chunk(struct ps_handle *handle, struct ps_chunk *chunk) 140 | { 141 | int cnt; 142 | 143 | cnt = ioctl(handle->fd, PS_IOC_SEND_CHUNK, chunk); 144 | if (cnt >= 0) { 145 | int i; 146 | int ifindex = chunk->queue.ifindex; 147 | 148 | handle->tx_chunks[ifindex]++; 149 | handle->tx_packets[ifindex] += cnt; 150 | 151 | for (i = 0; i < cnt; i++) 152 | handle->tx_bytes[ifindex] += chunk->info[i].len; 153 | } 154 | 155 | return cnt; 156 | } 157 | 158 | int ps_slowpath_packet(struct ps_handle *handle, struct ps_packet *packet) 159 | { 160 | return ioctl(handle->fd, PS_IOC_SLOWPATH_PACKET, packet); 161 | } 162 | 163 | int ps_alloc_view_chunk(struct ps_chunk *view_chunk, struct ps_chunk *src_chunk, bool copy_info) 164 | { 165 | memset(view_chunk, 0, sizeof(*view_chunk)); 166 | 167 | view_chunk->info = (struct ps_pkt_info *) malloc( 168 | sizeof(struct ps_pkt_info) * PS_MAX_CHUNK_SIZE); 169 | if (!view_chunk->info) 170 | return -1; 171 | 172 | if (copy_info) { 173 | memcpy(view_chunk->info, src_chunk->info, sizeof(struct ps_pkt_info) * PS_MAX_CHUNK_SIZE); 174 | view_chunk->cnt = src_chunk->cnt; 175 | view_chunk->queue = src_chunk->queue; 176 | } else { 177 | memset(view_chunk->info, 0, sizeof(struct ps_pkt_info) * PS_MAX_CHUNK_SIZE); 178 | view_chunk->cnt = 0; 179 | view_chunk->queue.qidx = -1; 180 | view_chunk->queue.ifindex = -1; 181 | } 182 | 183 | view_chunk->buf = src_chunk->buf; 184 | 185 | return 0; 186 | } 187 | 188 | int ps_free_view_chunk(struct ps_chunk *view_chunk) 189 | { 190 | view_chunk->cnt = 0; 191 | free(view_chunk->info); 192 | view_chunk->info = NULL; 193 | view_chunk->buf = NULL; 194 | return 0; 195 | } 196 | 197 | int ps_to_psifindex(int kifindex) 198 | { 199 | return k2ps_ifindex_map[kifindex]; 200 | } 201 | 202 | int ps_to_kifindex(int psifindex) 203 | { 204 | return device_list[psifindex].kifindex; 205 | } 206 | -------------------------------------------------------------------------------- /IOEngine/samples/echo/Makefile: -------------------------------------------------------------------------------- 1 | LIBS = -lpsio -lnuma 2 | LIB_DIR = -L../../lib 3 | INC = -I../../include 4 | CFLAGS = $(INC) -O2 -g -Wall 5 | 6 | .PHONY: clean 7 | 8 | all: echo 9 | 10 | echo: echo.c 11 | gcc $(CFLAGS) -o echo echo.c $(LIB_DIR) $(LIBS) 12 | 13 | clean: 14 | rm -f echo 15 | -------------------------------------------------------------------------------- /IOEngine/samples/echo/echo.c: -------------------------------------------------------------------------------- 1 | #ifndef _GNU_SOURCE 2 | #define _GNU_SOURCE 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | #include "psio.h" 17 | 18 | #define MAX_CPUS 32 19 | 20 | int num_devices; 21 | struct ps_device devices[PS_MAX_DEVICES]; 22 | 23 | int num_devices_attached; 24 | int devices_attached[PS_MAX_DEVICES]; 25 | 26 | struct ps_handle handles[PS_MAX_CPUS]; 27 | 28 | int my_cpu; 29 | int sink; 30 | 31 | int get_num_cpus() 32 | { 33 | return sysconf(_SC_NPROCESSORS_ONLN); 34 | } 35 | 36 | int bind_cpu(int cpu) 37 | { 38 | cpu_set_t *cmask; 39 | struct bitmask *bmask; 40 | size_t ncpu, setsize; 41 | int ret; 42 | 43 | ncpu = get_num_cpus(); 44 | 45 | if (cpu < 0 || cpu >= (int)ncpu) { 46 | errno = -EINVAL; 47 | return -1; 48 | } 49 | 50 | cmask = CPU_ALLOC(ncpu); 51 | if (cmask == NULL) 52 | return -1; 53 | 54 | setsize = CPU_ALLOC_SIZE(ncpu); 55 | CPU_ZERO_S(setsize, cmask); 56 | CPU_SET_S(cpu, setsize, cmask); 57 | 58 | ret = sched_setaffinity(0, ncpu, cmask); 59 | 60 | CPU_FREE(cmask); 61 | 62 | /* skip NUMA stuff for UMA systems */ 63 | if (numa_max_node() == 0) 64 | return ret; 65 | 66 | bmask = numa_bitmask_alloc(16); 67 | assert(bmask); 68 | 69 | numa_bitmask_setbit(bmask, cpu % 2); 70 | numa_set_membind(bmask); 71 | numa_bitmask_free(bmask); 72 | 73 | return ret; 74 | } 75 | 76 | void print_usage(char *argv0) 77 | { 78 | fprintf(stderr, "Usage: %s [-s] <...>\n", 79 | argv0); 80 | fprintf(stderr, " -s option makes this program work as a sink\n"); 81 | 82 | exit(2); 83 | } 84 | 85 | void parse_opt(int argc, char **argv) 86 | { 87 | int i, j; 88 | 89 | if (argc < 2) 90 | print_usage(argv[0]); 91 | 92 | if (strcmp(argv[1], "-s") == 0) { 93 | sink = 1; 94 | printf("just dropping incoming packets...\n"); 95 | } 96 | 97 | for (i = 1 + sink; i < argc; i++) { 98 | int ifindex = -1; 99 | 100 | for (j = 0; j < num_devices; j++) { 101 | if (strcmp(argv[i], devices[j].name) != 0) 102 | continue; 103 | 104 | ifindex = devices[j].ifindex; 105 | break; 106 | } 107 | 108 | if (ifindex == -1) { 109 | fprintf(stderr, "Interface %s does not exist!\n", argv[i]); 110 | exit(4); 111 | } 112 | 113 | for (j = 0; j < num_devices_attached; j++) { 114 | if (devices_attached[j] == ifindex) 115 | goto already_attached; 116 | } 117 | 118 | devices_attached[num_devices_attached] = ifindex; 119 | num_devices_attached++; 120 | 121 | already_attached: 122 | ; 123 | } 124 | 125 | assert(num_devices_attached > 0); 126 | } 127 | 128 | void handle_signal(int signal) 129 | { 130 | struct ps_handle *handle = &handles[my_cpu]; 131 | 132 | uint64_t total_rx_packets = 0; 133 | uint64_t total_tx_packets = 0; 134 | 135 | int i; 136 | int ifindex; 137 | 138 | usleep(10000 * (my_cpu + 1)); 139 | 140 | for (i = 0; i < num_devices_attached; i++) { 141 | ifindex = devices_attached[i]; 142 | total_tx_packets += handle->tx_packets[ifindex]; 143 | total_rx_packets += handle->rx_packets[ifindex]; 144 | } 145 | 146 | printf("----------\n"); 147 | printf("CPU %d: %ld packets received, %ld packets transmitted\n", 148 | my_cpu, total_rx_packets, total_tx_packets); 149 | 150 | for (i = 0; i < num_devices_attached; i++) { 151 | char *dev = devices[devices_attached[i]].name; 152 | ifindex = devices_attached[i]; 153 | 154 | if (handle->tx_packets[ifindex] == 0) 155 | continue; 156 | 157 | printf(" %s: ", dev); 158 | 159 | printf("RX %ld packets " 160 | "(%ld chunks, %.2f packets per chunk) ", 161 | handle->rx_packets[ifindex], 162 | handle->rx_chunks[ifindex], 163 | handle->rx_packets[ifindex] / 164 | (double)handle->rx_chunks[ifindex]); 165 | 166 | printf("TX %ld packets " 167 | "(%ld chunks, %.2f packets per chunk)\n", 168 | handle->tx_packets[ifindex], 169 | handle->tx_chunks[ifindex], 170 | handle->tx_packets[ifindex] / 171 | (double)handle->tx_chunks[ifindex]); 172 | } 173 | 174 | exit(0); 175 | } 176 | 177 | void echo() 178 | { 179 | struct ps_handle *handle = &handles[my_cpu]; 180 | struct ps_chunk chunk; 181 | 182 | int i; 183 | int working = 0; 184 | 185 | assert(ps_init_handle(handle) == 0); 186 | 187 | for (i = 0; i < num_devices_attached; i++) { 188 | struct ps_queue queue; 189 | if (devices[devices_attached[i]].num_rx_queues <= my_cpu) 190 | continue; 191 | 192 | if (devices[devices_attached[i]].num_tx_queues <= my_cpu) { 193 | printf("WARNING: xge%d has not enough TX queues!\n", 194 | devices_attached[i]); 195 | continue; 196 | } 197 | 198 | working = 1; 199 | queue.ifindex = devices_attached[i]; 200 | queue.qidx = my_cpu; 201 | 202 | printf("attaching RX queue xge%d:%d to CPU%d\n", queue.ifindex, queue.qidx, my_cpu); 203 | assert(ps_attach_rx_device(handle, &queue) == 0); 204 | } 205 | 206 | printf ("fd is %x\n", handle->fd); 207 | 208 | if (!working) 209 | goto done; 210 | 211 | assert(ps_alloc_chunk(handle, &chunk) == 0); 212 | 213 | chunk.recv_blocking = 1; 214 | 215 | for (;;) { 216 | int ret; 217 | 218 | chunk.cnt = 64; 219 | ret = ps_recv_chunk(handle, &chunk); 220 | 221 | if (ret < 0) { 222 | if (errno == EINTR) 223 | continue; 224 | 225 | if (!chunk.recv_blocking && errno == EWOULDBLOCK) 226 | break; 227 | 228 | assert(0); 229 | } 230 | 231 | int j; 232 | for (j = 0; j < ret; j ++) { 233 | char *buf = chunk.buf + chunk.info[i].offset; 234 | for (i = 0; i < 32; i ++) 235 | printf("%x ", buf[i]); 236 | 237 | printf("\n"); 238 | } 239 | break; 240 | 241 | #if 0 242 | if (!sink) { 243 | chunk.cnt = ret; 244 | ret = ps_send_chunk(handle, &chunk); 245 | assert(ret >= 0); 246 | } 247 | #endif 248 | } 249 | 250 | done: 251 | ps_close_handle(handle); 252 | } 253 | 254 | int main(int argc, char **argv) 255 | { 256 | int num_cpus; 257 | int i ; 258 | 259 | num_cpus = get_num_cpus(); 260 | assert(num_cpus >= 1); 261 | 262 | num_devices = ps_list_devices(devices); 263 | if (num_devices == -1) { 264 | perror("ps_list_devices"); 265 | exit(1); 266 | } 267 | 268 | parse_opt(argc, argv); 269 | 270 | for (i = 0; i < 4; i++) { 271 | int ret = fork(); 272 | assert(ret >= 0); 273 | 274 | my_cpu = i; 275 | 276 | if (ret == 0) { 277 | bind_cpu(i); 278 | signal(SIGINT, handle_signal); 279 | 280 | echo(); 281 | return 0; 282 | } 283 | } 284 | 285 | signal(SIGINT, SIG_IGN); 286 | 287 | while (1) { 288 | int ret = wait(NULL); 289 | if (ret == -1 && errno == ECHILD) 290 | break; 291 | } 292 | 293 | return 0; 294 | } 295 | -------------------------------------------------------------------------------- /IOEngine/samples/list_devices/Makefile: -------------------------------------------------------------------------------- 1 | LIBS = -lpsio 2 | LIB_DIR = -L../../lib 3 | INC = -I../../include 4 | CFLAGS = $(INC) -O2 -g -Wall 5 | 6 | .PHONY: clean 7 | 8 | all: list_devices 9 | 10 | list_devices: list_devices.c 11 | gcc $(CFLAGS) -o list_devices list_devices.c $(LIB_DIR) $(LIBS) 12 | 13 | clean: 14 | rm -f list_devices 15 | -------------------------------------------------------------------------------- /IOEngine/samples/list_devices/list_devices.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "psio.h" 4 | 5 | int main() 6 | { 7 | int num_devices; 8 | struct ps_device devices[PS_MAX_DEVICES]; 9 | 10 | int i; 11 | 12 | num_devices = ps_list_devices(devices); 13 | if (num_devices == -1) { 14 | perror("ps_list_devices"); 15 | return 1; 16 | } 17 | 18 | printf("found %d device(s).\n", num_devices); 19 | 20 | for (i = 0; i < num_devices; i++) { 21 | struct ps_device *dev = &devices[i]; 22 | char *t = (char *)&dev->ip_addr; 23 | 24 | printf("%d: %s ", 25 | dev->ifindex, 26 | dev->name); 27 | 28 | printf("(%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX) ", 29 | dev->dev_addr[0], 30 | dev->dev_addr[1], 31 | dev->dev_addr[2], 32 | dev->dev_addr[3], 33 | dev->dev_addr[4], 34 | dev->dev_addr[5]); 35 | 36 | printf("%u.%u.%u.%u ", t[0], t[1], t[2], t[3]); 37 | 38 | printf("%d RX, %d TX queues; ", 39 | dev->num_rx_queues, 40 | dev->num_tx_queues); 41 | 42 | printf("node %d\n", dev->numa_node); 43 | } 44 | 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /IOEngine/samples/monitoring/thruput.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | # A simple live monitoring script for throughputs and other statistics 4 | # using ethtool. A fully utilized 10 Gbps link showes about 14.2M pps 5 | # with 64 B packets. 6 | 7 | # The best environment for this script is Python 3.x or higher. 8 | # However, you can run it even on Python 2.6 depending on your system. 9 | # The primary reason to use 3.x versions is "," formatting of numbers. 10 | # If you do not need that, just don't use '-f' option. 11 | # To see the result values in a naturally sorted order, use Python 2.7 12 | # or higher (due to OrderedDict). 13 | 14 | # Note that this script may have some overheads, so please terminate it 15 | # when you do serious performance measurements requiring high accuracy. 16 | 17 | from __future__ import print_function 18 | import sys, os 19 | import time 20 | import copy 21 | import subprocess 22 | import multiprocessing 23 | try: 24 | from collections import OrderedDict 25 | except ImportError: 26 | OrderedDict = None 27 | from optparse import OptionParser 28 | 29 | def execute(cmd, check_returncode=False): 30 | proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 31 | stdout, stderr = proc.communicate() 32 | if check_returncode and proc.returncode != 0: 33 | raise subprocess.CalledProcessError(proc.returncode, cmd) 34 | return stdout.decode('ascii') 35 | 36 | def get_stats(dev): 37 | lines = execute('ethtool -S {0}'.format(dev), True).splitlines()[1:] 38 | if OrderedDict is None: 39 | ret = {} 40 | else: 41 | ret = OrderedDict() 42 | 43 | for line in lines: 44 | line = line.strip() 45 | key, value = line.split(': ') 46 | ret[key.strip()] = int(value) 47 | return ret 48 | 49 | if __name__ == '__main__': 50 | 51 | if os.geteuid() != 0: 52 | print('You msut be root!', file=sys.stderr) 53 | sys.exit(1) 54 | 55 | oparser = OptionParser() 56 | oparser.add_option('-i', '--interval', dest='interval', default=1.0, type=float, 57 | help='sets the update interval in seconds (default: 1.0)') 58 | oparser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, 59 | help='shows all statistics even if they are not changed during an interval.') 60 | oparser.add_option('-f', '--format-numbers', dest='format_numbers', action='store_true', default=False, 61 | help='formats numbers with commas and human-readable units for monitoring purpose.') 62 | opts, args = oparser.parse_args() 63 | 64 | num_cpus = multiprocessing.cpu_count() 65 | interested_fields = set('tx_queue_{0}_packets'.format(i) for i in range(num_cpus)) | \ 66 | set('rx_queue_{0}_packets'.format(i) for i in range(num_cpus)) | \ 67 | set('tx_queue_{0}_bytes'.format(i) for i in range(num_cpus)) | \ 68 | set('rx_queue_{0}_bytes'.format(i) for i in range(num_cpus)) | \ 69 | set(('tx_bytes', 'rx_bytes', 'tx_errors', 'rx_errors', 70 | 'tx_packets', 'rx_packets', 'tx_dropped', 'rx_dropped', 71 | 'rx_missed_errors')) 72 | last_stats = {} 73 | stats = {} 74 | 75 | if opts.format_numbers: 76 | if sys.version_info < (3, 1): 77 | print('Comman-separated number formatting requires Python 3.1 or higher.', file=sys.stderr) 78 | sys.exit(1) 79 | int_format = '{0:>16,d}' 80 | else: 81 | int_format = '{0:>16d}' 82 | 83 | devs = [] 84 | print() 85 | while len(devs) == 0: 86 | devs = execute('ifconfig -s | grep xge | awk \'{print $1}\'').splitlines() 87 | print("\033[2K\033[1Ano xge devices found, waiting...") 88 | time.sleep(1) 89 | for dev in devs: 90 | last_stats[dev] = get_stats(dev) 91 | last_timestamp = time.time() 92 | 93 | try: 94 | while True: 95 | strbuf = [] 96 | strbuf.append('\033[2J\033[0;0H') 97 | strbuf.append(time.ctime() + '\n') 98 | strbuf.append('Thrughputs per second:\n') 99 | strbuf.append('{0:<20}'.format('FIELD')) 100 | try: 101 | for dev in devs: 102 | strbuf.append('{0:>16}'.format(dev)) 103 | stats[dev] = get_stats(dev) 104 | except subprocess.CalledProcessError as e: 105 | if e.returncode == 71: # means "No such device", maybe reinstalling the module. 106 | time.sleep(0.5) 107 | continue 108 | else: 109 | print("Unexpected ethtool return code: {0}".format(e.returncode), file=sys.stderr) 110 | print(e.output, file=sys.stderr) 111 | sys.exit(1) 112 | if len(devs) >= 2: 113 | strbuf.append('{0:>16}'.format('TOTAL')) 114 | strbuf.append('\n') 115 | 116 | timestamp = time.time() 117 | interval = timestamp - last_timestamp 118 | 119 | keys = filter(lambda k: k in interested_fields, stats[devs[0]].keys()) 120 | 121 | for key in keys: 122 | is_diff = False 123 | 124 | for dev in devs: 125 | if stats[dev][key] != last_stats[dev][key]: 126 | is_diff = True 127 | 128 | if opts.verbose or is_diff: 129 | strbuf.append('{0:<20}'.format(key)) 130 | total_diff = 0 131 | for dev in devs: 132 | diff = stats[dev][key] - last_stats[dev][key] 133 | diff /= interval 134 | total_diff += diff 135 | strbuf.append(int_format.format(int(diff))) 136 | if len(devs) >= 2: 137 | strbuf.append(int_format.format(int(total_diff))) 138 | strbuf.append('\n') 139 | 140 | print(''.join(strbuf)) 141 | time.sleep(opts.interval) 142 | last_stats = copy.copy(stats) 143 | last_timestamp = timestamp 144 | except KeyboardInterrupt: 145 | print() 146 | # vim: ts=8 sts=4 sw=4 et fo=croql 147 | -------------------------------------------------------------------------------- /IOEngine/samples/packet_generator/Makefile: -------------------------------------------------------------------------------- 1 | LIBS = -lpsio -lnuma 2 | LIB_DIR = -L../../lib 3 | INC = -I../../include 4 | CFLAGS = -O2 -g -Wall $(INC) 5 | 6 | .PHONY: clean 7 | 8 | all: packet_generator pspgen 9 | 10 | pspgen: pspgen.c 11 | gcc $(CFLAGS) -o pspgen pspgen.c $(LIB_DIR) $(LIBS) -lpthread 12 | 13 | packet_generator: packet_generator.c 14 | gcc $(CFLAGS) -o packet_generator packet_generator.c $(LIB_DIR) $(LIBS) 15 | 16 | clean: 17 | rm -f packet_generator pspgen *.o 18 | -------------------------------------------------------------------------------- /IOEngine/samples/rxdump/Makefile: -------------------------------------------------------------------------------- 1 | LIBS = -lpsio 2 | LIB_DIR = -L../../lib 3 | INC = -I../../include 4 | CFLAGS = -O2 -g -Wall $(INC) 5 | 6 | .PHONY: clean 7 | 8 | all: rxdump 9 | 10 | rxdump: rxdump.c 11 | gcc $(CFLAGS) -o rxdump rxdump.c $(LIB_DIR) $(LIBS) 12 | 13 | clean: 14 | rm -f rxdump 15 | -------------------------------------------------------------------------------- /IOEngine/samples/tx/2pkt1con.pcap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kay21s/UPro/c95585a0721325a6855233c97b6a32cdb98763fa/IOEngine/samples/tx/2pkt1con.pcap -------------------------------------------------------------------------------- /IOEngine/samples/tx/Makefile: -------------------------------------------------------------------------------- 1 | LIBS = -lpsio -lnuma 2 | LIB_DIR = -L../../lib 3 | 4 | .PHONY: clean 5 | 6 | all: tx 7 | 8 | tx: tx.c pkt_buff.c 9 | gcc -g -Wall -o tx tx.c pkt_buff.c $(LIB_DIR) $(LIBS) 10 | 11 | clean: 12 | rm -f tx 13 | -------------------------------------------------------------------------------- /IOEngine/samples/tx/pkt_buff.c: -------------------------------------------------------------------------------- 1 | #include "pkt_buff.h" 2 | #include 3 | #include 4 | #include 5 | 6 | #define QUEUE_NUM 8 7 | #define FILE_CACHE_SIZE (1024*1024*500) /*BYTE*/ 8 | 9 | 10 | char fname[QUEUE_NUM][256]={ 11 | "rtp1.pcap", 12 | "rtp1.pcap", 13 | "rtp1.pcap", 14 | "rtp1.pcap", 15 | "rtp1.pcap", 16 | "rtp1.pcap", 17 | "rtp1.pcap", 18 | "rtp1.pcap", 19 | #if 0 20 | "/home/kay/trace/fix_split0.pcap", 21 | "/home/kay/trace/fix_split1.pcap", 22 | "/home/kay/trace/fix_split2.pcap", 23 | "/home/kay/trace/fix_split3.pcap", 24 | "/home/kay/trace/fix_split4.pcap", 25 | "/home/kay/trace/fix_split5.pcap", 26 | "/home/kay/trace/fix_split6.pcap", 27 | "/home/kay/trace/fix_split7.pcap", 28 | "/home/kay/trace/split0.pcap", 29 | "/home/kay/trace/split1.pcap", 30 | "/home/kay/trace/split2.pcap", 31 | "/home/kay/trace/split3.pcap", 32 | "/home/kay/trace/split4.pcap", 33 | "/home/kay/trace/split5.pcap", 34 | "/home/kay/trace/split6.pcap", 35 | "/home/kay/trace/split7.pcap", 36 | #endif 37 | }; /*trace file*/ 38 | 39 | file_cache_t *file_cache_head=NULL; 40 | 41 | 42 | void prep_skb(file_cache_t *fct,char** pdata); 43 | int check_pcap(file_cache_t *fct); 44 | void hex_printf(unsigned char *str,int len); 45 | void release_pkt_buff_part(void); 46 | file_cache_t* preload_pcap_file(int queue_map); 47 | 48 | #define FOFFSET(n) fct->offset+=n 49 | #define TCPDUMP_MAGIC 0xa1b2c3d4 /*no swap, and tcpdump pcap format*/ 50 | /* 51 | *build next skb from buffer cache 52 | */ 53 | u_char *prep_next_skb(file_cache_t *fct,u_int32_t *pktlen) 54 | { 55 | if (fct == NULL) { 56 | printf("<1>no file buffer cache \n");return NULL; 57 | } 58 | /* if end */ 59 | if (fct->offset == fct->size) 60 | fct->offset = sizeof(pf_hdr_t); 61 | 62 | /*set packet data and hdr pointer,? no copy*/ 63 | p_hdr_t *hdr = (p_hdr_t*)(fct->fcache + fct->offset); 64 | u_int32_t caplen = hdr->ncl_len; 65 | FOFFSET(sizeof(p_hdr_t)); 66 | u_char *pktdata = fct->fcache + fct->offset; 67 | FOFFSET(hdr->ncl_len); 68 | 69 | if (fct->offset > fct->size) { 70 | printf("<1>pcap file is not integrated\n"); 71 | return NULL; 72 | } 73 | 74 | *pktlen = caplen; 75 | 76 | return pktdata; 77 | } 78 | 79 | int check_pcap(file_cache_t *fct) 80 | { 81 | u_int32_t magic; 82 | memcpy(&magic, fct->fcache + fct->offset, sizeof(magic)); 83 | FOFFSET(sizeof(magic)); 84 | if (magic != TCPDUMP_MAGIC) { 85 | printf("<1> not a tcpdump file\n"); 86 | return 0; 87 | } 88 | fct->hdr.magic = magic; 89 | 90 | memcpy(&(fct->hdr)+sizeof(magic), fct->fcache+fct->offset, sizeof(fct->hdr)-sizeof(magic)); 91 | FOFFSET(sizeof(fct->hdr) - sizeof(magic)); 92 | 93 | if (fct->offset >= fct->size) { 94 | printf("<1> not a complete pcap file\n"); 95 | return 0; 96 | } 97 | return 1; 98 | } 99 | 100 | /* 101 | *one thread one trace 102 | */ 103 | file_cache_t *preload_pcap_file(int queue_map) 104 | { 105 | 106 | FILE *fp; 107 | const char *fcache=NULL; 108 | unsigned long size; 109 | file_cache_t *fct; 110 | char errbuf[256]; 111 | 112 | fp = fopen(fname[queue_map], "r"); 113 | 114 | if(fp != NULL) { 115 | 116 | fcache = malloc(FILE_CACHE_SIZE); 117 | if (fcache == NULL) { 118 | printf("<1> vmalloc file cache failed!\n"); 119 | fclose(fp); 120 | return NULL; 121 | } 122 | 123 | if ((size = fread((void *)fcache, (size_t)1, (size_t)FILE_CACHE_SIZE, fp)) == 0) { 124 | free(fcache); 125 | printf("<1>kernel file read failed.\n"); 126 | fclose(fp); 127 | return NULL; 128 | } else if (size == FILE_CACHE_SIZE){ 129 | free(fcache); 130 | fclose(fp); 131 | printf("<1>file cache size is not enough to buffer file\n"); 132 | return NULL; 133 | } else { 134 | printf("<1>loading %ld BYTE size from trace\n",size); 135 | } 136 | 137 | fclose(fp); 138 | 139 | } else { 140 | printf("<1>fopen failed!\n"); 141 | return NULL; 142 | } 143 | 144 | /*save malloc pointer for vfree*/ 145 | fct = malloc(sizeof(file_cache_t)); 146 | memset(fct, 0, sizeof(file_cache_t)); 147 | 148 | if (file_cache_head == NULL) { 149 | fct->next = NULL; 150 | fct->size = size; 151 | fct->fcache = fcache; 152 | file_cache_head = fct; 153 | } else { 154 | fct->next = file_cache_head; 155 | fct->fcache = fcache; 156 | fct->size = size; 157 | file_cache_head = fct; 158 | } 159 | /* avoid warning in compliation*/ 160 | if (errbuf[0] != 0); 161 | return fct; 162 | } 163 | 164 | void release_pkt_buff_part(){ 165 | 166 | /*free vmalloc buffer cache*/ 167 | file_cache_t *fct=file_cache_head; 168 | file_cache_t *next=file_cache_head->next; 169 | while(fct!=NULL) { 170 | free(fct->fcache); 171 | free(fct); 172 | fct=next; 173 | if(next!=NULL) 174 | next=next->next; 175 | } 176 | 177 | printf("<1>buffer cache free done!\n"); 178 | return ; 179 | } 180 | 181 | void hex_printf(unsigned char *str,int len){ 182 | 183 | int times=len/16; 184 | int last=len%16; 185 | unsigned char *p=str; 186 | int i=0; 187 | for(i=0;idata:%2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x\n", \ 189 | p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15]); 190 | p+=16; 191 | } 192 | printf("<1>remained %d data have been shown\n",last); 193 | 194 | } 195 | -------------------------------------------------------------------------------- /IOEngine/samples/tx/pkt_buff.h: -------------------------------------------------------------------------------- 1 | #ifndef _PKT_BUFF_H 2 | #define _PKT_BUFF_H 3 | #include 4 | #include 5 | 6 | /*pcap file format*/ 7 | typedef struct pf_hdr { 8 | u_int32_t magic; 9 | u_int16_t version_major; 10 | u_int16_t tversion_minor; 11 | int32_t thiszone; /* gmt to local correction */ 12 | u_int32_t sigfigs; /* accuracy of timestamps */ 13 | u_int32_t snaplen; /* max length saved portion of each pkt */ 14 | u_int32_t linktype; /* data link type (LINKTYPE_*) */ 15 | } pf_hdr_t; 16 | 17 | typedef struct pcaprec_hdr_s { 18 | u_int32_t ts_sec; /* timestamp seconds */ 19 | u_int32_t ts_usec; /* timestamp microseconds */ 20 | u_int32_t ncl_len; /* number of octets of packet saved in file */ 21 | u_int32_t rig_len; /* actual length of packet */ 22 | } p_hdr_t; 23 | 24 | struct file_cache{ 25 | char *fcache; 26 | unsigned long offset; 27 | unsigned long size; 28 | struct file_cache *next; 29 | /*pcap header*/ 30 | pf_hdr_t hdr; 31 | }; 32 | 33 | typedef struct file_cache file_cache_t; 34 | 35 | extern u_char *prep_next_skb(file_cache_t *fct,u_int32_t *pktlen); 36 | extern int check_pcap(file_cache_t *fct); 37 | extern void hex_printk(unsigned char *str,int len); 38 | extern void release_pkt_buff_part(void); 39 | extern file_cache_t* preload_pcap_file(int queue_map); 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /IOEngine/samples/tx/rtp1.pcap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kay21s/UPro/c95585a0721325a6855233c97b6a32cdb98763fa/IOEngine/samples/tx/rtp1.pcap -------------------------------------------------------------------------------- /IOEngine/samples/tx/tx.c: -------------------------------------------------------------------------------- 1 | #ifndef _GNU_SOURCE 2 | #define _GNU_SOURCE 3 | #endif 4 | #define __USE_GNU 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include "pkt_buff.h" 18 | //#include 19 | 20 | #include "../../include/psio.h" 21 | 22 | #define PS_MAX_CPUS 32 23 | 24 | int num_devices; 25 | struct ps_device devices[PS_MAX_DEVICES]; 26 | 27 | int num_devices_attached; 28 | int devices_attached[PS_MAX_DEVICES]; 29 | 30 | struct ps_handle handles[PS_MAX_CPUS]; 31 | 32 | int my_cpu, my_queue; 33 | 34 | struct timeval startime; 35 | struct timeval endtime; 36 | 37 | int get_num_cpus() 38 | { 39 | return sysconf(_SC_NPROCESSORS_ONLN); 40 | } 41 | 42 | int bind_cpu(int cpu) 43 | { 44 | cpu_set_t *cmask; 45 | struct bitmask *bmask; 46 | cpu_set_t mask; 47 | size_t n; 48 | int ret; 49 | 50 | n = get_num_cpus(); 51 | 52 | if (cpu < 0 || cpu >= (int)n) { 53 | errno = -EINVAL; 54 | return -1; 55 | } 56 | 57 | CPU_ZERO(&mask); 58 | CPU_SET(cpu, &mask); 59 | 60 | ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask); 61 | 62 | cmask = CPU_ALLOC(n); 63 | if (cmask == NULL) 64 | return -1; 65 | 66 | CPU_ZERO_S(n, cmask); 67 | CPU_SET_S(cpu, n, cmask); 68 | 69 | ret = sched_setaffinity(0, n, cmask); 70 | 71 | CPU_FREE(cmask); 72 | 73 | /* skip NUMA stuff for UMA systems */ 74 | if (numa_max_node() == 0) 75 | return ret; 76 | 77 | bmask = numa_bitmask_alloc(16); 78 | assert(bmask); 79 | 80 | numa_bitmask_setbit(bmask, cpu % 2); 81 | numa_set_membind(bmask); 82 | numa_bitmask_free(bmask); 83 | 84 | return ret; 85 | } 86 | 87 | void print_usage(char *argv0) 88 | { 89 | fprintf(stderr, "Usage: %s <...>", 90 | argv0); 91 | 92 | exit(2); 93 | } 94 | 95 | void parse_opt(int argc, char **argv) 96 | { 97 | int i, j; 98 | 99 | if (argc < 2) 100 | print_usage(argv[0]); 101 | 102 | for (i = 1; i < argc; i++) { 103 | int ifindex = -1; 104 | 105 | for (j = 0; j < num_devices; j++) { 106 | if (strcmp(argv[i], devices[j].name) != 0) 107 | continue; 108 | 109 | ifindex = devices[j].ifindex; 110 | break; 111 | } 112 | 113 | if (ifindex == -1) { 114 | fprintf(stderr, "Interface %s does not exist!\n", argv[i]); 115 | exit(4); 116 | } 117 | 118 | for (j = 0; j < num_devices_attached; j++) { 119 | if (devices_attached[j] == ifindex) 120 | goto already_attached; 121 | } 122 | 123 | devices_attached[num_devices_attached] = ifindex; 124 | num_devices_attached++; 125 | printf("ifindex = %d\n", ifindex); 126 | 127 | already_attached: 128 | ; 129 | } 130 | 131 | assert(num_devices_attached > 0); 132 | } 133 | 134 | void handle_signal(int signal) 135 | { 136 | struct ps_handle *handle = &handles[my_cpu]; 137 | 138 | uint64_t total_tx_packets = 0; 139 | uint64_t total_tx_bytes = 0; 140 | 141 | int i; 142 | int ifindex; 143 | 144 | struct timeval subtime; 145 | 146 | gettimeofday(&endtime, NULL); 147 | timersub(&endtime, &startime, &subtime); 148 | 149 | usleep(10000 * (my_cpu + 1)); 150 | 151 | assert (num_devices_attached == 1); 152 | for (i = 0; i < num_devices_attached; i++) { 153 | ifindex = devices_attached[i]; 154 | total_tx_packets += handle->tx_packets[ifindex]; 155 | total_tx_bytes += handle->tx_bytes[ifindex]; 156 | } 157 | 158 | printf("----------\n"); 159 | printf("CPU %d: %ld packets transmitted, elapse time : %lds, Send Speed : %lf Mpps, %5.2f Gbps, Aveage Len. = %ld\n", 160 | my_cpu, total_tx_packets, subtime.tv_sec, 161 | (double)(total_tx_packets) / (double) (subtime.tv_sec*1000000+subtime.tv_usec), 162 | (double)(total_tx_bytes*8) / (double) ((subtime.tv_sec*1000000+subtime.tv_usec) * 1000), 163 | total_tx_bytes/total_tx_packets); 164 | 165 | 166 | for (i = 0; i < num_devices_attached; i++) { 167 | char *dev = devices[devices_attached[i]].name; 168 | ifindex = devices_attached[i]; 169 | 170 | if (handle->tx_packets[ifindex] == 0) 171 | continue; 172 | 173 | printf(" %s: ", dev); 174 | 175 | printf("TX %ld packets " 176 | "(%ld chunks, %.2f packets per chunk)\n", 177 | handle->tx_packets[ifindex], 178 | handle->tx_chunks[ifindex], 179 | handle->tx_packets[ifindex] / 180 | (double)handle->tx_chunks[ifindex]); 181 | } 182 | 183 | exit(0); 184 | } 185 | 186 | void echo() 187 | { 188 | struct ps_handle *handle = &handles[my_cpu]; 189 | struct ps_chunk chunk; 190 | file_cache_t *fct; 191 | unsigned int pktlen; 192 | u_char *pktdata; 193 | 194 | int i; 195 | int working = 0; 196 | 197 | 198 | assert(ps_init_handle(handle) == 0); 199 | assert(ps_alloc_chunk(handle, &chunk) == 0); 200 | 201 | assert(num_devices_attached == 1); 202 | 203 | for (i = 0; i < num_devices_attached; i++) { 204 | working = 1; 205 | chunk.queue.ifindex = devices_attached[i]; 206 | chunk.queue.qidx = my_queue; 207 | printf("attach ifindex : %d\n", devices_attached[i]); 208 | 209 | printf("attaching RX queue xge%d:%d to CPU%d\n", chunk.queue.ifindex, chunk.queue.qidx, my_cpu); 210 | assert(ps_attach_rx_device(handle, &(chunk.queue)) == 0); 211 | } 212 | 213 | if (!working) 214 | goto done; 215 | 216 | //Preload pcap file --Kay 217 | if ((fct = preload_pcap_file(my_cpu)) != NULL) { 218 | printf("Loading done, core %d\n", my_cpu); 219 | if (!check_pcap(fct)) 220 | printf("It is not trace file, core %d\n", my_cpu); 221 | } else { 222 | printf("Loading failed, core %d\n", my_cpu); 223 | } 224 | 225 | 226 | 227 | chunk.cnt = 1024; // Change this chunk size to improve TX performance --Kay 228 | chunk.recv_blocking = 1; 229 | 230 | pktdata = prep_next_skb(fct, &pktlen); 231 | 232 | 233 | gettimeofday(&startime, NULL); 234 | for (;;) { 235 | // =========================================================== 236 | for (i=0; i < chunk.cnt; i++) { 237 | chunk.info[i].offset = i * PS_MAX_PACKET_SIZE; 238 | chunk.info[i].len = pktlen; 239 | memcpy_aligned(chunk.buf + chunk.info[i].offset, 240 | pktdata, 241 | pktlen); 242 | } 243 | 244 | // =========================================================== 245 | int ret = ps_send_chunk(handle, &chunk); 246 | assert(ret >= 0); 247 | } 248 | 249 | done: 250 | ps_close_handle(handle); 251 | } 252 | 253 | int main(int argc, char **argv) 254 | { 255 | int num_cpus; 256 | int i=0; 257 | 258 | num_cpus = get_num_cpus(); 259 | num_cpus = 1; 260 | assert(num_cpus >= 1); 261 | 262 | num_devices = ps_list_devices(devices); 263 | if (num_devices == -1) { 264 | perror("ps_list_devices"); 265 | exit(1); 266 | } 267 | 268 | parse_opt(argc, argv); 269 | 270 | num_cpus = 4; 271 | for (i = 0; i < num_cpus; i ++) { 272 | int ret = fork(); 273 | assert(ret >= 0); 274 | 275 | my_cpu = i * 2 + 1; 276 | my_queue = i; 277 | 278 | if (ret == 0) { 279 | bind_cpu(i); 280 | signal(SIGINT, handle_signal); 281 | 282 | echo(); 283 | return 0; 284 | } 285 | } 286 | 287 | signal(SIGINT, SIG_IGN); 288 | 289 | while (1) { 290 | int ret = wait(NULL); 291 | if (ret == -1 && errno == ECHILD) 292 | break; 293 | } 294 | 295 | return 0; 296 | } 297 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | UPro 2 | ==== 3 | 4 | A 10Gbps UDP Reverse Proxy 5 | 6 | kay21s@gmail.com 7 | -------------------------------------------------------------------------------- /latency-experiment/10g_experiment/minus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | if len(sys.argv) != 4: 6 | print "need recv, send, result filename" 7 | sys.exit(1) 8 | print sys.argv[1], sys.argv[2], sys.argv[3] 9 | recv = open(sys.argv[1]).readlines() 10 | send = open(sys.argv[2]).readlines() 11 | result = open(sys.argv[3], 'w') 12 | total = 0 13 | for i in range(len(recv)): 14 | #print recv[i].strip().split(' ')[0] 15 | res = int(recv[i].strip().split(' ')[0]) - int(send[i].strip().split(' ')[0]) 16 | total += res 17 | result.write(str(res)) 18 | result.write('\n') 19 | print total/len(recv) 20 | -------------------------------------------------------------------------------- /latency-experiment/10g_experiment/result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | if len(sys.argv) != 4: 6 | print "filename" 7 | sys.exit(1) 8 | print sys.argv[1], sys.argv[2], sys.argv[3] 9 | recv = open(sys.argv[1]).readlines() 10 | send = open(sys.argv[2]).readlines() 11 | result = open(sys.argv[3], 'w') 12 | 13 | recv = [x.strip().split(' ') for x in recv] 14 | recv = [[x[0], x[1]] for x in recv] 15 | send = [x.strip().split(' ') for x in send] 16 | send = [[x[0], x[1]] for x in send] 17 | 18 | total = 0 19 | min = 100000 20 | max = 0 21 | for x in recv: 22 | for y in send: 23 | if y[0] == x[0]: 24 | res = int(x[1]) - int(y[1]) 25 | total += res 26 | if min > res: 27 | min = res 28 | if max < res: 29 | max = res 30 | result.write(str(res) + '\n') 31 | print total/len(recv), min, max 32 | -------------------------------------------------------------------------------- /latency-experiment/10g_experiment/sort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | if len(sys.argv) != 3: 6 | print "filename" 7 | sys.exit(1) 8 | print sys.argv[1], sys.argv[2] 9 | input = open(sys.argv[1]).readlines() 10 | output = open(sys.argv[2], 'w') 11 | 12 | input = [x.strip().split(' ') for x in input] 13 | input = [[x[0], x[1]] for x in input] 14 | 15 | res = [[int(x[0]), int(x[1])] for x in input] 16 | res.sort() 17 | res = [str(x[0]) + ' ' + str(x[1]) + '\n' for x in res] 18 | output.writelines(res) 19 | 20 | output.close() 21 | -------------------------------------------------------------------------------- /latency-experiment/10g_experiment/static.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | if len(sys.argv) != 2: 6 | print "need file name" 7 | sys.exit(1) 8 | print sys.argv[1] 9 | recv = open(sys.argv[1]).readlines() 10 | total = 0 11 | min = 100000 12 | max = 0 13 | for i in range(len(recv)): 14 | #print recv[i].strip().split(' ')[0] 15 | res = int(recv[i].strip().split(' ')[0]) 16 | total += res 17 | if min > res: 18 | min = res 19 | if max < res: 20 | max = res 21 | print total/len(recv), min, max 22 | -------------------------------------------------------------------------------- /latency-experiment/25ms_experiment/result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | if len(sys.argv) != 4: 6 | print "filename" 7 | sys.exit(1) 8 | print sys.argv[1], sys.argv[2], sys.argv[3] 9 | recv = open(sys.argv[1]).readlines() 10 | send = open(sys.argv[2]).readlines() 11 | result = open(sys.argv[3], 'w') 12 | 13 | recv = [x.strip().split(' ') for x in recv] 14 | recv = [[x[0], x[1]] for x in recv] 15 | send = [x.strip().split(' ') for x in send] 16 | send = [[x[0], x[1]] for x in send] 17 | 18 | total = 0 19 | min = 100000 20 | max = 0 21 | for x in recv: 22 | for y in send: 23 | if y[0] == x[0]: 24 | res = int(x[1]) - int(y[1]) 25 | total += res 26 | if min > res: 27 | min = res 28 | if max < res: 29 | max = res 30 | result.write(str(res) + '\n') 31 | print total/len(recv), min, max 32 | -------------------------------------------------------------------------------- /latency-experiment/25ms_experiment/sort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | if len(sys.argv) != 3: 6 | print "filename" 7 | sys.exit(1) 8 | print sys.argv[1], sys.argv[2] 9 | input = open(sys.argv[1]).readlines() 10 | output = open(sys.argv[2], 'w') 11 | 12 | input = [x.strip().split(' ') for x in input] 13 | input = [[x[0], x[1]] for x in input] 14 | 15 | res = [[int(x[0]), int(x[1])] for x in input] 16 | res.sort() 17 | res = [str(x[0]) + ' ' + str(x[1]) + '\n' for x in res] 18 | output.writelines(res) 19 | 20 | output.close() 21 | -------------------------------------------------------------------------------- /latency-experiment/25ms_experiment/static.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | if len(sys.argv) != 2: 6 | print "need file name" 7 | sys.exit(1) 8 | print sys.argv[1] 9 | recv = open(sys.argv[1]).readlines() 10 | total = 0 11 | min = 100000 12 | max = 0 13 | for i in range(len(recv)): 14 | #print recv[i].strip().split(' ')[0] 15 | res = int(recv[i].strip().split(' ')[0]) 16 | total += res 17 | if min > res: 18 | min = res 19 | if max < res: 20 | max = res 21 | print total/len(recv), min, max 22 | -------------------------------------------------------------------------------- /latency-experiment/rxdump/Makefile: -------------------------------------------------------------------------------- 1 | LIBS = -lpsio -lrt 2 | LIB_DIR = -L../../lib 3 | INC = -I../../include 4 | CFLAGS = -O2 -g -Wall $(INC) 5 | 6 | .PHONY: clean 7 | 8 | all: rxdump 9 | 10 | rxdump: rxdump.c 11 | gcc $(CFLAGS) -o rxdump rxdump.c $(LIB_DIR) $(LIBS) 12 | 13 | clean: 14 | rm -f rxdump 15 | -------------------------------------------------------------------------------- /latency-experiment/rxdump/packet.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kay21s/UPro/c95585a0721325a6855233c97b6a32cdb98763fa/latency-experiment/rxdump/packet.txt -------------------------------------------------------------------------------- /latency-experiment/tx/Makefile: -------------------------------------------------------------------------------- 1 | LIBS = -lpsio -lnuma -lrt 2 | LIB_DIR = -L../../lib 3 | 4 | .PHONY: clean 5 | 6 | all: tx 7 | 8 | tx: tx.c pkt_buff.c 9 | gcc -g -Wall -o tx tx.c pkt_buff.c $(LIB_DIR) $(LIBS) 10 | 11 | clean: 12 | rm -f tx 13 | -------------------------------------------------------------------------------- /latency-experiment/tx/dpkt.py: -------------------------------------------------------------------------------- 1 | import dpkt 2 | import sys 3 | f = file(sys.argv[1], "rb") 4 | pcap=dpkt.pcap.Reader(f) 5 | 6 | ts,buf = pcap[0] 7 | eth = dpkt.ethernet.Ethernet(buf) 8 | ip= eth.data 9 | 10 | m = file(sys.argv[2], "w") 11 | m.write(ip) 12 | 13 | f.close() 14 | m.close() 15 | -------------------------------------------------------------------------------- /latency-experiment/tx/packet.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kay21s/UPro/c95585a0721325a6855233c97b6a32cdb98763fa/latency-experiment/tx/packet.txt -------------------------------------------------------------------------------- /latency-experiment/tx/pkt_buff.c: -------------------------------------------------------------------------------- 1 | #include "pkt_buff.h" 2 | #include 3 | #include 4 | #include 5 | 6 | #define QUEUE_NUM 8 7 | #define FILE_CACHE_SIZE (1024*1024*500) /*BYTE*/ 8 | 9 | 10 | char fname[QUEUE_NUM][256]={ 11 | "rtp1.pcap", 12 | "rtp1.pcap", 13 | "rtp1.pcap", 14 | "rtp1.pcap", 15 | "rtp1.pcap", 16 | "rtp1.pcap", 17 | "rtp1.pcap", 18 | "rtp1.pcap", 19 | #if 0 20 | "/home/kay/trace/fix_split0.pcap", 21 | "/home/kay/trace/fix_split1.pcap", 22 | "/home/kay/trace/fix_split2.pcap", 23 | "/home/kay/trace/fix_split3.pcap", 24 | "/home/kay/trace/fix_split4.pcap", 25 | "/home/kay/trace/fix_split5.pcap", 26 | "/home/kay/trace/fix_split6.pcap", 27 | "/home/kay/trace/fix_split7.pcap", 28 | "/home/kay/trace/split0.pcap", 29 | "/home/kay/trace/split1.pcap", 30 | "/home/kay/trace/split2.pcap", 31 | "/home/kay/trace/split3.pcap", 32 | "/home/kay/trace/split4.pcap", 33 | "/home/kay/trace/split5.pcap", 34 | "/home/kay/trace/split6.pcap", 35 | "/home/kay/trace/split7.pcap", 36 | #endif 37 | }; /*trace file*/ 38 | 39 | file_cache_t *file_cache_head=NULL; 40 | 41 | 42 | void prep_skb(file_cache_t *fct,char** pdata); 43 | int check_pcap(file_cache_t *fct); 44 | void hex_printf(unsigned char *str,int len); 45 | void release_pkt_buff_part(void); 46 | file_cache_t* preload_pcap_file(int queue_map); 47 | 48 | #define FOFFSET(n) fct->offset+=n 49 | #define TCPDUMP_MAGIC 0xa1b2c3d4 /*no swap, and tcpdump pcap format*/ 50 | /* 51 | *build next skb from buffer cache 52 | */ 53 | u_char *prep_next_skb(file_cache_t *fct,u_int32_t *pktlen) 54 | { 55 | if (fct == NULL) { 56 | printf("<1>no file buffer cache \n");return NULL; 57 | } 58 | /* if end */ 59 | if (fct->offset == fct->size) 60 | fct->offset = sizeof(pf_hdr_t); 61 | 62 | /*set packet data and hdr pointer,? no copy*/ 63 | p_hdr_t *hdr = (p_hdr_t*)(fct->fcache + fct->offset); 64 | u_int32_t caplen = hdr->ncl_len; 65 | FOFFSET(sizeof(p_hdr_t)); 66 | u_char *pktdata = fct->fcache + fct->offset; 67 | FOFFSET(hdr->ncl_len); 68 | 69 | if (fct->offset > fct->size) { 70 | printf("<1>pcap file is not integrated\n"); 71 | return NULL; 72 | } 73 | 74 | *pktlen = caplen; 75 | 76 | return pktdata; 77 | } 78 | 79 | int check_pcap(file_cache_t *fct) 80 | { 81 | u_int32_t magic; 82 | memcpy(&magic, fct->fcache + fct->offset, sizeof(magic)); 83 | FOFFSET(sizeof(magic)); 84 | if (magic != TCPDUMP_MAGIC) { 85 | printf("<1> not a tcpdump file\n"); 86 | return 0; 87 | } 88 | fct->hdr.magic = magic; 89 | 90 | memcpy(&(fct->hdr)+sizeof(magic), fct->fcache+fct->offset, sizeof(fct->hdr)-sizeof(magic)); 91 | FOFFSET(sizeof(fct->hdr) - sizeof(magic)); 92 | 93 | if (fct->offset >= fct->size) { 94 | printf("<1> not a complete pcap file\n"); 95 | return 0; 96 | } 97 | return 1; 98 | } 99 | 100 | /* 101 | *one thread one trace 102 | */ 103 | file_cache_t *preload_pcap_file(int queue_map) 104 | { 105 | 106 | FILE *fp; 107 | const char *fcache=NULL; 108 | unsigned long size; 109 | file_cache_t *fct; 110 | char errbuf[256]; 111 | 112 | fp = fopen(fname[queue_map], "r"); 113 | 114 | if(fp != NULL) { 115 | 116 | fcache = malloc(FILE_CACHE_SIZE); 117 | if (fcache == NULL) { 118 | printf("<1> vmalloc file cache failed!\n"); 119 | fclose(fp); 120 | return NULL; 121 | } 122 | 123 | if ((size = fread((void *)fcache, (size_t)1, (size_t)FILE_CACHE_SIZE, fp)) == 0) { 124 | free(fcache); 125 | printf("<1>kernel file read failed.\n"); 126 | fclose(fp); 127 | return NULL; 128 | } else if (size == FILE_CACHE_SIZE){ 129 | free(fcache); 130 | fclose(fp); 131 | printf("<1>file cache size is not enough to buffer file\n"); 132 | return NULL; 133 | } else { 134 | printf("<1>loading %ld BYTE size from trace\n",size); 135 | } 136 | 137 | fclose(fp); 138 | 139 | } else { 140 | printf("<1>fopen failed!\n"); 141 | return NULL; 142 | } 143 | 144 | /*save malloc pointer for vfree*/ 145 | fct = malloc(sizeof(file_cache_t)); 146 | memset(fct, 0, sizeof(file_cache_t)); 147 | 148 | if (file_cache_head == NULL) { 149 | fct->next = NULL; 150 | fct->size = size; 151 | fct->fcache = fcache; 152 | file_cache_head = fct; 153 | } else { 154 | fct->next = file_cache_head; 155 | fct->fcache = fcache; 156 | fct->size = size; 157 | file_cache_head = fct; 158 | } 159 | /* avoid warning in compliation*/ 160 | if (errbuf[0] != 0); 161 | return fct; 162 | } 163 | 164 | void release_pkt_buff_part(){ 165 | 166 | /*free vmalloc buffer cache*/ 167 | file_cache_t *fct=file_cache_head; 168 | file_cache_t *next=file_cache_head->next; 169 | while(fct!=NULL) { 170 | free(fct->fcache); 171 | free(fct); 172 | fct=next; 173 | if(next!=NULL) 174 | next=next->next; 175 | } 176 | 177 | printf("<1>buffer cache free done!\n"); 178 | return ; 179 | } 180 | 181 | void hex_printf(unsigned char *str,int len){ 182 | 183 | int times=len/16; 184 | int last=len%16; 185 | unsigned char *p=str; 186 | int i=0; 187 | for(i=0;idata:%2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x %2x\n", \ 189 | p[0],p[1],p[2],p[3],p[4],p[5],p[6],p[7],p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15]); 190 | p+=16; 191 | } 192 | printf("<1>remained %d data have been shown\n",last); 193 | 194 | } 195 | -------------------------------------------------------------------------------- /latency-experiment/tx/pkt_buff.h: -------------------------------------------------------------------------------- 1 | #ifndef _PKT_BUFF_H 2 | #define _PKT_BUFF_H 3 | #include 4 | #include 5 | 6 | /*pcap file format*/ 7 | typedef struct pf_hdr { 8 | u_int32_t magic; 9 | u_int16_t version_major; 10 | u_int16_t tversion_minor; 11 | int32_t thiszone; /* gmt to local correction */ 12 | u_int32_t sigfigs; /* accuracy of timestamps */ 13 | u_int32_t snaplen; /* max length saved portion of each pkt */ 14 | u_int32_t linktype; /* data link type (LINKTYPE_*) */ 15 | } pf_hdr_t; 16 | 17 | typedef struct pcaprec_hdr_s { 18 | u_int32_t ts_sec; /* timestamp seconds */ 19 | u_int32_t ts_usec; /* timestamp microseconds */ 20 | u_int32_t ncl_len; /* number of octets of packet saved in file */ 21 | u_int32_t rig_len; /* actual length of packet */ 22 | } p_hdr_t; 23 | 24 | struct file_cache{ 25 | char *fcache; 26 | unsigned long offset; 27 | unsigned long size; 28 | struct file_cache *next; 29 | /*pcap header*/ 30 | pf_hdr_t hdr; 31 | }; 32 | 33 | typedef struct file_cache file_cache_t; 34 | 35 | extern u_char *prep_next_skb(file_cache_t *fct,u_int32_t *pktlen); 36 | extern int check_pcap(file_cache_t *fct); 37 | extern void hex_printk(unsigned char *str,int len); 38 | extern void release_pkt_buff_part(void); 39 | extern file_cache_t* preload_pcap_file(int queue_map); 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /latency-experiment/tx/rtp1.pcap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kay21s/UPro/c95585a0721325a6855233c97b6a32cdb98763fa/latency-experiment/tx/rtp1.pcap -------------------------------------------------------------------------------- /libgpucrypto/Makefile: -------------------------------------------------------------------------------- 1 | GPUCRYPTO_DIR = ./ 2 | 3 | #################################################### 4 | # OS Name (Linux or Darwin) 5 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) 6 | OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) 7 | 8 | # Flags to detect 32-bit or 64-bit OS platform 9 | OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/") 10 | OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/") 11 | 12 | # These flags will override any settings 13 | ifeq ($(i386),1) 14 | OS_SIZE = 32 15 | OS_ARCH = i686 16 | endif 17 | 18 | ifeq ($(x86_64),1) 19 | OS_SIZE = 64 20 | OS_ARCH = x86_64 21 | endif 22 | 23 | # Flags to detect either a Linux system (linux) or Mac OSX (darwin) 24 | DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) 25 | 26 | # Location of the CUDA Toolkit binaries and libraries 27 | CUDA_PATH ?= /usr/local/cuda-5.5 28 | CUDA_INC_PATH ?= $(CUDA_PATH)/include 29 | CUDA_BIN_PATH ?= $(CUDA_PATH)/bin 30 | ifneq ($(DARWIN),) 31 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib 32 | else 33 | ifeq ($(OS_SIZE),32) 34 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib 35 | else 36 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib64 37 | endif 38 | endif 39 | 40 | # Common binaries 41 | NVCC ?= $(CUDA_BIN_PATH)/nvcc 42 | GCC := gcc 43 | 44 | # CUDA code generation flags 45 | GENCODE_SM20 := -gencode arch=compute_20,code=sm_20 46 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 47 | GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) 48 | 49 | # OS-specific build flags 50 | ifneq ($(DARWIN),) 51 | LDFLAGS := -Xlinker -rpath $(CUDA_LIB_PATH) -L$(CUDA_LIB_PATH) -lcudart 52 | CCFLAGS := -arch $(OS_ARCH) 53 | else 54 | ifeq ($(OS_SIZE),32) 55 | LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart 56 | CCFLAGS := -m32 57 | else 58 | LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart 59 | CCFLAGS := -m64 60 | endif 61 | endif 62 | 63 | # OS-architecture specific flags 64 | ifeq ($(OS_SIZE),32) 65 | NVCCFLAGS := -m32 66 | else 67 | NVCCFLAGS := -m64 68 | endif 69 | 70 | # Debug build flags 71 | ifeq ($(dbg),1) 72 | CCFLAGS += -g 73 | NVCCFLAGS += -g -G 74 | TARGET := debug 75 | else 76 | TARGET := release 77 | endif 78 | 79 | 80 | # Common includes and paths for CUDA 81 | NVCCINCLUDES := -I$(CUDA_INC_PATH) -I. -I/usr/local/cuda-5.5/samples/common/inc/ 82 | 83 | #################################################### 84 | 85 | 86 | OBJS_DIR = objs 87 | TARGET_DIR = lib 88 | TARGET_FILE = libgpucrypto.a 89 | TARGET = $(addprefix $(TARGET_DIR)/, $(TARGET_FILE)) 90 | 91 | .SUFFIXES : .cu .c .o 92 | 93 | CU_SRC_FILES = $(wildcard *.cu) 94 | CC_SRC_FILES = $(wildcard *.c) 95 | HEADER_FILES = $(wildcard *.h) $(wildcard *.h) 96 | 97 | SRC_FILES = $(CU_SRC_FILES) $(CC_SRC_FILES) 98 | OBJS_FILE = $(CU_SRC_FILES:.cu=.o) $(CC_SRC_FILES:.c=.o) 99 | 100 | OBJS = $(addprefix $(OBJS_DIR)/, $(OBJS_FILE)) 101 | DEPS = Makefile.dep 102 | 103 | all: $(TARGET) 104 | 105 | $(TARGET): $(DEPS) $(OBJS) | $(TARGET_DIR) $(OBJS_DIR) 106 | ar rcs $@ $(OBJS) 107 | 108 | $(TARGET_DIR): 109 | mkdir $(TARGET_DIR) 110 | 111 | $(OBJS_DIR): 112 | mkdir $(OBJS_DIR) 113 | 114 | $(DEPS): $(SRC_FILES) $(HEADER_FILES) 115 | $(CC) -MM -MP -x c++ $(CU_SRC_FILES) $(CC_SRC_FILES) | sed 's![^:]*.o:!objs/&!g' > Makefile.dep 116 | 117 | $(OBJS_DIR)/%.o : %.c 118 | $(GCC) $(CCFLAGS) $(NVCCINCLUDES) -c $< -o $@ 119 | 120 | $(OBJS_DIR)/%.o : %.cu 121 | $(NVCC) $(NVCCFLAGS) $(GENCODE_FLAGS) $(NVCCINCLUDES) -c $< -o $@ 122 | 123 | .PHONY : clean 124 | 125 | 126 | clean: 127 | rm -f $(TARGET) $(OBJS) $(DEPS) 128 | 129 | ifneq ($(MAKECMDGOALS), clean) 130 | -include $(DEPS) 131 | endif 132 | -------------------------------------------------------------------------------- /libgpucrypto/Makefile.dep: -------------------------------------------------------------------------------- 1 | objs/aes.o: aes.cu aes_core.h 2 | 3 | aes_core.h: 4 | objs/co_aes_sha1.o: co_aes_sha1.cu aes_core.h sha1.h crypto_size.h 5 | 6 | aes_core.h: 7 | 8 | sha1.h: 9 | 10 | crypto_size.h: 11 | objs/sha1.o: sha1.cu sha1.h crypto_size.h 12 | 13 | sha1.h: 14 | 15 | crypto_size.h: 16 | objs/crypto_mem.o: crypto_mem.c 17 | -------------------------------------------------------------------------------- /libgpucrypto/README: -------------------------------------------------------------------------------- 1 | GENERAL 2 | ======= 3 | libgpucrypto is subset of SSLShader software that implements 4 | few cryptographic algorithms: AES, SHA1, RSA using CUDA. 5 | This code is distributed under BSD-style license. 6 | Read LICENSE for more details. 7 | 8 | It requires CUDA 1.2 or above capable devices. 9 | We have tested our code on GTX285, GTX480, and GTX580. 10 | 11 | Below is the machine specification we used for the test. 12 | 13 | CPU: Intel X5650 2.66Ghz x 2 14 | M/B: Super Micro X8DAH 15 | RAM: DDR3 4GB x 6 16 | 17 | libgpucrypto is developed under Linux environment, 18 | and it is dependent on CUDA and OpenSSL. 19 | We have tesed our code on CUDA 3.2 and 4.0 20 | along with OpenSSL 1.0.0 (Mar 29, 2010). 21 | Below, we list versions of software that we have tested. 22 | 23 | * CUDA 4.0 24 | CUDA driver : 270.41.19 25 | CUDA toolkit : 4.0.17 26 | CUDA SDK : 4.0.17 27 | 28 | * CUDA 3.2 29 | CUDA driver : 260.19.26 30 | CUDA toolkit : 3.2.16 31 | CUDA SDK : 3.2.16 32 | 33 | * O/S 34 | Ubuntu 10.04 LTS 64bit 35 | 36 | We never tested our code on other Linux distributions nor 32 bit systems. 37 | Please do not ask for help on distribution specific issues. 38 | 39 | 40 | SETUP 41 | ===== 42 | 43 | 1) install required libraries 44 | you can download CUDA stuff at 45 | http://developer.nvidia.com/cuda-toolkit-40 46 | 47 | 2) install OpenSSL libraries and headers 48 | you can download OpenSSL at 49 | http://openssl.org/source/ 50 | 51 | 3) configure following variables in Makefile.in 52 | OPENSSL_DIR 53 | CUDA_TOOLKIT_DIR 54 | CUDA_SDK_DIR 55 | 56 | if you're using system default opeenssl development library, 57 | then you can leave it as blank. 58 | 59 | 4) build libgpucrypto 60 | make 61 | 62 | 5) try running test code 63 | 64 | examples) 65 | #./bin/aes_test -m ENC 66 | #./bin/rsa_test -m MP 67 | #./bin/sha_test 68 | 69 | you can see more detailed usage by running program w/o arguments or w/ incorrect one :). 70 | 71 | 72 | HOW TO USE 73 | ========== 74 | Please see test folder for example codes that uses libgpucrypto. 75 | Using this code requires some prior knowledge on CUDA programming and 76 | GPU codes are written in a way to optimize performance rather than usability, 77 | and the data structure it's API receives is quite complex to document. 78 | Please see below functions and codes for how to use GPU library. 79 | 80 | RSA: rsa_test.cc: test_latency_rsa 81 | AES: aes_test.cc: test_latency_ase_cbc_encrypt, test_latency_aes_cbc_decrypt 82 | SHA1: sha_test.cc: test_latency_hmac_sha1 83 | 84 | AES and SHA1 test codes have a function to transform 85 | somewhat human friendly data structure to GPU code's structure. 86 | See aes_cbc_encrypt_prepare, aes_cbc_decrypt_prepare, and hmac_sha1_prepare 87 | to better understand the exact data structure used for GPU code. 88 | 89 | 90 | NOTE 91 | ==== 92 | * Support for multi-threaded applications 93 | Sharing a device context between threads does not work 94 | CUDA 3.2 or earlier versions. 95 | CUDA 4.0 supports sharing GPU context among multiple threads, 96 | but we have not tested this capability with our code. 97 | 98 | * 64-bit native integer arithmetic for RSA 99 | RSA codes exploits 64-bit native integer support in CUDA 2.x devices. 100 | Use of 64-bit native integer or not is decided during build process. 101 | If you have CUDA 2.x device, and our codes does not utilize it correctly, 102 | then you may fix it by modifying IS_FERMI variable in Makefile.in file. 103 | 104 | * Alignment issue 105 | AES and SHA1 GPU codes can handle non multiple of 16 bytes data, 106 | however input data should be aligned before passed into GPU. 107 | The current test code that we provide do not deal with it, 108 | and will not work correctly if you try non-multiple of 16 bytes. 109 | 110 | 111 | CONTACTS 112 | ======== 113 | More information about the project is available at 114 | http://shader.kaist.edu/sslshader 115 | 116 | If you have trouble using the library 117 | please contact us at tengig@an.kaist.ac.kr. 118 | -------------------------------------------------------------------------------- /libgpucrypto/aes.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "aes_core.h" 6 | 7 | /* AES counter mode + HMAC SHA-1, 8 | the encryption of each block in AES counter mode is not parallelized in this implementation */ 9 | __global__ void aes_ctr_128_kernel ( 10 | uint8_t *input_buf, 11 | uint8_t *output_buf, 12 | const uint8_t *aes_keys, 13 | uint8_t *ivs, 14 | const uint32_t *pkt_offset, 15 | const uint16_t *length, 16 | const unsigned int num_flows, 17 | uint8_t *checkbits) 18 | { 19 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 20 | uint16_t len; 21 | /************************************************************************** 22 | AES Encryption is started first 23 | ***************************************************************************/ 24 | __shared__ uint32_t shared_Te0[256]; 25 | __shared__ uint32_t shared_Te1[256]; 26 | __shared__ uint32_t shared_Te2[256]; 27 | __shared__ uint32_t shared_Te3[256]; 28 | __shared__ uint32_t shared_Rcon[10]; 29 | 30 | /* Private counter 128 bits */ 31 | uint32_t keystream[4]; 32 | 33 | /* initialize T boxes */ 34 | for (unsigned i = 0; i * blockDim.x < 256 ; i ++) { 35 | unsigned index = threadIdx.x + i * blockDim.x; 36 | if (index >= 256) 37 | break; 38 | shared_Te0[index] = Te0_ConstMem[index]; 39 | shared_Te1[index] = Te1_ConstMem[index]; 40 | shared_Te2[index] = Te2_ConstMem[index]; 41 | shared_Te3[index] = Te3_ConstMem[index]; 42 | } 43 | 44 | for(unsigned i = 0; i * blockDim.x < 10; i ++){ 45 | int index = threadIdx.x + blockDim.x * i; 46 | if(index < 10){ 47 | shared_Rcon[index] = rcon[index]; 48 | } 49 | } 50 | 51 | /* ----debug-----*/ 52 | if (idx >= num_flows) { 53 | // printf("idx = %d, num_flows = %d, exit.\n", idx, num_flows); 54 | return; 55 | } 56 | 57 | /* make sure T boxes have been initialized. */ 58 | __syncthreads(); 59 | 60 | /* Encrypt using counter mode, this is the actual length of the packet */ 61 | /* pkt_offset[idx + 1] - pkt_offset[idx] is used for "length[idx] + padding for HMAC + HMAC sha-1 tag" */ 62 | len = length[idx]; 63 | 64 | /* Skip RTP header to Locate the data to be encrypted */ 65 | uint8_t *in = pkt_offset[idx] + input_buf; 66 | uint8_t cc = in[0] & 0x0F; /* Get the number of CSRC identifiers */ 67 | uint16_t header_len = 12 + 4 * cc; /* Get the total header length */ 68 | 69 | /* Jump to the parts need encryption */ 70 | in = in + header_len; /* Get to the payload */ 71 | 72 | uint8_t *out = pkt_offset[idx] + output_buf; 73 | out = out + header_len; /* Get to the payload */ 74 | len -= header_len; /* data length that needs encryption */ 75 | 76 | 77 | assert(out == in); 78 | 79 | /* ----debug----- */ 80 | if (len <= 0) { 81 | printf("idx = %d, len <= 0, exit.\n", idx); 82 | return; 83 | } 84 | 85 | const uint8_t *key = idx * 16 + aes_keys; 86 | uint64_t *iv = (uint64_t *) (idx * AES_BLOCK_SIZE + ivs); 87 | // printf("idx %d is writing : %d header_len : %d len\n", idx, header_len, len); 88 | while (len >= AES_BLOCK_SIZE) { 89 | 90 | /* for the ith block, its input is ((iv + i) mod 2^128)*/ 91 | iv[0] ++; 92 | if (iv[0] == 0) 93 | iv[1] ++; 94 | 95 | /* Get the keystream here */ 96 | AES_128_encrypt((uint8_t *)iv, (uint8_t *)keystream, key, 97 | shared_Te0, shared_Te1, shared_Te2, shared_Te3, shared_Rcon); 98 | 99 | *((uint32_t*)out) = *((uint32_t*)in) ^ *((uint32_t*)keystream); 100 | *(((uint32_t*)out) + 1) = *(((uint32_t*)in) + 1) ^ *(((uint32_t*)keystream) + 1); 101 | *(((uint32_t*)out) + 2) = *(((uint32_t*)in) + 2) ^ *(((uint32_t*)keystream) + 2); 102 | *(((uint32_t*)out) + 3) = *(((uint32_t*)in) + 3) ^ *(((uint32_t*)keystream) + 3); 103 | 104 | //if (idx == 0) 105 | // printf("in = %p, out = %p, output_buf = %p, offset = %d\n", in, out, output_buf, out - output_buf); 106 | 107 | len -= AES_BLOCK_SIZE; 108 | in += AES_BLOCK_SIZE; 109 | out += AES_BLOCK_SIZE; 110 | 111 | //if (idx == 0) 112 | // printf("len = %d, %d\n", len, AES_BLOCK_SIZE); 113 | } 114 | 115 | if (len) { 116 | //if (idx == 0) 117 | // printf("len = %d\n"); 118 | /* for the ith block, its input is ((iv + i) mod 2^128)*/ 119 | iv[0] ++; 120 | if (iv[0] == 0) 121 | iv[1] ++; 122 | 123 | AES_128_encrypt((uint8_t *)iv, (uint8_t *)keystream, key, 124 | shared_Te0, shared_Te1, shared_Te2, shared_Te3, shared_Rcon); 125 | 126 | for(unsigned n = 0; n < len; ++n) 127 | out[n] = in[n] ^ ((uint8_t *)keystream)[n]; 128 | } 129 | 130 | return; 131 | } 132 | 133 | extern "C" void launch_aes_gpu ( 134 | uint8_t *in, 135 | uint8_t *out, 136 | uint8_t *aes_keys, 137 | uint8_t *ivs, 138 | uint32_t *pkt_offset, 139 | uint16_t *actual_length, 140 | unsigned int num_flows, 141 | uint8_t *checkbits, 142 | unsigned threads_per_blk, 143 | cudaStream_t stream) 144 | { 145 | int num_blks = (num_flows + threads_per_blk - 1) / threads_per_blk; 146 | 147 | //printf("stream=%d, threads_per_blk =%d, num_blks = %d\n", stream, threads_per_blk, num_blks); 148 | if (stream == 0) { 149 | aes_ctr_128_kernel<<>>( 150 | in, out, aes_keys, ivs, pkt_offset, actual_length, num_flows, checkbits); 151 | } else { 152 | aes_ctr_128_kernel<<>>( 153 | in, out, aes_keys, ivs, pkt_offset, actual_length, num_flows, checkbits); 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /libgpucrypto/crypto_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef CRYPTO_KERNEL_H 2 | #define CRYPTO_KERNEL_H 3 | 4 | #include 5 | #include 6 | 7 | #define SHA1_THREADS_PER_BLK 32 8 | #define MAX_KEY_SIZE 64 9 | #define MAX_HASH_SIZE 20 10 | 11 | void AES_cbc_128_encrypt_gpu(const uint8_t *in_d, 12 | uint8_t *out_d, 13 | const uint32_t* pkt_offset_d, 14 | const uint8_t *keys_d, 15 | uint8_t *ivs_d, 16 | const unsigned int numFlows, 17 | uint8_t *checkbits_d, 18 | const unsigned int threads_per_blk, 19 | cudaStream_t stream); 20 | 21 | void hmac_sha1_gpu(char* buf, char* keys, uint32_t *offsets, uint16_t *lengths, 22 | uint32_t *outputs, int N, uint8_t * checkbits, 23 | unsigned threads_per_blk, cudaStream_t stream); 24 | 25 | #endif /* CRYPTO_KERNEL_H */ 26 | -------------------------------------------------------------------------------- /libgpucrypto/crypto_mem.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void *libgpu_device_mem_alloc(unsigned long size) 5 | { 6 | void *mem; 7 | cudaMalloc(&mem, size); 8 | return mem; 9 | } 10 | 11 | 12 | void libgpu_device_mem_free(uint8_t *mem) 13 | { 14 | if (mem) { 15 | cudaFree(mem); 16 | mem = NULL; 17 | } 18 | } 19 | 20 | void *libgpu_pinned_mem_alloc(unsigned long size) 21 | { 22 | void *mem; 23 | cudaHostAlloc(&mem, size, cudaHostAllocWriteCombined); 24 | return mem; 25 | } 26 | 27 | void libgpu_pinned_mem_free(uint8_t *mem) 28 | { 29 | if (mem) { 30 | cudaFreeHost(mem); 31 | mem = NULL; 32 | } 33 | } 34 | 35 | void libgpu_transfer_to_device(void *to, void *from, int size, cudaStream_t stream_id) 36 | { 37 | cudaMemcpyAsync(to, from, size, cudaMemcpyHostToDevice, stream_id); 38 | } 39 | 40 | void libgpu_transfer_to_host(void *to, void *from, int size, cudaStream_t stream_id) 41 | { 42 | cudaMemcpyAsync(to, from, size, cudaMemcpyDeviceToHost, stream_id); 43 | } 44 | 45 | void libgpu_sync() 46 | { 47 | cudaDeviceSynchronize(); 48 | } 49 | -------------------------------------------------------------------------------- /libgpucrypto/crypto_mem.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_MEM_H 2 | #define CUDA_MEM_H 3 | 4 | #include 5 | 6 | void *libgpu_device_mem_alloc(unsigned long size); 7 | void libgpu_device_mem_free(uint8_t *mem); 8 | void *libgpu_pinned_mem_alloc(unsigned long size); 9 | void libgpu_pinned_mem_free(uint8_t *mem); 10 | 11 | void libgpu_sync(); 12 | void libgpu_transfer_to_host(void *to, void *from, int size, int stream_id); 13 | void libgpu_transfer_to_device(void *to, void *from, int size, int stream_id); 14 | #endif 15 | -------------------------------------------------------------------------------- /libgpucrypto/crypto_size.h: -------------------------------------------------------------------------------- 1 | #ifndef CRYPTO_SIZE_H 2 | #define CRYPTO_SIZE_H 3 | 4 | /* Define sizes in bytes */ 5 | 6 | 7 | # if 0 // this is for SSL 8 | #define AES_KEY_SIZE 16 // 128/8 = 16 bytes 9 | #define AES_IV_SIZE 16 // 16 bytes 10 | #define PKT_OFFSET_SIZE 4 // 32 bits = 4 bytes 11 | #define HMAC_KEY_SIZE 64 // 64 bytes 12 | #define LENGTH_SIZE 4 // 32 bits = 4 bytes 13 | 14 | #endif 15 | 16 | // Here comes SRTP 17 | 18 | #define AES_KEY_SIZE 16 // 128/8 = 16 bytes 19 | #define AES_IV_SIZE 16 // 16 bytes 20 | 21 | #define PKT_OFFSET_SIZE 4 // 32 bits = 4 bytes 22 | #define PKT_LENGTH_SIZE 2 // 16 bits = 2 bytes 23 | 24 | #define HMAC_KEY_SIZE 20 // 160 bits 25 | #define HMAC_TAG_SIZE 10 // output of SHA1 is 80 bits 26 | 27 | /* CIPHER SET */ 28 | #define AES_CTR_HMAC_SHA1 0x1 // Currently, we have this 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /libgpucrypto/libgpucrypto.h: -------------------------------------------------------------------------------- 1 | #ifndef CO_AES_SHA1_H 2 | #define CO_AES_SHA1_H 3 | 4 | #include 5 | 6 | void co_aes_sha1_gpu( 7 | uint8_t *in, 8 | uint8_t *out, 9 | uint8_t *aes_keys, 10 | uint8_t *ivs, 11 | uint8_t *hmac_keys, 12 | uint32_t *pkt_offset, 13 | uint16_t *actual_length, 14 | unsigned int num_flows, 15 | uint8_t *checkbits, 16 | unsigned threads_per_blk, 17 | cudaStream_t stream); 18 | 19 | void launch_aes_gpu( 20 | uint8_t *in, 21 | uint8_t *out, 22 | uint8_t *aes_keys, 23 | uint8_t *ivs, 24 | uint32_t *pkt_offset, 25 | uint16_t *actual_length, 26 | unsigned int num_flows, 27 | uint8_t *checkbits, 28 | unsigned threads_per_blk, 29 | cudaStream_t stream); 30 | 31 | void launch_sha1_gpu( 32 | char* buf, 33 | char* keys, 34 | uint32_t *offsets, 35 | uint16_t *lengths, 36 | int N, 37 | uint8_t * checkbits, 38 | unsigned threads_per_blk, 39 | cudaStream_t stream); 40 | #endif 41 | -------------------------------------------------------------------------------- /libgpucrypto/sha1.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "sha1.h" 6 | #include "crypto_size.h" 7 | 8 | __global__ void computeHMAC_SHA1(char* buf, char* keys, uint32_t *offsets, uint16_t *lengths, int N, uint8_t * checkbits) 9 | { 10 | uint32_t w_register[16]; 11 | 12 | int index = blockIdx.x * blockDim.x + threadIdx.x; 13 | if (index < N) { 14 | uint32_t *w = w_register; 15 | hash_digest_t h; 16 | uint32_t offset = offsets[index]; 17 | uint16_t length = lengths[index]; 18 | uint16_t sha1_output_pos = (length + 3) & ~0x03; 19 | uint32_t *out = (uint32_t *)(buf + offset + sha1_output_pos); 20 | 21 | for (unsigned i = 0; i < 16; i++) 22 | w[i] = 0x36363636; 23 | xorpads(w, (uint32_t*)(keys + HMAC_KEY_SIZE * index)); 24 | 25 | 26 | h.h1 = 0x67452301; 27 | h.h2 = 0xEFCDAB89; 28 | h.h3 = 0x98BADCFE; 29 | h.h4 = 0x10325476; 30 | h.h5 = 0xC3D2E1F0; 31 | 32 | //SHA1 compute on ipad 33 | computeSHA1Block((char*)w, w, 0, 64, h); 34 | 35 | //SHA1 compute on mesage 36 | unsigned num_iter = (length + 63 + 9) / 64; 37 | for (unsigned i = 0; i < num_iter; i++) 38 | computeSHA1Block(buf + offset, w, i * 64, length, h); 39 | 40 | *(out) = swap(h.h1); 41 | *(out+1) = swap(h.h2); 42 | uint32_t temp = swap(h.h3); 43 | *(uint16_t *)(out+2) = ((uint16_t *)&temp)[0]; 44 | 45 | h.h1 = 0x67452301; 46 | h.h2 = 0xEFCDAB89; 47 | h.h3 = 0x98BADCFE; 48 | h.h4 = 0x10325476; 49 | h.h5 = 0xC3D2E1F0; 50 | 51 | for (unsigned i = 0; i < 16; i++) 52 | w[i] = 0x5c5c5c5c; 53 | 54 | xorpads(w, (uint32_t*)(keys + 64 * index)); 55 | 56 | //SHA 1 compute on opads 57 | computeSHA1Block((char*)w, w, 0, 64, h); 58 | 59 | //SHA 1 compute on (hash of ipad|m) 60 | computeSHA1Block((char*)out, w, 0, 20, h); 61 | /* 62 | *(out) = swap(h.h1); 63 | *(out+1) = swap(h.h2); 64 | temp = swap(h.h3); 65 | *(uint16_t *)(out+2) = ((uint16_t *)&temp)[0]; 66 | */ 67 | } 68 | __syncthreads(); 69 | 70 | //if (threadIdx.x == 0) 71 | // *(checkbits + blockIdx.x) = 1; 72 | 73 | } 74 | 75 | extern "C" void launch_sha1_gpu( 76 | char* buf, 77 | char* keys, 78 | uint32_t *offsets, 79 | uint16_t *lengths, 80 | int N, 81 | uint8_t * checkbits, 82 | unsigned threads_per_blk, 83 | cudaStream_t stream) 84 | { 85 | printf("SHA1 Launched \n"); 86 | int num_blks = (N + threads_per_blk - 1) / threads_per_blk; 87 | if (stream == 0) { 88 | computeHMAC_SHA1<<>>( 89 | buf, keys, offsets, lengths, N, checkbits); 90 | } else { 91 | computeHMAC_SHA1<<>>( 92 | buf, keys, offsets, lengths, N, checkbits); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /libgpucrypto/test/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | 3 | CFLAGS = -Wall -g -O2 4 | #-D__USE_GNU -D_GNU_SOURCE 5 | LIBS = -lrt -lpthread -L../lib -lgpucrypto 6 | 7 | #OPENSSL_DIR = ../openssl-1.0.1c 8 | #ifneq ($(OPENSSL_DIR), ) 9 | # OPENSSL_LIBDIR = -L$(OPENSSL_DIR)/lib64 10 | # OPENSSL_INCDIR = -I$(OPENSSL_DIR)/include 11 | #endif 12 | 13 | ################################################################ 14 | # OS Name (Linux or Darwin) 15 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) 16 | OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) 17 | 18 | # Flags to detect 32-bit or 64-bit OS platform 19 | OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/") 20 | OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/") 21 | 22 | # These flags will override any settings 23 | ifeq ($(i386),1) 24 | OS_SIZE = 32 25 | OS_ARCH = i686 26 | endif 27 | 28 | ifeq ($(x86_64),1) 29 | OS_SIZE = 64 30 | OS_ARCH = x86_64 31 | endif 32 | 33 | # Flags to detect either a Linux system (linux) or Mac OSX (darwin) 34 | DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) 35 | 36 | # Location of the CUDA Toolkit binaries and libraries 37 | CUDA_PATH ?= /usr/local/cuda-5.5 38 | CUDA_INC_PATH ?= $(CUDA_PATH)/include 39 | CUDA_BIN_PATH ?= $(CUDA_PATH)/bin 40 | ifneq ($(DARWIN),) 41 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib 42 | else 43 | ifeq ($(OS_SIZE),32) 44 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib 45 | else 46 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib64 47 | endif 48 | endif 49 | 50 | # Common binaries 51 | NVCC ?= $(CUDA_BIN_PATH)/nvcc 52 | 53 | # CUDA code generation flags 54 | GENCODE_SM20 := -gencode arch=compute_20,code=sm_20 55 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 56 | GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) 57 | 58 | # OS-specific build flags 59 | ifneq ($(DARWIN),) 60 | LDFLAGS := -Xlinker -rpath $(CUDA_LIB_PATH) -L$(CUDA_LIB_PATH) -lcudart 61 | CCFLAGS := -arch $(OS_ARCH) 62 | else 63 | ifeq ($(OS_SIZE),32) 64 | LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart 65 | CCFLAGS := -m32 66 | else 67 | LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart 68 | CCFLAGS := -m64 69 | endif 70 | endif 71 | 72 | # OS-architecture specific flags 73 | ifeq ($(OS_SIZE),32) 74 | NVCCFLAGS := -m32 75 | else 76 | NVCCFLAGS := -m64 77 | endif 78 | 79 | # Debug build flags 80 | ifeq ($(dbg),1) 81 | CCFLAGS += -g 82 | NVCCFLAGS += -g -G 83 | TARGET := debug 84 | else 85 | TARGET := release 86 | endif 87 | 88 | 89 | # Common includes and paths for CUDA 90 | NVCCINCLUDES := -I$(CUDA_INC_PATH) -I. -I/usr/local/cuda-5.0/samples/common/inc/ 91 | 92 | ################################################################ 93 | 94 | HEADER_DIR = ../ 95 | LIBCRYPTO_DIR = ../libgpucrypto 96 | OBJECT_DIR = objs 97 | TARGET_DIR = . 98 | 99 | TARGET_FILE = run 100 | TARGET = $(addprefix $(TARGET_DIR)/, $(TARGET_FILE)) 101 | 102 | SOURCE_FILES = $(wildcard *.c) 103 | HEADER_FILES = $(wildcard *.h) $(wildcard *.h) 104 | OBJECT_FILE = $(SOURCE_FILES:.c=.o) 105 | 106 | OBJECT = $(addprefix $(OBJECT_DIR)/, $(OBJECT_FILE)) 107 | 108 | all: $(TARGET) 109 | 110 | $(TARGET): $(OBJECT) | $(TARGET_DIR) $(OBJECT_DIR) 111 | $(CC) -g $(OBJECT) -o $@ $(LIBS) $(LDFLAGS) 112 | 113 | $(OBJECT_DIR): 114 | mkdir $(OBJECT_DIR) 115 | 116 | 117 | $(OBJECT_DIR)/%.o : %.c 118 | $(CC) $(CFLAGS) -I $(HEADER_DIR) -I $(CUDA_INC_PATH) -I $(LIBCRYPTO_DIR) -c $< -o $@ 119 | 120 | .PHONY : clean 121 | clean: 122 | rm -rf $(TARGET) $(OBJECT) 123 | -------------------------------------------------------------------------------- /libgpucrypto/test/README: -------------------------------------------------------------------------------- 1 | ./run [number of packets] [number of stream] 2 | -------------------------------------------------------------------------------- /libgpucrypto/test/perf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | 4 | speed = 10000000000 #10Gbps 5 | total_pkt = speed/(8*1328) # how many packets per second 6 | 7 | #set I from 10 to 500 8 | startI = 10.0 9 | endI = 62.0 10 | 11 | def drange(start, stop, step): 12 | r = start 13 | while r < stop: 14 | yield r 15 | r += step 16 | 17 | 18 | for stream in range(1, 7): 19 | print stream 20 | result_file = open("result"+str(stream), "w+") 21 | time_list = drange(startI, endI, 2.0) 22 | for time in time_list: 23 | pkt_num = int(total_pkt * (time/1000)) 24 | cmd = r'./run ' + str(pkt_num/stream) + ' ' + str(stream) 25 | output = os.popen(cmd).read().strip() 26 | result_file.write(str(int(time))+' '+output+'\n') 27 | print time, stream 28 | -------------------------------------------------------------------------------- /libgpucrypto/test/rtp.pkt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kay21s/UPro/c95585a0721325a6855233c97b6a32cdb98763fa/libgpucrypto/test/rtp.pkt -------------------------------------------------------------------------------- /src-20G/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | 3 | ################################################################ 4 | # OS Name (Linux or Darwin) 5 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) 6 | OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) 7 | 8 | # Flags to detect 32-bit or 64-bit OS platform 9 | OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/") 10 | OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/") 11 | 12 | # These flags will override any settings 13 | ifeq ($(i386),1) 14 | OS_SIZE = 32 15 | OS_ARCH = i686 16 | endif 17 | 18 | ifeq ($(x86_64),1) 19 | OS_SIZE = 64 20 | OS_ARCH = x86_64 21 | endif 22 | 23 | # Flags to detect either a Linux system (linux) or Mac OSX (darwin) 24 | DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) 25 | 26 | # Location of the CUDA Toolkit binaries and libraries 27 | CUDA_PATH ?= /usr/local/cuda-5.0 28 | CUDA_INC_PATH ?= $(CUDA_PATH)/include 29 | CUDA_BIN_PATH ?= $(CUDA_PATH)/bin 30 | CUDA_SDK_DIR = /home/$(USER)/NVIDIA_CUDA-5.0_Samples/ 31 | 32 | ifneq ($(DARWIN),) 33 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib 34 | else 35 | ifeq ($(OS_SIZE),32) 36 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib 37 | else 38 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib64 39 | endif 40 | endif 41 | 42 | # Common binaries 43 | NVCC ?= $(CUDA_BIN_PATH)/nvcc 44 | 45 | # CUDA code generation flags 46 | GENCODE_SM20 := -gencode arch=compute_20,code=sm_20 47 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 48 | GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) 49 | 50 | # OS-specific build flags 51 | ifneq ($(DARWIN),) 52 | LDFLAGS := -Xlinker -rpath $(CUDA_LIB_PATH) -L$(CUDA_LIB_PATH) -lcudart 53 | CCFLAGS := -arch $(OS_ARCH) 54 | else 55 | ifeq ($(OS_SIZE),32) 56 | LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart 57 | CCFLAGS := -m32 58 | else 59 | LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart 60 | CCFLAGS := -m64 61 | endif 62 | endif 63 | 64 | # OS-architecture specific flags 65 | ifeq ($(OS_SIZE),32) 66 | NVCCFLAGS := -m32 67 | else 68 | NVCCFLAGS := -m64 69 | endif 70 | 71 | # Debug build flags 72 | ifeq ($(dbg),1) 73 | CCFLAGS += -g 74 | NVCCFLAGS += -g -G 75 | TARGET := debug 76 | else 77 | TARGET := release 78 | endif 79 | 80 | 81 | # Common includes and paths for CUDA 82 | NVCCINCLUDES := -I$(CUDA_INC_PATH) -I. -I$(CUDA_SDK_DIR)common/inc/ 83 | 84 | ################################################################ 85 | 86 | CFLAGS = -Wall -g -D__USE_GNU -D_GNU_SOURCE -DUPRO_MALLOC -DDEBUG_0 -DTRANSFER_SEPERATE 87 | # AFFINITY_1: collector[0,1,2,3] forwarders[4,5,6,7] 88 | # AFFINITY_2: collector[0,2,4,6] forwarders[1,3,5,7] 89 | # AFFINITY_3: collector[1,3,5,7] forwarders[2,4,6,8] 90 | # AFFINITY_4: collector[1,2,3,4,..] forwarders[5,6,7,8,..] gpu_worker[0] 91 | CFLAGS += -DCPU_AFFINITY -DAFFINITY_4 92 | CFLAGS += -DNOT_FORWARD_0 -DNOT_GPU_0 -DNOT_COLLECT_0 93 | CFLAGS += -DCOLLECTOR_PERFORMANCE_TEST_0 94 | CFLAGS += -DUSE_LOCK_0 -DMEMCPY_C_0 -DMEMCPY_SSE_0 95 | 96 | LIBS = -lrt -lpthread -L../IOEngine/lib -lpsio 97 | LIBS += -L../libgpucrypto/lib -lgpucrypto 98 | 99 | HEADER_DIR = ./include 100 | IOENGINE_DIR = ../IOEngine/include 101 | LIBCRYPTO_DIR = ../libgpucrypto 102 | OBJECT_DIR = objs 103 | TARGET_DIR = . 104 | 105 | TARGET_FILE = upro 106 | TARGET = $(addprefix $(TARGET_DIR)/, $(TARGET_FILE)) 107 | 108 | SOURCE_FILES = $(wildcard *.c) 109 | HEADER_FILES = $(wildcard *.h) $(wildcard *.h) 110 | OBJECT_FILE = $(SOURCE_FILES:.c=.o) 111 | 112 | OBJECT = $(addprefix $(OBJECT_DIR)/, $(OBJECT_FILE)) 113 | 114 | all: $(TARGET) 115 | 116 | $(TARGET): $(OBJECT) | $(TARGET_DIR) $(OBJECT_DIR) 117 | $(CC) -pg $(OBJECT) -o $@ $(LIBS) $(LDFLAGS) 118 | 119 | $(OBJECT_DIR): 120 | mkdir $(OBJECT_DIR) 121 | 122 | 123 | $(OBJECT_DIR)/%.o : %.c 124 | $(CC) -pg $(CFLAGS) -I $(HEADER_DIR) -I $(IOENGINE_DIR) -I $(LIBCRYPTO_DIR) $(NVCCINCLUDES) -c $< -o $@ 125 | # -I $(CUDA_INC_PATH) -I $(LIBCRYPTO_DIR) 126 | 127 | .PHONY : clean 128 | clean: 129 | rm -rf $(TARGET) $(OBJECT) 130 | 131 | doc: $(SOURCE_FILES) doxygen.config 132 | doxygen doxygen.config 133 | -------------------------------------------------------------------------------- /src-20G/include/memcpy.h: -------------------------------------------------------------------------------- 1 | #ifndef MEMCPY_H 2 | #define MEMCPY_H 3 | 4 | void *memcpy_sse2(void * to, const void * from, size_t len); 5 | void *memcpy_c(void *dest, const void *src, size_t count); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /src-20G/include/upro_batch.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_BATCH_H 2 | #define UPRO_BATCH_H 3 | 4 | #include 5 | #include "upro_job.h" 6 | 7 | typedef struct upro_batch_buf_s 8 | { 9 | /* keys, pkt_offsets, and ivs, are all stored in the input buffer */ 10 | void *input_buf; 11 | void *aes_key_pos; 12 | void *aes_iv_pos; 13 | void *pkt_offset_pos; 14 | void *length_pos; //length of RTP packet payload 15 | void *hmac_key_pos; 16 | 17 | void *input_buf_d; 18 | // void *output_buf_d; 19 | void *aes_key_pos_d; 20 | void *aes_iv_pos_d; 21 | void *pkt_offset_pos_d; 22 | void *length_pos_d; //length of RTP packet payload 23 | void *hmac_key_pos_d; 24 | 25 | // Job for forwarding 26 | upro_job_t *job_list; 27 | int job_num; 28 | 29 | int buf_size; 30 | int buf_length; 31 | 32 | void *hdr_buf; 33 | int hdr_length; 34 | } upro_batch_buf_t; 35 | 36 | 37 | /* Each CPU worker holds such a data structure */ 38 | typedef struct upro_batch_s 39 | { 40 | upro_batch_buf_t buf[3]; 41 | 42 | volatile int collector_buf_id; 43 | volatile int forwarder_buf_id; 44 | volatile int available_buf_id[2]; 45 | int gpu_buf_id; 46 | 47 | /* GPU worker notify CPU worker 48 | * buf_has_been_taken tell CPU worker which buf has just been taken, 49 | * processed_buf_id tell CPU worker which buf has been processed. 50 | * they all should be -1, if there are no events. 51 | * GPU write it (0/1), and CPU clears it to -1 to claim its own action. 52 | */ 53 | pthread_mutex_t mutex_forwarder_buf_id; 54 | pthread_mutex_t mutex_available_buf_id; 55 | pthread_mutex_t mutex_batch_launch; 56 | } upro_batch_t; 57 | 58 | extern pthread_key_t worker_batch_struct; 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src-20G/include/upro_collector.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_COLLECTOR_H 2 | #define UPRO_COLLECTOR_H 3 | 4 | #include "psio.h" 5 | 6 | #define MAX_COLLECTOR_NUM 12 7 | 8 | typedef struct upro_collector_s { 9 | struct ps_handle handle; 10 | uint64_t total_packets; 11 | uint64_t total_bytes; 12 | struct timeval startime; 13 | struct timeval endtime; 14 | struct timeval subtime; 15 | } __attribute__((aligned(64))) upro_collector_t; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src-20G/include/upro_config.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_CONFIG_H 2 | #define UPRO_CONFIG_H 3 | 4 | #include "psio.h" 5 | 6 | # define MAX_WORKER_NUM 12 7 | 8 | typedef struct upro_config_s { 9 | unsigned int cpu_worker_num; 10 | unsigned int if0_worker_num; 11 | unsigned int if1_worker_num; 12 | unsigned int gpu_worker_num; 13 | 14 | unsigned int iterations; 15 | unsigned int log_sample_num; 16 | 17 | unsigned int eiu_hdr_len; 18 | 19 | unsigned int gpu; 20 | unsigned long batch_buf_max_size; 21 | unsigned long batch_job_max_num; 22 | 23 | unsigned int *core_ids; 24 | 25 | // Most important argument for realtime scheduling algorithm 26 | unsigned int I; // 40ms, 30ms ... 27 | unsigned int type; 28 | 29 | int io_batch_num; 30 | 31 | int ifindex_0; 32 | int ifindex_1; 33 | 34 | char interface_0[5]; 35 | char interface_1[5]; 36 | } upro_config_t; 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src-20G/include/upro_context.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef UPRO_CONTEXT_H 3 | #define UPRO_CONTEXT_H 4 | 5 | #include "upro_batch.h" 6 | 7 | typedef struct upro_forwarder_context_s { 8 | upro_batch_t *batch; 9 | int ifindex; 10 | int core_id; 11 | int queue_id; 12 | int initialized; 13 | int id; 14 | } upro_forwarder_context_t; 15 | 16 | typedef struct upro_collector_context_s { 17 | upro_batch_t *batch; 18 | int ifindex; 19 | int core_id; 20 | int queue_id; 21 | int initialized; 22 | int id; 23 | } upro_collector_context_t; 24 | 25 | typedef struct upro_gpu_worker_context_s { 26 | upro_batch_t *cpu_batch_set; 27 | int core_id; /* which core should gpu worker run */ 28 | /* Add more info passing to GPU worker here ... */ 29 | } upro_gpu_worker_context_t; 30 | 31 | void *upro_gpu_worker_main(upro_gpu_worker_context_t *context); 32 | void *upro_collector_main(upro_collector_context_t *context); 33 | void *upro_forwarder_main(upro_forwarder_context_t *context); 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /src-20G/include/upro_forwarder.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_FORWARDER_H 2 | #define UPRO_FORWARDER_H 3 | 4 | #include "psio.h" 5 | 6 | typedef struct upro_forwarder_s { 7 | struct ps_handle handle; 8 | uint64_t total_packets; 9 | uint64_t total_bytes; 10 | struct timeval startime; 11 | struct timeval endtime; 12 | struct timeval subtime; 13 | } __attribute__((aligned(64))) upro_forwarder_t; 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /src-20G/include/upro_gpu_worker.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_GPU_WORKER_H 2 | #define UPRO_GPU_WORKER_H 3 | 4 | #include "upro_batch.h" 5 | 6 | typedef struct upro_gpu_worker_s { 7 | upro_batch_buf_t **bufs[3]; /* Three buf sets */ 8 | int cur_buf_id; 9 | int total_bytes; 10 | } upro_gpu_worker_t; 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /src-20G/include/upro_job.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_JOB_H 2 | #define UPRO_JOB_H 3 | 4 | typedef struct upro_job_s 5 | { 6 | int hdr_length; 7 | int payload_length; 8 | char *hdr_ptr; 9 | char *payload_ptr; 10 | } upro_job_t; 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /src-20G/include/upro_log.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_LOG_H 2 | #define UPRO_LOG_H 3 | 4 | typedef struct upro_log_sample_s { 5 | unsigned int isMsg; 6 | unsigned int isErr; 7 | double timer; 8 | unsigned int nbytes; 9 | int loops; 10 | char * fmt; 11 | char * msg; 12 | double num; 13 | } upro_log_sample_t; 14 | 15 | typedef struct upro_log_s { 16 | unsigned int idx; 17 | unsigned int loops; 18 | unsigned int loop_entries; 19 | unsigned int loop_timers; 20 | 21 | upro_log_sample_t *samples; 22 | } upro_log_t; 23 | 24 | void upro_log_init(upro_log_t *log); 25 | void upro_log_loop_marker(upro_log_t *log); 26 | void upro_log_msg(upro_log_t *log, const char *format, const char *msg, const double num); 27 | void upro_log_timer(upro_log_t *log, const char *format, const char *msg, double timer, unsigned int nbytes, int loops); 28 | void upro_log_print(upro_log_t *log); 29 | #endif 30 | -------------------------------------------------------------------------------- /src-20G/include/upro_macros.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_MACROS_H 2 | #define UPRO_MACROS_H 3 | 4 | #include 5 | 6 | /* Boolean */ 7 | #define UPRO_FALSE 0 8 | #define UPRO_TRUE !UPRO_FALSE 9 | #define UPRO_ERROR -1 10 | 11 | /* Architecture */ 12 | #define INTSIZE sizeof(int) 13 | 14 | /* Print macros */ 15 | #define UPRO_INFO 0x1000 16 | #define UPRO_ERR 0X1001 17 | #define UPRO_WARN 0x1002 18 | #define UPRO_BUG 0x1003 19 | 20 | 21 | //#define upro_info(...) upro_print(UPRO_INFO, __VA_ARGS__) 22 | //#define upro_err(...) upro_print(UPRO_ERR, __VA_ARGS__) 23 | //#define upro_warn(...) upro_print(UPRO_WARN, __VA_ARGS__) 24 | //#define upro_trace(...) upro_print(UPRO_WARN, __VA_ARGS__) 25 | #define upro_info printf 26 | #define upro_err printf 27 | #define upro_warn printf 28 | #define upro_trace printf 29 | 30 | /* Transport type */ 31 | #ifndef ARRAY_SIZE 32 | # define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) 33 | #endif 34 | 35 | #ifdef __GNUC__ /* GCC supports this since 2.3. */ 36 | #define PRINTF_WARNINGS(a,b) __attribute__ ((format (printf, a, b))) 37 | #else 38 | #define PRINTF_WARNINGS(a,b) 39 | #endif 40 | 41 | #ifdef __GNUC__ /* GCC supports this since 2.7. */ 42 | #define UNUSED_PARAM __attribute__ ((unused)) 43 | #else 44 | #define UNUSED_PARAM 45 | #endif 46 | 47 | /* 48 | * Validation macros 49 | * ----------------- 50 | * Based on article http://lwn.net/Articles/13183/ 51 | * 52 | * --- 53 | * ChangeSet 1.803, 2002/10/18 16:28:57-07:00, torvalds@home.transmeta.com 54 | * 55 | * Make a polite version of BUG_ON() - WARN_ON() which doesn't 56 | * kill the machine. 57 | * 58 | * Damn I hate people who kill the machine for no good reason. 59 | * --- 60 | * 61 | */ 62 | 63 | #define upro_unlikely(x) __builtin_expect((x),0) 64 | #define upro_likely(x) __builtin_expect((x),1) 65 | #define upro_prefetch(x, ...) __builtin_prefetch(x, __VA_ARGS__) 66 | 67 | #define upro_is_bool(x) ((x == UPRO_TRUE || x == UPRO_FALSE) ? 1 : 0) 68 | 69 | #define upro_bug(condition) do { \ 70 | if (upro_unlikely((condition)!=0)) { \ 71 | upro_print(UPRO_BUG, "Bug found in %s() at %s:%d", \ 72 | __FUNCTION__, __FILE__, __LINE__); \ 73 | abort(); \ 74 | } \ 75 | } while(0) 76 | 77 | /* 78 | * Macros to calculate sub-net data using ip address and sub-net prefix 79 | */ 80 | 81 | #define UPRO_NET_IP_OCTECT(addr,pos) (addr >> (8 * pos) & 255) 82 | #define UPRO_NET_NETMASK(addr,net) htonl((0xffffffff << (32 - net))) 83 | #define UPRO_NET_BROADCAST(addr,net) (addr | ~UPRO_NET_NETMASK(addr,net)) 84 | #define UPRO_NET_NETWORK(addr,net) (addr & UPRO_NET_NETMASK(addr,net)) 85 | #define UPRO_NET_WILDCARD(addr,net) (UPRO_NET_BROADCAST(addr,net) ^ UPRO_NET_NETWORK(addr,net)) 86 | #define UPRO_NET_HOSTMIN(addr,net) net == 31 ? UPRO_NET_NETWORK(addr,net) : (UPRO_NET_NETWORK(addr,net) + 0x01000000) 87 | #define UPRO_NET_HOSTMAX(addr,net) net == 31 ? UPRO_NET_BROADCAST(addr,net) : (UPRO_NET_BROADCAST(addr,net) - 0x01000000); 88 | 89 | #if __GNUC__ >= 4 90 | #define UPRO_EXPORT __attribute__ ((visibility ("default"))) 91 | #else 92 | #define UPRO_EXPORT 93 | #endif 94 | 95 | // TRACE 96 | #define UPRO_TRACE(...) do {} while (0) 97 | 98 | #endif 99 | -------------------------------------------------------------------------------- /src-20G/include/upro_memory.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_MEM_H 2 | #define UPRO_MEM_H 3 | 4 | 5 | #if ((__GNUC__ * 100 + __GNUC__MINOR__) > 430) /* gcc version > 4.3 */ 6 | # define ALLOCSZ_ATTR(x,...) __attribute__ ((alloc_size(x, ##__VA_ARGS__))) 7 | #else 8 | # define ALLOCSZ_ATTR(x,...) 9 | #endif 10 | 11 | void *upro_mem_malloc(const size_t size); 12 | void *upro_mem_calloc(const size_t size); 13 | void *upro_mem_realloc(void *ptr, const size_t size); 14 | void upro_mem_free(void *ptr); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src-20G/include/upro_timer.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_TIMER_H 2 | #define UPRO_TIMER_H 3 | 4 | #include 5 | 6 | /** 7 | * \file Timer.h 8 | * \brief A timer class that provides a cross platform timer for use 9 | * in timing code progress with a high degree of accuracy. 10 | * FIXME: 11 | * 1s = 1000ms (millisecond) 12 | * 1ms = 1000us (microsecond) 13 | * 1us = 1000ns (nanosecond) 14 | * this counter returns in terms of us 15 | */ 16 | 17 | 18 | typedef struct upro_timer_s { 19 | uint64_t freq; 20 | uint64_t clocks; 21 | uint64_t start; 22 | } upro_timer_t; 23 | 24 | int upro_timer_init(); 25 | int upro_timer_start(upro_timer_t *timer); 26 | int upro_timer_restart(upro_timer_t *timer); 27 | int upro_timer_stop(upro_timer_t *timer); 28 | int upro_timer_reset(upro_timer_t *timer); 29 | double upro_timer_get_total_time(upro_timer_t *timer); 30 | double upro_timer_get_elapsed_time(upro_timer_t *timer); 31 | 32 | #endif 33 | 34 | -------------------------------------------------------------------------------- /src-20G/include/upro_transworker.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_TRANSWORKER_H 2 | #define UPRO_TRANSWORKER_H 3 | 4 | typedef struct upro_transworker_context_s { 5 | int core_id; 6 | int initialized; 7 | } upro_transworker_context_t; 8 | 9 | void upro_transworker_main(upro_transworker_context_t *context); 10 | 11 | #endif 12 | -------------------------------------------------------------------------------- /src-20G/upro_log.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "upro_log.h" 6 | #include "upro_memory.h" 7 | #include "upro_config.h" 8 | 9 | extern upro_config_t *config; 10 | 11 | #define LOG_PRINT 1 12 | 13 | // Add "num" variable based on original version 14 | void upro_sample_set_msg(upro_log_sample_t *sample, const char *fmt, const char *msg, double num) 15 | { 16 | sample->isMsg = 1; 17 | 18 | sample->fmt = upro_mem_malloc(strlen(fmt)+1); 19 | strcpy(sample->fmt, fmt); 20 | 21 | sample->msg = upro_mem_malloc(strlen(msg)+1); 22 | strcpy(sample->msg, msg); 23 | 24 | sample->num = num; 25 | } 26 | 27 | void upro_sample_set_timer(upro_log_sample_t *sample, const char *fmt, const char *msg, double timer, unsigned int nbytes, int loops) 28 | { 29 | sample->isMsg = 0; 30 | sample->timer = timer; 31 | 32 | if (loops != 0) sample->loops = loops; 33 | if (nbytes > 0) sample->nbytes = nbytes; 34 | 35 | if (strlen(msg) > 0) { 36 | sample->fmt = upro_mem_malloc(strlen( fmt ) + 1); 37 | strcpy(sample->fmt, fmt); 38 | } 39 | 40 | if (strlen(msg) > 0) { 41 | sample->msg = upro_mem_malloc(strlen( msg ) + 1); 42 | strcpy(sample->msg, msg); 43 | } 44 | } 45 | 46 | void upro_sample_print(upro_log_sample_t *sample) 47 | { 48 | if(sample->isMsg == 1) { 49 | printf(sample->fmt, sample->msg, sample->num); 50 | } else { 51 | double bwd = (((double) sample->nbytes * sample->loops )/ sample->timer) / 1e9; 52 | printf(sample->fmt, sample->msg, sample->timer, bwd) ; 53 | } 54 | } 55 | 56 | /* ---------------------------------------------------------------------- */ 57 | 58 | void upro_log_init(upro_log_t *log) 59 | { 60 | log->idx = 0; 61 | log->loops = 0; 62 | log->loop_entries = 0; 63 | log->loop_timers = 0; 64 | log->samples = upro_mem_malloc(config->log_sample_num * sizeof(upro_log_sample_t)); 65 | } 66 | 67 | void upro_log_loop_marker(upro_log_t *log) 68 | { 69 | log->loop_timers = 0; 70 | log->loops ++; 71 | #if defined(LOG_PRINT) 72 | printf("\n---------------------------%d\n", log->loops); 73 | #endif 74 | } 75 | 76 | void upro_log_msg(upro_log_t *log, const char *format, const char *msg, const double num) 77 | { 78 | #if defined(LOG_PRINT) 79 | printf(format, msg, num); 80 | #else 81 | upro_sample_set_msg(&(log->samples[log->idx ++]), format, msg, num); 82 | log->loop_entries ++; 83 | #endif 84 | } 85 | 86 | void upro_log_timer(upro_log_t *log, const char *format, const char *msg, double timer, unsigned int nbytes, int loops) 87 | { 88 | upro_sample_set_timer(&(log->samples[log->idx ++]), format, msg, timer, nbytes, loops); 89 | log->loop_entries ++; 90 | log->loop_timers ++; 91 | } 92 | 93 | void upro_log_print(upro_log_t *log) 94 | { 95 | int i; 96 | 97 | for(i = 0; i < log->loop_entries; i++) { 98 | upro_sample_print(&(log->samples[i])); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src-20G/upro_memory.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "upro_memory.h" 7 | #include "upro_macros.h" 8 | 9 | ALLOCSZ_ATTR(1) 10 | void *upro_mem_malloc(const size_t size) 11 | { 12 | void *aux = malloc(size); 13 | 14 | if (upro_unlikely(!aux && size)) { 15 | perror("malloc"); 16 | return NULL; 17 | } 18 | 19 | return aux; 20 | } 21 | 22 | ALLOCSZ_ATTR(1) 23 | void *upro_mem_calloc(const size_t size) 24 | { 25 | void *buf = calloc(1, size); 26 | if (upro_unlikely(!buf)) { 27 | return NULL; 28 | } 29 | 30 | return buf; 31 | } 32 | 33 | ALLOCSZ_ATTR(2) 34 | void *upro_mem_realloc(void *ptr, const size_t size) 35 | { 36 | void *aux = realloc(ptr, size); 37 | 38 | if (upro_unlikely(!aux && size)) { 39 | perror("realloc"); 40 | return NULL; 41 | } 42 | 43 | return aux; 44 | } 45 | 46 | void upro_mem_free(void *ptr) 47 | { 48 | free(ptr); 49 | } 50 | -------------------------------------------------------------------------------- /src-20G/upro_timer.c: -------------------------------------------------------------------------------- 1 | #include "upro_timer.h" 2 | 3 | #include 4 | #include 5 | 6 | int upro_timer_init(upro_timer_t *timer) 7 | { 8 | timer->freq = 1000; 9 | 10 | return 0; 11 | } 12 | 13 | int upro_timer_start(upro_timer_t *timer) 14 | { 15 | struct timespec s; 16 | clock_gettime(CLOCK_REALTIME, &s); 17 | timer->start = (uint64_t)s.tv_sec * 1e9 + (uint64_t)s.tv_nsec; 18 | 19 | return 0; 20 | } 21 | 22 | int upro_timer_restart(upro_timer_t *timer) 23 | { 24 | struct timespec s; 25 | clock_gettime(CLOCK_REALTIME, &s); 26 | timer->start = (uint64_t)s.tv_sec * 1e9 + (uint64_t)s.tv_nsec; 27 | 28 | timer->clocks = 0; 29 | 30 | return 0; 31 | } 32 | 33 | int upro_timer_stop(upro_timer_t *timer) 34 | { 35 | uint64_t n; 36 | 37 | struct timespec s; 38 | clock_gettime(CLOCK_REALTIME, &s); 39 | n = (uint64_t)s.tv_sec * 1e9 + (uint64_t)s.tv_nsec; 40 | 41 | n -= timer->start; 42 | timer->start = 0; 43 | timer->clocks += n; 44 | 45 | return 0; 46 | } 47 | 48 | int upro_timer_reset(upro_timer_t *timer) 49 | { 50 | timer->clocks = 0; 51 | 52 | return 0; 53 | } 54 | 55 | double upro_timer_get_total_time(upro_timer_t *timer) 56 | { 57 | //returns millisecond as unit -- second * 1000 58 | return (double)(timer->clocks) / (double) 1e6; 59 | } 60 | 61 | double upro_timer_get_elapsed_time(upro_timer_t *timer) 62 | { 63 | uint64_t n; 64 | 65 | struct timespec s; 66 | clock_gettime(CLOCK_REALTIME, &s); 67 | n = (uint64_t)s.tv_sec * 1e9 + (uint64_t)s.tv_nsec; 68 | 69 | return (double)(n - timer->start) / (double) 1e6; 70 | } 71 | 72 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | 3 | ################################################################ 4 | # OS Name (Linux or Darwin) 5 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) 6 | OSLOWER = $(shell uname -s 2>/dev/null | tr [:upper:] [:lower:]) 7 | 8 | # Flags to detect 32-bit or 64-bit OS platform 9 | OS_SIZE = $(shell uname -m | sed -e "s/i.86/32/" -e "s/x86_64/64/") 10 | OS_ARCH = $(shell uname -m | sed -e "s/i386/i686/") 11 | 12 | # These flags will override any settings 13 | ifeq ($(i386),1) 14 | OS_SIZE = 32 15 | OS_ARCH = i686 16 | endif 17 | 18 | ifeq ($(x86_64),1) 19 | OS_SIZE = 64 20 | OS_ARCH = x86_64 21 | endif 22 | 23 | # Flags to detect either a Linux system (linux) or Mac OSX (darwin) 24 | DARWIN = $(strip $(findstring DARWIN, $(OSUPPER))) 25 | 26 | # Location of the CUDA Toolkit binaries and libraries 27 | CUDA_PATH ?= /usr/local/cuda-5.0 28 | CUDA_INC_PATH ?= $(CUDA_PATH)/include 29 | CUDA_BIN_PATH ?= $(CUDA_PATH)/bin 30 | CUDA_SDK_DIR = /home/$(USER)/NVIDIA_CUDA-5.0_Samples/ 31 | 32 | ifneq ($(DARWIN),) 33 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib 34 | else 35 | ifeq ($(OS_SIZE),32) 36 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib 37 | else 38 | CUDA_LIB_PATH ?= $(CUDA_PATH)/lib64 39 | endif 40 | endif 41 | 42 | # Common binaries 43 | NVCC ?= $(CUDA_BIN_PATH)/nvcc 44 | 45 | # CUDA code generation flags 46 | GENCODE_SM20 := -gencode arch=compute_20,code=sm_20 47 | GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 48 | GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) 49 | 50 | # OS-specific build flags 51 | ifneq ($(DARWIN),) 52 | LDFLAGS := -Xlinker -rpath $(CUDA_LIB_PATH) -L$(CUDA_LIB_PATH) -lcudart 53 | CCFLAGS := -arch $(OS_ARCH) 54 | else 55 | ifeq ($(OS_SIZE),32) 56 | LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart 57 | CCFLAGS := -m32 58 | else 59 | LDFLAGS := -L$(CUDA_LIB_PATH) -lcudart 60 | CCFLAGS := -m64 61 | endif 62 | endif 63 | 64 | # OS-architecture specific flags 65 | ifeq ($(OS_SIZE),32) 66 | NVCCFLAGS := -m32 67 | else 68 | NVCCFLAGS := -m64 69 | endif 70 | 71 | # Debug build flags 72 | ifeq ($(dbg),1) 73 | CCFLAGS += -g 74 | NVCCFLAGS += -g -G 75 | TARGET := debug 76 | else 77 | TARGET := release 78 | endif 79 | 80 | 81 | # Common includes and paths for CUDA 82 | NVCCINCLUDES := -I$(CUDA_INC_PATH) -I. -I$(CUDA_SDK_DIR)common/inc/ 83 | 84 | ################################################################ 85 | 86 | CFLAGS = -Wall -g -D__USE_GNU -D_GNU_SOURCE -DUPRO_MALLOC -DDEBUG_0 -DTRANSFER_SEPERATE 87 | # AFFINITY_1: collector[0,1,2,3] forwarders[4,5,6,7] 88 | # AFFINITY_2: collector[0,2,4,6] forwarders[1,3,5,7] 89 | # AFFINITY_3: collector[1,3,5,7] forwarders[2,4,6,8] 90 | # AFFINITY_4: collector[1,2,3,4,..] forwarders[5,6,7,8,..] gpu_worker[0] 91 | CFLAGS += -DCPU_AFFINITY -DAFFINITY_4 92 | CFLAGS += -DNOT_FORWARD_0 -DNOT_COLLECT_0 -DNOT_GPU_0 93 | CFLAGS += -DCOLLECTOR_PERFORMANCE_TEST_0 94 | CFLAGS += -DUSE_LOCK 95 | 96 | LIBS = -lrt -lpthread -L../IOEngine/lib -lpsio 97 | LIBS += -L../libgpucrypto/lib -lgpucrypto 98 | 99 | HEADER_DIR = ./include 100 | IOENGINE_DIR = ../IOEngine/include 101 | LIBCRYPTO_DIR = ../libgpucrypto 102 | OBJECT_DIR = objs 103 | TARGET_DIR = . 104 | 105 | TARGET_FILE = upro 106 | TARGET = $(addprefix $(TARGET_DIR)/, $(TARGET_FILE)) 107 | 108 | SOURCE_FILES = $(wildcard *.c) 109 | HEADER_FILES = $(wildcard *.h) $(wildcard *.h) 110 | OBJECT_FILE = $(SOURCE_FILES:.c=.o) 111 | 112 | OBJECT = $(addprefix $(OBJECT_DIR)/, $(OBJECT_FILE)) 113 | 114 | all: $(TARGET) 115 | 116 | $(TARGET): $(OBJECT) | $(TARGET_DIR) $(OBJECT_DIR) 117 | $(CC) -pg $(OBJECT) -o $@ $(LIBS) $(LDFLAGS) 118 | 119 | $(OBJECT_DIR): 120 | mkdir $(OBJECT_DIR) 121 | 122 | 123 | $(OBJECT_DIR)/%.o : %.c 124 | $(CC) -pg $(CFLAGS) -I $(HEADER_DIR) -I $(IOENGINE_DIR) -I $(LIBCRYPTO_DIR) $(NVCCINCLUDES) -c $< -o $@ 125 | # -I $(CUDA_INC_PATH) -I $(LIBCRYPTO_DIR) 126 | 127 | .PHONY : clean 128 | clean: 129 | rm -rf $(TARGET) $(OBJECT) 130 | 131 | doc: $(SOURCE_FILES) doxygen.config 132 | doxygen doxygen.config 133 | -------------------------------------------------------------------------------- /src/README: -------------------------------------------------------------------------------- 1 | An SRTP Reverse Proxy 2 | 3 | RTP -----------AES-CTR/HMAC-SHA1----------------> SRTP 4 | 5 | 6 | 1) Options in Makefile 7 | 8 | CPU_AFFINITY: Set CPU Affinity 9 | AFFINITY_1: collector[0,1,2,3] forwarders[4,5,6,7] + 10 | AFFINITY_2: collector[0,2,4,6] forwarders[1,3,5,7] |--- gpu_worker[10] 11 | AFFINITY_3: collector[1,3,5,7] forwarders[2,4,6,8] + 12 | AFFINITY_4: collector[1,2,3,4,..] forwarders[5,6,7,8,..] gpu_worker[0] 13 | 14 | NOT_COLLECT: do not copy the packets into buffer 15 | NOT_GPU: gpu just handles packets to forwarder, without processing with GPU 16 | NOT_FORWARD: forwarder just mark the buffer as available, without forwarding 17 | 18 | COLLECTOR_PERFORMANCE_TEST: evaluate the performance of collector, packets are copied to a specified location (Memcpy, destination without cache miss). GPU worker and Forwarder will not be launched. 19 | NOT_GPU+NOT_FORWARD: evaluate the performance of collector, each packet is copied to a differnt location (Memcpy, destination always cache miss). 20 | 21 | 22 | 2) Options in upro.c (config) 23 | 24 | Through command line : ./upro -i 25 -n 4 # 25ms, cpu_worker_num = 4 25 | 26 | I : GPU batch time 27 | cpu_worker_num : Collector-Forwarder pair number 28 | gpu_worker_num : Currently always 1 29 | client_interface/server_interface : If they are the same, the proxy forwards packets through the port where it receives them. (xge0/xge1) 30 | io_batch_num : For IOEngine, packet number in each batch 31 | -------------------------------------------------------------------------------- /src/include/upro_batch.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_BATCH_H 2 | #define UPRO_BATCH_H 3 | 4 | #include 5 | #include "upro_job.h" 6 | 7 | typedef struct upro_batch_buf_s 8 | { 9 | /* keys, pkt_offsets, and ivs, are all stored in the input buffer */ 10 | void *input_buf; 11 | void *aes_key_pos; 12 | void *aes_iv_pos; 13 | void *pkt_offset_pos; 14 | void *length_pos; //length of RTP packet payload 15 | void *hmac_key_pos; 16 | 17 | void *input_buf_d; 18 | // void *output_buf_d; 19 | void *aes_key_pos_d; 20 | void *aes_iv_pos_d; 21 | void *pkt_offset_pos_d; 22 | void *length_pos_d; //length of RTP packet payload 23 | void *hmac_key_pos_d; 24 | 25 | // Job for forwarding 26 | upro_job_t *job_list; 27 | int job_num; 28 | 29 | int buf_size; 30 | int buf_length; 31 | 32 | void *hdr_buf; 33 | int hdr_length; 34 | } upro_batch_buf_t; 35 | 36 | 37 | /* Each CPU worker holds such a data structure */ 38 | typedef struct upro_batch_s 39 | { 40 | upro_batch_buf_t buf[3]; 41 | 42 | volatile int collector_buf_id; 43 | volatile int forwarder_buf_id; 44 | volatile int available_buf_id[2]; 45 | int gpu_buf_id; 46 | 47 | /* GPU worker notify CPU worker 48 | * buf_has_been_taken tell CPU worker which buf has just been taken, 49 | * processed_buf_id tell CPU worker which buf has been processed. 50 | * they all should be -1, if there are no events. 51 | * GPU write it (0/1), and CPU clears it to -1 to claim its own action. 52 | */ 53 | pthread_mutex_t mutex_forwarder_buf_id; 54 | pthread_mutex_t mutex_available_buf_id; 55 | pthread_mutex_t mutex_batch_launch; 56 | } upro_batch_t; 57 | 58 | extern pthread_key_t worker_batch_struct; 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/include/upro_collector.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_COLLECTOR_H 2 | #define UPRO_COLLECTOR_H 3 | 4 | #include "psio.h" 5 | 6 | #define MAX_COLLECTOR_NUM 12 7 | 8 | typedef struct upro_collector_s { 9 | struct ps_handle handle; 10 | uint64_t total_packets; 11 | uint64_t total_bytes; 12 | struct timeval startime; 13 | struct timeval endtime; 14 | struct timeval subtime; 15 | } __attribute__((aligned(64))) upro_collector_t; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src/include/upro_config.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_CONFIG_H 2 | #define UPRO_CONFIG_H 3 | 4 | #include "psio.h" 5 | 6 | # define MAX_WORKER_NUM 12 7 | 8 | typedef struct upro_config_s { 9 | unsigned int cpu_worker_num; 10 | unsigned int gpu_worker_num; 11 | unsigned int worker_num; // cpu_worker_num + gpu_worker_num 12 | unsigned int iterations; 13 | unsigned int log_sample_num; 14 | 15 | unsigned int eiu_hdr_len; 16 | 17 | unsigned int gpu; 18 | unsigned long batch_buf_max_size; 19 | unsigned long batch_job_max_num; 20 | 21 | unsigned int *core_ids; 22 | 23 | // Most important argument for realtime scheduling algorithm 24 | unsigned int I; // 40ms, 30ms ... 25 | unsigned int type; 26 | 27 | int io_batch_num; 28 | int client_ifindex; 29 | int server_ifindex; 30 | 31 | char client_interface[5]; 32 | char server_interface[5]; 33 | 34 | struct ps_device client_device; 35 | struct ps_device server_device; 36 | } upro_config_t; 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/include/upro_context.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef UPRO_CONTEXT_H 3 | #define UPRO_CONTEXT_H 4 | 5 | #include "upro_batch.h" 6 | 7 | typedef struct upro_forwarder_context_s { 8 | upro_batch_t *batch; 9 | int core_id; 10 | int queue_id; 11 | int initialized; 12 | } upro_forwarder_context_t; 13 | 14 | typedef struct upro_collector_context_s { 15 | upro_batch_t *batch; 16 | int core_id; 17 | int queue_id; 18 | int initialized; 19 | } upro_collector_context_t; 20 | 21 | typedef struct upro_gpu_worker_context_s { 22 | upro_batch_t *cpu_batch_set; 23 | int core_id; /* which core should gpu worker run */ 24 | /* Add more info passing to GPU worker here ... */ 25 | } upro_gpu_worker_context_t; 26 | 27 | void *upro_gpu_worker_main(upro_gpu_worker_context_t *context); 28 | void *upro_collector_main(upro_collector_context_t *context); 29 | void *upro_forwarder_main(upro_forwarder_context_t *context); 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /src/include/upro_forwarder.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_FORWARDER_H 2 | #define UPRO_FORWARDER_H 3 | 4 | #include "psio.h" 5 | 6 | typedef struct upro_forwarder_s { 7 | struct ps_handle handle; 8 | uint64_t total_packets; 9 | uint64_t total_bytes; 10 | struct timeval startime; 11 | struct timeval endtime; 12 | struct timeval subtime; 13 | } __attribute__((aligned(64))) upro_forwarder_t; 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /src/include/upro_gpu_worker.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_GPU_WORKER_H 2 | #define UPRO_GPU_WORKER_H 3 | 4 | #include "upro_batch.h" 5 | 6 | typedef struct upro_gpu_worker_s { 7 | upro_batch_buf_t **bufs[3]; /* Three buf sets */ 8 | int cur_buf_id; 9 | int total_bytes; 10 | } upro_gpu_worker_t; 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /src/include/upro_job.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_JOB_H 2 | #define UPRO_JOB_H 3 | 4 | typedef struct upro_job_s 5 | { 6 | int hdr_length; 7 | int payload_length; 8 | char *hdr_ptr; 9 | char *payload_ptr; 10 | } upro_job_t; 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /src/include/upro_log.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_LOG_H 2 | #define UPRO_LOG_H 3 | 4 | typedef struct upro_log_sample_s { 5 | unsigned int isMsg; 6 | unsigned int isErr; 7 | double timer; 8 | unsigned int nbytes; 9 | int loops; 10 | char * fmt; 11 | char * msg; 12 | double num; 13 | } upro_log_sample_t; 14 | 15 | typedef struct upro_log_s { 16 | unsigned int idx; 17 | unsigned int loops; 18 | unsigned int loop_entries; 19 | unsigned int loop_timers; 20 | 21 | upro_log_sample_t *samples; 22 | } upro_log_t; 23 | 24 | void upro_log_init(upro_log_t *log); 25 | void upro_log_loop_marker(upro_log_t *log); 26 | void upro_log_msg(upro_log_t *log, const char *format, const char *msg, const double num); 27 | void upro_log_timer(upro_log_t *log, const char *format, const char *msg, double timer, unsigned int nbytes, int loops); 28 | void upro_log_print(upro_log_t *log); 29 | #endif 30 | -------------------------------------------------------------------------------- /src/include/upro_macros.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_MACROS_H 2 | #define UPRO_MACROS_H 3 | 4 | #include 5 | 6 | /* Boolean */ 7 | #define UPRO_FALSE 0 8 | #define UPRO_TRUE !UPRO_FALSE 9 | #define UPRO_ERROR -1 10 | 11 | /* Architecture */ 12 | #define INTSIZE sizeof(int) 13 | 14 | /* Print macros */ 15 | #define UPRO_INFO 0x1000 16 | #define UPRO_ERR 0X1001 17 | #define UPRO_WARN 0x1002 18 | #define UPRO_BUG 0x1003 19 | 20 | 21 | //#define upro_info(...) upro_print(UPRO_INFO, __VA_ARGS__) 22 | //#define upro_err(...) upro_print(UPRO_ERR, __VA_ARGS__) 23 | //#define upro_warn(...) upro_print(UPRO_WARN, __VA_ARGS__) 24 | //#define upro_trace(...) upro_print(UPRO_WARN, __VA_ARGS__) 25 | #define upro_info printf 26 | #define upro_err printf 27 | #define upro_warn printf 28 | #define upro_trace printf 29 | 30 | /* Transport type */ 31 | #ifndef ARRAY_SIZE 32 | # define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) 33 | #endif 34 | 35 | #ifdef __GNUC__ /* GCC supports this since 2.3. */ 36 | #define PRINTF_WARNINGS(a,b) __attribute__ ((format (printf, a, b))) 37 | #else 38 | #define PRINTF_WARNINGS(a,b) 39 | #endif 40 | 41 | #ifdef __GNUC__ /* GCC supports this since 2.7. */ 42 | #define UNUSED_PARAM __attribute__ ((unused)) 43 | #else 44 | #define UNUSED_PARAM 45 | #endif 46 | 47 | /* 48 | * Validation macros 49 | * ----------------- 50 | * Based on article http://lwn.net/Articles/13183/ 51 | * 52 | * --- 53 | * ChangeSet 1.803, 2002/10/18 16:28:57-07:00, torvalds@home.transmeta.com 54 | * 55 | * Make a polite version of BUG_ON() - WARN_ON() which doesn't 56 | * kill the machine. 57 | * 58 | * Damn I hate people who kill the machine for no good reason. 59 | * --- 60 | * 61 | */ 62 | 63 | #define upro_unlikely(x) __builtin_expect((x),0) 64 | #define upro_likely(x) __builtin_expect((x),1) 65 | #define upro_prefetch(x, ...) __builtin_prefetch(x, __VA_ARGS__) 66 | 67 | #define upro_is_bool(x) ((x == UPRO_TRUE || x == UPRO_FALSE) ? 1 : 0) 68 | 69 | #define upro_bug(condition) do { \ 70 | if (upro_unlikely((condition)!=0)) { \ 71 | upro_print(UPRO_BUG, "Bug found in %s() at %s:%d", \ 72 | __FUNCTION__, __FILE__, __LINE__); \ 73 | abort(); \ 74 | } \ 75 | } while(0) 76 | 77 | /* 78 | * Macros to calculate sub-net data using ip address and sub-net prefix 79 | */ 80 | 81 | #define UPRO_NET_IP_OCTECT(addr,pos) (addr >> (8 * pos) & 255) 82 | #define UPRO_NET_NETMASK(addr,net) htonl((0xffffffff << (32 - net))) 83 | #define UPRO_NET_BROADCAST(addr,net) (addr | ~UPRO_NET_NETMASK(addr,net)) 84 | #define UPRO_NET_NETWORK(addr,net) (addr & UPRO_NET_NETMASK(addr,net)) 85 | #define UPRO_NET_WILDCARD(addr,net) (UPRO_NET_BROADCAST(addr,net) ^ UPRO_NET_NETWORK(addr,net)) 86 | #define UPRO_NET_HOSTMIN(addr,net) net == 31 ? UPRO_NET_NETWORK(addr,net) : (UPRO_NET_NETWORK(addr,net) + 0x01000000) 87 | #define UPRO_NET_HOSTMAX(addr,net) net == 31 ? UPRO_NET_BROADCAST(addr,net) : (UPRO_NET_BROADCAST(addr,net) - 0x01000000); 88 | 89 | #if __GNUC__ >= 4 90 | #define UPRO_EXPORT __attribute__ ((visibility ("default"))) 91 | #else 92 | #define UPRO_EXPORT 93 | #endif 94 | 95 | // TRACE 96 | #define UPRO_TRACE(...) do {} while (0) 97 | 98 | #endif 99 | -------------------------------------------------------------------------------- /src/include/upro_memory.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_MEM_H 2 | #define UPRO_MEM_H 3 | 4 | 5 | #if ((__GNUC__ * 100 + __GNUC__MINOR__) > 430) /* gcc version > 4.3 */ 6 | # define ALLOCSZ_ATTR(x,...) __attribute__ ((alloc_size(x, ##__VA_ARGS__))) 7 | #else 8 | # define ALLOCSZ_ATTR(x,...) 9 | #endif 10 | 11 | void *upro_mem_malloc(const size_t size); 12 | void *upro_mem_calloc(const size_t size); 13 | void *upro_mem_realloc(void *ptr, const size_t size); 14 | void upro_mem_free(void *ptr); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/include/upro_timer.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_TIMER_H 2 | #define UPRO_TIMER_H 3 | 4 | #include 5 | 6 | /** 7 | * \file Timer.h 8 | * \brief A timer class that provides a cross platform timer for use 9 | * in timing code progress with a high degree of accuracy. 10 | * FIXME: 11 | * 1s = 1000ms (millisecond) 12 | * 1ms = 1000us (microsecond) 13 | * 1us = 1000ns (nanosecond) 14 | * this counter returns in terms of us 15 | */ 16 | 17 | 18 | typedef struct upro_timer_s { 19 | uint64_t freq; 20 | uint64_t clocks; 21 | uint64_t start; 22 | } upro_timer_t; 23 | 24 | int upro_timer_init(); 25 | int upro_timer_start(upro_timer_t *timer); 26 | int upro_timer_restart(upro_timer_t *timer); 27 | int upro_timer_stop(upro_timer_t *timer); 28 | int upro_timer_reset(upro_timer_t *timer); 29 | double upro_timer_get_total_time(upro_timer_t *timer); 30 | double upro_timer_get_elapsed_time(upro_timer_t *timer); 31 | 32 | #endif 33 | 34 | -------------------------------------------------------------------------------- /src/include/upro_transworker.h: -------------------------------------------------------------------------------- 1 | #ifndef UPRO_TRANSWORKER_H 2 | #define UPRO_TRANSWORKER_H 3 | 4 | typedef struct upro_transworker_context_s { 5 | int core_id; 6 | int initialized; 7 | } upro_transworker_context_t; 8 | 9 | void upro_transworker_main(upro_transworker_context_t *context); 10 | 11 | #endif 12 | -------------------------------------------------------------------------------- /src/upro_log.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "upro_log.h" 6 | #include "upro_memory.h" 7 | #include "upro_config.h" 8 | 9 | extern upro_config_t *config; 10 | 11 | #define LOG_PRINT 1 12 | 13 | // Add "num" variable based on original version 14 | void upro_sample_set_msg(upro_log_sample_t *sample, const char *fmt, const char *msg, double num) 15 | { 16 | sample->isMsg = 1; 17 | 18 | sample->fmt = upro_mem_malloc(strlen(fmt)+1); 19 | strcpy(sample->fmt, fmt); 20 | 21 | sample->msg = upro_mem_malloc(strlen(msg)+1); 22 | strcpy(sample->msg, msg); 23 | 24 | sample->num = num; 25 | } 26 | 27 | void upro_sample_set_timer(upro_log_sample_t *sample, const char *fmt, const char *msg, double timer, unsigned int nbytes, int loops) 28 | { 29 | sample->isMsg = 0; 30 | sample->timer = timer; 31 | 32 | if (loops != 0) sample->loops = loops; 33 | if (nbytes > 0) sample->nbytes = nbytes; 34 | 35 | if (strlen(msg) > 0) { 36 | sample->fmt = upro_mem_malloc(strlen( fmt ) + 1); 37 | strcpy(sample->fmt, fmt); 38 | } 39 | 40 | if (strlen(msg) > 0) { 41 | sample->msg = upro_mem_malloc(strlen( msg ) + 1); 42 | strcpy(sample->msg, msg); 43 | } 44 | } 45 | 46 | void upro_sample_print(upro_log_sample_t *sample) 47 | { 48 | if(sample->isMsg == 1) { 49 | printf(sample->fmt, sample->msg, sample->num); 50 | } else { 51 | double bwd = (((double) sample->nbytes * sample->loops )/ sample->timer) / 1e9; 52 | printf(sample->fmt, sample->msg, sample->timer, bwd) ; 53 | } 54 | } 55 | 56 | /* ---------------------------------------------------------------------- */ 57 | 58 | void upro_log_init(upro_log_t *log) 59 | { 60 | log->idx = 0; 61 | log->loops = 0; 62 | log->loop_entries = 0; 63 | log->loop_timers = 0; 64 | log->samples = upro_mem_malloc(config->log_sample_num * sizeof(upro_log_sample_t)); 65 | } 66 | 67 | void upro_log_loop_marker(upro_log_t *log) 68 | { 69 | log->loop_timers = 0; 70 | log->loops ++; 71 | #if defined(LOG_PRINT) 72 | printf("\n---------------------------%d\n", log->loops); 73 | #elif defined(NO_PRINT) 74 | return; 75 | #endif 76 | } 77 | 78 | void upro_log_msg(upro_log_t *log, const char *format, const char *msg, const double num) 79 | { 80 | #if defined(LOG_PRINT) 81 | printf(format, msg, num); 82 | #elif defined(NO_PRINT) 83 | return; 84 | #else 85 | upro_sample_set_msg(&(log->samples[log->idx ++]), format, msg, num); 86 | log->loop_entries ++; 87 | #endif 88 | } 89 | 90 | void upro_log_timer(upro_log_t *log, const char *format, const char *msg, double timer, unsigned int nbytes, int loops) 91 | { 92 | upro_sample_set_timer(&(log->samples[log->idx ++]), format, msg, timer, nbytes, loops); 93 | log->loop_entries ++; 94 | log->loop_timers ++; 95 | } 96 | 97 | void upro_log_print(upro_log_t *log) 98 | { 99 | int i; 100 | 101 | for(i = 0; i < log->loop_entries; i++) { 102 | upro_sample_print(&(log->samples[i])); 103 | } 104 | } 105 | 106 | #if 0 107 | void upro_log_print_summary(upro_log_t *log, int skip) 108 | { 109 | int i, nl, current; 110 | double sum; 111 | 112 | for(i = 0; i < log->loop_entries; i++) 113 | { 114 | if(log->samples[i].isMsg()) 115 | { 116 | bool foundError = false; 117 | 118 | for(nl = 0; nl < log->loops; nl++) 119 | { 120 | current = i + nl * log->loop_entries; 121 | 122 | if(log->samples[current].isErr()) 123 | { 124 | log->samples[current].printSample(); 125 | foundError = true; 126 | break; 127 | } 128 | } 129 | 130 | if(!foundError) 131 | log->samples[i].printSample(); 132 | } 133 | else 134 | { 135 | sum = 0; 136 | 137 | for(nl = skip; nl < log->loops; nl++) 138 | { 139 | sum += log->samples[i + nl * log->loop_entries].getTimer(); 140 | } 141 | 142 | log->samples[i].setTimer("", "", sum / (log->loops-skip), 0, 0); 143 | log->samples[i].printSample(); 144 | } 145 | } 146 | } 147 | #endif 148 | -------------------------------------------------------------------------------- /src/upro_memory.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "upro_memory.h" 7 | #include "upro_macros.h" 8 | 9 | ALLOCSZ_ATTR(1) 10 | void *upro_mem_malloc(const size_t size) 11 | { 12 | void *aux = malloc(size); 13 | 14 | if (upro_unlikely(!aux && size)) { 15 | perror("malloc"); 16 | return NULL; 17 | } 18 | 19 | return aux; 20 | } 21 | 22 | ALLOCSZ_ATTR(1) 23 | void *upro_mem_calloc(const size_t size) 24 | { 25 | void *buf = calloc(1, size); 26 | if (upro_unlikely(!buf)) { 27 | return NULL; 28 | } 29 | 30 | return buf; 31 | } 32 | 33 | ALLOCSZ_ATTR(2) 34 | void *upro_mem_realloc(void *ptr, const size_t size) 35 | { 36 | void *aux = realloc(ptr, size); 37 | 38 | if (upro_unlikely(!aux && size)) { 39 | perror("realloc"); 40 | return NULL; 41 | } 42 | 43 | return aux; 44 | } 45 | 46 | void upro_mem_free(void *ptr) 47 | { 48 | free(ptr); 49 | } 50 | -------------------------------------------------------------------------------- /src/upro_timer.c: -------------------------------------------------------------------------------- 1 | #include "upro_timer.h" 2 | 3 | #include 4 | #include 5 | 6 | int upro_timer_init(upro_timer_t *timer) 7 | { 8 | timer->freq = 1000; 9 | 10 | return 0; 11 | } 12 | 13 | int upro_timer_start(upro_timer_t *timer) 14 | { 15 | struct timespec s; 16 | clock_gettime(CLOCK_REALTIME, &s); 17 | timer->start = (uint64_t)s.tv_sec * 1e9 + (uint64_t)s.tv_nsec; 18 | 19 | return 0; 20 | } 21 | 22 | int upro_timer_restart(upro_timer_t *timer) 23 | { 24 | struct timespec s; 25 | clock_gettime(CLOCK_REALTIME, &s); 26 | timer->start = (uint64_t)s.tv_sec * 1e9 + (uint64_t)s.tv_nsec; 27 | 28 | timer->clocks = 0; 29 | 30 | return 0; 31 | } 32 | 33 | int upro_timer_stop(upro_timer_t *timer) 34 | { 35 | uint64_t n; 36 | 37 | struct timespec s; 38 | clock_gettime(CLOCK_REALTIME, &s); 39 | n = (uint64_t)s.tv_sec * 1e9 + (uint64_t)s.tv_nsec; 40 | 41 | n -= timer->start; 42 | timer->start = 0; 43 | timer->clocks += n; 44 | 45 | return 0; 46 | } 47 | 48 | int upro_timer_reset(upro_timer_t *timer) 49 | { 50 | timer->clocks = 0; 51 | 52 | return 0; 53 | } 54 | 55 | double upro_timer_get_total_time(upro_timer_t *timer) 56 | { 57 | //returns millisecond as unit -- second * 1000 58 | return (double)(timer->clocks) / (double) 1e6; 59 | } 60 | 61 | double upro_timer_get_elapsed_time(upro_timer_t *timer) 62 | { 63 | uint64_t n; 64 | 65 | struct timespec s; 66 | clock_gettime(CLOCK_REALTIME, &s); 67 | n = (uint64_t)s.tv_sec * 1e9 + (uint64_t)s.tv_nsec; 68 | 69 | return (double)(n - timer->start) / (double) 1e6; 70 | } 71 | 72 | -------------------------------------------------------------------------------- /src/upro_transworker.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "upro_transworker.h" 13 | #include "upro_config.h" 14 | #include "upro_memory.h" 15 | #include "upro_macros.h" 16 | #include "upro_batch.h" 17 | #include "psio.h" 18 | 19 | extern upro_config_t *config; 20 | extern pthread_mutex_t mutex_worker_init; 21 | 22 | int upro_transworker_init(upro_transworker_context_t *context) 23 | { 24 | unsigned long mask = 1 << context->core_id; 25 | 26 | if (sched_setaffinity(0, sizeof(unsigned long), (cpu_set_t *)&mask) < 0) { 27 | upro_err("Err set affinity in transworker\n"); 28 | exit(0); 29 | } 30 | 31 | pthread_mutex_lock(&mutex_worker_init); 32 | context->initialized = 1; 33 | pthread_mutex_unlock(&mutex_worker_init); 34 | 35 | return 0; 36 | } 37 | 38 | void upro_transworker_main(upro_transworker_context_t *context) 39 | { 40 | struct ps_handle handle; 41 | struct ps_chunk chunk; 42 | struct ps_queue queue; 43 | int ret, i; 44 | 45 | upro_transworker_init(context); 46 | 47 | /* handle and queue init */ 48 | assert(ps_init_handle(&handle) == 0); 49 | 50 | /* attach all the queue in the client side */ 51 | for (i = 0; i < config->client_device.num_rx_queues; i ++) { 52 | queue.ifindex = config->client_ifindex; 53 | queue.qidx = i; 54 | assert(ps_attach_rx_device(&handle, &queue) == 0); 55 | } 56 | 57 | assert(ps_alloc_chunk(&handle, &chunk) == 0); 58 | chunk.recv_blocking = 1; 59 | 60 | printf("Transworker is working ...\n"); 61 | 62 | /* receive and forward */ 63 | for (;;) { 64 | chunk.cnt = 16; 65 | chunk.queue.ifindex = config->client_ifindex; 66 | 67 | ret = ps_recv_chunk(&handle, &chunk); 68 | if (ret < 0) { 69 | if (errno == EINTR) 70 | continue; 71 | if (!chunk.recv_blocking && errno == EWOULDBLOCK) 72 | break; 73 | assert(0); 74 | } 75 | 76 | chunk.cnt = ret; 77 | chunk.queue.ifindex = config->server_ifindex; 78 | 79 | ret = ps_send_chunk(&handle, &chunk); 80 | assert(ret >= 0); 81 | } 82 | 83 | return; 84 | } 85 | --------------------------------------------------------------------------------