├── .gitignore ├── .idea ├── misc.xml ├── modules.xml ├── rnn_benchmarks.iml ├── vcs.xml └── workspace.xml ├── 1x320-LSTM ├── __init__.py ├── bench_keras-tensorflow_LSTM.py ├── bench_keras-tensorflow_cudnnLSTM.py ├── bench_keras-theano_LSTM.py ├── bench_lasagne_LSTMLayer.py ├── bench_pytorch_LSTMCell-basic.py ├── bench_pytorch_LSTMCell-fused.py ├── bench_pytorch_cudnnLSTM.py ├── bench_tensorflow_LSTMBlockCell.py ├── bench_tensorflow_LSTMBlockFusedCell.py ├── bench_tensorflow_LSTMCell.py ├── bench_tensorflow_cudnnLSTM.py └── lib_pytorchLSTM.py ├── 4x320-LSTM ├── __init__.py ├── bench_lasagne_LSTMLayer.py ├── bench_pytorch_cudnnLSTM.py ├── bench_tensorflow_LSTMBlockCell.py ├── bench_tensorflow_LSTMCell.py └── bench_tensorflow_cudnnLSTM.py ├── 4x320-LSTM_ctc ├── __init__.py ├── bench_lasagne_LSTMLayer.py ├── bench_pytorch_cudnnLSTM.py ├── bench_tensorflow_LSTMBlockCell.py └── bench_tensorflow_LSTMCell.py ├── README.md ├── main ├── framework_comparison │ ├── main.py │ └── plot.py └── pytorch_comparison │ ├── main.py │ ├── plot.py │ └── unifier.py ├── results └── 10 │ ├── framework_comparison │ ├── 1x320-LSTM_cross-entropy.png │ ├── 1x320-LSTM_cross-entropy_100.png │ ├── 4x320-BIDIR-LSTM_CTC.png │ ├── 4x320-BIDIR-LSTM_cross-entropy.png │ └── readme.md │ └── pytorch_comparison │ ├── 1x320-LSTM_cross-entropy.png │ ├── 1x320-LSTM_cross-entropy_100.png │ ├── 4x320-BIDIR-LSTM_CTC.png │ ├── 4x320-BIDIR-LSTM_cross-entropy.png │ ├── readme.md │ └── results.csv ├── support.py └── utils ├── analyse_pandas.py ├── disable_cores.sh ├── enable_cores.sh ├── plot_all.sh └── rm_results.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.pdf 3 | *.pyc 4 | 5 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/rnn_benchmarks.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 119 | 120 | 121 | 122 | gru 123 | GRUC 124 | res 125 | cuda 126 | LSTMCw 127 | LSTMC 128 | rnn_size 129 | sync 130 | memor 131 | params 132 | seq 133 | pack 134 | mask 135 | h2 136 | CTC 137 | max_le 138 | seq_len 139 | seqlen 140 | 141 | 142 | 143 | 145 | 146 | 193 | 194 | 195 | 196 | 197 | true 198 | DEFINITION_ORDER 199 | 200 | 201 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 229 | 230 | 233 | 234 | 235 | 236 | 239 | 240 | 243 | 244 | 247 | 248 | 249 | 250 | 253 | 254 | 257 | 258 | 261 | 262 | 263 | 264 | 267 | 268 | 271 | 272 | 275 | 276 | 277 | 278 | 281 | 282 | 285 | 286 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 324 | 325 | 326 | 327 | 345 | 346 | 364 | 365 | 383 | 384 | 402 | 403 | 422 | 423 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 472 | 473 | 486 | 487 | 505 | 506 | 518 | 519 | project 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 554 | 555 | 571 | 572 | 587 | 588 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 639 | 640 | 641 | 643 | 644 | 645 | 646 | 1486730635354 647 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 681 | 682 | 685 | 688 | 689 | 690 | 692 | 693 | 694 | 695 | 696 | file://$PROJECT_DIR$/1x320-GRU/bench_lasagne.py 697 | 18 698 | 700 | 701 | file://$PROJECT_DIR$/support.py 702 | 106 703 | 705 | 706 | file://$PROJECT_DIR$/support.py 707 | 103 708 | 710 | 711 | file://$PROJECT_DIR$/4x320-LSTM/bench_pytorch.py 712 | 38 713 | 715 | 716 | file://$PROJECT_DIR$/4x320-LSTM/bench_tensorflow.py 717 | 60 718 | 720 | 721 | file://$PROJECT_DIR$/4x320-LSTM/bench_pytorch.py 722 | 18 723 | 725 | 726 | file://$PROJECT_DIR$/support.py 727 | 93 728 | 730 | 731 | file://$USER_HOME$/envs/pytorch_latest/lib/python2.7/site-packages/warpctc_pytorch/__init__.py 732 | 25 733 | 735 | 736 | file://$PROJECT_DIR$/4x320-LSTM_ctc/bench_pytorch.py 737 | 83 738 | 740 | 741 | file://$PROJECT_DIR$/4x320-LSTM_ctc/bench_tensorflow.py 742 | 86 743 | 745 | 746 | 747 | 748 | 749 | 751 | 752 | 753 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | 957 | 958 | 959 | 960 | 961 | 962 | 963 | 964 | 965 | 966 | 967 | 968 | 969 | 970 | 971 | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | 1018 | 1019 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1036 | 1037 | 1038 | 1039 | 1040 | 1041 | 1042 | 1043 | 1044 | -------------------------------------------------------------------------------- /1x320-LSTM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/1x320-LSTM/__init__.py -------------------------------------------------------------------------------- /1x320-LSTM/bench_keras-tensorflow_LSTM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import keras 5 | from keras.layers import Input, LSTM, Dense 6 | from keras.models import Model 7 | from keras.utils import to_categorical 8 | 9 | from support import toy_batch, default_params, write_results, print_results, check_results 10 | 11 | # Experiment_type 12 | bench = 'keras-{}_LSTM'.format(keras.backend.backend()) 13 | version = keras.__version__ 14 | experiment = '1x320-LSTM_cross-entropy' 15 | 16 | # Get data 17 | bX, b_lenX, bY, classes = toy_batch() 18 | batch_size, max_len, inp_dims = bX.shape 19 | rnn_size, learning_rate, batches = default_params() 20 | 21 | # Create symbolic vars 22 | x = Input(shape=(None, inp_dims), dtype='float32', name='input') 23 | 24 | # Create network 25 | fw_cell = LSTM(rnn_size, return_sequences=False, implementation=2)(x) 26 | h3 = Dense(classes, activation='softmax', use_bias=False)(fw_cell) 27 | model = Model(inputs=x, outputs=h3) 28 | start=timer.perf_counter() 29 | model.compile(optimizer='Adam', loss='categorical_crossentropy') 30 | end=timer.perf_counter() 31 | print('>>> Model compilation took {:.1f} seconds'.format(end - start)) 32 | 33 | # Print parameter count 34 | params = model.count_params() 35 | print('# network parameters: ' + str(params)) 36 | 37 | # Check for correct sizes 38 | assert (model.layers[-1].input_shape == (None, rnn_size)) # final projection input size (rnn_size) 39 | assert (model.layers[-1].get_weights()[0].shape == (rnn_size, classes)) # final projection output size (rnn_size, classes) 40 | output = model.predict(bX) 41 | assert (output.shape == (batch_size, classes)) 42 | 43 | # Start training 44 | batch_time = [] 45 | batch_loss = [] 46 | train_start=timer.perf_counter() 47 | for i in range(batches): 48 | batch_start = timer.perf_counter() 49 | loss=model.train_on_batch(x=bX, y=to_categorical(bY, num_classes=classes)) 50 | batch_end = timer.perf_counter() 51 | batch_time.append(batch_end - batch_start) 52 | batch_loss.append(loss) 53 | train_end=timer.perf_counter() 54 | 55 | # Write results 56 | print_results(batch_time) 57 | check_results(batch_loss, batch_time, train_start, train_end) 58 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 59 | run_time=batch_time, version=version) 60 | -------------------------------------------------------------------------------- /1x320-LSTM/bench_keras-tensorflow_cudnnLSTM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import keras 5 | from keras.layers import Input, Dense, CuDNNLSTM 6 | from keras.models import Model 7 | from keras.utils import to_categorical 8 | 9 | from support import toy_batch, default_params, write_results, print_results, check_results 10 | 11 | # Experiment_type 12 | bench = 'keras-{}_cudnnLSTM'.format(keras.backend.backend()) 13 | version = keras.__version__ 14 | experiment = '1x320-LSTM_cross-entropy' 15 | 16 | # Get data 17 | bX, b_lenX, bY, classes = toy_batch() 18 | batch_size, max_len, inp_dims = bX.shape 19 | rnn_size, learning_rate, batches = default_params() 20 | 21 | # Create symbolic vars 22 | x = Input(shape=(None, inp_dims), dtype='float32', name='input') 23 | 24 | # Create network 25 | fw_cell = CuDNNLSTM(rnn_size, return_sequences=False)(x) 26 | h3 = Dense(classes, activation='softmax', use_bias=False)(fw_cell) 27 | model = Model(inputs=x, outputs=h3) 28 | start=timer.perf_counter() 29 | model.compile(optimizer='Adam', loss='categorical_crossentropy') 30 | end=timer.perf_counter() 31 | print('>>> Model compilation took {:.1f} seconds'.format(end - start)) 32 | 33 | # Print parameter count 34 | params = model.count_params() 35 | print('# network parameters: ' + str(params)) 36 | 37 | # Check for correct sizes 38 | assert (model.layers[-1].input_shape == (None, rnn_size)) # final projection input size (rnn_size) 39 | assert (model.layers[-1].get_weights()[0].shape == (rnn_size, classes)) # final projection output size (rnn_size, classes) 40 | output = model.predict(bX) 41 | assert (output.shape == (batch_size, classes)) 42 | 43 | # Start training 44 | batch_time = [] 45 | batch_loss = [] 46 | train_start=timer.perf_counter() 47 | for i in range(batches): 48 | batch_start = timer.perf_counter() 49 | loss=model.train_on_batch(x=bX, y=to_categorical(bY, num_classes=classes)) 50 | batch_end = timer.perf_counter() 51 | batch_time.append(batch_end - batch_start) 52 | batch_loss.append(loss) 53 | train_end=timer.perf_counter() 54 | 55 | # Write results 56 | print_results(batch_time) 57 | check_results(batch_loss, batch_time, train_start, train_end) 58 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 59 | run_time=batch_time, version=version) 60 | -------------------------------------------------------------------------------- /1x320-LSTM/bench_keras-theano_LSTM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import keras 5 | from keras.layers import Input, LSTM, Dense 6 | from keras.models import Model 7 | from keras.utils import to_categorical 8 | 9 | from support import toy_batch, default_params, write_results, print_results, check_results 10 | 11 | # Experiment_type 12 | bench = 'keras-{}_LSTM'.format(keras.backend.backend()) 13 | version = keras.__version__ 14 | experiment = '1x320-LSTM_cross-entropy' 15 | 16 | # Get data 17 | bX, b_lenX, bY, classes = toy_batch() 18 | batch_size, max_len, inp_dims = bX.shape 19 | rnn_size, learning_rate, batches = default_params() 20 | 21 | # Create symbolic vars 22 | x = Input(shape=(None, inp_dims), dtype='float32', name='input') 23 | 24 | # Create network 25 | fw_cell = LSTM(rnn_size, return_sequences=False, implementation=2)(x) 26 | h3 = Dense(classes, activation='softmax', use_bias=False)(fw_cell) 27 | model = Model(inputs=x, outputs=h3) 28 | start=timer.perf_counter() 29 | model.compile(optimizer='Adam', loss='categorical_crossentropy') 30 | end=timer.perf_counter() 31 | print('>>> Model compilation took {:.1f} seconds'.format(end - start)) 32 | 33 | # Print parameter count 34 | params = model.count_params() 35 | print('# network parameters: ' + str(params)) 36 | 37 | # Check for correct sizes 38 | assert (model.layers[-1].input_shape == (None, rnn_size)) # final projection input size (rnn_size) 39 | assert (model.layers[-1].get_weights()[0].shape == (rnn_size, classes)) # final projection output size (rnn_size, classes) 40 | output = model.predict(bX) 41 | assert (output.shape == (batch_size, classes)) 42 | 43 | # Start training 44 | batch_time = [] 45 | batch_loss = [] 46 | train_start=timer.perf_counter() 47 | for i in range(batches): 48 | batch_start = timer.perf_counter() 49 | loss=model.train_on_batch(x=bX, y=to_categorical(bY, num_classes=classes)) 50 | batch_end = timer.perf_counter() 51 | batch_time.append(batch_end - batch_start) 52 | batch_loss.append(loss) 53 | train_end=timer.perf_counter() 54 | 55 | # Write results 56 | print_results(batch_time) 57 | check_results(batch_loss, batch_time, train_start, train_end) 58 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 59 | run_time=batch_time, version=version) 60 | -------------------------------------------------------------------------------- /1x320-LSTM/bench_lasagne_LSTMLayer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import lasagne 5 | import theano 6 | import theano.tensor as T 7 | 8 | from support import toy_batch, default_params, write_results, print_results, check_results 9 | 10 | # Experiment_type 11 | bench = 'lasagne_LSTMLayer' 12 | version = lasagne.__version__ 13 | experiment = '1x320-LSTM_cross-entropy' 14 | 15 | # Get data 16 | bX, b_lenX, bY, classes = toy_batch() 17 | batch_size, seq_len, inp_dims = bX.shape 18 | rnn_size, learning_rate, batches = default_params() 19 | 20 | # Create symbolic vars 21 | x = T.ftensor3('x') 22 | y = T.ivector('y') 23 | 24 | # Create network 25 | network = lasagne.layers.InputLayer(shape=(None, None, inp_dims), input_var=x) # Input layer 26 | network = lasagne.layers.LSTMLayer(network, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform()) # RNN layer 27 | network = lasagne.layers.SliceLayer(network, -1, axis=1) # slice last time step 28 | network = lasagne.layers.DenseLayer(network, num_units=classes, nonlinearity=lasagne.nonlinearities.softmax, 29 | b=None) # Output projection 30 | 31 | # Print parameter count 32 | params = lasagne.layers.count_params(network) 33 | print('>>> # network parameters: ' + str(params)) 34 | 35 | # Create loss, optimizer and train function 36 | prediction = lasagne.layers.get_output(network) 37 | loss = lasagne.objectives.categorical_crossentropy(predictions=prediction, targets=y) 38 | loss = loss.mean() 39 | update_params = lasagne.layers.get_all_params(network, trainable=True) 40 | updates = lasagne.updates.adam(loss, update_params, learning_rate=learning_rate) 41 | fn_inputs = [x, y] 42 | prediction_det = lasagne.layers.get_output(network, deterministic=True) 43 | 44 | start = timer.perf_counter() 45 | train_fn = theano.function(fn_inputs, loss, updates=updates) 46 | output_fn = theano.function([x], prediction_det) 47 | end = timer.perf_counter() 48 | print('>>> Theano function compilation took {:.1f} seconds'.format(end - start)) 49 | 50 | # Check for correct sizes 51 | assert (network.input_shape == (None, rnn_size)) # final projection input size (Batch_size x rnn_size) 52 | assert (network.W.eval().shape == (rnn_size, classes)) # final projection kernel size (rnn_size x classes) 53 | output = output_fn(bX) 54 | output_fn.sync_shared() 55 | assert (output.shape == (batch_size, classes)) # output size 56 | 57 | # Start training 58 | batch_time = [] 59 | batch_loss = [] 60 | train_start = timer.perf_counter() # start of training 61 | for i in range(batches): 62 | batch_start = timer.perf_counter() # start of batch 63 | loss = train_fn(bX, bY) 64 | train_fn.sync_shared() # synchronize function call for precise time measurement 65 | batch_end = timer.perf_counter() # end of batch 66 | batch_time.append(batch_end - batch_start) 67 | batch_loss.append(loss) 68 | train_end = timer.perf_counter() # end of training 69 | 70 | # Results handling 71 | print_results(batch_time) 72 | check_results(batch_loss, batch_time, train_start, train_end) 73 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 74 | run_time=batch_time, version=version) 75 | -------------------------------------------------------------------------------- /1x320-LSTM/bench_pytorch_LSTMCell-basic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import lib_pytorchLSTM as libLSTM 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | from torch.autograd import Variable 10 | 11 | from support import toy_batch, default_params, write_results, print_results, check_results 12 | 13 | # Experiment_type 14 | bench = 'pytorch_LSTMCell-basic' 15 | version = torch.__version__ 16 | experiment = '1x320-LSTM_cross-entropy' 17 | 18 | # Get data 19 | bX, b_lenX, bY, classes = toy_batch() 20 | batch_size, seq_len, inp_dims = bX.shape 21 | rnn_size, learning_rate, batches = default_params() 22 | 23 | # PyTorch compatibility: time first, batch second 24 | bX = np.transpose(bX, (1, 0, 2)) 25 | 26 | # Create Network 27 | class Net(nn.Module): 28 | def __init__(self): 29 | super(Net, self).__init__() 30 | self.lstm = libLSTM.LSTMCell(input_size=inp_dims, hidden_size=rnn_size, bias=True) 31 | self.fc = nn.Linear(rnn_size, classes, bias=False) 32 | 33 | def forward(self, x): 34 | max_len, batch_size, features = x.size() 35 | h_lstm = Variable(torch.zeros(batch_size, rnn_size)).cuda() 36 | c_lstm = Variable(torch.zeros(batch_size, rnn_size)).cuda() 37 | output = [] 38 | for i in range(max_len): 39 | h_lstm, c_lstm = self.lstm(x[i], (h_lstm, c_lstm)) 40 | output.append(h_lstm) 41 | h1 = torch.stack(output) 42 | h2 = h1[-1, :, :] 43 | h3 = self.fc(h2) 44 | return h3 45 | 46 | 47 | net = Net() 48 | net.cuda() 49 | 50 | # Print parameter count 51 | params = 0 52 | for param in list(net.parameters()): 53 | sizes = 1 54 | for el in param.size(): 55 | sizes = sizes * el 56 | params += sizes 57 | print('# network parameters: ' + str(params)) 58 | 59 | # Create optimizer 60 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 61 | criterion = nn.CrossEntropyLoss() # loss definition 62 | 63 | # Check for correct sizes 64 | assert (net.fc.in_features == rnn_size) # final projection input size (rnn_size) 65 | assert (net.fc.weight.cpu().data.numpy().shape == ( 66 | classes, rnn_size)) # final projection output size (classes, rnn_size) 67 | bXt = Variable(torch.from_numpy(bX).cuda()) 68 | torch.cuda.synchronize() 69 | output = net(bXt) 70 | output_numpy = output.data.cpu().numpy() 71 | assert (output_numpy.shape == (batch_size, classes)) 72 | 73 | # Start training 74 | batch_time = [] 75 | batch_loss = [] 76 | train_start = timer.perf_counter() 77 | for i in range(batches): 78 | torch.cuda.synchronize() # synchronize function call for precise time measurement 79 | batch_start = timer.perf_counter() 80 | 81 | bXt = Variable(torch.from_numpy(bX).cuda()) 82 | bYt = Variable(torch.from_numpy(bY).cuda()) 83 | 84 | optimizer.zero_grad() 85 | output = net(bXt) 86 | loss = criterion(output, bYt.long()) 87 | loss.backward() 88 | optimizer.step() 89 | 90 | torch.cuda.synchronize() # synchronize function call for precise time measurement 91 | batch_end = timer.perf_counter() 92 | batch_time.append(batch_end - batch_start) 93 | batch_loss.append(float(loss.data.cpu().numpy())) 94 | train_end = timer.perf_counter() # end of training 95 | 96 | # Write results 97 | print_results(batch_time) 98 | check_results(batch_loss, batch_time, train_start, train_end) 99 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 100 | run_time=batch_time, version=version) 101 | -------------------------------------------------------------------------------- /1x320-LSTM/bench_pytorch_LSTMCell-fused.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.autograd import Variable 9 | 10 | from support import toy_batch, default_params, write_results, print_results, check_results 11 | 12 | # Experiment_type 13 | bench = 'pytorch_LSTMCell-fused' 14 | version = torch.__version__ 15 | experiment = '1x320-LSTM_cross-entropy' 16 | 17 | # Get data 18 | bX, b_lenX, bY, classes = toy_batch() 19 | batch_size, seq_len, inp_dims = bX.shape 20 | rnn_size, learning_rate, batches = default_params() 21 | 22 | # PyTorch compatibility: time first, batch second 23 | bX = np.transpose(bX, (1, 0, 2)) 24 | 25 | 26 | # Create Network 27 | class Net(nn.Module): 28 | def __init__(self): 29 | super(Net, self).__init__() 30 | self.lstm = nn.LSTMCell(input_size=inp_dims, hidden_size=rnn_size, bias=True) 31 | self.fc = nn.Linear(rnn_size, classes, bias=False) 32 | 33 | def forward(self, x): 34 | max_len, batch_size, features = x.size() 35 | h_lstm = Variable(torch.zeros(batch_size, rnn_size)).cuda() 36 | c_lstm = Variable(torch.zeros(batch_size, rnn_size)).cuda() 37 | 38 | output = [] 39 | for i in range(max_len): 40 | h_lstm, c_lstm = self.lstm(x[i], (h_lstm, c_lstm)) 41 | output.append(h_lstm) 42 | 43 | h1 = torch.stack(output) 44 | h2 = h1[-1, :, :] 45 | h3 = self.fc(h2) 46 | return h3 47 | 48 | 49 | net = Net() 50 | net.cuda() 51 | 52 | # Print parameter count 53 | params = 0 54 | for param in list(net.parameters()): 55 | sizes = 1 56 | for el in param.size(): 57 | sizes = sizes * el 58 | params += sizes 59 | print('# network parameters: ' + str(params)) 60 | 61 | # Create optimizer 62 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 63 | criterion = nn.CrossEntropyLoss() # loss definition 64 | 65 | # Check for correct sizes 66 | assert (net.fc.in_features == rnn_size) # final projection input size (rnn_size) 67 | assert (net.fc.weight.cpu().data.numpy().shape == ( 68 | classes, rnn_size)) # final projection output size (classes, rnn_size) 69 | bXt = Variable(torch.from_numpy(bX).cuda()) 70 | torch.cuda.synchronize() 71 | output = net(bXt) 72 | output_numpy = output.data.cpu().numpy() 73 | assert (output_numpy.shape == (batch_size, classes)) 74 | 75 | # Start training 76 | batch_time = [] 77 | batch_loss = [] 78 | train_start = timer.perf_counter() 79 | for i in range(batches): 80 | torch.cuda.synchronize() # synchronize function call for precise time measurement 81 | batch_start = timer.perf_counter() 82 | 83 | bXt = Variable(torch.from_numpy(bX).cuda()) 84 | bYt = Variable(torch.from_numpy(bY).cuda()) 85 | 86 | optimizer.zero_grad() 87 | output = net(bXt) 88 | loss = criterion(output, bYt.long()) 89 | loss.backward() 90 | optimizer.step() 91 | 92 | torch.cuda.synchronize() # synchronize function call for precise time measurement 93 | batch_end = timer.perf_counter() 94 | batch_time.append(batch_end - batch_start) 95 | batch_loss.append(float(loss.data.cpu().numpy())) 96 | train_end = timer.perf_counter() # end of training 97 | 98 | # Write results 99 | print_results(batch_time) 100 | check_results(batch_loss, batch_time, train_start, train_end) 101 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 102 | run_time=batch_time, version=version) 103 | -------------------------------------------------------------------------------- /1x320-LSTM/bench_pytorch_cudnnLSTM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.autograd import Variable 9 | 10 | from support import toy_batch, default_params, write_results, print_results, check_results 11 | 12 | # Experiment_type 13 | bench = 'pytorch_cudnnLSTM' 14 | version = torch.__version__ 15 | experiment = '1x320-LSTM_cross-entropy' 16 | 17 | # Get data 18 | bX, b_lenX, bY, classes = toy_batch() 19 | batch_size, seq_len, inp_dims = bX.shape 20 | rnn_size, learning_rate, batches = default_params() 21 | 22 | # PyTorch compatibility: time first, batch second 23 | bX = np.transpose(bX, (1, 0, 2)) 24 | 25 | # Create Network 26 | class Net(nn.Module): 27 | def __init__(self): 28 | super(Net, self).__init__() 29 | self.lstm = nn.LSTM(input_size=inp_dims, hidden_size=rnn_size, num_layers=1, bias=True, bidirectional=False) 30 | self.fc = nn.Linear(rnn_size, classes, bias=False) 31 | 32 | def forward(self, x): 33 | h1, state = self.lstm(x) 34 | h2 = h1[-1, :, :] 35 | h3 = self.fc(h2) 36 | return h3 37 | 38 | 39 | net = Net() 40 | net.cuda() 41 | 42 | # Print parameter count 43 | params = 0 44 | for param in list(net.parameters()): 45 | sizes = 1 46 | for el in param.size(): 47 | sizes = sizes * el 48 | params += sizes 49 | print('# network parameters: ' + str(params)) 50 | 51 | # Create optimizer 52 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 53 | criterion = nn.CrossEntropyLoss() # loss definition 54 | 55 | # Check for correct sizes 56 | assert (net.fc.in_features == rnn_size) # final projection input size (rnn_size) 57 | assert (net.fc.weight.cpu().data.numpy().shape == ( 58 | classes, rnn_size)) # final projection output size (classes, rnn_size) 59 | bXt = Variable(torch.from_numpy(bX).cuda()) 60 | torch.cuda.synchronize() 61 | output = net(bXt) 62 | output_numpy = output.data.cpu().numpy() 63 | assert (output_numpy.shape == (batch_size, classes)) 64 | 65 | # Start training 66 | batch_time = [] 67 | batch_loss = [] 68 | train_start = timer.perf_counter() 69 | for i in range(batches): 70 | torch.cuda.synchronize() # synchronize function call for precise time measurement 71 | batch_start = timer.perf_counter() 72 | 73 | bXt = Variable(torch.from_numpy(bX).cuda()) 74 | bYt = Variable(torch.from_numpy(bY).cuda()) 75 | 76 | optimizer.zero_grad() 77 | output = net(bXt) 78 | loss = criterion(output, bYt.long()) 79 | loss.backward() 80 | optimizer.step() 81 | 82 | torch.cuda.synchronize() # synchronize function call for precise time measurement 83 | batch_end = timer.perf_counter() 84 | batch_time.append(batch_end - batch_start) 85 | batch_loss.append(float(loss.data.cpu().numpy())) 86 | train_end = timer.perf_counter() # end of training 87 | 88 | # Write results 89 | print_results(batch_time) 90 | check_results(batch_loss, batch_time, train_start, train_end) 91 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 92 | run_time=batch_time, version=version) 93 | -------------------------------------------------------------------------------- /1x320-LSTM/bench_tensorflow_LSTMBlockCell.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import tensorflow as tf 5 | 6 | from support import toy_batch, default_params, write_results, print_results, check_results 7 | 8 | # Experiment_type 9 | bench = 'tensorflow_LSTMBlockCell' 10 | version = tf.__version__ 11 | experiment = '1x320-LSTM_cross-entropy' 12 | 13 | # Get data 14 | bX, b_lenX, bY, classes = toy_batch() 15 | batch_size, max_len, inp_dims = bX.shape 16 | rnn_size, learning_rate, batches = default_params() 17 | 18 | # Create symbolic vars 19 | x = tf.placeholder(tf.float32, [None, None, inp_dims]) 20 | seq_len = tf.placeholder(tf.int32, [None]) 21 | y = tf.placeholder(tf.int32, [None]) 22 | 23 | # Create network 24 | fw_cell = tf.contrib.rnn.LSTMBlockCell(rnn_size) 25 | h1, _ = tf.nn.dynamic_rnn(cell=fw_cell, inputs=x, sequence_length=seq_len, dtype=tf.float32) 26 | h2 = h1[:, -1, :] 27 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False) 28 | 29 | # Create loss, optimizer and train function 30 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y)) 31 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 32 | train_step = optimizer.minimize(loss) 33 | 34 | # Initialize session 35 | init = tf.global_variables_initializer() 36 | config = tf.ConfigProto() 37 | # config.gpu_options.allow_growth = False # dynamic allocation of VRAM 38 | 39 | # Print parameter count 40 | params = 0 41 | for variable in tf.trainable_variables(): 42 | # shape is an array of tf.Dimension 43 | shape = variable.get_shape() 44 | variable_parameters = 1 45 | for dim in shape: 46 | variable_parameters *= dim.value 47 | params += variable_parameters 48 | print('# network parameters: ' + str(params)) 49 | 50 | with tf.Session(config=config) as sess: 51 | sess.run(init) 52 | # Check for correct sizes 53 | assert (h2._shape_as_list() == [None, rnn_size]) # final projection input size (rnn_size) 54 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [rnn_size, classes]) # final projection output size (rnn_size, classes) 55 | output = sess.run(h3, feed_dict={x: bX, y: bY, seq_len: b_lenX}) 56 | assert (output.shape == (batch_size, classes)) 57 | 58 | # Start training 59 | batch_time = [] 60 | batch_loss = [] 61 | train_start=timer.perf_counter() 62 | for i in range(batches): 63 | batch_start = timer.perf_counter() 64 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, seq_len: b_lenX}) 65 | batch_end = timer.perf_counter() 66 | batch_time.append(batch_end - batch_start) 67 | batch_loss.append(loss_val) 68 | train_end = timer.perf_counter() 69 | 70 | # Results handling 71 | print_results(batch_time) 72 | check_results(batch_loss, batch_time, train_start, train_end) 73 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 74 | run_time=batch_time, version=version) 75 | -------------------------------------------------------------------------------- /1x320-LSTM/bench_tensorflow_LSTMBlockFusedCell.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from support import toy_batch, default_params, write_results, print_results, check_results 8 | 9 | # Experiment_type 10 | bench = 'tensorflow_LSTMBlockFusedCell' 11 | version = tf.__version__ 12 | experiment = '1x320-LSTM_cross-entropy' 13 | 14 | # Get data 15 | bX, b_lenX, bY, classes = toy_batch() 16 | batch_size, max_len, inp_dims = bX.shape 17 | rnn_size, learning_rate, batches = default_params() 18 | 19 | # Create symbolic vars 20 | x = tf.placeholder(tf.float32, [None, None, inp_dims]) 21 | seq_len = tf.placeholder(tf.int32, [None]) 22 | y = tf.placeholder(tf.int32, [None]) 23 | 24 | # fusedcell compatibility: time first, batch second 25 | bX = np.transpose(bX, (1, 0, 2)) 26 | 27 | # Create network 28 | fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(rnn_size) 29 | h1, _ = fw_cell(x, sequence_length=seq_len, dtype=tf.float32) 30 | h2 = h1[-1, :, :] 31 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False) 32 | 33 | # Create loss, optimizer and train function 34 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y)) 35 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 36 | train_step = optimizer.minimize(loss) 37 | 38 | # Initialize session 39 | init = tf.global_variables_initializer() 40 | config = tf.ConfigProto() 41 | # config.gpu_options.allow_growth = False # dynamic allocation of VRAM 42 | 43 | # Print parameter count 44 | params = 0 45 | for variable in tf.trainable_variables(): 46 | # shape is an array of tf.Dimension 47 | shape = variable.get_shape() 48 | variable_parameters = 1 49 | for dim in shape: 50 | variable_parameters *= dim.value 51 | params += variable_parameters 52 | print('# network parameters: ' + str(params)) 53 | 54 | with tf.Session(config=config) as sess: 55 | sess.run(init) 56 | # Check for correct sizes 57 | assert (h2._shape_as_list() == [None, rnn_size]) # final projection input size (rnn_size) 58 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [rnn_size, classes]) # final projection output size (rnn_size, classes) 59 | output = sess.run(h3, feed_dict={x: bX, y: bY, seq_len: b_lenX}) 60 | assert (output.shape == (batch_size, classes)) 61 | 62 | # Start training 63 | batch_time = [] 64 | batch_loss = [] 65 | train_start=timer.perf_counter() 66 | for i in range(batches): 67 | batch_start = timer.perf_counter() 68 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, seq_len: b_lenX}) 69 | batch_end = timer.perf_counter() 70 | batch_time.append(batch_end - batch_start) 71 | batch_loss.append(loss_val) 72 | train_end = timer.perf_counter() 73 | 74 | # Results handling 75 | print_results(batch_time) 76 | check_results(batch_loss, batch_time, train_start, train_end) 77 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 78 | run_time=batch_time, version=version) 79 | -------------------------------------------------------------------------------- /1x320-LSTM/bench_tensorflow_LSTMCell.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import tensorflow as tf 5 | 6 | from support import toy_batch, default_params, write_results, print_results, check_results 7 | 8 | # Experiment_type 9 | bench = 'tensorflow_LSTMCell' 10 | version = tf.__version__ 11 | experiment = '1x320-LSTM_cross-entropy' 12 | 13 | # Get data 14 | bX, b_lenX, bY, classes = toy_batch() 15 | batch_size, max_len, inp_dims = bX.shape 16 | rnn_size, learning_rate, batches = default_params() 17 | 18 | # Create symbolic vars 19 | x = tf.placeholder(tf.float32, [None, None, inp_dims]) 20 | seq_len = tf.placeholder(tf.int32, [None]) 21 | y = tf.placeholder(tf.int32, [None]) 22 | 23 | # Create network 24 | fw_cell = tf.nn.rnn_cell.LSTMCell(rnn_size) 25 | h1, _ = tf.nn.dynamic_rnn(cell=fw_cell, inputs=x, sequence_length=seq_len, dtype=tf.float32) 26 | h2 = h1[:, -1, :] 27 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False) 28 | 29 | # Create loss, optimizer and train function 30 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y)) 31 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 32 | train_step = optimizer.minimize(loss) 33 | 34 | # Initialize session 35 | init = tf.global_variables_initializer() 36 | config = tf.ConfigProto() 37 | # config.gpu_options.allow_growth = False # dynamic allocation of VRAM 38 | 39 | # Print parameter count 40 | params = 0 41 | for variable in tf.trainable_variables(): 42 | # shape is an array of tf.Dimension 43 | shape = variable.get_shape() 44 | variable_parameters = 1 45 | for dim in shape: 46 | variable_parameters *= dim.value 47 | params += variable_parameters 48 | print('# network parameters: ' + str(params)) 49 | 50 | with tf.Session(config=config) as sess: 51 | sess.run(init) 52 | # Check for correct sizes 53 | assert (h2._shape_as_list() == [None, rnn_size]) # final projection input size (rnn_size) 54 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [rnn_size, classes]) # final projection output size (rnn_size, classes) 55 | output = sess.run(h3, feed_dict={x: bX, y: bY, seq_len: b_lenX}) 56 | assert (output.shape == (batch_size, classes)) 57 | 58 | # Start training 59 | batch_time = [] 60 | batch_loss = [] 61 | train_start=timer.perf_counter() 62 | for i in range(batches): 63 | batch_start = timer.perf_counter() 64 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, seq_len: b_lenX}) 65 | batch_end = timer.perf_counter() 66 | batch_time.append(batch_end - batch_start) 67 | batch_loss.append(loss_val) 68 | train_end = timer.perf_counter() 69 | 70 | # Results handling 71 | print_results(batch_time) 72 | check_results(batch_loss, batch_time, train_start, train_end) 73 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 74 | run_time=batch_time, version=version) -------------------------------------------------------------------------------- /1x320-LSTM/bench_tensorflow_cudnnLSTM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from support import toy_batch, default_params, write_results, print_results, check_results 8 | 9 | # Experiment_type 10 | bench = 'tensorflow_cudnnLSTM' 11 | version = tf.__version__ 12 | experiment = '1x320-LSTM_cross-entropy' 13 | 14 | # Get data 15 | bX, b_lenX, bY, classes = toy_batch() 16 | batch_size, max_len, inp_dims = bX.shape 17 | rnn_size, learning_rate, batches = default_params() 18 | 19 | # cudnn compatibility: time first, batch second 20 | bX = np.transpose(bX, (1, 0, 2)) 21 | 22 | # Create symbolic vars 23 | x = tf.placeholder(tf.float32, [None, None, inp_dims]) 24 | seq_len = tf.placeholder(tf.int32, [None]) 25 | y = tf.placeholder(tf.int32, [None]) 26 | 27 | # Create network 28 | cudnn_lstm = tf.contrib.cudnn_rnn.CudnnLSTM(num_layers=1, num_units=rnn_size) 29 | h1, _ = cudnn_lstm(x) 30 | h2 = h1[-1, :, :] 31 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False) 32 | 33 | # Create loss, optimizer and train function 34 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y)) 35 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 36 | 37 | train_step = optimizer.minimize(loss) 38 | 39 | # Initialize session 40 | init = tf.global_variables_initializer() 41 | config = tf.ConfigProto() 42 | # config.gpu_options.allow_growth = False # dynamic allocation of VRAM 43 | 44 | # Print parameter count 45 | params = 0 46 | for variable in tf.trainable_variables(): 47 | # shape is an array of tf.Dimension 48 | if 'cudnn_lstm' in str(variable): 49 | biases = cudnn_lstm.canonical_bias_shapes 50 | weights = cudnn_lstm.canonical_weight_shapes 51 | all_biases = np.sum(biases) 52 | all_weights = np.sum([t[0] * t[1] for t in weights]) 53 | params += all_biases 54 | params += all_weights 55 | else: 56 | shape = variable.get_shape() 57 | variable_parametes = 1 58 | for dim in shape: 59 | variable_parametes *= dim.value 60 | params += variable_parametes 61 | print('# network parameters: ' + str(params)) 62 | 63 | with tf.Session(config=config) as sess: 64 | sess.run(init) 65 | # Check for correct sizes 66 | assert (h2._shape_as_list() == [None, rnn_size]) # final projection input size (rnn_size) 67 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [rnn_size, classes]) # final projection output size (rnn_size, classes) 68 | output = sess.run(h3, feed_dict={x: bX, y: bY, seq_len: b_lenX}) 69 | assert (output.shape == (batch_size, classes)) 70 | 71 | # Start training 72 | batch_time = [] 73 | batch_loss = [] 74 | train_start=timer.perf_counter() 75 | for i in range(batches): 76 | batch_start = timer.perf_counter() 77 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, seq_len: b_lenX}) 78 | batch_end = timer.perf_counter() 79 | batch_time.append(batch_end - batch_start) 80 | batch_loss.append(loss_val) 81 | train_end = timer.perf_counter() 82 | 83 | # Results handling 84 | print_results(batch_time) 85 | check_results(batch_loss, batch_time, train_start, train_end) 86 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 87 | run_time=batch_time, version=version) -------------------------------------------------------------------------------- /1x320-LSTM/lib_pytorchLSTM.py: -------------------------------------------------------------------------------- 1 | "copied together from pytorch/nn/modules/rnn.py, pytorch/nn/_functions/rnn.py" 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from torch.nn import Parameter 8 | 9 | 10 | class RNNCellBase(torch.nn.Module): 11 | 12 | def extra_repr(self): 13 | s = '{input_size}, {hidden_size}' 14 | if 'bias' in self.__dict__ and self.bias is not True: 15 | s += ', bias={bias}' 16 | if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh": 17 | s += ', nonlinearity={nonlinearity}' 18 | return s.format(**self.__dict__) 19 | 20 | def check_forward_input(self, input): 21 | if input.size(1) != self.input_size: 22 | raise RuntimeError( 23 | "input has inconsistent input_size: got {}, expected {}".format( 24 | input.size(1), self.input_size)) 25 | 26 | def check_forward_hidden(self, input, hx, hidden_label=''): 27 | if input.size(0) != hx.size(0): 28 | raise RuntimeError( 29 | "Input batch size {} doesn't match hidden{} batch size {}".format( 30 | input.size(0), hidden_label, hx.size(0))) 31 | 32 | if hx.size(1) != self.hidden_size: 33 | raise RuntimeError( 34 | "hidden{} has inconsistent hidden_size: got {}, expected {}".format( 35 | hidden_label, hx.size(1), self.hidden_size)) 36 | 37 | 38 | class LSTMCell(RNNCellBase): 39 | r"""A long short-term memory (LSTM) cell. 40 | 41 | .. math:: 42 | 43 | \begin{array}{ll} 44 | i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\ 45 | f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\ 46 | g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\ 47 | o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\ 48 | c' = f * c + i * g \\ 49 | h' = o \tanh(c') \\ 50 | \end{array} 51 | 52 | where :math:`\sigma` is the sigmoid function. 53 | 54 | Args: 55 | input_size: The number of expected features in the input `x` 56 | hidden_size: The number of features in the hidden state `h` 57 | bias: If `False`, then the layer does not use bias weights `b_ih` and 58 | `b_hh`. Default: ``True`` 59 | 60 | Inputs: input, (h_0, c_0) 61 | - **input** of shape `(batch, input_size)`: tensor containing input features 62 | - **h_0** of shape `(batch, hidden_size)`: tensor containing the initial hidden 63 | state for each element in the batch. 64 | - **c_0** of shape `(batch, hidden_size)`: tensor containing the initial cell state 65 | for each element in the batch. 66 | 67 | If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero. 68 | 69 | Outputs: h_1, c_1 70 | - **h_1** of shape `(batch, hidden_size)`: tensor containing the next hidden state 71 | for each element in the batch 72 | - **c_1** of shape `(batch, hidden_size)`: tensor containing the next cell state 73 | for each element in the batch 74 | 75 | Attributes: 76 | weight_ih: the learnable input-hidden weights, of shape 77 | `(4*hidden_size x input_size)` 78 | weight_hh: the learnable hidden-hidden weights, of shape 79 | `(4*hidden_size x hidden_size)` 80 | bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)` 81 | bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)` 82 | 83 | Examples:: 84 | 85 | >>> rnn = nn.LSTMCell(10, 20) 86 | >>> input = torch.randn(6, 3, 10) 87 | >>> hx = torch.randn(3, 20) 88 | >>> cx = torch.randn(3, 20) 89 | >>> output = [] 90 | >>> for i in range(6): 91 | hx, cx = rnn(input[i], (hx, cx)) 92 | output.append(hx) 93 | """ 94 | 95 | def __init__(self, input_size, hidden_size, bias=True): 96 | super(LSTMCell, self).__init__() 97 | self.input_size = input_size 98 | self.hidden_size = hidden_size 99 | self.bias = bias 100 | self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size)) 101 | self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size)) 102 | if bias: 103 | self.bias_ih = Parameter(torch.Tensor(4 * hidden_size)) 104 | self.bias_hh = Parameter(torch.Tensor(4 * hidden_size)) 105 | else: 106 | self.register_parameter('bias_ih', None) 107 | self.register_parameter('bias_hh', None) 108 | self.reset_parameters() 109 | 110 | def reset_parameters(self): 111 | stdv = 1.0 / math.sqrt(self.hidden_size) 112 | for weight in self.parameters(): 113 | weight.data.uniform_(-stdv, stdv) 114 | 115 | def forward(self, input, hx): 116 | self.check_forward_input(input) 117 | self.check_forward_hidden(input, hx[0], '[0]') 118 | self.check_forward_hidden(input, hx[1], '[1]') 119 | return self.LSTMCell( 120 | input, hx, 121 | self.weight_ih, self.weight_hh, 122 | self.bias_ih, self.bias_hh, 123 | ) 124 | 125 | def LSTMCell(self, input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): 126 | 127 | hx, cx = hidden 128 | gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) 129 | 130 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) 131 | 132 | ingate = F.sigmoid(ingate) 133 | forgetgate = F.sigmoid(forgetgate) 134 | cellgate = F.tanh(cellgate) 135 | outgate = F.sigmoid(outgate) 136 | 137 | cy = (forgetgate * cx) + (ingate * cellgate) 138 | hy = outgate * F.tanh(cy) 139 | 140 | return hy, cy 141 | -------------------------------------------------------------------------------- /4x320-LSTM/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/4x320-LSTM/__init__.py -------------------------------------------------------------------------------- /4x320-LSTM/bench_lasagne_LSTMLayer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import lasagne 5 | import theano 6 | import theano.tensor as T 7 | 8 | from support import toy_batch, default_params, write_results, print_results, check_results 9 | 10 | # Experiment_type 11 | bench = 'lasagne_LSTMLayer' 12 | version = lasagne.__version__ 13 | experiment = '4x320-BIDIR-LSTM_cross-entropy' 14 | 15 | # Get data 16 | bX, b_lenX, bY, classes = toy_batch() 17 | batch_size, seq_len, inp_dims = bX.shape 18 | rnn_size, learning_rate, batches = default_params() 19 | 20 | # Create symbolic vars 21 | x = T.ftensor3('x') 22 | y = T.ivector('y') 23 | 24 | 25 | # Create network 26 | def get_bench_net_lstm(input_var, inp_dim, rnn_size): 27 | # Input layer 28 | l_in = lasagne.layers.InputLayer(shape=(None, None, inp_dim), input_var=input_var) 29 | 30 | # Allows arbitrary sizes 31 | batch_size, seq_len, _ = input_var.shape 32 | 33 | # RNN layers 34 | h1f = lasagne.layers.LSTMLayer(l_in, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform()) 35 | h1b = lasagne.layers.LSTMLayer(l_in, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform(), backwards=True) 36 | h1 = lasagne.layers.ConcatLayer([h1f, h1b], axis=2) 37 | 38 | h2f = lasagne.layers.LSTMLayer(h1, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform()) 39 | h2b = lasagne.layers.LSTMLayer(h1, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform(), backwards=True) 40 | h2 = lasagne.layers.ConcatLayer([h2f, h2b], axis=2) 41 | 42 | h3f = lasagne.layers.LSTMLayer(h2, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform()) 43 | h3b = lasagne.layers.LSTMLayer(h2, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform(), backwards=True) 44 | h3 = lasagne.layers.ConcatLayer([h3f, h3b], axis=2) 45 | 46 | h4f = lasagne.layers.LSTMLayer(h3, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform()) 47 | h4b = lasagne.layers.LSTMLayer(h3, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform(), backwards=True) 48 | h4 = lasagne.layers.ConcatLayer([h4f, h4b], axis=2) 49 | 50 | h5 = lasagne.layers.SliceLayer(h4, -1, axis=1) 51 | h6 = lasagne.layers.DenseLayer(h5, num_units=classes, nonlinearity=lasagne.nonlinearities.softmax, b=None) 52 | 53 | return h6 54 | 55 | 56 | # Create network 57 | network = get_bench_net_lstm(x, inp_dims, rnn_size) 58 | 59 | # Print parameter count 60 | params = lasagne.layers.count_params(network) 61 | print('>>> # network parameters: ' + str(params)) 62 | 63 | # Create loss, optimizer and train function 64 | prediction = lasagne.layers.get_output(network) 65 | loss = lasagne.objectives.categorical_crossentropy(predictions=prediction, targets=y) 66 | loss = loss.mean() 67 | 68 | update_params = lasagne.layers.get_all_params(network, trainable=True) 69 | updates = lasagne.updates.adam(loss, update_params, learning_rate=learning_rate) 70 | 71 | fn_inputs = [x, y] 72 | 73 | start = timer.perf_counter() 74 | train_fn = theano.function(fn_inputs, loss, updates=updates) 75 | prediction_det = lasagne.layers.get_output(network, deterministic=True) 76 | output_fn = theano.function([x], prediction_det) 77 | end = timer.perf_counter() 78 | print('>>> Theano function compilation took {:.1f} seconds'.format(end - start)) 79 | 80 | # Check for correct sizes 81 | assert (network.input_shape == (None, 2*rnn_size)) # final projection input size (Batch_size x rnn_size) 82 | assert (network.W.eval().shape == (2*rnn_size, classes)) # final projection kernel size (rnn_size x classes) 83 | output = output_fn(bX) 84 | output_fn.sync_shared() 85 | assert (output.shape == (batch_size, classes)) # output size 86 | 87 | # Start training 88 | batch_time = [] 89 | batch_loss = [] 90 | train_start = timer.perf_counter() # start of training 91 | for i in range(batches): 92 | batch_start = timer.perf_counter() # start of batch 93 | loss = train_fn(bX, bY) 94 | train_fn.sync_shared() # synchronize function call for precise time measurement 95 | batch_end = timer.perf_counter() # end of batch 96 | batch_time.append(batch_end - batch_start) 97 | batch_loss.append(loss) 98 | train_end = timer.perf_counter() # end of training 99 | 100 | # Results handling 101 | print_results(batch_time) 102 | check_results(batch_loss, batch_time, train_start, train_end) 103 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 104 | run_time=batch_time, version=version) 105 | -------------------------------------------------------------------------------- /4x320-LSTM/bench_pytorch_cudnnLSTM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.autograd import Variable 9 | 10 | from support import toy_batch, default_params, write_results, print_results, check_results 11 | 12 | # Experiment_type 13 | bench = 'pytorch_cudnnLSTM' 14 | version = torch.__version__ 15 | experiment = '4x320-BIDIR-LSTM_cross-entropy' 16 | 17 | # Get data 18 | bX, b_lenX, bY, classes = toy_batch() 19 | batch_size, seq_len, inp_dims = bX.shape 20 | rnn_size, learning_rate, batches = default_params() 21 | 22 | # PyTorch compatibility: time first, batch second 23 | bX = np.transpose(bX, (1, 0, 2)) 24 | 25 | 26 | # Create Network 27 | class Net(nn.Module): 28 | def __init__(self): 29 | super(Net, self).__init__() 30 | self.lstm = nn.LSTM(input_size=inp_dims, hidden_size=rnn_size, num_layers=4, bias=True, bidirectional=True) 31 | self.fc = nn.Linear(rnn_size * 2, classes, bias=False) 32 | 33 | def forward(self, x): 34 | h1, state = self.lstm(x) 35 | h2 = h1[-1, :, :] 36 | h3 = self.fc(h2) 37 | return h3 38 | 39 | 40 | net = Net() 41 | net.cuda() 42 | 43 | # Print parameter count 44 | params = 0 45 | for param in list(net.parameters()): 46 | sizes = 1 47 | for el in param.size(): 48 | sizes = sizes * el 49 | params += sizes 50 | print('# network parameters: ' + str(params)) 51 | 52 | # Create optimizer 53 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 54 | criterion = nn.CrossEntropyLoss() # loss definition 55 | 56 | # Check for correct sizes 57 | assert (net.fc.in_features == 2*rnn_size) # final projection input size (rnn_size) 58 | assert (net.fc.weight.cpu().data.numpy().shape == ( 59 | classes, 2*rnn_size)) # final projection kernel size (classes, rnn_size) 60 | bXt = Variable(torch.from_numpy(bX).cuda()) 61 | torch.cuda.synchronize() 62 | output = net(bXt) 63 | output_numpy = output.data.cpu().numpy() 64 | assert (output_numpy.shape == (batch_size, classes)) 65 | 66 | # Start training 67 | batch_time = [] 68 | batch_loss = [] 69 | train_start = timer.perf_counter() 70 | for i in range(batches): 71 | torch.cuda.synchronize() # synchronize function call for precise time measurement 72 | batch_start = timer.perf_counter() 73 | 74 | bXt = Variable(torch.from_numpy(bX).cuda()) 75 | bYt = Variable(torch.from_numpy(bY).cuda()) 76 | 77 | optimizer.zero_grad() 78 | output = net(bXt) 79 | loss = criterion(output, bYt.long()) 80 | loss.backward() 81 | optimizer.step() 82 | 83 | torch.cuda.synchronize() # synchronize function call for precise time measurement 84 | batch_end = timer.perf_counter() 85 | batch_time.append(batch_end - batch_start) 86 | batch_loss.append(float(loss.data.cpu().numpy())) 87 | train_end = timer.perf_counter() # end of training 88 | 89 | # Write results 90 | print_results(batch_time) 91 | check_results(batch_loss, batch_time, train_start, train_end) 92 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 93 | run_time=batch_time, version=version) 94 | -------------------------------------------------------------------------------- /4x320-LSTM/bench_tensorflow_LSTMBlockCell.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import tensorflow as tf 5 | 6 | from support import toy_batch, default_params, write_results, print_results, check_results 7 | 8 | # Experiment_type 9 | bench = 'tensorflow_LSTMBlockCell' 10 | version = tf.__version__ 11 | experiment = '4x320-BIDIR-LSTM_cross-entropy' 12 | 13 | # Get data 14 | bX, b_lenX, bY, classes = toy_batch() 15 | batch_size, max_len, inp_dims = bX.shape 16 | rnn_size, learning_rate, batches = default_params() 17 | 18 | # Create symbolic vars 19 | x = tf.placeholder(tf.float32, [None, None, inp_dims]) 20 | x_len = tf.placeholder(tf.int32, [None]) 21 | y = tf.placeholder(tf.int32, [None]) 22 | 23 | # Create network 24 | fw_cell = [tf.contrib.rnn.LSTMBlockCell(rnn_size) for _ in range(4)] 25 | bw_cell = [tf.contrib.rnn.LSTMBlockCell(rnn_size) for _ in range(4)] 26 | 27 | h1, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw=fw_cell, cells_bw=bw_cell, 28 | inputs=x, sequence_length=x_len, dtype=tf.float32) 29 | h2 = h1[:, -1, :] 30 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False) 31 | 32 | # Create loss, optimizer and train function 33 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y)) 34 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 35 | 36 | train_step = optimizer.minimize(loss) 37 | 38 | # Initialize session 39 | init = tf.global_variables_initializer() 40 | config = tf.ConfigProto() 41 | # config.gpu_options.allow_growth = True 42 | 43 | # Print parameter count 44 | params = 0 45 | for variable in tf.trainable_variables(): 46 | # shape is an array of tf.Dimension 47 | shape = variable.get_shape() 48 | variable_parametes = 1 49 | for dim in shape: 50 | variable_parametes *= dim.value 51 | params += variable_parametes 52 | print('# network parameters: ' + str(params)) 53 | 54 | with tf.Session(config=config) as sess: 55 | sess.run(init) 56 | # Check for correct sizes 57 | assert (h2._shape_as_list() == [None, 2*rnn_size]) # final projection input size (rnn_size) 58 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [2*rnn_size, classes]) # final projection output size (rnn_size, classes) 59 | output = sess.run(h3, feed_dict={x: bX, y: bY, x_len: b_lenX}) 60 | assert (output.shape == (batch_size, classes)) 61 | 62 | # Start training 63 | batch_time = [] 64 | batch_loss = [] 65 | train_start=timer.perf_counter() 66 | for i in range(batches): 67 | batch_start = timer.perf_counter() 68 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, x_len: b_lenX}) 69 | batch_end = timer.perf_counter() 70 | batch_time.append(batch_end - batch_start) 71 | batch_loss.append(loss_val) 72 | train_end = timer.perf_counter() 73 | 74 | # Results handling 75 | print_results(batch_time) 76 | check_results(batch_loss, batch_time, train_start, train_end) 77 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 78 | run_time=batch_time, version=version) 79 | -------------------------------------------------------------------------------- /4x320-LSTM/bench_tensorflow_LSTMCell.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import tensorflow as tf 5 | 6 | from support import toy_batch, default_params, write_results, print_results, check_results 7 | 8 | # Experiment_type 9 | bench = 'tensorflow_LSTMCell' 10 | version = tf.__version__ 11 | experiment = '4x320-BIDIR-LSTM_cross-entropy' 12 | 13 | # Get data 14 | bX, b_lenX, bY, classes = toy_batch() 15 | batch_size, max_len, inp_dims = bX.shape 16 | rnn_size, learning_rate, batches = default_params() 17 | 18 | # Create symbolic vars 19 | x = tf.placeholder(tf.float32, [None, None, inp_dims]) 20 | x_len = tf.placeholder(tf.int32, [None]) 21 | y = tf.placeholder(tf.int32, [None]) 22 | 23 | # Create network 24 | fw_cell = [tf.nn.rnn_cell.LSTMCell(rnn_size) for _ in range(4)] 25 | bw_cell = [tf.nn.rnn_cell.LSTMCell(rnn_size) for _ in range(4)] 26 | 27 | h1, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw=fw_cell, cells_bw=bw_cell, 28 | inputs=x, sequence_length=x_len, dtype=tf.float32) 29 | h2 = h1[:, -1, :] 30 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False) 31 | 32 | # Create loss, optimizer and train function 33 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y)) 34 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 35 | 36 | train_step = optimizer.minimize(loss) 37 | 38 | # Initialize session 39 | init = tf.global_variables_initializer() 40 | config = tf.ConfigProto() 41 | # config.gpu_options.allow_growth = True 42 | 43 | # Print parameter count 44 | params = 0 45 | for variable in tf.trainable_variables(): 46 | # shape is an array of tf.Dimension 47 | shape = variable.get_shape() 48 | variable_parametes = 1 49 | for dim in shape: 50 | variable_parametes *= dim.value 51 | params += variable_parametes 52 | print('# network parameters: ' + str(params)) 53 | 54 | with tf.Session(config=config) as sess: 55 | sess.run(init) 56 | # Check for correct sizes 57 | assert (h2._shape_as_list() == [None, 2*rnn_size]) # final projection input size (rnn_size) 58 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [2*rnn_size, classes]) # final projection output size (rnn_size, classes) 59 | output = sess.run(h3, feed_dict={x: bX, y: bY, x_len: b_lenX}) 60 | assert (output.shape == (batch_size, classes)) 61 | 62 | # Start training 63 | batch_time = [] 64 | batch_loss = [] 65 | train_start=timer.perf_counter() 66 | for i in range(batches): 67 | batch_start = timer.perf_counter() 68 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, x_len: b_lenX}) 69 | batch_end = timer.perf_counter() 70 | batch_time.append(batch_end - batch_start) 71 | batch_loss.append(loss_val) 72 | train_end = timer.perf_counter() 73 | 74 | # Results handling 75 | print_results(batch_time) 76 | check_results(batch_loss, batch_time, train_start, train_end) 77 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 78 | run_time=batch_time, version=version) 79 | -------------------------------------------------------------------------------- /4x320-LSTM/bench_tensorflow_cudnnLSTM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from support import toy_batch, default_params, write_results, print_results, check_results 8 | 9 | # Experiment_type 10 | bench = 'tensorflow_cudnnLSTM' 11 | version = tf.__version__ 12 | experiment = '4x320-BIDIR-LSTM_cross-entropy' 13 | 14 | # Get data 15 | bX, b_lenX, bY, classes = toy_batch() 16 | batch_size, max_len, inp_dims = bX.shape 17 | rnn_size, learning_rate, batches = default_params() 18 | 19 | # cudnn compatibility: time first, batch second 20 | bX = np.transpose(bX, (1, 0, 2)) 21 | 22 | # Create symbolic vars 23 | x = tf.placeholder(tf.float32, [None, None, inp_dims]) 24 | x_len = tf.placeholder(tf.int32, [None]) 25 | y = tf.placeholder(tf.int32, [None]) 26 | 27 | # Create network 28 | cudnn_lstm = tf.contrib.cudnn_rnn.CudnnLSTM(num_layers=4, num_units=rnn_size, direction='bidirectional') 29 | h1, _ = cudnn_lstm(x) 30 | h2 = h1[-1, :, :] 31 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False) 32 | 33 | # Create loss, optimizer and train function 34 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y)) 35 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 36 | 37 | train_step = optimizer.minimize(loss) 38 | 39 | # Initialize session 40 | init = tf.global_variables_initializer() 41 | config = tf.ConfigProto() 42 | # config.gpu_options.allow_growth = True 43 | 44 | # Print parameter count 45 | params = 0 46 | for variable in tf.trainable_variables(): 47 | # shape is an array of tf.Dimension 48 | if 'cudnn_lstm' in str(variable): 49 | biases = cudnn_lstm.canonical_bias_shapes 50 | weights = cudnn_lstm.canonical_weight_shapes 51 | all_biases = np.sum(biases) 52 | all_weights = np.sum([t[0] * t[1] for t in weights]) 53 | params += all_biases 54 | params += all_weights 55 | else: 56 | shape = variable.get_shape() 57 | variable_parametes = 1 58 | for dim in shape: 59 | variable_parametes *= dim.value 60 | params += variable_parametes 61 | print('# network parameters: ' + str(params)) 62 | 63 | with tf.Session(config=config) as sess: 64 | sess.run(init) 65 | # Check for correct sizes 66 | assert (h2._shape_as_list() == [None, 2*rnn_size]) # final projection input size (rnn_size) 67 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [2*rnn_size, classes]) # final projection output size (rnn_size, classes) 68 | output = sess.run(h3, feed_dict={x: bX, y: bY, x_len: b_lenX}) 69 | assert (output.shape == (batch_size, classes)) 70 | 71 | # Start training 72 | batch_time = [] 73 | batch_loss = [] 74 | train_start=timer.perf_counter() 75 | for i in range(batches): 76 | batch_start = timer.perf_counter() 77 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, x_len: b_lenX}) 78 | batch_end = timer.perf_counter() 79 | batch_time.append(batch_end - batch_start) 80 | batch_loss.append(loss_val) 81 | train_end = timer.perf_counter() 82 | 83 | # Results handling 84 | print_results(batch_time) 85 | check_results(batch_loss, batch_time, train_start, train_end) 86 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 87 | run_time=batch_time, version=version) 88 | -------------------------------------------------------------------------------- /4x320-LSTM_ctc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/4x320-LSTM_ctc/__init__.py -------------------------------------------------------------------------------- /4x320-LSTM_ctc/bench_lasagne_LSTMLayer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import lasagne 5 | import theano 6 | import theano.tensor as T 7 | from theano.tensor.nnet.ctc import (ctc) 8 | 9 | from support import toy_batch_ctc, default_params, write_results, print_results, check_results 10 | 11 | # Experiment_type 12 | bench = 'lasagne_LSTMLayer' 13 | version = lasagne.__version__ 14 | experiment = '4x320-BIDIR-LSTM_CTC' 15 | 16 | # Get data 17 | bX, b_lenX, maskX, bY, b_lenY, classes = toy_batch_ctc() 18 | bY = bY.reshape(-1, b_lenY.max()) # compatibility with theano ctc interface 19 | batch_size, seq_len, inp_dims = bX.shape 20 | rnn_size, learning_rate, batches = default_params() 21 | 22 | # Create symbolic vars 23 | input_var = T.ftensor3('bX') 24 | input_var_lens = T.ivector('b_lenX') 25 | mask_var = T.matrix('maskX') 26 | target_var = T.imatrix('bY') 27 | 28 | 29 | # Create network 30 | def get_bench_net_lstm(input_var, mask_var, inp_dim, rnn_size, classes): 31 | # Input layer 32 | l_in = lasagne.layers.InputLayer(shape=(None, None, inp_dim), input_var=input_var) 33 | 34 | # Masking layer 35 | l_mask = lasagne.layers.InputLayer(shape=(None, None), input_var=mask_var) 36 | 37 | # Allows arbitrary sizes 38 | batch_size, seq_len, _ = input_var.shape 39 | 40 | # RNN layers 41 | h1f = lasagne.layers.LSTMLayer(l_in, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform()) 42 | h1b = lasagne.layers.LSTMLayer(l_in, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform(), 43 | backwards=True) 44 | h1 = lasagne.layers.ConcatLayer([h1f, h1b], axis=2) 45 | 46 | h2f = lasagne.layers.LSTMLayer(h1, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform()) 47 | h2b = lasagne.layers.LSTMLayer(h1, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform(), 48 | backwards=True) 49 | h2 = lasagne.layers.ConcatLayer([h2f, h2b], axis=2) 50 | 51 | h3f = lasagne.layers.LSTMLayer(h2, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform()) 52 | h3b = lasagne.layers.LSTMLayer(h2, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform(), 53 | backwards=True) 54 | h3 = lasagne.layers.ConcatLayer([h3f, h3b], axis=2) 55 | 56 | h4f = lasagne.layers.LSTMLayer(h3, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform()) 57 | h4b = lasagne.layers.LSTMLayer(h3, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform(), 58 | backwards=True) 59 | h4 = lasagne.layers.ConcatLayer([h4f, h4b], axis=2) 60 | 61 | h5 = non_flattening_dense(h4, batch_size=batch_size, seq_len=seq_len, num_units=classes, 62 | nonlinearity=lasagne.nonlinearities.linear) 63 | 64 | h6 = lasagne.layers.DimshuffleLayer(h5, (1, 0, 2)) 65 | 66 | return h6 67 | 68 | 69 | def non_flattening_dense(l_in, batch_size, seq_len, *args, **kwargs): 70 | # Flatten down the dimensions for everything but the features 71 | l_flat = lasagne.layers.ReshapeLayer(l_in, (-1, [2])) 72 | # Make a dense layer connected to it 73 | l_dense = lasagne.layers.DenseLayer(l_flat, b=None, *args, **kwargs,) 74 | # Reshape it back out 75 | l_nonflat = lasagne.layers.ReshapeLayer(l_dense, (batch_size, seq_len, l_dense.output_shape[1])) 76 | return l_nonflat 77 | 78 | 79 | # Create network 80 | network = get_bench_net_lstm(input_var=input_var, mask_var=mask_var, inp_dim=inp_dims, rnn_size=rnn_size, 81 | classes=classes) 82 | 83 | # Create loss, optimizer and train function 84 | prediction = lasagne.layers.get_output(network) 85 | loss = T.mean(ctc(prediction, target_var, input_var_lens)) 86 | params = lasagne.layers.get_all_params(network, trainable=True) 87 | updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate) 88 | fn_inputs = [input_var, input_var_lens, mask_var, target_var] 89 | 90 | start = timer.perf_counter() 91 | train_fn = theano.function(fn_inputs, loss, updates=updates) 92 | output_fn = theano.function([input_var, mask_var], prediction) 93 | end = timer.perf_counter() 94 | print('>>> Theano function compilation took {:.1f} seconds'.format(end - start)) 95 | 96 | # Print parameter count 97 | params = lasagne.layers.count_params(network) 98 | print('# network parameters: ' + str(params)) 99 | 100 | # Check for correct sizes 101 | output_layer = network.input_layer.input_layer 102 | assert (output_layer.input_shape == (None, 2*rnn_size)) # final projection input size (Batch_size x rnn_size) 103 | assert (output_layer.W.eval().shape == (2*rnn_size, classes)) # final projection kernel size (rnn_size x classes) 104 | output = output_fn(bX, maskX) 105 | output_fn.sync_shared() 106 | assert (output.shape == (seq_len, batch_size, classes)) # output size 107 | 108 | # Start training 109 | batch_time = [] 110 | batch_loss = [] 111 | train_start = timer.perf_counter() # start of training 112 | for i in range(batches): 113 | batch_start = timer.perf_counter() # start of batch 114 | loss = train_fn(bX, b_lenX, maskX, bY) 115 | train_fn.sync_shared() # synchronize function call for precise time measurement 116 | batch_end = timer.perf_counter() # end of batch 117 | batch_time.append(batch_end - batch_start) 118 | batch_loss.append(loss) 119 | train_end = timer.perf_counter() # end of training 120 | 121 | # Results handling 122 | print_results(batch_time) 123 | check_results(batch_loss, batch_time, train_start, train_end) 124 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 125 | run_time=batch_time, version=version) -------------------------------------------------------------------------------- /4x320-LSTM_ctc/bench_pytorch_cudnnLSTM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.autograd import Variable 9 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 10 | from warpctc_pytorch import CTCLoss 11 | 12 | from support import toy_batch_ctc, default_params, write_results, print_results, check_results 13 | 14 | # Experiment_type 15 | bench = 'pytorch_cudnnLSTM' 16 | version = torch.__version__ 17 | experiment = '4x320-BIDIR-LSTM_CTC' 18 | 19 | # Get data 20 | bX, b_lenX, maskX, bY, b_lenY, classes = toy_batch_ctc() 21 | batch_size, seq_len, inp_dims = bX.shape 22 | rnn_size, learning_rate, batches = default_params() 23 | 24 | # PyTorch compatibility: time first, batch second 25 | bX = np.transpose(bX, (1, 0, 2)) 26 | 27 | 28 | # Create Network 29 | class Net(nn.Module): 30 | def __init__(self): 31 | super(Net, self).__init__() 32 | self.lstm = nn.LSTM(input_size=inp_dims, hidden_size=rnn_size, num_layers=4, bias=True, bidirectional=True) 33 | self.fc = nn.Linear(rnn_size * 2, classes, bias=False) 34 | 35 | def forward(self, x): 36 | h1p, state = self.lstm(x) 37 | h1, lens = pad_packed_sequence(h1p) 38 | h2 = self.fc(h1) 39 | return h2 40 | 41 | 42 | net = Net() 43 | net.cuda() 44 | 45 | # Print parameter count 46 | params = 0 47 | for param in list(net.parameters()): 48 | sizes = 1 49 | for el in param.size(): 50 | sizes = sizes * el 51 | params += sizes 52 | print('# network parameters: ' + str(params)) 53 | 54 | # Create optimizer 55 | optimizer = optim.Adam(net.parameters(), lr=learning_rate) 56 | criterion = CTCLoss() 57 | 58 | # Check for correct sizes 59 | assert (net.fc.in_features == 2*rnn_size) # final projection input size (rnn_size) 60 | assert (net.fc.weight.cpu().data.numpy().shape == ( 61 | classes, 2*rnn_size)) # final projection kernel size (classes, rnn_size) 62 | bXt = Variable(torch.from_numpy(bX).cuda()) 63 | bXt = pack_padded_sequence(bXt, b_lenX[::-1]) 64 | torch.cuda.synchronize() 65 | output = net(bXt) 66 | output_numpy = output.data.cpu().numpy() 67 | assert (output_numpy.shape == (seq_len, batch_size, classes)) 68 | 69 | # Start training 70 | batch_time = [] 71 | batch_loss = [] 72 | train_start = timer.perf_counter() 73 | for i in range(batches): 74 | torch.cuda.synchronize() # synchronize function call for precise time measurement 75 | batch_start = timer.perf_counter() 76 | 77 | bXt = Variable(torch.from_numpy(bX).cuda()) 78 | bXt = pack_padded_sequence(bXt, b_lenX[::-1]) # Pack those sequences for masking, plz 79 | b_lenXt = Variable(torch.from_numpy(b_lenX)) 80 | bYt = Variable(torch.from_numpy(bY)) 81 | b_lenYt = Variable(torch.from_numpy(b_lenY)) 82 | 83 | optimizer.zero_grad() 84 | output = net(bXt) 85 | loss = criterion(output, bYt, b_lenXt, b_lenYt) 86 | loss.backward() 87 | optimizer.step() 88 | 89 | torch.cuda.synchronize() # synchronize function call for precise time measurement 90 | batch_end = timer.perf_counter() 91 | batch_time.append(batch_end - batch_start) 92 | batch_loss.append(float(loss.data.cpu().numpy())) 93 | train_end = timer.perf_counter() # end of training 94 | 95 | # Write results 96 | print_results(batch_time) 97 | check_results(batch_loss, batch_time, train_start, train_end) 98 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 99 | run_time=batch_time, version=version) 100 | -------------------------------------------------------------------------------- /4x320-LSTM_ctc/bench_tensorflow_LSTMBlockCell.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import tensorflow as tf 5 | 6 | from support import toy_batch_ctc, default_params, write_results, print_results, target_converter, \ 7 | sparse_tuple_from, check_results 8 | 9 | # Experiment_type 10 | bench = 'tensorflow_LSTMBlockCell' 11 | version = tf.__version__ 12 | experiment = '4x320-BIDIR-LSTM_CTC' 13 | 14 | # Get data 15 | bX, b_lenX, maskX, bY, b_lenY, classes = toy_batch_ctc() 16 | batch_size, seq_len, inp_dims = bX.shape 17 | rnn_size, learning_rate, batches = default_params() 18 | 19 | # Create symbolic vars 20 | x = tf.placeholder(tf.float32, [None, None, inp_dims]) 21 | x_len = tf.placeholder(tf.int32, [None]) 22 | y = tf.sparse_placeholder(tf.int32) 23 | 24 | weights = {'out': tf.Variable(tf.truncated_normal(shape=[2 * rnn_size, classes], stddev=0.1), name='W_out')} 25 | 26 | # Create network 27 | def get_EESEN(x, rnn_size, weights, x_len, classes): 28 | shape = tf.shape(x) 29 | batch_size, max_timesteps = shape[0], shape[1] 30 | 31 | with tf.name_scope('MultiLSTM'): 32 | fw_cell = [tf.contrib.rnn.LSTMBlockCell(rnn_size) for _ in range(4)] 33 | bw_cell = [tf.contrib.rnn.LSTMBlockCell(rnn_size) for _ in range(4)] 34 | 35 | h1, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw=fw_cell, cells_bw=bw_cell, 36 | inputs=x, sequence_length=x_len, dtype=tf.float32) 37 | 38 | with tf.name_scope('Affine'): 39 | h1_rs = tf.reshape(h1, [-1, 2 * rnn_size]) 40 | logits = tf.matmul(h1_rs, weights['out']) 41 | logits = tf.reshape(logits, [batch_size, max_timesteps, classes]) 42 | logits = tf.transpose(logits, (1, 0, 2)) 43 | 44 | return logits, h1 45 | 46 | 47 | pred, h1 = get_EESEN(x=x, rnn_size=rnn_size, weights=weights, x_len=x_len, classes=classes) 48 | 49 | # Create loss, optimizer and train function 50 | loss = tf.reduce_mean(tf.nn.ctc_loss(inputs=pred, labels=y, sequence_length=x_len, time_major=True)) 51 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 52 | 53 | train_step = optimizer.minimize(loss) 54 | 55 | # Initialize session 56 | init = tf.global_variables_initializer() 57 | config = tf.ConfigProto() 58 | # config.gpu_options.allow_growth = True 59 | 60 | # Print parameter count 61 | params = 0 62 | for variable in tf.trainable_variables(): 63 | # shape is an array of tf.Dimension 64 | shape = variable.get_shape() 65 | variable_parametes = 1 66 | for dim in shape: 67 | variable_parametes *= dim.value 68 | params += variable_parametes 69 | print('# network parameters: ' + str(params)) 70 | 71 | with tf.Session(config=config) as sess: 72 | sess.run(init) 73 | bY = target_converter(bY, b_lenY) 74 | bY = sparse_tuple_from(bY) 75 | 76 | # Check for correct sizes 77 | assert (h1._shape_as_list() == [None, None, 2*rnn_size]) # final projection input size (rnn_size) 78 | assert (weights['out'].shape.as_list() == [2*rnn_size, classes]) # final projection kernel size (rnn_size, classes) 79 | output = sess.run(pred, feed_dict={x: bX, y: bY, x_len: b_lenX}) 80 | assert (output.shape == (seq_len, batch_size, classes)) 81 | 82 | # Start training 83 | batch_time = [] 84 | batch_loss = [] 85 | train_start=timer.perf_counter() 86 | for i in range(batches): 87 | batch_start = timer.perf_counter() 88 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, x_len: b_lenX}) 89 | batch_end = timer.perf_counter() 90 | batch_time.append(batch_end - batch_start) 91 | batch_loss.append(loss_val) 92 | train_end = timer.perf_counter() 93 | 94 | # Results handling 95 | print_results(batch_time) 96 | check_results(batch_loss, batch_time, train_start, train_end) 97 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 98 | run_time=batch_time, version=version) 99 | -------------------------------------------------------------------------------- /4x320-LSTM_ctc/bench_tensorflow_LSTMCell.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time as timer 3 | 4 | import tensorflow as tf 5 | 6 | from support import toy_batch_ctc, default_params, write_results, print_results, target_converter, \ 7 | sparse_tuple_from, check_results 8 | 9 | # Experiment_type 10 | bench = 'tensorflow_LSTMCell' 11 | version = tf.__version__ 12 | experiment = '4x320-BIDIR-LSTM_CTC' 13 | 14 | # Get data 15 | bX, b_lenX, maskX, bY, b_lenY, classes = toy_batch_ctc() 16 | batch_size, seq_len, inp_dims = bX.shape 17 | rnn_size, learning_rate, batches = default_params() 18 | 19 | # Create symbolic vars 20 | x = tf.placeholder(tf.float32, [None, None, inp_dims]) 21 | x_len = tf.placeholder(tf.int32, [None]) 22 | y = tf.sparse_placeholder(tf.int32) 23 | 24 | print(bX.shape) 25 | print(b_lenX.shape) 26 | print(bY.shape) 27 | 28 | weights = {'out': tf.Variable(tf.truncated_normal(shape=[2 * rnn_size, classes], stddev=0.1), name='W_out')} 29 | biases = {'out': tf.Variable(tf.zeros([classes]), name='b_out')} 30 | 31 | 32 | # Create network 33 | def get_EESEN(x, rnn_size, weights, biases, x_len, classes): 34 | shape = tf.shape(x) 35 | batch_size, max_timesteps = shape[0], shape[1] 36 | 37 | with tf.name_scope('MultiLSTM'): 38 | fw_cell = [tf.nn.rnn_cell.LSTMCell(rnn_size) for _ in range(4)] 39 | bw_cell = [tf.nn.rnn_cell.LSTMCell(rnn_size) for _ in range(4)] 40 | 41 | h1, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw=fw_cell, cells_bw=bw_cell, 42 | inputs=x, sequence_length=x_len, dtype=tf.float32) 43 | 44 | with tf.name_scope('Affine'): 45 | h1_rs = tf.reshape(h1, [-1, 2 * rnn_size]) 46 | logits = tf.matmul(h1_rs, weights['out']) + biases['out'] 47 | logits = tf.reshape(logits, [batch_size, max_timesteps, classes]) 48 | logits = tf.transpose(logits, (1, 0, 2)) 49 | 50 | return logits, h1 51 | 52 | 53 | pred, h1 = get_EESEN(x=x, rnn_size=rnn_size, weights=weights, biases=biases, x_len=x_len, classes=classes) 54 | 55 | # Create loss, optimizer and train function 56 | loss = tf.reduce_mean(tf.nn.ctc_loss(inputs=pred, labels=y, sequence_length=x_len, time_major=True)) 57 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 58 | 59 | train_step = optimizer.minimize(loss) 60 | 61 | # Initialize session 62 | init = tf.global_variables_initializer() 63 | config = tf.ConfigProto() 64 | # config.gpu_options.allow_growth = True 65 | 66 | # Print parameter count 67 | params = 0 68 | for variable in tf.trainable_variables(): 69 | # shape is an array of tf.Dimension 70 | shape = variable.get_shape() 71 | variable_parametes = 1 72 | for dim in shape: 73 | variable_parametes *= dim.value 74 | params += variable_parametes 75 | print('# network parameters: ' + str(params)) 76 | 77 | with tf.Session(config=config) as sess: 78 | sess.run(init) 79 | bY = target_converter(bY, b_lenY) 80 | bY = sparse_tuple_from(bY) 81 | 82 | # Check for correct sizes 83 | assert (h1._shape_as_list() == [None, None, 2*rnn_size]) # final projection input size (rnn_size) 84 | assert (weights['out'].shape.as_list() == [2*rnn_size, classes]) # final projection kernel size (rnn_size, classes) 85 | output = sess.run(pred, feed_dict={x: bX, y: bY, x_len: b_lenX}) 86 | assert (output.shape == (seq_len, batch_size, classes)) 87 | 88 | # Start training 89 | batch_time = [] 90 | batch_loss = [] 91 | train_start=timer.perf_counter() 92 | for i in range(batches): 93 | batch_start = timer.perf_counter() 94 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, x_len: b_lenX}) 95 | batch_end = timer.perf_counter() 96 | batch_time.append(batch_end - batch_start) 97 | batch_loss.append(loss_val) 98 | train_end = timer.perf_counter() 99 | 100 | # Results handling 101 | print_results(batch_time) 102 | check_results(batch_loss, batch_time, train_start, train_end) 103 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params, 104 | run_time=batch_time, version=version) 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rnn_benchmarks 2 | Welcome to the rnn_benchmarks repository! We offer: 3 | - A training speed comparison of different LSTM implementations across deep learning frameworks 4 | - Common input sizes, network configurations and cost functions from automatic speech recognition 5 | - Best-practice scripts to learn coding up a network, optimizers, loss functions etc. 6 | 7 | ## Update June 4th 2018 8 | - Arxiv paper: [LSTM Benchmarks for Deep Learning Frameworks](https://arxiv.org/abs/1806.01818) 9 | - [LSTM benchmarks between PyTorch 0.4, TensorFlow 1.8, Keras 2.1.6 and latest Lasagne](https://github.com/stefbraun/rnn_benchmarks/tree/master/results/10/framework_comparison) 10 | 11 | 12 | - [LSTM benchmarks between PyTorch versions 0.1.12 to 0.4.0](https://github.com/stefbraun/rnn_benchmarks/tree/master/results/10/pytorch_comparison) 13 | 14 | 15 | ## Run the benchmarks 16 | Go to the folder 'main' and execute the 'main.py' script in the corresponding benchmark folder. Before running 'main.py', you need to give the paths to the python environment that contain the corresponding framework. The 'main.py' script creates a 'commands.sh' script that will execute the benchmarks. The measured execution times will be written to 'results/results.csv'. The toy data and default parameters are provided by 'support.py', to make sure every script uses the same hyperparameters. 17 | 18 | -------------------------------------------------------------------------------- /main/framework_comparison/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | from pathlib import Path 5 | 6 | # Parameters 7 | cuda_device = 1 8 | dry = 1 # Run benches or not 9 | python_path = Path(__file__).resolve().parents[2] 10 | 11 | command_list = ['echo {} > {}'.format(os.path.join(python_path, 'results', 'framework_comparison'), 12 | os.path.join(python_path, 'results', 'conf'))] 13 | 14 | # write path to dataframe to config file 15 | with open(os.path.join(python_path, 'results', 'conf'), 'w') as f: 16 | f.write(os.path.join(python_path, 'results', 'framework_comparison')) 17 | 18 | # Please define your virtual environments for testing 19 | interpreter = {} 20 | interpreter[ 21 | 'lasagne'] = 'MKL_THREADING_LAYER=GNU LIBRARY_PATH=/usr/local/cuda-9.0/lib64 /home/brauns/anaconda3/envs/theano/bin/python' 22 | interpreter[ 23 | 'keras-theano'] = 'MKL_THREADING_LAYER=GNU LIBRARY_PATH=/usr/local/cuda-9.0/lib64 /home/brauns/anaconda3/envs/theano/bin/python' 24 | interpreter['tensorflow'] = '/home/brauns/anaconda3/envs/tensorflow/bin/python' 25 | interpreter['keras-tensorflow'] = '/home/brauns/anaconda3/envs/tensorflow/bin/python' 26 | interpreter['pytorch'] = '/home/brauns/anaconda3/envs/pt4/bin/python' 27 | 28 | # Experiments 29 | all_experiments = ['1x320-LSTM', '4x320-LSTM', '4x320-LSTM_ctc'] 30 | 31 | # Run benches 32 | for experiment in all_experiments: 33 | experiment_folder = os.path.join(python_path, experiment) 34 | all_benches = [script for script in os.listdir(experiment_folder) if 'bench' in script] 35 | 36 | for bench in all_benches: 37 | print('=' * 100) 38 | _, framework, cell = bench.split('_') 39 | 40 | if 'keras' not in framework: 41 | interpreter_path = interpreter[framework] 42 | script_path = os.path.join(experiment_folder, bench) 43 | command = 'CUDA_VISIBLE_DEVICES={} PYTHONPATH={} {} {}'.format(cuda_device, python_path, interpreter_path, 44 | script_path) 45 | else: 46 | backend = framework.split('-')[1] 47 | interpreter_path = interpreter[framework] 48 | script_path = os.path.join(experiment_folder, bench) 49 | command = 'CUDA_VISIBLE_DEVICES={} KERAS_BACKEND={} PYTHONPATH={} {} {}'.format(cuda_device, backend, 50 | python_path, 51 | interpreter_path, 52 | script_path) 53 | print(command) 54 | command_list.append(command) 55 | if dry == 0: 56 | proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) 57 | proc.wait() 58 | 59 | command_list = map(lambda x: x + '\n', command_list) 60 | with open(os.path.join(sys.path[0], 'commands.sh'), 'w') as f: 61 | f.writelines(command_list) 62 | -------------------------------------------------------------------------------- /main/framework_comparison/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | import seaborn as sns 8 | from matplotlib.lines import Line2D 9 | from matplotlib.ticker import MaxNLocator 10 | 11 | sns.set_style('darkgrid') 12 | 13 | import matplotlib.pylab as pylab 14 | 15 | lparams = ['legend.fontsize', 'axes.labelsize', 'axes.titlesize', 'xtick.labelsize', 'ytick.labelsize'] 16 | fontsize = 11.5 17 | params = {key: fontsize for key in lparams} 18 | pylab.rcParams.update(params) 19 | 20 | def match_case(row): 21 | for old, new in [('pytorch', 'PyTorch'), ('tensorflow', 'TensorFlow'), ('lasagne', 'Lasagne'), 22 | ('keras', 'Keras'), ('theano', 'Theano'), ('cudnnLSTM', 'cuDNNLSTM')]: 23 | row['bench'] = row['bench'].replace(old, new) 24 | return row 25 | 26 | 27 | def linebreak(row): 28 | row['bench'] = '\n'.join(row['bench'].split('_')) 29 | return row 30 | 31 | 32 | def framework(row): 33 | row['framework'] = row['bench'].split('_')[0] 34 | if 'keras' in row['bench']: 35 | row['framework'] = 'keras' 36 | elif 'Keras' in row['bench']: 37 | row['framework'] = 'Keras' 38 | return row 39 | 40 | 41 | def get_color_palette(unique_benchs): 42 | colors = [] 43 | for bench in unique_benchs: 44 | if ('tensorflow' in bench) or ('TensorFlow' in bench): 45 | c = "#377eb8" 46 | c = '#4c72b0' 47 | if ('pytorch' in bench) or ('PyTorch' in bench): 48 | c = "#e41a1c" 49 | # c='#C44E52' 50 | c='#de2d26' 51 | if ('lasagne' in bench) or ('Lasagne' in bench): 52 | c = "#696969" 53 | if ('keras' in bench) or ('Keras' in bench): 54 | c = "#4daf4a" 55 | c= '#55A868' 56 | colors.append(c) 57 | return colors 58 | 59 | 60 | # Load file 61 | repo_path = Path(__file__).resolve().parents[2] 62 | logfile = os.path.join(repo_path, 'results', 'framework_comparison', 'results.csv') 63 | df = pd.read_csv(logfile) 64 | 65 | # Parameters 66 | experiments = list(df['experiment'].unique()) 67 | 68 | # for exp, ax in zip(experiments, axs.reshape(-1)): 69 | for exp in experiments: 70 | 71 | dfp = df[df['experiment'] == exp] 72 | dfp = dfp.apply(match_case, axis=1) 73 | dfp = dfp.apply(framework, axis=1) 74 | dfp = dfp.apply(linebreak, axis=1) 75 | dfp = dfp.groupby('bench').tail(400) 76 | dfp['mean'] = dfp.groupby('bench').transform('mean')['runtime'] 77 | dfp = dfp.sort_values(['mean'], ascending=True) 78 | dfp['runtime'] = dfp['runtime'] * 1000 79 | 80 | # Uber-plotting skillz: ax control 81 | fig_width = 8 82 | ax_height = len(dfp['bench'].unique()) * 0.5 83 | 84 | left_inch = 1.75 85 | left_rel = left_inch / fig_width 86 | ax_width_rel = 1 - left_rel - 0.005 87 | 88 | bottom_inch = 0.45 89 | top_inch = 0.2 90 | fig_height = ax_height + bottom_inch + top_inch 91 | 92 | bottom_rel = bottom_inch / fig_height 93 | ax_height_rel = (fig_height - bottom_inch - top_inch - 0.01) / fig_height 94 | 95 | fig = plt.figure(figsize=(fig_width, fig_height)) 96 | ax_pad = (left_rel, bottom_rel, ax_width_rel, ax_height_rel) 97 | 98 | ax = fig.add_axes((ax_pad)) 99 | 100 | # Start plotting 101 | colors = get_color_palette(dfp['bench'].unique()) 102 | sns.set_palette(colors) 103 | 104 | custom_lines = [Line2D([0], [0], color=c, lw=4) for c in list(pd.unique(colors))] 105 | ax.legend(custom_lines, list(dfp['framework'].unique())) 106 | 107 | # dfp = dfp.apply(unbreak, axis=1) 108 | sns.barplot(ax=ax, data=dfp, y='bench', x='runtime', ci='sd') 109 | ax.set_title(exp.replace('_', '-')) 110 | ax.set_xlabel('Time per batch [milliseconds]') 111 | 112 | min_width = 1e6 113 | max_width = 0 114 | for p, c in zip(ax.patches, colors): 115 | min_width = np.min([min_width, p.get_width()]) 116 | max_width = np.max([max_width, p.get_width()]) 117 | 118 | ax.set_xlim((0, 1.6 * max_width)) 119 | max_x = np.max(ax.get_xlim()) 120 | 121 | for p, c in zip(ax.patches, colors): 122 | # print(max_x) 123 | if min_width > 10: 124 | ax.text(p.get_width() + max_x / 8, p.get_y() + p.get_height() / 1.3, 125 | '{:4.0f}ms ::: {:3.1f}x'.format(p.get_width(), p.get_width() / min_width), 126 | fontsize=fontsize+1.5, fontweight='bold', color=c, ha='center', va='bottom') 127 | else: 128 | ax.text(p.get_width() + max_x / 8, p.get_y() + p.get_height() / 1.3, 129 | '{:4.1f}ms ::: {:3.1f}x'.format(p.get_width(), p.get_width() / min_width), 130 | fontsize=fontsize+1.5, fontweight='bold', color=c, ha='center', va='bottom') 131 | 132 | ax.set_ylabel('') 133 | ax.xaxis.set_major_locator(MaxNLocator(prune='upper')) 134 | plt.setp(ax.get_xticklabels()[-1], visible=False) 135 | 136 | output_file = os.path.join(repo_path, 'results/framework_comparison/{}'.format(exp)) 137 | 138 | fig.savefig(output_file, dpi=300) 139 | fig.savefig(output_file + '.pdf', dpi=300) 140 | -------------------------------------------------------------------------------- /main/pytorch_comparison/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | from pathlib import Path 5 | 6 | # Parameters 7 | cuda_device = 1 8 | python_path = Path(__file__).resolve().parents[2] 9 | dry = 1 10 | 11 | command_list = ['echo {} > {}'.format(os.path.join(python_path, 'results', 'pytorch_comparison'), 12 | os.path.join(python_path, 'results', 'conf'))] 13 | 14 | # write path to dataframe to config file 15 | with open(os.path.join(python_path, 'results', 'conf'), 'w') as f: 16 | f.write(os.path.join(python_path, 'results', 'pytorch_comparison')) 17 | 18 | # Please define your virtual environments for testing 19 | all_interpreters = ['/home/brauns/anaconda3/envs/pt{}/bin/python'.format(i) for i in range(1, 5)] 20 | 21 | # Experiments 22 | all_experiments = ['1x320-LSTM', '4x320-LSTM', '4x320-LSTM_ctc'] 23 | 24 | # Run benches 25 | for experiment in all_experiments: 26 | experiment_folder = os.path.join(python_path, experiment) 27 | all_benches = [script for script in os.listdir(experiment_folder) if ('bench' in script) and 'pytorch' in script] 28 | 29 | for interpreter in all_interpreters: 30 | for bench in all_benches: 31 | print('=' * 100) 32 | _, framework, cell = bench.split('_') 33 | 34 | interpreter_path = interpreter 35 | script_path = os.path.join(experiment_folder, bench) 36 | command = 'CUDA_VISIBLE_DEVICES={} PYTHONPATH={} {} {}'.format(cuda_device, python_path, interpreter_path, 37 | script_path) 38 | 39 | print(command) 40 | command_list.append(command) 41 | if dry == 0: 42 | proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) 43 | proc.wait() 44 | 45 | command_list = map(lambda x: x + '\n', command_list) 46 | with open(os.path.join(sys.path[0], 'commands.sh'), 'w') as f: 47 | f.writelines(command_list) 48 | -------------------------------------------------------------------------------- /main/pytorch_comparison/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | import seaborn as sns 8 | 9 | sns.set_style('darkgrid') 10 | 11 | import matplotlib.pylab as pylab 12 | from matplotlib.ticker import MaxNLocator 13 | 14 | 15 | lparams = ['legend.fontsize', 'axes.labelsize', 'axes.titlesize', 'xtick.labelsize', 'ytick.labelsize'] 16 | fontsize = 11.5 17 | params = {key: fontsize for key in lparams} 18 | pylab.rcParams.update(params) 19 | 20 | def match_case(row): 21 | for old, new in [('pytorch', 'PyTorch'), ('tensorflow', 'TensorFlow'), ('lasagne', 'Lasagne'), 22 | ('keras', 'Keras'), ('theano', 'Theano'), ('cudnnLSTM', 'cuDNNLSTM')]: 23 | row['bench'] = row['bench'].replace(old, new) 24 | return row 25 | 26 | def linebreak(row): 27 | row['bench'] = '\n'.join(row['bench'].split('_')) 28 | return row 29 | 30 | 31 | def get_color_palette(): 32 | colors = list(reversed(['#fcbba1', '#fc9272', '#fb6a4a', '#de2d26'])) 33 | 34 | # colors = list(['#C44E52', '#ec8386', '#fdb6b8', '#fed6d7']) 35 | # colors=sns.color_palette('Reds') 36 | 37 | 38 | return colors 39 | 40 | 41 | # Load file 42 | repo_path = Path(__file__).resolve().parents[2] 43 | logfile = os.path.join(repo_path, 'results', 'pytorch_comparison', 'results.csv') 44 | df = pd.read_csv(logfile) 45 | 46 | # Parameters 47 | experiments = list(df['experiment'].unique()) 48 | 49 | # for exp, ax in zip(experiments, axs.reshape(-1)): 50 | for exp in experiments: 51 | 52 | dfp = df[df['experiment'] == exp] 53 | dfp = dfp.apply(match_case, axis=1) 54 | dfp = dfp.apply(linebreak, axis=1) 55 | dfp = dfp.groupby(['bench', 'version']).tail(400) 56 | dfp['mean'] = dfp.groupby('bench').transform('mean')['runtime'] 57 | dfp = dfp.sort_values(['mean', 'version', 'bench'], ascending=[True, False, False]) 58 | dfp['runtime'] = dfp['runtime'] * 1000 59 | 60 | # Uber-plotting skillz: ax control 61 | fig_width = 8 62 | ax_height = np.min([len(dfp['version'].unique()) * len(dfp['bench'].unique()), 11]) * 0.5 63 | 64 | left_inch = 1.75 65 | left_rel = left_inch / fig_width 66 | ax_width_rel = 1 - left_rel - 0.005 67 | bottom_inch = 0.45 68 | top_inch = 0.2 69 | fig_height = ax_height + bottom_inch + top_inch 70 | 71 | bottom_rel = bottom_inch / fig_height 72 | ax_height_rel = (fig_height - bottom_inch - top_inch - 0.01) / fig_height 73 | 74 | fig = plt.figure(figsize=(fig_width, fig_height)) 75 | ax_pad = (left_rel, bottom_rel, ax_width_rel, ax_height_rel) 76 | 77 | ax = fig.add_axes((ax_pad)) 78 | 79 | # Start plotting 80 | colors = get_color_palette() 81 | sns.set_palette(colors) 82 | 83 | sns.barplot(ax=ax, data=dfp, y='bench', x='runtime', hue='version', ci='sd') 84 | ax.set_title(exp) 85 | xl = ax.set_xlabel('Time per batch [milliseconds]') 86 | 87 | all_width = [p.get_width() for p in ax.patches] 88 | min_width = np.min(all_width) 89 | max_width = np.max(all_width) 90 | 91 | ax.set_xlim((0, 1.6 * max_width)) 92 | max_x = np.max(ax.get_xlim()) 93 | 94 | for p in ax.patches: 95 | # print(max_x) 96 | if min_width > 10: 97 | ax.text(p.get_width() + max_x / 8, p.get_y() + p.get_height() / 1.3, 98 | '{:4.0f}ms ::: {:3.1f}x'.format(p.get_width(), p.get_width() / min_width), 99 | fontsize=fontsize+1.5, fontweight='bold', color='dimgrey', ha='center', va='bottom') 100 | else: 101 | ax.text(p.get_width() + max_x / 8, p.get_y() + p.get_height() / 1.3, 102 | '{:4.1f}ms ::: {:3.1f}x'.format(p.get_width(), p.get_width() / min_width), 103 | fontsize=fontsize+1.5, fontweight='bold', color='grey', ha='center', va='bottom') 104 | 105 | ax.set_ylabel('') 106 | ax.legend(loc=1) 107 | 108 | ax.xaxis.set_major_locator(MaxNLocator(prune='upper')) 109 | plt.setp(ax.get_xticklabels()[-1], visible=False) 110 | 111 | output_file = os.path.join(repo_path, 'results/pytorch_comparison/{}'.format(exp)) 112 | 113 | fig.savefig(output_file, dpi=300) 114 | fig.savefig(output_file + '.pdf') 115 | -------------------------------------------------------------------------------- /main/pytorch_comparison/unifier.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | 6 | repo_path = Path(__file__).resolve().parents[2] 7 | 8 | # Get frames 9 | df1 = pd.read_csv(os.path.join(repo_path, 'results', 'framework_comparison', 'results.csv')) 10 | df2 = pd.read_csv(os.path.join(repo_path, 'results', 'pytorch_comparison', 'results.csv')) 11 | 12 | # Get version in framework comparison 13 | df1pt = df1[df1['bench'].str.contains('pytorch')] 14 | pytorch_version = df1pt['version'].unique()[0] 15 | print('Replacing pytorch version {}'.format(pytorch_version)) 16 | 17 | # Prepare pytorch comparison dataframe 18 | df2pt = df2.copy() 19 | df2pt.drop(df2pt[df2pt['version'] == pytorch_version].index, inplace=True) 20 | 21 | # Prepare framework comparison dataframe 22 | df2pt = df2pt.append(df1pt) 23 | df2pt.reset_index 24 | 25 | # save csv 26 | df2pt.to_csv(os.path.join(repo_path, 'results', 'pytorch_comparison', 'results.csv'), index=None) 27 | 5 + 5 28 | -------------------------------------------------------------------------------- /results/10/framework_comparison/1x320-LSTM_cross-entropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/framework_comparison/1x320-LSTM_cross-entropy.png -------------------------------------------------------------------------------- /results/10/framework_comparison/1x320-LSTM_cross-entropy_100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/framework_comparison/1x320-LSTM_cross-entropy_100.png -------------------------------------------------------------------------------- /results/10/framework_comparison/4x320-BIDIR-LSTM_CTC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/framework_comparison/4x320-BIDIR-LSTM_CTC.png -------------------------------------------------------------------------------- /results/10/framework_comparison/4x320-BIDIR-LSTM_cross-entropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/framework_comparison/4x320-BIDIR-LSTM_cross-entropy.png -------------------------------------------------------------------------------- /results/10/framework_comparison/readme.md: -------------------------------------------------------------------------------- 1 | # Framework comparison 2 | - PyTorch, TensorFlow, Lasagne, Keras/TensorFlow and Keras/Theano benchmarks 3 | - LSTM implementations by cuDNN, fused kernels and naive approaches 4 | - Fixed sequence-length data with cross-entropy loss and variable sequence-length data with CTC loss 5 | - Input sizes 64x100x123 and 32x1000x123 (batch size x time steps x channels) 6 | - Network sizes 1x320 and 4x320 (number of layers x number of LSTM units) 7 | 8 | ## Framework versions 9 | Framework | Version | Release |Backend | cuda | cuDNN 10 | -|-|-|-|-|- 11 | PyTorch | 0.4.0 | [April 2018](https://github.com/PyTorch/PyTorch/releases/tag/v0.4.0) | - | 9.0 | 7102 12 | TensorFlow | 1.8.0 | [April 2018](https://github.com/TensorFlow/TensorFlow/releases/tag/v1.8.0) |- | 9.0 | 7005 13 | Lasagne | 0.2.1dev | [April 2018 ](https://github.com/Lasagne/Lasagne/commit/7992faa80fa5233a786e2582a605e854cea7d1cf) | [Theano 1.0.1](https://github.com/Theano/Theano/releases/tag/rel-1.0.1) | 9.0 | 7005 14 | Keras | 2.1.6 | [April 2018](https://github.com/Keras-team/Keras/releases/tag/2.1.6) |[Theano 1.0.1](https://github.com/Theano/Theano/releases/tag/rel-1.0.1), [TensorFlow 1.8.0](https://github.com/TensorFlow/TensorFlow/releases/tag/v1.8.0)| 9.0 | 7005 15 | 16 | ## LSTM implementations 17 | 18 | Library | Name | Details 19 | -|-|- 20 | PyTorch | [`LSTMCell-basic`](https://github.com/stefbraun/rnn_benchmarks/blob/master/1x320-LSTM/bench_pytorch_LSTMCell-basic.py) | Custom code, pure PyTorch implementation, easy to modify. Loop over time with Python `for` loop 21 | PyTorch | [`LSTMCell-fused`](http://PyTorch.org/docs/stable/nn.html?highlight=lstmcell#torch.nn.LSTMCell) | LSTM with optimized kernel for single time steps. Loop over time with Python `for` loop 22 | PyTorch |[`cuDNNLSTM`](http://PyTorch.org/docs/stable/nn.html?highlight=lstm#torch.nn.LSTM) | Wrapper to cuDNN LSTM implementation 23 | TensorFlow | [`LSTMCell`](https://www.TensorFlow.org/versions/r1.8/api_docs/python/tf/contrib/rnn/LSTMCell)| Pure TensorFlow implementation, easy to modify. Loop over time with `tf.while_loop`. Uses `dynamic_rnn` 24 | TensorFlow | [`LSTMBlockCell`](https://www.TensorFlow.org/versions/r1.8/api_docs/python/tf/contrib/rnn/LSTMBlockCell)| Optimized LSTM with single operation per time-step. Loop over time with `tf.while_loop`. Uses `dynamic_rnn` 25 | TensorFlow | [`LSTMBlockFusedCell`](https://www.TensorFlow.org/versions/r1.8/api_docs/python/tf/contrib/rnn/LSTMBlockFusedCell)| Optimized LSTM with single operation over all time steps. Loop over time is part of the operation. 26 | TensorFlow | [`cuDNNLSTM`](https://www.tensorflow.org/api_docs/python/tf/contrib/cudnn_rnn/CudnnLSTM)| Wrapper to cuDNN LSTM implementation 27 | Lasagne | [`LSTMLayer`](http://Lasagne.readthedocs.io/en/latest/modules/layers/recurrent.html?highlight=gru#Lasagne.layers.LSTMLayer)| Pure Theano implementation, easy to modify. Loop over time with `theano.scan` 28 | Keras | [`cuDNNLSTM`](https://Keras.io/layers/recurrent/#cuDNNlstm) | Wrapper to cuDNN LSTM implementation 29 | Keras | [`LSTM`](https://Keras.io/layers/recurrent/#lstm)| Pure Theano/TensorFlow implementation, easy to modify. Loop over time with `theano.scan` or `tf.while_loop` 30 | 31 | ## Loss functions and input data 32 | The loss functions are varied with the input data: 33 | 1. Cross-entropy for fixed sequence length data 34 | - default implementation from each framework 35 | 2. Connectionist Temporal Classification (CTC) for variable sequence length data 36 | - warp_ctc for [Theano+Lasagne](http://deeplearning.net/software/Theano/library/tensor/nnet/ctc.html?highlight=ctc#module-Theano.tensor.nnet.ctc) and [PyTorch](https://github.com/SeanNaren/warp-ctc) 37 | - TensorFlow default [CTC implementation](https://www.TensorFlow.org/api_docs/python/tf/nn/ctc_loss) 38 | 39 | 40 | Benchmark name | Layers x LSTM units | # Classes & output units | Loss | Input size [NxTxC] 1 | Sequence length | Labels per sample| Benchmark scenario 41 | -|-|-|-|-|-|-|- 42 | 1x320/CE-short | 1x320 unidirectional | 10 Dense | cross entropy | 64x100x123 | fixed
[100] | 1 | Real world2 43 | 1x320/CE-long | 1x320 unidirectional | 10 Dense | cross entropy | 32x1000x123 | fixed
[1000] | 1 | Synthetic 44 | 4x320/CE-long | 4x320 bidirectional | 10 Dense | cross entropy | 32x1000x123 | fixed
[1000] | 1 | Synthetic 45 | 4x320/CTC-long | 4x320 bidirectional | 59 Dense | CTC| 32x1000x123 | variable
[500..1000] | 100 | Real-world3 46 | 52 | 1N=number of samples, T=time-steps, C=feature channels
53 | 2ASR-task on TIDIGITS/isolated digit recognition, default training set (0.7 hours of speech): 123- 54 | dimensional filterbank features with 100fps, average sequence length of 98, alphabet size of 10 digits and 55 | 1 label per sample
56 | 3ASR-task on WSJ/continuous speech recognition, pre-processing with [EESEN](https://github.com/srvk/eesen) on training subset 57 | si-284 (81h of speech): 123-dimensional filterbank features with 100fps, average sequence length 783, alphabet 58 | size of 59 characters and average number of characters per sample 102 59 | ## Results 60 | - Xeon W-2195 CPU, GTX 1080 Founders Edition, Ubuntu 16.04 61 | - The results reflect the mean time to fully process a batch (forward + backward pass). 62 | - The measurements are taken over 500 runs, and the first 100 are discarded as warm-up. 63 | 64 | Benchmark | Results 65 | -|- 66 | 1x320/CE-short
---
L1: 1x320 unidir LSTM
L2: 10 Dense
---
cross-entropy loss
input size 64x100x123
fixed sequence length
---
433k parameters
| 67 | 1x320/CE-long
---
L1: 1x320 unidir LSTM
L2: 10 Dense
---
cross-entropy loss
input size 32x1000x123
fixed sequence length
---
576k parameters
| 68 | 4x320/CE-long
---
L1-4: 4x320 bidir LSTM
L5: 10 Dense
---
cross-entropy loss
input size 32x1000x123
fixed sequence length
---
8.5M parameters
| 69 | 4x320/CTC-long
L1-4: 4x320 bidir LSTM
L5: 59 Dense
---
CTC loss
input size 32x1000x123
variable sequence length
---
8.5M parameters
| 70 | 71 | Remarks: 72 | - The benchmark scripts are carefully written, but not optimized to squeeze that last bit of 73 | performance out of them. They should reflect typical day-to-day research applications. 74 | - Due to time constraints, only the 1x320 LSTM benchmark covers all considered frameworks. 75 | For the multi-layer 4x320 networks, only implementations that provided helper functions to 76 | create stacked bidirectional networks were evaluated. An exemption of this rule was made 77 | for Lasagne, in order to include a Theano-based contender for this scenario. 78 | - The TensorFlow benchmarks use the `feed_dict` input method that is simple to implement, 79 | but slower than the [`tf.data` API](https://www.tensorflow.org/performance/performance_guide#input_pipeline_optimization). Implementing a high performance input pipeline in TensorFlow is not trivial, and only the feed_dict approach allowed for a similar implementation complexity as in the PyTorch and Lasagne cases. 80 | - The TensorFlow `cuDNNLSTM` was not tested with variable length data as it does not support 81 | such input (see [issue 6633](https://github.com/TensorFlow/TensorFlow/issues/6633)). 82 | - The TensorFlow benchmark uses the integrated `tf.nn.ctc_loss` instead of the warp- 83 | ctc library, even though there is a TensorFlow binding available ([Link](https://github.com/baidu-research/warp-ctc)). The performance 84 | difference has not been measured. 85 | - PyTorch 0.4.0 merged the Tensor and Variable classes and does not need the Variable 86 | wrapper anymore. The Variable wrapper has a negligible performance impact on version 87 | 0.4.0, but is required for older PyTorch releases in the PyTorch version comparison. 88 | - The CTC benchmark was not carried out on PyTorch 0.1.12_2 as the compilation process was too complex. The packed sequence implementation has a large impact on performance for v0.2.0_4 (see [issue 4512](https://github.com/PyTorch/PyTorch/pull/4512)). 89 | -------------------------------------------------------------------------------- /results/10/pytorch_comparison/1x320-LSTM_cross-entropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/pytorch_comparison/1x320-LSTM_cross-entropy.png -------------------------------------------------------------------------------- /results/10/pytorch_comparison/1x320-LSTM_cross-entropy_100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/pytorch_comparison/1x320-LSTM_cross-entropy_100.png -------------------------------------------------------------------------------- /results/10/pytorch_comparison/4x320-BIDIR-LSTM_CTC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/pytorch_comparison/4x320-BIDIR-LSTM_CTC.png -------------------------------------------------------------------------------- /results/10/pytorch_comparison/4x320-BIDIR-LSTM_cross-entropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/pytorch_comparison/4x320-BIDIR-LSTM_cross-entropy.png -------------------------------------------------------------------------------- /results/10/pytorch_comparison/readme.md: -------------------------------------------------------------------------------- 1 | # PyTorch comparison 2 | - PyTorch 0.1 to 0.4 version comparison 3 | - LSTM implementations by cuDNN, fused kernels and naive approaches 4 | - Fixed sequence-length data with cross-entropy loss and variable sequence-length data with CTC loss 5 | - Input sizes 64x100x123 and 32x1000x123 (batch size x time steps x channels) 6 | - Network sizes 1x320 and 4x320 (number of layers x number of LSTM units) 7 | 8 | ## Framework versions 9 | Framework | Version | Release |Backend | cuda | cuDNN 10 | -|-|-|-|-|- 11 | PyTorch | 0.4.0 | [April 2018](https://github.com/PyTorch/PyTorch/releases/tag/v0.4.0) | - | 9.0 | 7102 12 | PyTorch | 0.3.1post2 | [February 2018](https://github.com/PyTorch/PyTorch/releases/tag/v0.3.1) | - | 8.0 | 7005 13 | PyTorch | 0.2.0_4 | [August 2017](https://github.com/PyTorch/PyTorch/releases/tag/v0.2.0) | - | 8.0 | 6021 14 | PyTorch | 0.1.12_2 | [April 2018](https://github.com/PyTorch/PyTorch/releases/tag/v0.1.12) | - | 8.0 | 6021 15 | 16 | ## LSTM implementations 17 | 18 | Library | Name | Details 19 | -|-|- 20 | PyTorch | [`LSTMCell-basic`](https://github.com/stefbraun/rnn_benchmarks/blob/master/1x320-LSTM/bench_pytorch_LSTMCell-basic.py) | Custom code, pure PyTorch implementation, easy to modify. Loop over time with Python `for` loop 21 | PyTorch | [`LSTMCell-fused`](http://PyTorch.org/docs/stable/nn.html?highlight=lstmcell#torch.nn.LSTMCell) | LSTM with optimized kernel for single time steps. Loop over time with Python `for` loop 22 | PyTorch |[`cuDNNLSTM`](http://PyTorch.org/docs/stable/nn.html?highlight=lstm#torch.nn.LSTM) | Wrapper to cuDNN LSTM implementation 23 | 24 | ## Loss functions and input data 25 | The loss functions are varied with the input data: 26 | 1. Cross-entropy for fixed sequence length data 27 | - default implementation from PyTorch 28 | 2. Connectionist Temporal Classification (CTC) for variable sequence length data 29 | - warp_ctc for [PyTorch](https://github.com/SeanNaren/warp-ctc) 30 | 31 | Benchmark name | Layers x LSTM units | # Classes & output units | Loss | Input size [NxTxC] 1 | Sequence length | Labels per sample| Benchmark scenario 32 | -|-|-|-|-|-|-|- 33 | 1x320/CE-short | 1x320 unidirectional | 10 Dense | cross entropy | 64x100x123 | fixed
[100] | 1 | Real-world2 34 | 1x320/CE-long | 1x320 unidirectional | 10 Dense | cross entropy | 32x1000x123 | fixed
[1000] | 1 | Synthetic 35 | 4x320/CE-long | 4x320 bidirectional | 10 Dense | cross entropy | 32x1000x123 | fixed
[1000] | 1 | Synthetic 36 | 4x320/CTC-long | 4x320 bidirectional | 59 Dense | CTC| 32x1000x123 | variable
[500..1000] | 100 | Real-world3 37 | 43 | 1N=number of samples, T=time-steps, C=feature channels
44 | 2ASR-task on TIDIGITS/isolated digit recognition, default training set (0.7 hours of speech): 123- 45 | dimensional filterbank features with 100fps, average sequence length of 98, alphabet size of 10 digits and 46 | 1 label per sample
47 | 3ASR-task on WSJ/continuous speech recognition, pre-processing with [EESEN](https://github.com/srvk/eesen) on training subset 48 | si-284 (81h of speech): 123-dimensional filterbank features with 100fps, average sequence length 783, alphabet 49 | size of 59 characters and average number of characters per sample 102 50 | ## Results 51 | - Xeon W-2195 CPU, GTX 1080 Founders Edition, Ubuntu 16.04 52 | - The results reflect the mean time to fully process a batch (forward + backward pass). 53 | - The measurements are taken over 500 runs, and the first 100 are discarded as warm-up. 54 | 55 | Benchmark | Results 56 | -|- 57 | 1x320/CE-short
---
L1: 1x320 unidir LSTM
L2: 10 Dense
---
cross-entropy loss
input size 64x100x123
fixed sequence length
---
433k parameters
| 58 | 1x320/CE-long
---
L1: 1x320 unidir LSTM
L2: 10 Dense
---
cross-entropy loss
input size 32x1000x123
fixed sequence length
---
576k parameters
| 59 | 4x320/CE-long
---
L1-4: 4x320 bidir LSTM
L5: 10 Dense
---
cross-entropy loss
input size 32x1000x123
fixed sequence length
---
8.5M parameters
| 60 | 4x320/CTC-long
L1-4: 4x320 bidir LSTM
L5: 59 Dense
---
CTC loss
input size 32x1000x123
variable sequence length
---
8.5M parameters
| 61 | 62 | Remarks: 63 | - The benchmark scripts are carefully written, but not optimized to squeeze that last bit of 64 | performance out of them. They should reflect typical day-to-day research applications. 65 | - Due to time constraints, only the 1x320 LSTM benchmark covers all considered frameworks. 66 | For the multi-layer 4x320 networks, only implementations that provided helper functions to 67 | create stacked bidirectional networks were evaluated. 68 | - PyTorch 0.4.0 merged the Tensor and Variable classes and does not need the Variable 69 | wrapper anymore. The Variable wrapper has a negligible performance impact on version 70 | 0.4.0, but is required for older PyTorch releases in the PyTorch version comparison. 71 | - The CTC benchmark was not carried out on PyTorch 0.1.12_2 as the compilation process was too complex. The packed sequence implementation has a large impact on performance for v0.2.0_4 (see [issue 4512](https://github.com/PyTorch/PyTorch/pull/4512)). 72 | -------------------------------------------------------------------------------- /support.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import OrderedDict 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import os.path 7 | import pandas as pd 8 | 9 | 10 | def default_params(): 11 | rnn_size = 320 12 | learning_rate = 1e-3 13 | batches = 500 14 | return rnn_size, learning_rate, batches 15 | 16 | 17 | def toy_batch(seed=11, shape=(64, 100, 123), classes=10): 18 | batch_size, max_len, features = shape 19 | np.random.seed(seed) 20 | 21 | # Samples 22 | bX = np.float32(np.random.uniform(-1, 1, (shape))) 23 | b_lenX = np.int32(np.ones(batch_size) * max_len) 24 | 25 | # Targets 26 | bY = np.int32(np.random.randint(low=0, high=classes - 1, size=batch_size)) 27 | 28 | return bX, b_lenX, bY, classes 29 | 30 | 31 | def toy_batch_ctc(seed=11, shape=(32, 1000, 123), classes=59): 32 | batch_size, max_len, features = shape 33 | np.random.seed(seed) 34 | 35 | # Samples 36 | bX = np.float32(np.random.uniform(-1, 1, (shape))) 37 | b_lenX = np.int32(np.linspace(max_len / 2, max_len, batch_size)) 38 | # print(b_lenX) 39 | maskX = np.zeros((batch_size, max_len), dtype='float32') 40 | for i, len_sample in enumerate(b_lenX): 41 | maskX[i, :len_sample] = np.ones((1, len_sample)) 42 | 43 | # Targets 44 | bY = np.int32(np.random.randint(low=1, high=classes - 1, 45 | size=batch_size * 100)) # remember warp-ctc: 0 is the blank label, tensorflow-ctc: -1 is the blank label 46 | b_lenY = np.int32(np.ones(batch_size) * 100) # labels per sample comes from WSJ-si84 47 | 48 | return bX, b_lenX, maskX, bY, b_lenY, classes 49 | 50 | 51 | def check_results(batch_loss_list, batch_time_list, train_start, train_end): 52 | 53 | # Initialize 54 | abort = 0 55 | 56 | # 0. Check if loss is numeric (not NAN and not inf) 57 | check_loss=[np.isfinite(loss) for loss in batch_loss_list] 58 | if False not in check_loss: 59 | print('>>> Loss check 1/2 passed: loss is finite {}'.format(np.unique(check_loss))) 60 | else: 61 | print('!!! Loss check 1/2 failed: loss is NOT finite {}'.format(np.unique(check_loss))) 62 | abort = 1 63 | 64 | # 1. Check if loss is decreasing 65 | check_loss=np.diff(batch_loss_list) 66 | if np.sum(check_loss)<0: 67 | print('>>> Loss check 2/2 passed: loss is globally decreasing') 68 | else: 69 | print('!!! Loss check 2/2 failed: loss is NOT globally decreasing') 70 | abort=1 71 | 72 | # 2. Check deviation between the full loop time and the sum of individual batches 73 | loop_time = train_end - train_start 74 | batch_time_sum = np.sum(batch_time_list) 75 | factor = loop_time / batch_time_sum 76 | deviation = np.abs((1 - factor) * 100) 77 | 78 | if deviation < 1: # Less than 1% deviation 79 | print('>>> Timing check passed - < 1% deviation between loop time and sum of batches ::: Loop time {:.3f} ::: Sum of batch times {:.3f} ::: Deviation [%] {:.3f}'.format(loop_time, 80 | batch_time_sum, 81 | deviation)) 82 | else: 83 | print('!!! Timing check failed - Deviation > 1% ::: Loop time {:.3f} ::: Sum of batch times {:.3f} :::' 84 | ' Deviation [%] {:.3f}'.format(loop_time, batch_time_sum, deviation)) 85 | abort=1 86 | 87 | if abort==1: 88 | sys.exit('!!! Abort benchmark.') 89 | print('=' * 100) 90 | 91 | 92 | def write_results(script_name, bench, experiment, parameters, run_time, version=None, 93 | logfile=None): 94 | 95 | if logfile == None: 96 | # Get path 97 | repo_path = os.path.dirname(os.path.realpath(__file__)) 98 | 99 | with open(os.path.join(repo_path, 'results', 'conf')) as f: 100 | mode = f.readline().strip() 101 | 102 | logfile = os.path.join(repo_path, 'results', mode, 'results.csv') 103 | 104 | # Prepare header 105 | if os.path.isfile(logfile) == False: 106 | df = pd.DataFrame(index=None, columns=['name', 'bench', 'version', 'experiment', 'parameters', 'runtime']) 107 | df.to_csv(logfile, index=None) 108 | 109 | # Prepare new results 110 | row_list = [] 111 | for rt in run_time: 112 | row = OrderedDict() 113 | row['experiment'] = experiment 114 | row['bench'] = bench 115 | row['version'] = version 116 | row['name'] = script_name 117 | row['parameters'] = parameters 118 | row['runtime'] = rt 119 | 120 | row_list.append(row) 121 | 122 | dfa = pd.DataFrame.from_dict(row_list) 123 | 124 | # Append new results 125 | df = pd.read_csv(logfile) 126 | df = df.append(dfa) 127 | df.to_csv(logfile, index=None) 128 | 129 | 130 | def print_results(run_time): 131 | if len(run_time) > 100: 132 | run_time = run_time[100:] 133 | else: 134 | print('!!! First 100 batches are considered as warm-up. Please run more batches') 135 | run_time=np.asarray(run_time)*1000 136 | print( 137 | '>>> Time per batch [ms] ::: Mean {:.1f} ::: Std {:.1f} ::: Median {:.1f} ::: 99Percentile {:.1f} ::: Min {:.1f} ::: Max {:.1f}'.format( 138 | np.mean(run_time), np.std(run_time), 139 | np.median(run_time), np.percentile(run_time, 99), np.min(run_time), np.max(run_time))) 140 | 141 | def plot_results(time): 142 | fig, ax = plt.subplots() 143 | ax.scatter(range(len(time)), time) 144 | ax.grid() 145 | ax.set_xlabel('Batch #') 146 | ax.set_ylabel('Time per Batch [sec]') 147 | return fig, ax 148 | 149 | 150 | # Helper functions for label conversion from warp-ctc to tf-ctc format:-( 151 | def target_converter(bY, b_lenY): 152 | b_lenY_cs = np.cumsum(b_lenY)[:-1] 153 | bY_conv = np.split(bY, b_lenY_cs) 154 | return bY_conv 155 | 156 | 157 | def sparse_tuple_from(sequences, dtype=np.int32): 158 | """Create a sparse representention of x. 159 | Args: 160 | sequences: a list of lists of type dtype where each element is a sequence 161 | Returns: 162 | A tuple with (indices, values, shape) 163 | """ 164 | indices = [] 165 | values = [] 166 | 167 | for n, seq in enumerate(sequences): 168 | indices.extend(zip([n] * len(seq), range(len(seq)))) 169 | values.extend(seq) 170 | 171 | indices = np.asarray(indices, dtype=np.int64) 172 | values = np.asarray(values, dtype=dtype) 173 | shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64) 174 | 175 | return indices, values, shape 176 | -------------------------------------------------------------------------------- /utils/analyse_pandas.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import pandas as pd 5 | 6 | parser = argparse.ArgumentParser(description='Process results dataframe') 7 | parser.add_argument('--file', default=None, 8 | help='Dataframe to process.') 9 | args = parser.parse_args() 10 | 11 | # Load file 12 | logfile = os.path.join(args.file) 13 | df = pd.read_csv(logfile) 14 | 15 | # assert (int(df.groupby(['experiment', 'bench', 'version']).count()['runtime'].unique()) == 500) 16 | df['runtime'] = df['runtime'] * 1000 17 | 18 | df=df.groupby(['experiment','bench']).tail(400) 19 | df['mean'] = df.groupby(['experiment','bench']).transform('mean')['runtime'] 20 | df['std'] = df.groupby(['experiment','bench']).transform('std')['runtime'] 21 | df = df.sort_values(['mean'], ascending=True) 22 | grp=df.groupby(['experiment','bench'], as_index=False).tail(1).round(1) 23 | print(grp.to_string()) 24 | -------------------------------------------------------------------------------- /utils/disable_cores.sh: -------------------------------------------------------------------------------- 1 | num_cores=4 2 | num_cores_ht=$((18+$num_cores)) 3 | echo num_cores_ht 4 | for i in `seq $num_cores 1 17`; 5 | do echo 0 > /sys/devices/system/cpu/cpu$i/online; 6 | done 7 | 8 | for i in `seq $num_cores_ht 1 36`; 9 | do echo 0 > /sys/devices/system/cpu/cpu$i/online; 10 | done 11 | 12 | -------------------------------------------------------------------------------- /utils/enable_cores.sh: -------------------------------------------------------------------------------- 1 | for i in $(seq 36 $END); 2 | do echo 1 > /sys/devices/system/cpu/cpu$i/online; 3 | done 4 | -------------------------------------------------------------------------------- /utils/plot_all.sh: -------------------------------------------------------------------------------- 1 | unzip -o results/results_100.zip -d results/ 2 | python main/framework_comparison/plot.py 3 | mv results/framework_comparison/1x320-LSTM_cross-entropy.pdf results/framework_comparison/1x320-LSTM_cross-entropy_100.pdf 4 | mv results/framework_comparison/1x320-LSTM_cross-entropy.png results/framework_comparison/1x320-LSTM_cross-entropy_100.png 5 | 6 | python main/pytorch_comparison/plot.py 7 | mv results/pytorch_comparison/1x320-LSTM_cross-entropy.pdf results/pytorch_comparison/1x320-LSTM_cross-entropy_100.pdf 8 | mv results/pytorch_comparison/1x320-LSTM_cross-entropy.png results/pytorch_comparison/1x320-LSTM_cross-entropy_100.png 9 | 10 | unzip -o results/results_1k.zip -d results/ 11 | python main/framework_comparison/plot.py 12 | python main/pytorch_comparison/plot.py 13 | -------------------------------------------------------------------------------- /utils/rm_results.sh: -------------------------------------------------------------------------------- 1 | rm results/framework_comparison/results.csv results/pytorch_comparison/results.csv 2 | 3 | 4 | --------------------------------------------------------------------------------