├── .gitignore
├── .idea
├── misc.xml
├── modules.xml
├── rnn_benchmarks.iml
├── vcs.xml
└── workspace.xml
├── 1x320-LSTM
├── __init__.py
├── bench_keras-tensorflow_LSTM.py
├── bench_keras-tensorflow_cudnnLSTM.py
├── bench_keras-theano_LSTM.py
├── bench_lasagne_LSTMLayer.py
├── bench_pytorch_LSTMCell-basic.py
├── bench_pytorch_LSTMCell-fused.py
├── bench_pytorch_cudnnLSTM.py
├── bench_tensorflow_LSTMBlockCell.py
├── bench_tensorflow_LSTMBlockFusedCell.py
├── bench_tensorflow_LSTMCell.py
├── bench_tensorflow_cudnnLSTM.py
└── lib_pytorchLSTM.py
├── 4x320-LSTM
├── __init__.py
├── bench_lasagne_LSTMLayer.py
├── bench_pytorch_cudnnLSTM.py
├── bench_tensorflow_LSTMBlockCell.py
├── bench_tensorflow_LSTMCell.py
└── bench_tensorflow_cudnnLSTM.py
├── 4x320-LSTM_ctc
├── __init__.py
├── bench_lasagne_LSTMLayer.py
├── bench_pytorch_cudnnLSTM.py
├── bench_tensorflow_LSTMBlockCell.py
└── bench_tensorflow_LSTMCell.py
├── README.md
├── main
├── framework_comparison
│ ├── main.py
│ └── plot.py
└── pytorch_comparison
│ ├── main.py
│ ├── plot.py
│ └── unifier.py
├── results
└── 10
│ ├── framework_comparison
│ ├── 1x320-LSTM_cross-entropy.png
│ ├── 1x320-LSTM_cross-entropy_100.png
│ ├── 4x320-BIDIR-LSTM_CTC.png
│ ├── 4x320-BIDIR-LSTM_cross-entropy.png
│ └── readme.md
│ └── pytorch_comparison
│ ├── 1x320-LSTM_cross-entropy.png
│ ├── 1x320-LSTM_cross-entropy_100.png
│ ├── 4x320-BIDIR-LSTM_CTC.png
│ ├── 4x320-BIDIR-LSTM_cross-entropy.png
│ ├── readme.md
│ └── results.csv
├── support.py
└── utils
├── analyse_pandas.py
├── disable_cores.sh
├── enable_cores.sh
├── plot_all.sh
└── rm_results.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.pdf
3 | *.pyc
4 |
5 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/rnn_benchmarks.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
119 |
120 |
121 |
122 | gru
123 | GRUC
124 | res
125 | cuda
126 | LSTMCw
127 | LSTMC
128 | rnn_size
129 | sync
130 | memor
131 | params
132 | seq
133 | pack
134 | mask
135 | h2
136 | CTC
137 | max_le
138 | seq_len
139 | seqlen
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 | true
198 | DEFINITION_ORDER
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 | project
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 | 1486730635354
647 |
648 |
649 | 1486730635354
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 | file://$PROJECT_DIR$/1x320-GRU/bench_lasagne.py
697 | 18
698 |
699 |
700 |
701 | file://$PROJECT_DIR$/support.py
702 | 106
703 |
704 |
705 |
706 | file://$PROJECT_DIR$/support.py
707 | 103
708 |
709 |
710 |
711 | file://$PROJECT_DIR$/4x320-LSTM/bench_pytorch.py
712 | 38
713 |
714 |
715 |
716 | file://$PROJECT_DIR$/4x320-LSTM/bench_tensorflow.py
717 | 60
718 |
719 |
720 |
721 | file://$PROJECT_DIR$/4x320-LSTM/bench_pytorch.py
722 | 18
723 |
724 |
725 |
726 | file://$PROJECT_DIR$/support.py
727 | 93
728 |
729 |
730 |
731 | file://$USER_HOME$/envs/pytorch_latest/lib/python2.7/site-packages/warpctc_pytorch/__init__.py
732 | 25
733 |
734 |
735 |
736 | file://$PROJECT_DIR$/4x320-LSTM_ctc/bench_pytorch.py
737 | 83
738 |
739 |
740 |
741 | file://$PROJECT_DIR$/4x320-LSTM_ctc/bench_tensorflow.py
742 | 86
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
788 |
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 |
798 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 |
834 |
835 |
836 |
837 |
838 |
839 |
840 |
841 |
842 |
843 |
844 |
845 |
846 |
847 |
848 |
849 |
850 |
851 |
852 |
853 |
854 |
855 |
856 |
857 |
858 |
859 |
860 |
861 |
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 |
872 |
873 |
874 |
875 |
876 |
877 |
878 |
879 |
880 |
881 |
882 |
883 |
884 |
885 |
886 |
887 |
888 |
889 |
890 |
891 |
892 |
893 |
894 |
895 |
896 |
897 |
898 |
899 |
900 |
901 |
902 |
903 |
904 |
905 |
906 |
907 |
908 |
909 |
910 |
911 |
912 |
913 |
914 |
915 |
916 |
917 |
918 |
919 |
920 |
921 |
922 |
923 |
924 |
925 |
926 |
927 |
928 |
929 |
930 |
931 |
932 |
933 |
934 |
935 |
936 |
937 |
938 |
939 |
940 |
941 |
942 |
943 |
944 |
945 |
946 |
947 |
948 |
949 |
950 |
951 |
952 |
953 |
954 |
955 |
956 |
957 |
958 |
959 |
960 |
961 |
962 |
963 |
964 |
965 |
966 |
967 |
968 |
969 |
970 |
971 |
972 |
973 |
974 |
975 |
976 |
977 |
978 |
979 |
980 |
981 |
982 |
983 |
984 |
985 |
986 |
987 |
988 |
989 |
990 |
991 |
992 |
993 |
994 |
995 |
996 |
997 |
998 |
999 |
1000 |
1001 |
1002 |
1003 |
1004 |
1005 |
1006 |
1007 |
1008 |
1009 |
1010 |
1011 |
1012 |
1013 |
1014 |
1015 |
1016 |
1017 |
1018 |
1019 |
1020 |
1021 |
1022 |
1023 |
1024 |
1025 |
1026 |
1027 |
1028 |
1029 |
1030 |
1031 |
1032 |
1033 |
1034 |
1035 |
1036 |
1037 |
1038 |
1039 |
1040 |
1041 |
1042 |
1043 |
1044 |
--------------------------------------------------------------------------------
/1x320-LSTM/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/1x320-LSTM/__init__.py
--------------------------------------------------------------------------------
/1x320-LSTM/bench_keras-tensorflow_LSTM.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import keras
5 | from keras.layers import Input, LSTM, Dense
6 | from keras.models import Model
7 | from keras.utils import to_categorical
8 |
9 | from support import toy_batch, default_params, write_results, print_results, check_results
10 |
11 | # Experiment_type
12 | bench = 'keras-{}_LSTM'.format(keras.backend.backend())
13 | version = keras.__version__
14 | experiment = '1x320-LSTM_cross-entropy'
15 |
16 | # Get data
17 | bX, b_lenX, bY, classes = toy_batch()
18 | batch_size, max_len, inp_dims = bX.shape
19 | rnn_size, learning_rate, batches = default_params()
20 |
21 | # Create symbolic vars
22 | x = Input(shape=(None, inp_dims), dtype='float32', name='input')
23 |
24 | # Create network
25 | fw_cell = LSTM(rnn_size, return_sequences=False, implementation=2)(x)
26 | h3 = Dense(classes, activation='softmax', use_bias=False)(fw_cell)
27 | model = Model(inputs=x, outputs=h3)
28 | start=timer.perf_counter()
29 | model.compile(optimizer='Adam', loss='categorical_crossentropy')
30 | end=timer.perf_counter()
31 | print('>>> Model compilation took {:.1f} seconds'.format(end - start))
32 |
33 | # Print parameter count
34 | params = model.count_params()
35 | print('# network parameters: ' + str(params))
36 |
37 | # Check for correct sizes
38 | assert (model.layers[-1].input_shape == (None, rnn_size)) # final projection input size (rnn_size)
39 | assert (model.layers[-1].get_weights()[0].shape == (rnn_size, classes)) # final projection output size (rnn_size, classes)
40 | output = model.predict(bX)
41 | assert (output.shape == (batch_size, classes))
42 |
43 | # Start training
44 | batch_time = []
45 | batch_loss = []
46 | train_start=timer.perf_counter()
47 | for i in range(batches):
48 | batch_start = timer.perf_counter()
49 | loss=model.train_on_batch(x=bX, y=to_categorical(bY, num_classes=classes))
50 | batch_end = timer.perf_counter()
51 | batch_time.append(batch_end - batch_start)
52 | batch_loss.append(loss)
53 | train_end=timer.perf_counter()
54 |
55 | # Write results
56 | print_results(batch_time)
57 | check_results(batch_loss, batch_time, train_start, train_end)
58 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
59 | run_time=batch_time, version=version)
60 |
--------------------------------------------------------------------------------
/1x320-LSTM/bench_keras-tensorflow_cudnnLSTM.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import keras
5 | from keras.layers import Input, Dense, CuDNNLSTM
6 | from keras.models import Model
7 | from keras.utils import to_categorical
8 |
9 | from support import toy_batch, default_params, write_results, print_results, check_results
10 |
11 | # Experiment_type
12 | bench = 'keras-{}_cudnnLSTM'.format(keras.backend.backend())
13 | version = keras.__version__
14 | experiment = '1x320-LSTM_cross-entropy'
15 |
16 | # Get data
17 | bX, b_lenX, bY, classes = toy_batch()
18 | batch_size, max_len, inp_dims = bX.shape
19 | rnn_size, learning_rate, batches = default_params()
20 |
21 | # Create symbolic vars
22 | x = Input(shape=(None, inp_dims), dtype='float32', name='input')
23 |
24 | # Create network
25 | fw_cell = CuDNNLSTM(rnn_size, return_sequences=False)(x)
26 | h3 = Dense(classes, activation='softmax', use_bias=False)(fw_cell)
27 | model = Model(inputs=x, outputs=h3)
28 | start=timer.perf_counter()
29 | model.compile(optimizer='Adam', loss='categorical_crossentropy')
30 | end=timer.perf_counter()
31 | print('>>> Model compilation took {:.1f} seconds'.format(end - start))
32 |
33 | # Print parameter count
34 | params = model.count_params()
35 | print('# network parameters: ' + str(params))
36 |
37 | # Check for correct sizes
38 | assert (model.layers[-1].input_shape == (None, rnn_size)) # final projection input size (rnn_size)
39 | assert (model.layers[-1].get_weights()[0].shape == (rnn_size, classes)) # final projection output size (rnn_size, classes)
40 | output = model.predict(bX)
41 | assert (output.shape == (batch_size, classes))
42 |
43 | # Start training
44 | batch_time = []
45 | batch_loss = []
46 | train_start=timer.perf_counter()
47 | for i in range(batches):
48 | batch_start = timer.perf_counter()
49 | loss=model.train_on_batch(x=bX, y=to_categorical(bY, num_classes=classes))
50 | batch_end = timer.perf_counter()
51 | batch_time.append(batch_end - batch_start)
52 | batch_loss.append(loss)
53 | train_end=timer.perf_counter()
54 |
55 | # Write results
56 | print_results(batch_time)
57 | check_results(batch_loss, batch_time, train_start, train_end)
58 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
59 | run_time=batch_time, version=version)
60 |
--------------------------------------------------------------------------------
/1x320-LSTM/bench_keras-theano_LSTM.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import keras
5 | from keras.layers import Input, LSTM, Dense
6 | from keras.models import Model
7 | from keras.utils import to_categorical
8 |
9 | from support import toy_batch, default_params, write_results, print_results, check_results
10 |
11 | # Experiment_type
12 | bench = 'keras-{}_LSTM'.format(keras.backend.backend())
13 | version = keras.__version__
14 | experiment = '1x320-LSTM_cross-entropy'
15 |
16 | # Get data
17 | bX, b_lenX, bY, classes = toy_batch()
18 | batch_size, max_len, inp_dims = bX.shape
19 | rnn_size, learning_rate, batches = default_params()
20 |
21 | # Create symbolic vars
22 | x = Input(shape=(None, inp_dims), dtype='float32', name='input')
23 |
24 | # Create network
25 | fw_cell = LSTM(rnn_size, return_sequences=False, implementation=2)(x)
26 | h3 = Dense(classes, activation='softmax', use_bias=False)(fw_cell)
27 | model = Model(inputs=x, outputs=h3)
28 | start=timer.perf_counter()
29 | model.compile(optimizer='Adam', loss='categorical_crossentropy')
30 | end=timer.perf_counter()
31 | print('>>> Model compilation took {:.1f} seconds'.format(end - start))
32 |
33 | # Print parameter count
34 | params = model.count_params()
35 | print('# network parameters: ' + str(params))
36 |
37 | # Check for correct sizes
38 | assert (model.layers[-1].input_shape == (None, rnn_size)) # final projection input size (rnn_size)
39 | assert (model.layers[-1].get_weights()[0].shape == (rnn_size, classes)) # final projection output size (rnn_size, classes)
40 | output = model.predict(bX)
41 | assert (output.shape == (batch_size, classes))
42 |
43 | # Start training
44 | batch_time = []
45 | batch_loss = []
46 | train_start=timer.perf_counter()
47 | for i in range(batches):
48 | batch_start = timer.perf_counter()
49 | loss=model.train_on_batch(x=bX, y=to_categorical(bY, num_classes=classes))
50 | batch_end = timer.perf_counter()
51 | batch_time.append(batch_end - batch_start)
52 | batch_loss.append(loss)
53 | train_end=timer.perf_counter()
54 |
55 | # Write results
56 | print_results(batch_time)
57 | check_results(batch_loss, batch_time, train_start, train_end)
58 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
59 | run_time=batch_time, version=version)
60 |
--------------------------------------------------------------------------------
/1x320-LSTM/bench_lasagne_LSTMLayer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import lasagne
5 | import theano
6 | import theano.tensor as T
7 |
8 | from support import toy_batch, default_params, write_results, print_results, check_results
9 |
10 | # Experiment_type
11 | bench = 'lasagne_LSTMLayer'
12 | version = lasagne.__version__
13 | experiment = '1x320-LSTM_cross-entropy'
14 |
15 | # Get data
16 | bX, b_lenX, bY, classes = toy_batch()
17 | batch_size, seq_len, inp_dims = bX.shape
18 | rnn_size, learning_rate, batches = default_params()
19 |
20 | # Create symbolic vars
21 | x = T.ftensor3('x')
22 | y = T.ivector('y')
23 |
24 | # Create network
25 | network = lasagne.layers.InputLayer(shape=(None, None, inp_dims), input_var=x) # Input layer
26 | network = lasagne.layers.LSTMLayer(network, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform()) # RNN layer
27 | network = lasagne.layers.SliceLayer(network, -1, axis=1) # slice last time step
28 | network = lasagne.layers.DenseLayer(network, num_units=classes, nonlinearity=lasagne.nonlinearities.softmax,
29 | b=None) # Output projection
30 |
31 | # Print parameter count
32 | params = lasagne.layers.count_params(network)
33 | print('>>> # network parameters: ' + str(params))
34 |
35 | # Create loss, optimizer and train function
36 | prediction = lasagne.layers.get_output(network)
37 | loss = lasagne.objectives.categorical_crossentropy(predictions=prediction, targets=y)
38 | loss = loss.mean()
39 | update_params = lasagne.layers.get_all_params(network, trainable=True)
40 | updates = lasagne.updates.adam(loss, update_params, learning_rate=learning_rate)
41 | fn_inputs = [x, y]
42 | prediction_det = lasagne.layers.get_output(network, deterministic=True)
43 |
44 | start = timer.perf_counter()
45 | train_fn = theano.function(fn_inputs, loss, updates=updates)
46 | output_fn = theano.function([x], prediction_det)
47 | end = timer.perf_counter()
48 | print('>>> Theano function compilation took {:.1f} seconds'.format(end - start))
49 |
50 | # Check for correct sizes
51 | assert (network.input_shape == (None, rnn_size)) # final projection input size (Batch_size x rnn_size)
52 | assert (network.W.eval().shape == (rnn_size, classes)) # final projection kernel size (rnn_size x classes)
53 | output = output_fn(bX)
54 | output_fn.sync_shared()
55 | assert (output.shape == (batch_size, classes)) # output size
56 |
57 | # Start training
58 | batch_time = []
59 | batch_loss = []
60 | train_start = timer.perf_counter() # start of training
61 | for i in range(batches):
62 | batch_start = timer.perf_counter() # start of batch
63 | loss = train_fn(bX, bY)
64 | train_fn.sync_shared() # synchronize function call for precise time measurement
65 | batch_end = timer.perf_counter() # end of batch
66 | batch_time.append(batch_end - batch_start)
67 | batch_loss.append(loss)
68 | train_end = timer.perf_counter() # end of training
69 |
70 | # Results handling
71 | print_results(batch_time)
72 | check_results(batch_loss, batch_time, train_start, train_end)
73 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
74 | run_time=batch_time, version=version)
75 |
--------------------------------------------------------------------------------
/1x320-LSTM/bench_pytorch_LSTMCell-basic.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import lib_pytorchLSTM as libLSTM
5 | import numpy as np
6 | import torch
7 | import torch.nn as nn
8 | import torch.optim as optim
9 | from torch.autograd import Variable
10 |
11 | from support import toy_batch, default_params, write_results, print_results, check_results
12 |
13 | # Experiment_type
14 | bench = 'pytorch_LSTMCell-basic'
15 | version = torch.__version__
16 | experiment = '1x320-LSTM_cross-entropy'
17 |
18 | # Get data
19 | bX, b_lenX, bY, classes = toy_batch()
20 | batch_size, seq_len, inp_dims = bX.shape
21 | rnn_size, learning_rate, batches = default_params()
22 |
23 | # PyTorch compatibility: time first, batch second
24 | bX = np.transpose(bX, (1, 0, 2))
25 |
26 | # Create Network
27 | class Net(nn.Module):
28 | def __init__(self):
29 | super(Net, self).__init__()
30 | self.lstm = libLSTM.LSTMCell(input_size=inp_dims, hidden_size=rnn_size, bias=True)
31 | self.fc = nn.Linear(rnn_size, classes, bias=False)
32 |
33 | def forward(self, x):
34 | max_len, batch_size, features = x.size()
35 | h_lstm = Variable(torch.zeros(batch_size, rnn_size)).cuda()
36 | c_lstm = Variable(torch.zeros(batch_size, rnn_size)).cuda()
37 | output = []
38 | for i in range(max_len):
39 | h_lstm, c_lstm = self.lstm(x[i], (h_lstm, c_lstm))
40 | output.append(h_lstm)
41 | h1 = torch.stack(output)
42 | h2 = h1[-1, :, :]
43 | h3 = self.fc(h2)
44 | return h3
45 |
46 |
47 | net = Net()
48 | net.cuda()
49 |
50 | # Print parameter count
51 | params = 0
52 | for param in list(net.parameters()):
53 | sizes = 1
54 | for el in param.size():
55 | sizes = sizes * el
56 | params += sizes
57 | print('# network parameters: ' + str(params))
58 |
59 | # Create optimizer
60 | optimizer = optim.Adam(net.parameters(), lr=learning_rate)
61 | criterion = nn.CrossEntropyLoss() # loss definition
62 |
63 | # Check for correct sizes
64 | assert (net.fc.in_features == rnn_size) # final projection input size (rnn_size)
65 | assert (net.fc.weight.cpu().data.numpy().shape == (
66 | classes, rnn_size)) # final projection output size (classes, rnn_size)
67 | bXt = Variable(torch.from_numpy(bX).cuda())
68 | torch.cuda.synchronize()
69 | output = net(bXt)
70 | output_numpy = output.data.cpu().numpy()
71 | assert (output_numpy.shape == (batch_size, classes))
72 |
73 | # Start training
74 | batch_time = []
75 | batch_loss = []
76 | train_start = timer.perf_counter()
77 | for i in range(batches):
78 | torch.cuda.synchronize() # synchronize function call for precise time measurement
79 | batch_start = timer.perf_counter()
80 |
81 | bXt = Variable(torch.from_numpy(bX).cuda())
82 | bYt = Variable(torch.from_numpy(bY).cuda())
83 |
84 | optimizer.zero_grad()
85 | output = net(bXt)
86 | loss = criterion(output, bYt.long())
87 | loss.backward()
88 | optimizer.step()
89 |
90 | torch.cuda.synchronize() # synchronize function call for precise time measurement
91 | batch_end = timer.perf_counter()
92 | batch_time.append(batch_end - batch_start)
93 | batch_loss.append(float(loss.data.cpu().numpy()))
94 | train_end = timer.perf_counter() # end of training
95 |
96 | # Write results
97 | print_results(batch_time)
98 | check_results(batch_loss, batch_time, train_start, train_end)
99 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
100 | run_time=batch_time, version=version)
101 |
--------------------------------------------------------------------------------
/1x320-LSTM/bench_pytorch_LSTMCell-fused.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import numpy as np
5 | import torch
6 | import torch.nn as nn
7 | import torch.optim as optim
8 | from torch.autograd import Variable
9 |
10 | from support import toy_batch, default_params, write_results, print_results, check_results
11 |
12 | # Experiment_type
13 | bench = 'pytorch_LSTMCell-fused'
14 | version = torch.__version__
15 | experiment = '1x320-LSTM_cross-entropy'
16 |
17 | # Get data
18 | bX, b_lenX, bY, classes = toy_batch()
19 | batch_size, seq_len, inp_dims = bX.shape
20 | rnn_size, learning_rate, batches = default_params()
21 |
22 | # PyTorch compatibility: time first, batch second
23 | bX = np.transpose(bX, (1, 0, 2))
24 |
25 |
26 | # Create Network
27 | class Net(nn.Module):
28 | def __init__(self):
29 | super(Net, self).__init__()
30 | self.lstm = nn.LSTMCell(input_size=inp_dims, hidden_size=rnn_size, bias=True)
31 | self.fc = nn.Linear(rnn_size, classes, bias=False)
32 |
33 | def forward(self, x):
34 | max_len, batch_size, features = x.size()
35 | h_lstm = Variable(torch.zeros(batch_size, rnn_size)).cuda()
36 | c_lstm = Variable(torch.zeros(batch_size, rnn_size)).cuda()
37 |
38 | output = []
39 | for i in range(max_len):
40 | h_lstm, c_lstm = self.lstm(x[i], (h_lstm, c_lstm))
41 | output.append(h_lstm)
42 |
43 | h1 = torch.stack(output)
44 | h2 = h1[-1, :, :]
45 | h3 = self.fc(h2)
46 | return h3
47 |
48 |
49 | net = Net()
50 | net.cuda()
51 |
52 | # Print parameter count
53 | params = 0
54 | for param in list(net.parameters()):
55 | sizes = 1
56 | for el in param.size():
57 | sizes = sizes * el
58 | params += sizes
59 | print('# network parameters: ' + str(params))
60 |
61 | # Create optimizer
62 | optimizer = optim.Adam(net.parameters(), lr=learning_rate)
63 | criterion = nn.CrossEntropyLoss() # loss definition
64 |
65 | # Check for correct sizes
66 | assert (net.fc.in_features == rnn_size) # final projection input size (rnn_size)
67 | assert (net.fc.weight.cpu().data.numpy().shape == (
68 | classes, rnn_size)) # final projection output size (classes, rnn_size)
69 | bXt = Variable(torch.from_numpy(bX).cuda())
70 | torch.cuda.synchronize()
71 | output = net(bXt)
72 | output_numpy = output.data.cpu().numpy()
73 | assert (output_numpy.shape == (batch_size, classes))
74 |
75 | # Start training
76 | batch_time = []
77 | batch_loss = []
78 | train_start = timer.perf_counter()
79 | for i in range(batches):
80 | torch.cuda.synchronize() # synchronize function call for precise time measurement
81 | batch_start = timer.perf_counter()
82 |
83 | bXt = Variable(torch.from_numpy(bX).cuda())
84 | bYt = Variable(torch.from_numpy(bY).cuda())
85 |
86 | optimizer.zero_grad()
87 | output = net(bXt)
88 | loss = criterion(output, bYt.long())
89 | loss.backward()
90 | optimizer.step()
91 |
92 | torch.cuda.synchronize() # synchronize function call for precise time measurement
93 | batch_end = timer.perf_counter()
94 | batch_time.append(batch_end - batch_start)
95 | batch_loss.append(float(loss.data.cpu().numpy()))
96 | train_end = timer.perf_counter() # end of training
97 |
98 | # Write results
99 | print_results(batch_time)
100 | check_results(batch_loss, batch_time, train_start, train_end)
101 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
102 | run_time=batch_time, version=version)
103 |
--------------------------------------------------------------------------------
/1x320-LSTM/bench_pytorch_cudnnLSTM.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import numpy as np
5 | import torch
6 | import torch.nn as nn
7 | import torch.optim as optim
8 | from torch.autograd import Variable
9 |
10 | from support import toy_batch, default_params, write_results, print_results, check_results
11 |
12 | # Experiment_type
13 | bench = 'pytorch_cudnnLSTM'
14 | version = torch.__version__
15 | experiment = '1x320-LSTM_cross-entropy'
16 |
17 | # Get data
18 | bX, b_lenX, bY, classes = toy_batch()
19 | batch_size, seq_len, inp_dims = bX.shape
20 | rnn_size, learning_rate, batches = default_params()
21 |
22 | # PyTorch compatibility: time first, batch second
23 | bX = np.transpose(bX, (1, 0, 2))
24 |
25 | # Create Network
26 | class Net(nn.Module):
27 | def __init__(self):
28 | super(Net, self).__init__()
29 | self.lstm = nn.LSTM(input_size=inp_dims, hidden_size=rnn_size, num_layers=1, bias=True, bidirectional=False)
30 | self.fc = nn.Linear(rnn_size, classes, bias=False)
31 |
32 | def forward(self, x):
33 | h1, state = self.lstm(x)
34 | h2 = h1[-1, :, :]
35 | h3 = self.fc(h2)
36 | return h3
37 |
38 |
39 | net = Net()
40 | net.cuda()
41 |
42 | # Print parameter count
43 | params = 0
44 | for param in list(net.parameters()):
45 | sizes = 1
46 | for el in param.size():
47 | sizes = sizes * el
48 | params += sizes
49 | print('# network parameters: ' + str(params))
50 |
51 | # Create optimizer
52 | optimizer = optim.Adam(net.parameters(), lr=learning_rate)
53 | criterion = nn.CrossEntropyLoss() # loss definition
54 |
55 | # Check for correct sizes
56 | assert (net.fc.in_features == rnn_size) # final projection input size (rnn_size)
57 | assert (net.fc.weight.cpu().data.numpy().shape == (
58 | classes, rnn_size)) # final projection output size (classes, rnn_size)
59 | bXt = Variable(torch.from_numpy(bX).cuda())
60 | torch.cuda.synchronize()
61 | output = net(bXt)
62 | output_numpy = output.data.cpu().numpy()
63 | assert (output_numpy.shape == (batch_size, classes))
64 |
65 | # Start training
66 | batch_time = []
67 | batch_loss = []
68 | train_start = timer.perf_counter()
69 | for i in range(batches):
70 | torch.cuda.synchronize() # synchronize function call for precise time measurement
71 | batch_start = timer.perf_counter()
72 |
73 | bXt = Variable(torch.from_numpy(bX).cuda())
74 | bYt = Variable(torch.from_numpy(bY).cuda())
75 |
76 | optimizer.zero_grad()
77 | output = net(bXt)
78 | loss = criterion(output, bYt.long())
79 | loss.backward()
80 | optimizer.step()
81 |
82 | torch.cuda.synchronize() # synchronize function call for precise time measurement
83 | batch_end = timer.perf_counter()
84 | batch_time.append(batch_end - batch_start)
85 | batch_loss.append(float(loss.data.cpu().numpy()))
86 | train_end = timer.perf_counter() # end of training
87 |
88 | # Write results
89 | print_results(batch_time)
90 | check_results(batch_loss, batch_time, train_start, train_end)
91 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
92 | run_time=batch_time, version=version)
93 |
--------------------------------------------------------------------------------
/1x320-LSTM/bench_tensorflow_LSTMBlockCell.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import tensorflow as tf
5 |
6 | from support import toy_batch, default_params, write_results, print_results, check_results
7 |
8 | # Experiment_type
9 | bench = 'tensorflow_LSTMBlockCell'
10 | version = tf.__version__
11 | experiment = '1x320-LSTM_cross-entropy'
12 |
13 | # Get data
14 | bX, b_lenX, bY, classes = toy_batch()
15 | batch_size, max_len, inp_dims = bX.shape
16 | rnn_size, learning_rate, batches = default_params()
17 |
18 | # Create symbolic vars
19 | x = tf.placeholder(tf.float32, [None, None, inp_dims])
20 | seq_len = tf.placeholder(tf.int32, [None])
21 | y = tf.placeholder(tf.int32, [None])
22 |
23 | # Create network
24 | fw_cell = tf.contrib.rnn.LSTMBlockCell(rnn_size)
25 | h1, _ = tf.nn.dynamic_rnn(cell=fw_cell, inputs=x, sequence_length=seq_len, dtype=tf.float32)
26 | h2 = h1[:, -1, :]
27 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False)
28 |
29 | # Create loss, optimizer and train function
30 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y))
31 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
32 | train_step = optimizer.minimize(loss)
33 |
34 | # Initialize session
35 | init = tf.global_variables_initializer()
36 | config = tf.ConfigProto()
37 | # config.gpu_options.allow_growth = False # dynamic allocation of VRAM
38 |
39 | # Print parameter count
40 | params = 0
41 | for variable in tf.trainable_variables():
42 | # shape is an array of tf.Dimension
43 | shape = variable.get_shape()
44 | variable_parameters = 1
45 | for dim in shape:
46 | variable_parameters *= dim.value
47 | params += variable_parameters
48 | print('# network parameters: ' + str(params))
49 |
50 | with tf.Session(config=config) as sess:
51 | sess.run(init)
52 | # Check for correct sizes
53 | assert (h2._shape_as_list() == [None, rnn_size]) # final projection input size (rnn_size)
54 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [rnn_size, classes]) # final projection output size (rnn_size, classes)
55 | output = sess.run(h3, feed_dict={x: bX, y: bY, seq_len: b_lenX})
56 | assert (output.shape == (batch_size, classes))
57 |
58 | # Start training
59 | batch_time = []
60 | batch_loss = []
61 | train_start=timer.perf_counter()
62 | for i in range(batches):
63 | batch_start = timer.perf_counter()
64 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, seq_len: b_lenX})
65 | batch_end = timer.perf_counter()
66 | batch_time.append(batch_end - batch_start)
67 | batch_loss.append(loss_val)
68 | train_end = timer.perf_counter()
69 |
70 | # Results handling
71 | print_results(batch_time)
72 | check_results(batch_loss, batch_time, train_start, train_end)
73 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
74 | run_time=batch_time, version=version)
75 |
--------------------------------------------------------------------------------
/1x320-LSTM/bench_tensorflow_LSTMBlockFusedCell.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import numpy as np
5 | import tensorflow as tf
6 |
7 | from support import toy_batch, default_params, write_results, print_results, check_results
8 |
9 | # Experiment_type
10 | bench = 'tensorflow_LSTMBlockFusedCell'
11 | version = tf.__version__
12 | experiment = '1x320-LSTM_cross-entropy'
13 |
14 | # Get data
15 | bX, b_lenX, bY, classes = toy_batch()
16 | batch_size, max_len, inp_dims = bX.shape
17 | rnn_size, learning_rate, batches = default_params()
18 |
19 | # Create symbolic vars
20 | x = tf.placeholder(tf.float32, [None, None, inp_dims])
21 | seq_len = tf.placeholder(tf.int32, [None])
22 | y = tf.placeholder(tf.int32, [None])
23 |
24 | # fusedcell compatibility: time first, batch second
25 | bX = np.transpose(bX, (1, 0, 2))
26 |
27 | # Create network
28 | fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(rnn_size)
29 | h1, _ = fw_cell(x, sequence_length=seq_len, dtype=tf.float32)
30 | h2 = h1[-1, :, :]
31 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False)
32 |
33 | # Create loss, optimizer and train function
34 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y))
35 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
36 | train_step = optimizer.minimize(loss)
37 |
38 | # Initialize session
39 | init = tf.global_variables_initializer()
40 | config = tf.ConfigProto()
41 | # config.gpu_options.allow_growth = False # dynamic allocation of VRAM
42 |
43 | # Print parameter count
44 | params = 0
45 | for variable in tf.trainable_variables():
46 | # shape is an array of tf.Dimension
47 | shape = variable.get_shape()
48 | variable_parameters = 1
49 | for dim in shape:
50 | variable_parameters *= dim.value
51 | params += variable_parameters
52 | print('# network parameters: ' + str(params))
53 |
54 | with tf.Session(config=config) as sess:
55 | sess.run(init)
56 | # Check for correct sizes
57 | assert (h2._shape_as_list() == [None, rnn_size]) # final projection input size (rnn_size)
58 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [rnn_size, classes]) # final projection output size (rnn_size, classes)
59 | output = sess.run(h3, feed_dict={x: bX, y: bY, seq_len: b_lenX})
60 | assert (output.shape == (batch_size, classes))
61 |
62 | # Start training
63 | batch_time = []
64 | batch_loss = []
65 | train_start=timer.perf_counter()
66 | for i in range(batches):
67 | batch_start = timer.perf_counter()
68 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, seq_len: b_lenX})
69 | batch_end = timer.perf_counter()
70 | batch_time.append(batch_end - batch_start)
71 | batch_loss.append(loss_val)
72 | train_end = timer.perf_counter()
73 |
74 | # Results handling
75 | print_results(batch_time)
76 | check_results(batch_loss, batch_time, train_start, train_end)
77 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
78 | run_time=batch_time, version=version)
79 |
--------------------------------------------------------------------------------
/1x320-LSTM/bench_tensorflow_LSTMCell.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import tensorflow as tf
5 |
6 | from support import toy_batch, default_params, write_results, print_results, check_results
7 |
8 | # Experiment_type
9 | bench = 'tensorflow_LSTMCell'
10 | version = tf.__version__
11 | experiment = '1x320-LSTM_cross-entropy'
12 |
13 | # Get data
14 | bX, b_lenX, bY, classes = toy_batch()
15 | batch_size, max_len, inp_dims = bX.shape
16 | rnn_size, learning_rate, batches = default_params()
17 |
18 | # Create symbolic vars
19 | x = tf.placeholder(tf.float32, [None, None, inp_dims])
20 | seq_len = tf.placeholder(tf.int32, [None])
21 | y = tf.placeholder(tf.int32, [None])
22 |
23 | # Create network
24 | fw_cell = tf.nn.rnn_cell.LSTMCell(rnn_size)
25 | h1, _ = tf.nn.dynamic_rnn(cell=fw_cell, inputs=x, sequence_length=seq_len, dtype=tf.float32)
26 | h2 = h1[:, -1, :]
27 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False)
28 |
29 | # Create loss, optimizer and train function
30 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y))
31 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
32 | train_step = optimizer.minimize(loss)
33 |
34 | # Initialize session
35 | init = tf.global_variables_initializer()
36 | config = tf.ConfigProto()
37 | # config.gpu_options.allow_growth = False # dynamic allocation of VRAM
38 |
39 | # Print parameter count
40 | params = 0
41 | for variable in tf.trainable_variables():
42 | # shape is an array of tf.Dimension
43 | shape = variable.get_shape()
44 | variable_parameters = 1
45 | for dim in shape:
46 | variable_parameters *= dim.value
47 | params += variable_parameters
48 | print('# network parameters: ' + str(params))
49 |
50 | with tf.Session(config=config) as sess:
51 | sess.run(init)
52 | # Check for correct sizes
53 | assert (h2._shape_as_list() == [None, rnn_size]) # final projection input size (rnn_size)
54 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [rnn_size, classes]) # final projection output size (rnn_size, classes)
55 | output = sess.run(h3, feed_dict={x: bX, y: bY, seq_len: b_lenX})
56 | assert (output.shape == (batch_size, classes))
57 |
58 | # Start training
59 | batch_time = []
60 | batch_loss = []
61 | train_start=timer.perf_counter()
62 | for i in range(batches):
63 | batch_start = timer.perf_counter()
64 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, seq_len: b_lenX})
65 | batch_end = timer.perf_counter()
66 | batch_time.append(batch_end - batch_start)
67 | batch_loss.append(loss_val)
68 | train_end = timer.perf_counter()
69 |
70 | # Results handling
71 | print_results(batch_time)
72 | check_results(batch_loss, batch_time, train_start, train_end)
73 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
74 | run_time=batch_time, version=version)
--------------------------------------------------------------------------------
/1x320-LSTM/bench_tensorflow_cudnnLSTM.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import numpy as np
5 | import tensorflow as tf
6 |
7 | from support import toy_batch, default_params, write_results, print_results, check_results
8 |
9 | # Experiment_type
10 | bench = 'tensorflow_cudnnLSTM'
11 | version = tf.__version__
12 | experiment = '1x320-LSTM_cross-entropy'
13 |
14 | # Get data
15 | bX, b_lenX, bY, classes = toy_batch()
16 | batch_size, max_len, inp_dims = bX.shape
17 | rnn_size, learning_rate, batches = default_params()
18 |
19 | # cudnn compatibility: time first, batch second
20 | bX = np.transpose(bX, (1, 0, 2))
21 |
22 | # Create symbolic vars
23 | x = tf.placeholder(tf.float32, [None, None, inp_dims])
24 | seq_len = tf.placeholder(tf.int32, [None])
25 | y = tf.placeholder(tf.int32, [None])
26 |
27 | # Create network
28 | cudnn_lstm = tf.contrib.cudnn_rnn.CudnnLSTM(num_layers=1, num_units=rnn_size)
29 | h1, _ = cudnn_lstm(x)
30 | h2 = h1[-1, :, :]
31 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False)
32 |
33 | # Create loss, optimizer and train function
34 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y))
35 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
36 |
37 | train_step = optimizer.minimize(loss)
38 |
39 | # Initialize session
40 | init = tf.global_variables_initializer()
41 | config = tf.ConfigProto()
42 | # config.gpu_options.allow_growth = False # dynamic allocation of VRAM
43 |
44 | # Print parameter count
45 | params = 0
46 | for variable in tf.trainable_variables():
47 | # shape is an array of tf.Dimension
48 | if 'cudnn_lstm' in str(variable):
49 | biases = cudnn_lstm.canonical_bias_shapes
50 | weights = cudnn_lstm.canonical_weight_shapes
51 | all_biases = np.sum(biases)
52 | all_weights = np.sum([t[0] * t[1] for t in weights])
53 | params += all_biases
54 | params += all_weights
55 | else:
56 | shape = variable.get_shape()
57 | variable_parametes = 1
58 | for dim in shape:
59 | variable_parametes *= dim.value
60 | params += variable_parametes
61 | print('# network parameters: ' + str(params))
62 |
63 | with tf.Session(config=config) as sess:
64 | sess.run(init)
65 | # Check for correct sizes
66 | assert (h2._shape_as_list() == [None, rnn_size]) # final projection input size (rnn_size)
67 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [rnn_size, classes]) # final projection output size (rnn_size, classes)
68 | output = sess.run(h3, feed_dict={x: bX, y: bY, seq_len: b_lenX})
69 | assert (output.shape == (batch_size, classes))
70 |
71 | # Start training
72 | batch_time = []
73 | batch_loss = []
74 | train_start=timer.perf_counter()
75 | for i in range(batches):
76 | batch_start = timer.perf_counter()
77 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, seq_len: b_lenX})
78 | batch_end = timer.perf_counter()
79 | batch_time.append(batch_end - batch_start)
80 | batch_loss.append(loss_val)
81 | train_end = timer.perf_counter()
82 |
83 | # Results handling
84 | print_results(batch_time)
85 | check_results(batch_loss, batch_time, train_start, train_end)
86 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
87 | run_time=batch_time, version=version)
--------------------------------------------------------------------------------
/1x320-LSTM/lib_pytorchLSTM.py:
--------------------------------------------------------------------------------
1 | "copied together from pytorch/nn/modules/rnn.py, pytorch/nn/_functions/rnn.py"
2 |
3 | import math
4 |
5 | import torch
6 | import torch.nn.functional as F
7 | from torch.nn import Parameter
8 |
9 |
10 | class RNNCellBase(torch.nn.Module):
11 |
12 | def extra_repr(self):
13 | s = '{input_size}, {hidden_size}'
14 | if 'bias' in self.__dict__ and self.bias is not True:
15 | s += ', bias={bias}'
16 | if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
17 | s += ', nonlinearity={nonlinearity}'
18 | return s.format(**self.__dict__)
19 |
20 | def check_forward_input(self, input):
21 | if input.size(1) != self.input_size:
22 | raise RuntimeError(
23 | "input has inconsistent input_size: got {}, expected {}".format(
24 | input.size(1), self.input_size))
25 |
26 | def check_forward_hidden(self, input, hx, hidden_label=''):
27 | if input.size(0) != hx.size(0):
28 | raise RuntimeError(
29 | "Input batch size {} doesn't match hidden{} batch size {}".format(
30 | input.size(0), hidden_label, hx.size(0)))
31 |
32 | if hx.size(1) != self.hidden_size:
33 | raise RuntimeError(
34 | "hidden{} has inconsistent hidden_size: got {}, expected {}".format(
35 | hidden_label, hx.size(1), self.hidden_size))
36 |
37 |
38 | class LSTMCell(RNNCellBase):
39 | r"""A long short-term memory (LSTM) cell.
40 |
41 | .. math::
42 |
43 | \begin{array}{ll}
44 | i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
45 | f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
46 | g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\
47 | o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
48 | c' = f * c + i * g \\
49 | h' = o \tanh(c') \\
50 | \end{array}
51 |
52 | where :math:`\sigma` is the sigmoid function.
53 |
54 | Args:
55 | input_size: The number of expected features in the input `x`
56 | hidden_size: The number of features in the hidden state `h`
57 | bias: If `False`, then the layer does not use bias weights `b_ih` and
58 | `b_hh`. Default: ``True``
59 |
60 | Inputs: input, (h_0, c_0)
61 | - **input** of shape `(batch, input_size)`: tensor containing input features
62 | - **h_0** of shape `(batch, hidden_size)`: tensor containing the initial hidden
63 | state for each element in the batch.
64 | - **c_0** of shape `(batch, hidden_size)`: tensor containing the initial cell state
65 | for each element in the batch.
66 |
67 | If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.
68 |
69 | Outputs: h_1, c_1
70 | - **h_1** of shape `(batch, hidden_size)`: tensor containing the next hidden state
71 | for each element in the batch
72 | - **c_1** of shape `(batch, hidden_size)`: tensor containing the next cell state
73 | for each element in the batch
74 |
75 | Attributes:
76 | weight_ih: the learnable input-hidden weights, of shape
77 | `(4*hidden_size x input_size)`
78 | weight_hh: the learnable hidden-hidden weights, of shape
79 | `(4*hidden_size x hidden_size)`
80 | bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
81 | bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`
82 |
83 | Examples::
84 |
85 | >>> rnn = nn.LSTMCell(10, 20)
86 | >>> input = torch.randn(6, 3, 10)
87 | >>> hx = torch.randn(3, 20)
88 | >>> cx = torch.randn(3, 20)
89 | >>> output = []
90 | >>> for i in range(6):
91 | hx, cx = rnn(input[i], (hx, cx))
92 | output.append(hx)
93 | """
94 |
95 | def __init__(self, input_size, hidden_size, bias=True):
96 | super(LSTMCell, self).__init__()
97 | self.input_size = input_size
98 | self.hidden_size = hidden_size
99 | self.bias = bias
100 | self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size))
101 | self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size))
102 | if bias:
103 | self.bias_ih = Parameter(torch.Tensor(4 * hidden_size))
104 | self.bias_hh = Parameter(torch.Tensor(4 * hidden_size))
105 | else:
106 | self.register_parameter('bias_ih', None)
107 | self.register_parameter('bias_hh', None)
108 | self.reset_parameters()
109 |
110 | def reset_parameters(self):
111 | stdv = 1.0 / math.sqrt(self.hidden_size)
112 | for weight in self.parameters():
113 | weight.data.uniform_(-stdv, stdv)
114 |
115 | def forward(self, input, hx):
116 | self.check_forward_input(input)
117 | self.check_forward_hidden(input, hx[0], '[0]')
118 | self.check_forward_hidden(input, hx[1], '[1]')
119 | return self.LSTMCell(
120 | input, hx,
121 | self.weight_ih, self.weight_hh,
122 | self.bias_ih, self.bias_hh,
123 | )
124 |
125 | def LSTMCell(self, input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
126 |
127 | hx, cx = hidden
128 | gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
129 |
130 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
131 |
132 | ingate = F.sigmoid(ingate)
133 | forgetgate = F.sigmoid(forgetgate)
134 | cellgate = F.tanh(cellgate)
135 | outgate = F.sigmoid(outgate)
136 |
137 | cy = (forgetgate * cx) + (ingate * cellgate)
138 | hy = outgate * F.tanh(cy)
139 |
140 | return hy, cy
141 |
--------------------------------------------------------------------------------
/4x320-LSTM/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/4x320-LSTM/__init__.py
--------------------------------------------------------------------------------
/4x320-LSTM/bench_lasagne_LSTMLayer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import lasagne
5 | import theano
6 | import theano.tensor as T
7 |
8 | from support import toy_batch, default_params, write_results, print_results, check_results
9 |
10 | # Experiment_type
11 | bench = 'lasagne_LSTMLayer'
12 | version = lasagne.__version__
13 | experiment = '4x320-BIDIR-LSTM_cross-entropy'
14 |
15 | # Get data
16 | bX, b_lenX, bY, classes = toy_batch()
17 | batch_size, seq_len, inp_dims = bX.shape
18 | rnn_size, learning_rate, batches = default_params()
19 |
20 | # Create symbolic vars
21 | x = T.ftensor3('x')
22 | y = T.ivector('y')
23 |
24 |
25 | # Create network
26 | def get_bench_net_lstm(input_var, inp_dim, rnn_size):
27 | # Input layer
28 | l_in = lasagne.layers.InputLayer(shape=(None, None, inp_dim), input_var=input_var)
29 |
30 | # Allows arbitrary sizes
31 | batch_size, seq_len, _ = input_var.shape
32 |
33 | # RNN layers
34 | h1f = lasagne.layers.LSTMLayer(l_in, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform())
35 | h1b = lasagne.layers.LSTMLayer(l_in, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform(), backwards=True)
36 | h1 = lasagne.layers.ConcatLayer([h1f, h1b], axis=2)
37 |
38 | h2f = lasagne.layers.LSTMLayer(h1, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform())
39 | h2b = lasagne.layers.LSTMLayer(h1, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform(), backwards=True)
40 | h2 = lasagne.layers.ConcatLayer([h2f, h2b], axis=2)
41 |
42 | h3f = lasagne.layers.LSTMLayer(h2, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform())
43 | h3b = lasagne.layers.LSTMLayer(h2, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform(), backwards=True)
44 | h3 = lasagne.layers.ConcatLayer([h3f, h3b], axis=2)
45 |
46 | h4f = lasagne.layers.LSTMLayer(h3, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform())
47 | h4b = lasagne.layers.LSTMLayer(h3, num_units=rnn_size, hid_init=lasagne.init.GlorotUniform(), backwards=True)
48 | h4 = lasagne.layers.ConcatLayer([h4f, h4b], axis=2)
49 |
50 | h5 = lasagne.layers.SliceLayer(h4, -1, axis=1)
51 | h6 = lasagne.layers.DenseLayer(h5, num_units=classes, nonlinearity=lasagne.nonlinearities.softmax, b=None)
52 |
53 | return h6
54 |
55 |
56 | # Create network
57 | network = get_bench_net_lstm(x, inp_dims, rnn_size)
58 |
59 | # Print parameter count
60 | params = lasagne.layers.count_params(network)
61 | print('>>> # network parameters: ' + str(params))
62 |
63 | # Create loss, optimizer and train function
64 | prediction = lasagne.layers.get_output(network)
65 | loss = lasagne.objectives.categorical_crossentropy(predictions=prediction, targets=y)
66 | loss = loss.mean()
67 |
68 | update_params = lasagne.layers.get_all_params(network, trainable=True)
69 | updates = lasagne.updates.adam(loss, update_params, learning_rate=learning_rate)
70 |
71 | fn_inputs = [x, y]
72 |
73 | start = timer.perf_counter()
74 | train_fn = theano.function(fn_inputs, loss, updates=updates)
75 | prediction_det = lasagne.layers.get_output(network, deterministic=True)
76 | output_fn = theano.function([x], prediction_det)
77 | end = timer.perf_counter()
78 | print('>>> Theano function compilation took {:.1f} seconds'.format(end - start))
79 |
80 | # Check for correct sizes
81 | assert (network.input_shape == (None, 2*rnn_size)) # final projection input size (Batch_size x rnn_size)
82 | assert (network.W.eval().shape == (2*rnn_size, classes)) # final projection kernel size (rnn_size x classes)
83 | output = output_fn(bX)
84 | output_fn.sync_shared()
85 | assert (output.shape == (batch_size, classes)) # output size
86 |
87 | # Start training
88 | batch_time = []
89 | batch_loss = []
90 | train_start = timer.perf_counter() # start of training
91 | for i in range(batches):
92 | batch_start = timer.perf_counter() # start of batch
93 | loss = train_fn(bX, bY)
94 | train_fn.sync_shared() # synchronize function call for precise time measurement
95 | batch_end = timer.perf_counter() # end of batch
96 | batch_time.append(batch_end - batch_start)
97 | batch_loss.append(loss)
98 | train_end = timer.perf_counter() # end of training
99 |
100 | # Results handling
101 | print_results(batch_time)
102 | check_results(batch_loss, batch_time, train_start, train_end)
103 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
104 | run_time=batch_time, version=version)
105 |
--------------------------------------------------------------------------------
/4x320-LSTM/bench_pytorch_cudnnLSTM.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import numpy as np
5 | import torch
6 | import torch.nn as nn
7 | import torch.optim as optim
8 | from torch.autograd import Variable
9 |
10 | from support import toy_batch, default_params, write_results, print_results, check_results
11 |
12 | # Experiment_type
13 | bench = 'pytorch_cudnnLSTM'
14 | version = torch.__version__
15 | experiment = '4x320-BIDIR-LSTM_cross-entropy'
16 |
17 | # Get data
18 | bX, b_lenX, bY, classes = toy_batch()
19 | batch_size, seq_len, inp_dims = bX.shape
20 | rnn_size, learning_rate, batches = default_params()
21 |
22 | # PyTorch compatibility: time first, batch second
23 | bX = np.transpose(bX, (1, 0, 2))
24 |
25 |
26 | # Create Network
27 | class Net(nn.Module):
28 | def __init__(self):
29 | super(Net, self).__init__()
30 | self.lstm = nn.LSTM(input_size=inp_dims, hidden_size=rnn_size, num_layers=4, bias=True, bidirectional=True)
31 | self.fc = nn.Linear(rnn_size * 2, classes, bias=False)
32 |
33 | def forward(self, x):
34 | h1, state = self.lstm(x)
35 | h2 = h1[-1, :, :]
36 | h3 = self.fc(h2)
37 | return h3
38 |
39 |
40 | net = Net()
41 | net.cuda()
42 |
43 | # Print parameter count
44 | params = 0
45 | for param in list(net.parameters()):
46 | sizes = 1
47 | for el in param.size():
48 | sizes = sizes * el
49 | params += sizes
50 | print('# network parameters: ' + str(params))
51 |
52 | # Create optimizer
53 | optimizer = optim.Adam(net.parameters(), lr=learning_rate)
54 | criterion = nn.CrossEntropyLoss() # loss definition
55 |
56 | # Check for correct sizes
57 | assert (net.fc.in_features == 2*rnn_size) # final projection input size (rnn_size)
58 | assert (net.fc.weight.cpu().data.numpy().shape == (
59 | classes, 2*rnn_size)) # final projection kernel size (classes, rnn_size)
60 | bXt = Variable(torch.from_numpy(bX).cuda())
61 | torch.cuda.synchronize()
62 | output = net(bXt)
63 | output_numpy = output.data.cpu().numpy()
64 | assert (output_numpy.shape == (batch_size, classes))
65 |
66 | # Start training
67 | batch_time = []
68 | batch_loss = []
69 | train_start = timer.perf_counter()
70 | for i in range(batches):
71 | torch.cuda.synchronize() # synchronize function call for precise time measurement
72 | batch_start = timer.perf_counter()
73 |
74 | bXt = Variable(torch.from_numpy(bX).cuda())
75 | bYt = Variable(torch.from_numpy(bY).cuda())
76 |
77 | optimizer.zero_grad()
78 | output = net(bXt)
79 | loss = criterion(output, bYt.long())
80 | loss.backward()
81 | optimizer.step()
82 |
83 | torch.cuda.synchronize() # synchronize function call for precise time measurement
84 | batch_end = timer.perf_counter()
85 | batch_time.append(batch_end - batch_start)
86 | batch_loss.append(float(loss.data.cpu().numpy()))
87 | train_end = timer.perf_counter() # end of training
88 |
89 | # Write results
90 | print_results(batch_time)
91 | check_results(batch_loss, batch_time, train_start, train_end)
92 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
93 | run_time=batch_time, version=version)
94 |
--------------------------------------------------------------------------------
/4x320-LSTM/bench_tensorflow_LSTMBlockCell.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import tensorflow as tf
5 |
6 | from support import toy_batch, default_params, write_results, print_results, check_results
7 |
8 | # Experiment_type
9 | bench = 'tensorflow_LSTMBlockCell'
10 | version = tf.__version__
11 | experiment = '4x320-BIDIR-LSTM_cross-entropy'
12 |
13 | # Get data
14 | bX, b_lenX, bY, classes = toy_batch()
15 | batch_size, max_len, inp_dims = bX.shape
16 | rnn_size, learning_rate, batches = default_params()
17 |
18 | # Create symbolic vars
19 | x = tf.placeholder(tf.float32, [None, None, inp_dims])
20 | x_len = tf.placeholder(tf.int32, [None])
21 | y = tf.placeholder(tf.int32, [None])
22 |
23 | # Create network
24 | fw_cell = [tf.contrib.rnn.LSTMBlockCell(rnn_size) for _ in range(4)]
25 | bw_cell = [tf.contrib.rnn.LSTMBlockCell(rnn_size) for _ in range(4)]
26 |
27 | h1, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw=fw_cell, cells_bw=bw_cell,
28 | inputs=x, sequence_length=x_len, dtype=tf.float32)
29 | h2 = h1[:, -1, :]
30 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False)
31 |
32 | # Create loss, optimizer and train function
33 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y))
34 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
35 |
36 | train_step = optimizer.minimize(loss)
37 |
38 | # Initialize session
39 | init = tf.global_variables_initializer()
40 | config = tf.ConfigProto()
41 | # config.gpu_options.allow_growth = True
42 |
43 | # Print parameter count
44 | params = 0
45 | for variable in tf.trainable_variables():
46 | # shape is an array of tf.Dimension
47 | shape = variable.get_shape()
48 | variable_parametes = 1
49 | for dim in shape:
50 | variable_parametes *= dim.value
51 | params += variable_parametes
52 | print('# network parameters: ' + str(params))
53 |
54 | with tf.Session(config=config) as sess:
55 | sess.run(init)
56 | # Check for correct sizes
57 | assert (h2._shape_as_list() == [None, 2*rnn_size]) # final projection input size (rnn_size)
58 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [2*rnn_size, classes]) # final projection output size (rnn_size, classes)
59 | output = sess.run(h3, feed_dict={x: bX, y: bY, x_len: b_lenX})
60 | assert (output.shape == (batch_size, classes))
61 |
62 | # Start training
63 | batch_time = []
64 | batch_loss = []
65 | train_start=timer.perf_counter()
66 | for i in range(batches):
67 | batch_start = timer.perf_counter()
68 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, x_len: b_lenX})
69 | batch_end = timer.perf_counter()
70 | batch_time.append(batch_end - batch_start)
71 | batch_loss.append(loss_val)
72 | train_end = timer.perf_counter()
73 |
74 | # Results handling
75 | print_results(batch_time)
76 | check_results(batch_loss, batch_time, train_start, train_end)
77 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
78 | run_time=batch_time, version=version)
79 |
--------------------------------------------------------------------------------
/4x320-LSTM/bench_tensorflow_LSTMCell.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import tensorflow as tf
5 |
6 | from support import toy_batch, default_params, write_results, print_results, check_results
7 |
8 | # Experiment_type
9 | bench = 'tensorflow_LSTMCell'
10 | version = tf.__version__
11 | experiment = '4x320-BIDIR-LSTM_cross-entropy'
12 |
13 | # Get data
14 | bX, b_lenX, bY, classes = toy_batch()
15 | batch_size, max_len, inp_dims = bX.shape
16 | rnn_size, learning_rate, batches = default_params()
17 |
18 | # Create symbolic vars
19 | x = tf.placeholder(tf.float32, [None, None, inp_dims])
20 | x_len = tf.placeholder(tf.int32, [None])
21 | y = tf.placeholder(tf.int32, [None])
22 |
23 | # Create network
24 | fw_cell = [tf.nn.rnn_cell.LSTMCell(rnn_size) for _ in range(4)]
25 | bw_cell = [tf.nn.rnn_cell.LSTMCell(rnn_size) for _ in range(4)]
26 |
27 | h1, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw=fw_cell, cells_bw=bw_cell,
28 | inputs=x, sequence_length=x_len, dtype=tf.float32)
29 | h2 = h1[:, -1, :]
30 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False)
31 |
32 | # Create loss, optimizer and train function
33 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y))
34 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
35 |
36 | train_step = optimizer.minimize(loss)
37 |
38 | # Initialize session
39 | init = tf.global_variables_initializer()
40 | config = tf.ConfigProto()
41 | # config.gpu_options.allow_growth = True
42 |
43 | # Print parameter count
44 | params = 0
45 | for variable in tf.trainable_variables():
46 | # shape is an array of tf.Dimension
47 | shape = variable.get_shape()
48 | variable_parametes = 1
49 | for dim in shape:
50 | variable_parametes *= dim.value
51 | params += variable_parametes
52 | print('# network parameters: ' + str(params))
53 |
54 | with tf.Session(config=config) as sess:
55 | sess.run(init)
56 | # Check for correct sizes
57 | assert (h2._shape_as_list() == [None, 2*rnn_size]) # final projection input size (rnn_size)
58 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [2*rnn_size, classes]) # final projection output size (rnn_size, classes)
59 | output = sess.run(h3, feed_dict={x: bX, y: bY, x_len: b_lenX})
60 | assert (output.shape == (batch_size, classes))
61 |
62 | # Start training
63 | batch_time = []
64 | batch_loss = []
65 | train_start=timer.perf_counter()
66 | for i in range(batches):
67 | batch_start = timer.perf_counter()
68 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, x_len: b_lenX})
69 | batch_end = timer.perf_counter()
70 | batch_time.append(batch_end - batch_start)
71 | batch_loss.append(loss_val)
72 | train_end = timer.perf_counter()
73 |
74 | # Results handling
75 | print_results(batch_time)
76 | check_results(batch_loss, batch_time, train_start, train_end)
77 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
78 | run_time=batch_time, version=version)
79 |
--------------------------------------------------------------------------------
/4x320-LSTM/bench_tensorflow_cudnnLSTM.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import numpy as np
5 | import tensorflow as tf
6 |
7 | from support import toy_batch, default_params, write_results, print_results, check_results
8 |
9 | # Experiment_type
10 | bench = 'tensorflow_cudnnLSTM'
11 | version = tf.__version__
12 | experiment = '4x320-BIDIR-LSTM_cross-entropy'
13 |
14 | # Get data
15 | bX, b_lenX, bY, classes = toy_batch()
16 | batch_size, max_len, inp_dims = bX.shape
17 | rnn_size, learning_rate, batches = default_params()
18 |
19 | # cudnn compatibility: time first, batch second
20 | bX = np.transpose(bX, (1, 0, 2))
21 |
22 | # Create symbolic vars
23 | x = tf.placeholder(tf.float32, [None, None, inp_dims])
24 | x_len = tf.placeholder(tf.int32, [None])
25 | y = tf.placeholder(tf.int32, [None])
26 |
27 | # Create network
28 | cudnn_lstm = tf.contrib.cudnn_rnn.CudnnLSTM(num_layers=4, num_units=rnn_size, direction='bidirectional')
29 | h1, _ = cudnn_lstm(x)
30 | h2 = h1[-1, :, :]
31 | h3 = tf.layers.dense(h2, units=classes, activation=None, use_bias=False)
32 |
33 | # Create loss, optimizer and train function
34 | loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=h3, labels=y))
35 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
36 |
37 | train_step = optimizer.minimize(loss)
38 |
39 | # Initialize session
40 | init = tf.global_variables_initializer()
41 | config = tf.ConfigProto()
42 | # config.gpu_options.allow_growth = True
43 |
44 | # Print parameter count
45 | params = 0
46 | for variable in tf.trainable_variables():
47 | # shape is an array of tf.Dimension
48 | if 'cudnn_lstm' in str(variable):
49 | biases = cudnn_lstm.canonical_bias_shapes
50 | weights = cudnn_lstm.canonical_weight_shapes
51 | all_biases = np.sum(biases)
52 | all_weights = np.sum([t[0] * t[1] for t in weights])
53 | params += all_biases
54 | params += all_weights
55 | else:
56 | shape = variable.get_shape()
57 | variable_parametes = 1
58 | for dim in shape:
59 | variable_parametes *= dim.value
60 | params += variable_parametes
61 | print('# network parameters: ' + str(params))
62 |
63 | with tf.Session(config=config) as sess:
64 | sess.run(init)
65 | # Check for correct sizes
66 | assert (h2._shape_as_list() == [None, 2*rnn_size]) # final projection input size (rnn_size)
67 | assert (tf.trainable_variables(scope='dense/kernel:0')[0].shape.as_list() == [2*rnn_size, classes]) # final projection output size (rnn_size, classes)
68 | output = sess.run(h3, feed_dict={x: bX, y: bY, x_len: b_lenX})
69 | assert (output.shape == (batch_size, classes))
70 |
71 | # Start training
72 | batch_time = []
73 | batch_loss = []
74 | train_start=timer.perf_counter()
75 | for i in range(batches):
76 | batch_start = timer.perf_counter()
77 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, x_len: b_lenX})
78 | batch_end = timer.perf_counter()
79 | batch_time.append(batch_end - batch_start)
80 | batch_loss.append(loss_val)
81 | train_end = timer.perf_counter()
82 |
83 | # Results handling
84 | print_results(batch_time)
85 | check_results(batch_loss, batch_time, train_start, train_end)
86 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
87 | run_time=batch_time, version=version)
88 |
--------------------------------------------------------------------------------
/4x320-LSTM_ctc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/4x320-LSTM_ctc/__init__.py
--------------------------------------------------------------------------------
/4x320-LSTM_ctc/bench_lasagne_LSTMLayer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import lasagne
5 | import theano
6 | import theano.tensor as T
7 | from theano.tensor.nnet.ctc import (ctc)
8 |
9 | from support import toy_batch_ctc, default_params, write_results, print_results, check_results
10 |
11 | # Experiment_type
12 | bench = 'lasagne_LSTMLayer'
13 | version = lasagne.__version__
14 | experiment = '4x320-BIDIR-LSTM_CTC'
15 |
16 | # Get data
17 | bX, b_lenX, maskX, bY, b_lenY, classes = toy_batch_ctc()
18 | bY = bY.reshape(-1, b_lenY.max()) # compatibility with theano ctc interface
19 | batch_size, seq_len, inp_dims = bX.shape
20 | rnn_size, learning_rate, batches = default_params()
21 |
22 | # Create symbolic vars
23 | input_var = T.ftensor3('bX')
24 | input_var_lens = T.ivector('b_lenX')
25 | mask_var = T.matrix('maskX')
26 | target_var = T.imatrix('bY')
27 |
28 |
29 | # Create network
30 | def get_bench_net_lstm(input_var, mask_var, inp_dim, rnn_size, classes):
31 | # Input layer
32 | l_in = lasagne.layers.InputLayer(shape=(None, None, inp_dim), input_var=input_var)
33 |
34 | # Masking layer
35 | l_mask = lasagne.layers.InputLayer(shape=(None, None), input_var=mask_var)
36 |
37 | # Allows arbitrary sizes
38 | batch_size, seq_len, _ = input_var.shape
39 |
40 | # RNN layers
41 | h1f = lasagne.layers.LSTMLayer(l_in, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform())
42 | h1b = lasagne.layers.LSTMLayer(l_in, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform(),
43 | backwards=True)
44 | h1 = lasagne.layers.ConcatLayer([h1f, h1b], axis=2)
45 |
46 | h2f = lasagne.layers.LSTMLayer(h1, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform())
47 | h2b = lasagne.layers.LSTMLayer(h1, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform(),
48 | backwards=True)
49 | h2 = lasagne.layers.ConcatLayer([h2f, h2b], axis=2)
50 |
51 | h3f = lasagne.layers.LSTMLayer(h2, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform())
52 | h3b = lasagne.layers.LSTMLayer(h2, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform(),
53 | backwards=True)
54 | h3 = lasagne.layers.ConcatLayer([h3f, h3b], axis=2)
55 |
56 | h4f = lasagne.layers.LSTMLayer(h3, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform())
57 | h4b = lasagne.layers.LSTMLayer(h3, num_units=rnn_size, mask_input=l_mask, hid_init=lasagne.init.GlorotUniform(),
58 | backwards=True)
59 | h4 = lasagne.layers.ConcatLayer([h4f, h4b], axis=2)
60 |
61 | h5 = non_flattening_dense(h4, batch_size=batch_size, seq_len=seq_len, num_units=classes,
62 | nonlinearity=lasagne.nonlinearities.linear)
63 |
64 | h6 = lasagne.layers.DimshuffleLayer(h5, (1, 0, 2))
65 |
66 | return h6
67 |
68 |
69 | def non_flattening_dense(l_in, batch_size, seq_len, *args, **kwargs):
70 | # Flatten down the dimensions for everything but the features
71 | l_flat = lasagne.layers.ReshapeLayer(l_in, (-1, [2]))
72 | # Make a dense layer connected to it
73 | l_dense = lasagne.layers.DenseLayer(l_flat, b=None, *args, **kwargs,)
74 | # Reshape it back out
75 | l_nonflat = lasagne.layers.ReshapeLayer(l_dense, (batch_size, seq_len, l_dense.output_shape[1]))
76 | return l_nonflat
77 |
78 |
79 | # Create network
80 | network = get_bench_net_lstm(input_var=input_var, mask_var=mask_var, inp_dim=inp_dims, rnn_size=rnn_size,
81 | classes=classes)
82 |
83 | # Create loss, optimizer and train function
84 | prediction = lasagne.layers.get_output(network)
85 | loss = T.mean(ctc(prediction, target_var, input_var_lens))
86 | params = lasagne.layers.get_all_params(network, trainable=True)
87 | updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate)
88 | fn_inputs = [input_var, input_var_lens, mask_var, target_var]
89 |
90 | start = timer.perf_counter()
91 | train_fn = theano.function(fn_inputs, loss, updates=updates)
92 | output_fn = theano.function([input_var, mask_var], prediction)
93 | end = timer.perf_counter()
94 | print('>>> Theano function compilation took {:.1f} seconds'.format(end - start))
95 |
96 | # Print parameter count
97 | params = lasagne.layers.count_params(network)
98 | print('# network parameters: ' + str(params))
99 |
100 | # Check for correct sizes
101 | output_layer = network.input_layer.input_layer
102 | assert (output_layer.input_shape == (None, 2*rnn_size)) # final projection input size (Batch_size x rnn_size)
103 | assert (output_layer.W.eval().shape == (2*rnn_size, classes)) # final projection kernel size (rnn_size x classes)
104 | output = output_fn(bX, maskX)
105 | output_fn.sync_shared()
106 | assert (output.shape == (seq_len, batch_size, classes)) # output size
107 |
108 | # Start training
109 | batch_time = []
110 | batch_loss = []
111 | train_start = timer.perf_counter() # start of training
112 | for i in range(batches):
113 | batch_start = timer.perf_counter() # start of batch
114 | loss = train_fn(bX, b_lenX, maskX, bY)
115 | train_fn.sync_shared() # synchronize function call for precise time measurement
116 | batch_end = timer.perf_counter() # end of batch
117 | batch_time.append(batch_end - batch_start)
118 | batch_loss.append(loss)
119 | train_end = timer.perf_counter() # end of training
120 |
121 | # Results handling
122 | print_results(batch_time)
123 | check_results(batch_loss, batch_time, train_start, train_end)
124 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
125 | run_time=batch_time, version=version)
--------------------------------------------------------------------------------
/4x320-LSTM_ctc/bench_pytorch_cudnnLSTM.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import numpy as np
5 | import torch
6 | import torch.nn as nn
7 | import torch.optim as optim
8 | from torch.autograd import Variable
9 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
10 | from warpctc_pytorch import CTCLoss
11 |
12 | from support import toy_batch_ctc, default_params, write_results, print_results, check_results
13 |
14 | # Experiment_type
15 | bench = 'pytorch_cudnnLSTM'
16 | version = torch.__version__
17 | experiment = '4x320-BIDIR-LSTM_CTC'
18 |
19 | # Get data
20 | bX, b_lenX, maskX, bY, b_lenY, classes = toy_batch_ctc()
21 | batch_size, seq_len, inp_dims = bX.shape
22 | rnn_size, learning_rate, batches = default_params()
23 |
24 | # PyTorch compatibility: time first, batch second
25 | bX = np.transpose(bX, (1, 0, 2))
26 |
27 |
28 | # Create Network
29 | class Net(nn.Module):
30 | def __init__(self):
31 | super(Net, self).__init__()
32 | self.lstm = nn.LSTM(input_size=inp_dims, hidden_size=rnn_size, num_layers=4, bias=True, bidirectional=True)
33 | self.fc = nn.Linear(rnn_size * 2, classes, bias=False)
34 |
35 | def forward(self, x):
36 | h1p, state = self.lstm(x)
37 | h1, lens = pad_packed_sequence(h1p)
38 | h2 = self.fc(h1)
39 | return h2
40 |
41 |
42 | net = Net()
43 | net.cuda()
44 |
45 | # Print parameter count
46 | params = 0
47 | for param in list(net.parameters()):
48 | sizes = 1
49 | for el in param.size():
50 | sizes = sizes * el
51 | params += sizes
52 | print('# network parameters: ' + str(params))
53 |
54 | # Create optimizer
55 | optimizer = optim.Adam(net.parameters(), lr=learning_rate)
56 | criterion = CTCLoss()
57 |
58 | # Check for correct sizes
59 | assert (net.fc.in_features == 2*rnn_size) # final projection input size (rnn_size)
60 | assert (net.fc.weight.cpu().data.numpy().shape == (
61 | classes, 2*rnn_size)) # final projection kernel size (classes, rnn_size)
62 | bXt = Variable(torch.from_numpy(bX).cuda())
63 | bXt = pack_padded_sequence(bXt, b_lenX[::-1])
64 | torch.cuda.synchronize()
65 | output = net(bXt)
66 | output_numpy = output.data.cpu().numpy()
67 | assert (output_numpy.shape == (seq_len, batch_size, classes))
68 |
69 | # Start training
70 | batch_time = []
71 | batch_loss = []
72 | train_start = timer.perf_counter()
73 | for i in range(batches):
74 | torch.cuda.synchronize() # synchronize function call for precise time measurement
75 | batch_start = timer.perf_counter()
76 |
77 | bXt = Variable(torch.from_numpy(bX).cuda())
78 | bXt = pack_padded_sequence(bXt, b_lenX[::-1]) # Pack those sequences for masking, plz
79 | b_lenXt = Variable(torch.from_numpy(b_lenX))
80 | bYt = Variable(torch.from_numpy(bY))
81 | b_lenYt = Variable(torch.from_numpy(b_lenY))
82 |
83 | optimizer.zero_grad()
84 | output = net(bXt)
85 | loss = criterion(output, bYt, b_lenXt, b_lenYt)
86 | loss.backward()
87 | optimizer.step()
88 |
89 | torch.cuda.synchronize() # synchronize function call for precise time measurement
90 | batch_end = timer.perf_counter()
91 | batch_time.append(batch_end - batch_start)
92 | batch_loss.append(float(loss.data.cpu().numpy()))
93 | train_end = timer.perf_counter() # end of training
94 |
95 | # Write results
96 | print_results(batch_time)
97 | check_results(batch_loss, batch_time, train_start, train_end)
98 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
99 | run_time=batch_time, version=version)
100 |
--------------------------------------------------------------------------------
/4x320-LSTM_ctc/bench_tensorflow_LSTMBlockCell.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import tensorflow as tf
5 |
6 | from support import toy_batch_ctc, default_params, write_results, print_results, target_converter, \
7 | sparse_tuple_from, check_results
8 |
9 | # Experiment_type
10 | bench = 'tensorflow_LSTMBlockCell'
11 | version = tf.__version__
12 | experiment = '4x320-BIDIR-LSTM_CTC'
13 |
14 | # Get data
15 | bX, b_lenX, maskX, bY, b_lenY, classes = toy_batch_ctc()
16 | batch_size, seq_len, inp_dims = bX.shape
17 | rnn_size, learning_rate, batches = default_params()
18 |
19 | # Create symbolic vars
20 | x = tf.placeholder(tf.float32, [None, None, inp_dims])
21 | x_len = tf.placeholder(tf.int32, [None])
22 | y = tf.sparse_placeholder(tf.int32)
23 |
24 | weights = {'out': tf.Variable(tf.truncated_normal(shape=[2 * rnn_size, classes], stddev=0.1), name='W_out')}
25 |
26 | # Create network
27 | def get_EESEN(x, rnn_size, weights, x_len, classes):
28 | shape = tf.shape(x)
29 | batch_size, max_timesteps = shape[0], shape[1]
30 |
31 | with tf.name_scope('MultiLSTM'):
32 | fw_cell = [tf.contrib.rnn.LSTMBlockCell(rnn_size) for _ in range(4)]
33 | bw_cell = [tf.contrib.rnn.LSTMBlockCell(rnn_size) for _ in range(4)]
34 |
35 | h1, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw=fw_cell, cells_bw=bw_cell,
36 | inputs=x, sequence_length=x_len, dtype=tf.float32)
37 |
38 | with tf.name_scope('Affine'):
39 | h1_rs = tf.reshape(h1, [-1, 2 * rnn_size])
40 | logits = tf.matmul(h1_rs, weights['out'])
41 | logits = tf.reshape(logits, [batch_size, max_timesteps, classes])
42 | logits = tf.transpose(logits, (1, 0, 2))
43 |
44 | return logits, h1
45 |
46 |
47 | pred, h1 = get_EESEN(x=x, rnn_size=rnn_size, weights=weights, x_len=x_len, classes=classes)
48 |
49 | # Create loss, optimizer and train function
50 | loss = tf.reduce_mean(tf.nn.ctc_loss(inputs=pred, labels=y, sequence_length=x_len, time_major=True))
51 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
52 |
53 | train_step = optimizer.minimize(loss)
54 |
55 | # Initialize session
56 | init = tf.global_variables_initializer()
57 | config = tf.ConfigProto()
58 | # config.gpu_options.allow_growth = True
59 |
60 | # Print parameter count
61 | params = 0
62 | for variable in tf.trainable_variables():
63 | # shape is an array of tf.Dimension
64 | shape = variable.get_shape()
65 | variable_parametes = 1
66 | for dim in shape:
67 | variable_parametes *= dim.value
68 | params += variable_parametes
69 | print('# network parameters: ' + str(params))
70 |
71 | with tf.Session(config=config) as sess:
72 | sess.run(init)
73 | bY = target_converter(bY, b_lenY)
74 | bY = sparse_tuple_from(bY)
75 |
76 | # Check for correct sizes
77 | assert (h1._shape_as_list() == [None, None, 2*rnn_size]) # final projection input size (rnn_size)
78 | assert (weights['out'].shape.as_list() == [2*rnn_size, classes]) # final projection kernel size (rnn_size, classes)
79 | output = sess.run(pred, feed_dict={x: bX, y: bY, x_len: b_lenX})
80 | assert (output.shape == (seq_len, batch_size, classes))
81 |
82 | # Start training
83 | batch_time = []
84 | batch_loss = []
85 | train_start=timer.perf_counter()
86 | for i in range(batches):
87 | batch_start = timer.perf_counter()
88 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, x_len: b_lenX})
89 | batch_end = timer.perf_counter()
90 | batch_time.append(batch_end - batch_start)
91 | batch_loss.append(loss_val)
92 | train_end = timer.perf_counter()
93 |
94 | # Results handling
95 | print_results(batch_time)
96 | check_results(batch_loss, batch_time, train_start, train_end)
97 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
98 | run_time=batch_time, version=version)
99 |
--------------------------------------------------------------------------------
/4x320-LSTM_ctc/bench_tensorflow_LSTMCell.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time as timer
3 |
4 | import tensorflow as tf
5 |
6 | from support import toy_batch_ctc, default_params, write_results, print_results, target_converter, \
7 | sparse_tuple_from, check_results
8 |
9 | # Experiment_type
10 | bench = 'tensorflow_LSTMCell'
11 | version = tf.__version__
12 | experiment = '4x320-BIDIR-LSTM_CTC'
13 |
14 | # Get data
15 | bX, b_lenX, maskX, bY, b_lenY, classes = toy_batch_ctc()
16 | batch_size, seq_len, inp_dims = bX.shape
17 | rnn_size, learning_rate, batches = default_params()
18 |
19 | # Create symbolic vars
20 | x = tf.placeholder(tf.float32, [None, None, inp_dims])
21 | x_len = tf.placeholder(tf.int32, [None])
22 | y = tf.sparse_placeholder(tf.int32)
23 |
24 | print(bX.shape)
25 | print(b_lenX.shape)
26 | print(bY.shape)
27 |
28 | weights = {'out': tf.Variable(tf.truncated_normal(shape=[2 * rnn_size, classes], stddev=0.1), name='W_out')}
29 | biases = {'out': tf.Variable(tf.zeros([classes]), name='b_out')}
30 |
31 |
32 | # Create network
33 | def get_EESEN(x, rnn_size, weights, biases, x_len, classes):
34 | shape = tf.shape(x)
35 | batch_size, max_timesteps = shape[0], shape[1]
36 |
37 | with tf.name_scope('MultiLSTM'):
38 | fw_cell = [tf.nn.rnn_cell.LSTMCell(rnn_size) for _ in range(4)]
39 | bw_cell = [tf.nn.rnn_cell.LSTMCell(rnn_size) for _ in range(4)]
40 |
41 | h1, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(cells_fw=fw_cell, cells_bw=bw_cell,
42 | inputs=x, sequence_length=x_len, dtype=tf.float32)
43 |
44 | with tf.name_scope('Affine'):
45 | h1_rs = tf.reshape(h1, [-1, 2 * rnn_size])
46 | logits = tf.matmul(h1_rs, weights['out']) + biases['out']
47 | logits = tf.reshape(logits, [batch_size, max_timesteps, classes])
48 | logits = tf.transpose(logits, (1, 0, 2))
49 |
50 | return logits, h1
51 |
52 |
53 | pred, h1 = get_EESEN(x=x, rnn_size=rnn_size, weights=weights, biases=biases, x_len=x_len, classes=classes)
54 |
55 | # Create loss, optimizer and train function
56 | loss = tf.reduce_mean(tf.nn.ctc_loss(inputs=pred, labels=y, sequence_length=x_len, time_major=True))
57 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
58 |
59 | train_step = optimizer.minimize(loss)
60 |
61 | # Initialize session
62 | init = tf.global_variables_initializer()
63 | config = tf.ConfigProto()
64 | # config.gpu_options.allow_growth = True
65 |
66 | # Print parameter count
67 | params = 0
68 | for variable in tf.trainable_variables():
69 | # shape is an array of tf.Dimension
70 | shape = variable.get_shape()
71 | variable_parametes = 1
72 | for dim in shape:
73 | variable_parametes *= dim.value
74 | params += variable_parametes
75 | print('# network parameters: ' + str(params))
76 |
77 | with tf.Session(config=config) as sess:
78 | sess.run(init)
79 | bY = target_converter(bY, b_lenY)
80 | bY = sparse_tuple_from(bY)
81 |
82 | # Check for correct sizes
83 | assert (h1._shape_as_list() == [None, None, 2*rnn_size]) # final projection input size (rnn_size)
84 | assert (weights['out'].shape.as_list() == [2*rnn_size, classes]) # final projection kernel size (rnn_size, classes)
85 | output = sess.run(pred, feed_dict={x: bX, y: bY, x_len: b_lenX})
86 | assert (output.shape == (seq_len, batch_size, classes))
87 |
88 | # Start training
89 | batch_time = []
90 | batch_loss = []
91 | train_start=timer.perf_counter()
92 | for i in range(batches):
93 | batch_start = timer.perf_counter()
94 | _, loss_val = sess.run([train_step, loss], feed_dict={x: bX, y: bY, x_len: b_lenX})
95 | batch_end = timer.perf_counter()
96 | batch_time.append(batch_end - batch_start)
97 | batch_loss.append(loss_val)
98 | train_end = timer.perf_counter()
99 |
100 | # Results handling
101 | print_results(batch_time)
102 | check_results(batch_loss, batch_time, train_start, train_end)
103 | write_results(script_name=os.path.basename(__file__), bench=bench, experiment=experiment, parameters=params,
104 | run_time=batch_time, version=version)
105 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # rnn_benchmarks
2 | Welcome to the rnn_benchmarks repository! We offer:
3 | - A training speed comparison of different LSTM implementations across deep learning frameworks
4 | - Common input sizes, network configurations and cost functions from automatic speech recognition
5 | - Best-practice scripts to learn coding up a network, optimizers, loss functions etc.
6 |
7 | ## Update June 4th 2018
8 | - Arxiv paper: [LSTM Benchmarks for Deep Learning Frameworks](https://arxiv.org/abs/1806.01818)
9 | - [LSTM benchmarks between PyTorch 0.4, TensorFlow 1.8, Keras 2.1.6 and latest Lasagne](https://github.com/stefbraun/rnn_benchmarks/tree/master/results/10/framework_comparison)
10 |
11 |
12 | - [LSTM benchmarks between PyTorch versions 0.1.12 to 0.4.0](https://github.com/stefbraun/rnn_benchmarks/tree/master/results/10/pytorch_comparison)
13 |
14 |
15 | ## Run the benchmarks
16 | Go to the folder 'main' and execute the 'main.py' script in the corresponding benchmark folder. Before running 'main.py', you need to give the paths to the python environment that contain the corresponding framework. The 'main.py' script creates a 'commands.sh' script that will execute the benchmarks. The measured execution times will be written to 'results/results.csv'. The toy data and default parameters are provided by 'support.py', to make sure every script uses the same hyperparameters.
17 |
18 |
--------------------------------------------------------------------------------
/main/framework_comparison/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import sys
4 | from pathlib import Path
5 |
6 | # Parameters
7 | cuda_device = 1
8 | dry = 1 # Run benches or not
9 | python_path = Path(__file__).resolve().parents[2]
10 |
11 | command_list = ['echo {} > {}'.format(os.path.join(python_path, 'results', 'framework_comparison'),
12 | os.path.join(python_path, 'results', 'conf'))]
13 |
14 | # write path to dataframe to config file
15 | with open(os.path.join(python_path, 'results', 'conf'), 'w') as f:
16 | f.write(os.path.join(python_path, 'results', 'framework_comparison'))
17 |
18 | # Please define your virtual environments for testing
19 | interpreter = {}
20 | interpreter[
21 | 'lasagne'] = 'MKL_THREADING_LAYER=GNU LIBRARY_PATH=/usr/local/cuda-9.0/lib64 /home/brauns/anaconda3/envs/theano/bin/python'
22 | interpreter[
23 | 'keras-theano'] = 'MKL_THREADING_LAYER=GNU LIBRARY_PATH=/usr/local/cuda-9.0/lib64 /home/brauns/anaconda3/envs/theano/bin/python'
24 | interpreter['tensorflow'] = '/home/brauns/anaconda3/envs/tensorflow/bin/python'
25 | interpreter['keras-tensorflow'] = '/home/brauns/anaconda3/envs/tensorflow/bin/python'
26 | interpreter['pytorch'] = '/home/brauns/anaconda3/envs/pt4/bin/python'
27 |
28 | # Experiments
29 | all_experiments = ['1x320-LSTM', '4x320-LSTM', '4x320-LSTM_ctc']
30 |
31 | # Run benches
32 | for experiment in all_experiments:
33 | experiment_folder = os.path.join(python_path, experiment)
34 | all_benches = [script for script in os.listdir(experiment_folder) if 'bench' in script]
35 |
36 | for bench in all_benches:
37 | print('=' * 100)
38 | _, framework, cell = bench.split('_')
39 |
40 | if 'keras' not in framework:
41 | interpreter_path = interpreter[framework]
42 | script_path = os.path.join(experiment_folder, bench)
43 | command = 'CUDA_VISIBLE_DEVICES={} PYTHONPATH={} {} {}'.format(cuda_device, python_path, interpreter_path,
44 | script_path)
45 | else:
46 | backend = framework.split('-')[1]
47 | interpreter_path = interpreter[framework]
48 | script_path = os.path.join(experiment_folder, bench)
49 | command = 'CUDA_VISIBLE_DEVICES={} KERAS_BACKEND={} PYTHONPATH={} {} {}'.format(cuda_device, backend,
50 | python_path,
51 | interpreter_path,
52 | script_path)
53 | print(command)
54 | command_list.append(command)
55 | if dry == 0:
56 | proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
57 | proc.wait()
58 |
59 | command_list = map(lambda x: x + '\n', command_list)
60 | with open(os.path.join(sys.path[0], 'commands.sh'), 'w') as f:
61 | f.writelines(command_list)
62 |
--------------------------------------------------------------------------------
/main/framework_comparison/plot.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | import matplotlib.pyplot as plt
5 | import numpy as np
6 | import pandas as pd
7 | import seaborn as sns
8 | from matplotlib.lines import Line2D
9 | from matplotlib.ticker import MaxNLocator
10 |
11 | sns.set_style('darkgrid')
12 |
13 | import matplotlib.pylab as pylab
14 |
15 | lparams = ['legend.fontsize', 'axes.labelsize', 'axes.titlesize', 'xtick.labelsize', 'ytick.labelsize']
16 | fontsize = 11.5
17 | params = {key: fontsize for key in lparams}
18 | pylab.rcParams.update(params)
19 |
20 | def match_case(row):
21 | for old, new in [('pytorch', 'PyTorch'), ('tensorflow', 'TensorFlow'), ('lasagne', 'Lasagne'),
22 | ('keras', 'Keras'), ('theano', 'Theano'), ('cudnnLSTM', 'cuDNNLSTM')]:
23 | row['bench'] = row['bench'].replace(old, new)
24 | return row
25 |
26 |
27 | def linebreak(row):
28 | row['bench'] = '\n'.join(row['bench'].split('_'))
29 | return row
30 |
31 |
32 | def framework(row):
33 | row['framework'] = row['bench'].split('_')[0]
34 | if 'keras' in row['bench']:
35 | row['framework'] = 'keras'
36 | elif 'Keras' in row['bench']:
37 | row['framework'] = 'Keras'
38 | return row
39 |
40 |
41 | def get_color_palette(unique_benchs):
42 | colors = []
43 | for bench in unique_benchs:
44 | if ('tensorflow' in bench) or ('TensorFlow' in bench):
45 | c = "#377eb8"
46 | c = '#4c72b0'
47 | if ('pytorch' in bench) or ('PyTorch' in bench):
48 | c = "#e41a1c"
49 | # c='#C44E52'
50 | c='#de2d26'
51 | if ('lasagne' in bench) or ('Lasagne' in bench):
52 | c = "#696969"
53 | if ('keras' in bench) or ('Keras' in bench):
54 | c = "#4daf4a"
55 | c= '#55A868'
56 | colors.append(c)
57 | return colors
58 |
59 |
60 | # Load file
61 | repo_path = Path(__file__).resolve().parents[2]
62 | logfile = os.path.join(repo_path, 'results', 'framework_comparison', 'results.csv')
63 | df = pd.read_csv(logfile)
64 |
65 | # Parameters
66 | experiments = list(df['experiment'].unique())
67 |
68 | # for exp, ax in zip(experiments, axs.reshape(-1)):
69 | for exp in experiments:
70 |
71 | dfp = df[df['experiment'] == exp]
72 | dfp = dfp.apply(match_case, axis=1)
73 | dfp = dfp.apply(framework, axis=1)
74 | dfp = dfp.apply(linebreak, axis=1)
75 | dfp = dfp.groupby('bench').tail(400)
76 | dfp['mean'] = dfp.groupby('bench').transform('mean')['runtime']
77 | dfp = dfp.sort_values(['mean'], ascending=True)
78 | dfp['runtime'] = dfp['runtime'] * 1000
79 |
80 | # Uber-plotting skillz: ax control
81 | fig_width = 8
82 | ax_height = len(dfp['bench'].unique()) * 0.5
83 |
84 | left_inch = 1.75
85 | left_rel = left_inch / fig_width
86 | ax_width_rel = 1 - left_rel - 0.005
87 |
88 | bottom_inch = 0.45
89 | top_inch = 0.2
90 | fig_height = ax_height + bottom_inch + top_inch
91 |
92 | bottom_rel = bottom_inch / fig_height
93 | ax_height_rel = (fig_height - bottom_inch - top_inch - 0.01) / fig_height
94 |
95 | fig = plt.figure(figsize=(fig_width, fig_height))
96 | ax_pad = (left_rel, bottom_rel, ax_width_rel, ax_height_rel)
97 |
98 | ax = fig.add_axes((ax_pad))
99 |
100 | # Start plotting
101 | colors = get_color_palette(dfp['bench'].unique())
102 | sns.set_palette(colors)
103 |
104 | custom_lines = [Line2D([0], [0], color=c, lw=4) for c in list(pd.unique(colors))]
105 | ax.legend(custom_lines, list(dfp['framework'].unique()))
106 |
107 | # dfp = dfp.apply(unbreak, axis=1)
108 | sns.barplot(ax=ax, data=dfp, y='bench', x='runtime', ci='sd')
109 | ax.set_title(exp.replace('_', '-'))
110 | ax.set_xlabel('Time per batch [milliseconds]')
111 |
112 | min_width = 1e6
113 | max_width = 0
114 | for p, c in zip(ax.patches, colors):
115 | min_width = np.min([min_width, p.get_width()])
116 | max_width = np.max([max_width, p.get_width()])
117 |
118 | ax.set_xlim((0, 1.6 * max_width))
119 | max_x = np.max(ax.get_xlim())
120 |
121 | for p, c in zip(ax.patches, colors):
122 | # print(max_x)
123 | if min_width > 10:
124 | ax.text(p.get_width() + max_x / 8, p.get_y() + p.get_height() / 1.3,
125 | '{:4.0f}ms ::: {:3.1f}x'.format(p.get_width(), p.get_width() / min_width),
126 | fontsize=fontsize+1.5, fontweight='bold', color=c, ha='center', va='bottom')
127 | else:
128 | ax.text(p.get_width() + max_x / 8, p.get_y() + p.get_height() / 1.3,
129 | '{:4.1f}ms ::: {:3.1f}x'.format(p.get_width(), p.get_width() / min_width),
130 | fontsize=fontsize+1.5, fontweight='bold', color=c, ha='center', va='bottom')
131 |
132 | ax.set_ylabel('')
133 | ax.xaxis.set_major_locator(MaxNLocator(prune='upper'))
134 | plt.setp(ax.get_xticklabels()[-1], visible=False)
135 |
136 | output_file = os.path.join(repo_path, 'results/framework_comparison/{}'.format(exp))
137 |
138 | fig.savefig(output_file, dpi=300)
139 | fig.savefig(output_file + '.pdf', dpi=300)
140 |
--------------------------------------------------------------------------------
/main/pytorch_comparison/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import sys
4 | from pathlib import Path
5 |
6 | # Parameters
7 | cuda_device = 1
8 | python_path = Path(__file__).resolve().parents[2]
9 | dry = 1
10 |
11 | command_list = ['echo {} > {}'.format(os.path.join(python_path, 'results', 'pytorch_comparison'),
12 | os.path.join(python_path, 'results', 'conf'))]
13 |
14 | # write path to dataframe to config file
15 | with open(os.path.join(python_path, 'results', 'conf'), 'w') as f:
16 | f.write(os.path.join(python_path, 'results', 'pytorch_comparison'))
17 |
18 | # Please define your virtual environments for testing
19 | all_interpreters = ['/home/brauns/anaconda3/envs/pt{}/bin/python'.format(i) for i in range(1, 5)]
20 |
21 | # Experiments
22 | all_experiments = ['1x320-LSTM', '4x320-LSTM', '4x320-LSTM_ctc']
23 |
24 | # Run benches
25 | for experiment in all_experiments:
26 | experiment_folder = os.path.join(python_path, experiment)
27 | all_benches = [script for script in os.listdir(experiment_folder) if ('bench' in script) and 'pytorch' in script]
28 |
29 | for interpreter in all_interpreters:
30 | for bench in all_benches:
31 | print('=' * 100)
32 | _, framework, cell = bench.split('_')
33 |
34 | interpreter_path = interpreter
35 | script_path = os.path.join(experiment_folder, bench)
36 | command = 'CUDA_VISIBLE_DEVICES={} PYTHONPATH={} {} {}'.format(cuda_device, python_path, interpreter_path,
37 | script_path)
38 |
39 | print(command)
40 | command_list.append(command)
41 | if dry == 0:
42 | proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
43 | proc.wait()
44 |
45 | command_list = map(lambda x: x + '\n', command_list)
46 | with open(os.path.join(sys.path[0], 'commands.sh'), 'w') as f:
47 | f.writelines(command_list)
48 |
--------------------------------------------------------------------------------
/main/pytorch_comparison/plot.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | import matplotlib.pyplot as plt
5 | import numpy as np
6 | import pandas as pd
7 | import seaborn as sns
8 |
9 | sns.set_style('darkgrid')
10 |
11 | import matplotlib.pylab as pylab
12 | from matplotlib.ticker import MaxNLocator
13 |
14 |
15 | lparams = ['legend.fontsize', 'axes.labelsize', 'axes.titlesize', 'xtick.labelsize', 'ytick.labelsize']
16 | fontsize = 11.5
17 | params = {key: fontsize for key in lparams}
18 | pylab.rcParams.update(params)
19 |
20 | def match_case(row):
21 | for old, new in [('pytorch', 'PyTorch'), ('tensorflow', 'TensorFlow'), ('lasagne', 'Lasagne'),
22 | ('keras', 'Keras'), ('theano', 'Theano'), ('cudnnLSTM', 'cuDNNLSTM')]:
23 | row['bench'] = row['bench'].replace(old, new)
24 | return row
25 |
26 | def linebreak(row):
27 | row['bench'] = '\n'.join(row['bench'].split('_'))
28 | return row
29 |
30 |
31 | def get_color_palette():
32 | colors = list(reversed(['#fcbba1', '#fc9272', '#fb6a4a', '#de2d26']))
33 |
34 | # colors = list(['#C44E52', '#ec8386', '#fdb6b8', '#fed6d7'])
35 | # colors=sns.color_palette('Reds')
36 |
37 |
38 | return colors
39 |
40 |
41 | # Load file
42 | repo_path = Path(__file__).resolve().parents[2]
43 | logfile = os.path.join(repo_path, 'results', 'pytorch_comparison', 'results.csv')
44 | df = pd.read_csv(logfile)
45 |
46 | # Parameters
47 | experiments = list(df['experiment'].unique())
48 |
49 | # for exp, ax in zip(experiments, axs.reshape(-1)):
50 | for exp in experiments:
51 |
52 | dfp = df[df['experiment'] == exp]
53 | dfp = dfp.apply(match_case, axis=1)
54 | dfp = dfp.apply(linebreak, axis=1)
55 | dfp = dfp.groupby(['bench', 'version']).tail(400)
56 | dfp['mean'] = dfp.groupby('bench').transform('mean')['runtime']
57 | dfp = dfp.sort_values(['mean', 'version', 'bench'], ascending=[True, False, False])
58 | dfp['runtime'] = dfp['runtime'] * 1000
59 |
60 | # Uber-plotting skillz: ax control
61 | fig_width = 8
62 | ax_height = np.min([len(dfp['version'].unique()) * len(dfp['bench'].unique()), 11]) * 0.5
63 |
64 | left_inch = 1.75
65 | left_rel = left_inch / fig_width
66 | ax_width_rel = 1 - left_rel - 0.005
67 | bottom_inch = 0.45
68 | top_inch = 0.2
69 | fig_height = ax_height + bottom_inch + top_inch
70 |
71 | bottom_rel = bottom_inch / fig_height
72 | ax_height_rel = (fig_height - bottom_inch - top_inch - 0.01) / fig_height
73 |
74 | fig = plt.figure(figsize=(fig_width, fig_height))
75 | ax_pad = (left_rel, bottom_rel, ax_width_rel, ax_height_rel)
76 |
77 | ax = fig.add_axes((ax_pad))
78 |
79 | # Start plotting
80 | colors = get_color_palette()
81 | sns.set_palette(colors)
82 |
83 | sns.barplot(ax=ax, data=dfp, y='bench', x='runtime', hue='version', ci='sd')
84 | ax.set_title(exp)
85 | xl = ax.set_xlabel('Time per batch [milliseconds]')
86 |
87 | all_width = [p.get_width() for p in ax.patches]
88 | min_width = np.min(all_width)
89 | max_width = np.max(all_width)
90 |
91 | ax.set_xlim((0, 1.6 * max_width))
92 | max_x = np.max(ax.get_xlim())
93 |
94 | for p in ax.patches:
95 | # print(max_x)
96 | if min_width > 10:
97 | ax.text(p.get_width() + max_x / 8, p.get_y() + p.get_height() / 1.3,
98 | '{:4.0f}ms ::: {:3.1f}x'.format(p.get_width(), p.get_width() / min_width),
99 | fontsize=fontsize+1.5, fontweight='bold', color='dimgrey', ha='center', va='bottom')
100 | else:
101 | ax.text(p.get_width() + max_x / 8, p.get_y() + p.get_height() / 1.3,
102 | '{:4.1f}ms ::: {:3.1f}x'.format(p.get_width(), p.get_width() / min_width),
103 | fontsize=fontsize+1.5, fontweight='bold', color='grey', ha='center', va='bottom')
104 |
105 | ax.set_ylabel('')
106 | ax.legend(loc=1)
107 |
108 | ax.xaxis.set_major_locator(MaxNLocator(prune='upper'))
109 | plt.setp(ax.get_xticklabels()[-1], visible=False)
110 |
111 | output_file = os.path.join(repo_path, 'results/pytorch_comparison/{}'.format(exp))
112 |
113 | fig.savefig(output_file, dpi=300)
114 | fig.savefig(output_file + '.pdf')
115 |
--------------------------------------------------------------------------------
/main/pytorch_comparison/unifier.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | import pandas as pd
5 |
6 | repo_path = Path(__file__).resolve().parents[2]
7 |
8 | # Get frames
9 | df1 = pd.read_csv(os.path.join(repo_path, 'results', 'framework_comparison', 'results.csv'))
10 | df2 = pd.read_csv(os.path.join(repo_path, 'results', 'pytorch_comparison', 'results.csv'))
11 |
12 | # Get version in framework comparison
13 | df1pt = df1[df1['bench'].str.contains('pytorch')]
14 | pytorch_version = df1pt['version'].unique()[0]
15 | print('Replacing pytorch version {}'.format(pytorch_version))
16 |
17 | # Prepare pytorch comparison dataframe
18 | df2pt = df2.copy()
19 | df2pt.drop(df2pt[df2pt['version'] == pytorch_version].index, inplace=True)
20 |
21 | # Prepare framework comparison dataframe
22 | df2pt = df2pt.append(df1pt)
23 | df2pt.reset_index
24 |
25 | # save csv
26 | df2pt.to_csv(os.path.join(repo_path, 'results', 'pytorch_comparison', 'results.csv'), index=None)
27 | 5 + 5
28 |
--------------------------------------------------------------------------------
/results/10/framework_comparison/1x320-LSTM_cross-entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/framework_comparison/1x320-LSTM_cross-entropy.png
--------------------------------------------------------------------------------
/results/10/framework_comparison/1x320-LSTM_cross-entropy_100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/framework_comparison/1x320-LSTM_cross-entropy_100.png
--------------------------------------------------------------------------------
/results/10/framework_comparison/4x320-BIDIR-LSTM_CTC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/framework_comparison/4x320-BIDIR-LSTM_CTC.png
--------------------------------------------------------------------------------
/results/10/framework_comparison/4x320-BIDIR-LSTM_cross-entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/framework_comparison/4x320-BIDIR-LSTM_cross-entropy.png
--------------------------------------------------------------------------------
/results/10/framework_comparison/readme.md:
--------------------------------------------------------------------------------
1 | # Framework comparison
2 | - PyTorch, TensorFlow, Lasagne, Keras/TensorFlow and Keras/Theano benchmarks
3 | - LSTM implementations by cuDNN, fused kernels and naive approaches
4 | - Fixed sequence-length data with cross-entropy loss and variable sequence-length data with CTC loss
5 | - Input sizes 64x100x123 and 32x1000x123 (batch size x time steps x channels)
6 | - Network sizes 1x320 and 4x320 (number of layers x number of LSTM units)
7 |
8 | ## Framework versions
9 | Framework | Version | Release |Backend | cuda | cuDNN
10 | -|-|-|-|-|-
11 | PyTorch | 0.4.0 | [April 2018](https://github.com/PyTorch/PyTorch/releases/tag/v0.4.0) | - | 9.0 | 7102
12 | TensorFlow | 1.8.0 | [April 2018](https://github.com/TensorFlow/TensorFlow/releases/tag/v1.8.0) |- | 9.0 | 7005
13 | Lasagne | 0.2.1dev | [April 2018 ](https://github.com/Lasagne/Lasagne/commit/7992faa80fa5233a786e2582a605e854cea7d1cf) | [Theano 1.0.1](https://github.com/Theano/Theano/releases/tag/rel-1.0.1) | 9.0 | 7005
14 | Keras | 2.1.6 | [April 2018](https://github.com/Keras-team/Keras/releases/tag/2.1.6) |[Theano 1.0.1](https://github.com/Theano/Theano/releases/tag/rel-1.0.1), [TensorFlow 1.8.0](https://github.com/TensorFlow/TensorFlow/releases/tag/v1.8.0)| 9.0 | 7005
15 |
16 | ## LSTM implementations
17 |
18 | Library | Name | Details
19 | -|-|-
20 | PyTorch | [`LSTMCell-basic`](https://github.com/stefbraun/rnn_benchmarks/blob/master/1x320-LSTM/bench_pytorch_LSTMCell-basic.py) | Custom code, pure PyTorch implementation, easy to modify. Loop over time with Python `for` loop
21 | PyTorch | [`LSTMCell-fused`](http://PyTorch.org/docs/stable/nn.html?highlight=lstmcell#torch.nn.LSTMCell) | LSTM with optimized kernel for single time steps. Loop over time with Python `for` loop
22 | PyTorch |[`cuDNNLSTM`](http://PyTorch.org/docs/stable/nn.html?highlight=lstm#torch.nn.LSTM) | Wrapper to cuDNN LSTM implementation
23 | TensorFlow | [`LSTMCell`](https://www.TensorFlow.org/versions/r1.8/api_docs/python/tf/contrib/rnn/LSTMCell)| Pure TensorFlow implementation, easy to modify. Loop over time with `tf.while_loop`. Uses `dynamic_rnn`
24 | TensorFlow | [`LSTMBlockCell`](https://www.TensorFlow.org/versions/r1.8/api_docs/python/tf/contrib/rnn/LSTMBlockCell)| Optimized LSTM with single operation per time-step. Loop over time with `tf.while_loop`. Uses `dynamic_rnn`
25 | TensorFlow | [`LSTMBlockFusedCell`](https://www.TensorFlow.org/versions/r1.8/api_docs/python/tf/contrib/rnn/LSTMBlockFusedCell)| Optimized LSTM with single operation over all time steps. Loop over time is part of the operation.
26 | TensorFlow | [`cuDNNLSTM`](https://www.tensorflow.org/api_docs/python/tf/contrib/cudnn_rnn/CudnnLSTM)| Wrapper to cuDNN LSTM implementation
27 | Lasagne | [`LSTMLayer`](http://Lasagne.readthedocs.io/en/latest/modules/layers/recurrent.html?highlight=gru#Lasagne.layers.LSTMLayer)| Pure Theano implementation, easy to modify. Loop over time with `theano.scan`
28 | Keras | [`cuDNNLSTM`](https://Keras.io/layers/recurrent/#cuDNNlstm) | Wrapper to cuDNN LSTM implementation
29 | Keras | [`LSTM`](https://Keras.io/layers/recurrent/#lstm)| Pure Theano/TensorFlow implementation, easy to modify. Loop over time with `theano.scan` or `tf.while_loop`
30 |
31 | ## Loss functions and input data
32 | The loss functions are varied with the input data:
33 | 1. Cross-entropy for fixed sequence length data
34 | - default implementation from each framework
35 | 2. Connectionist Temporal Classification (CTC) for variable sequence length data
36 | - warp_ctc for [Theano+Lasagne](http://deeplearning.net/software/Theano/library/tensor/nnet/ctc.html?highlight=ctc#module-Theano.tensor.nnet.ctc) and [PyTorch](https://github.com/SeanNaren/warp-ctc)
37 | - TensorFlow default [CTC implementation](https://www.TensorFlow.org/api_docs/python/tf/nn/ctc_loss)
38 |
39 |
40 | Benchmark name | Layers x LSTM units | # Classes & output units | Loss | Input size [NxTxC] 1 | Sequence length | Labels per sample| Benchmark scenario
41 | -|-|-|-|-|-|-|-
42 | 1x320/CE-short | 1x320 unidirectional | 10 Dense | cross entropy | 64x100x123 | fixed
[100] | 1 | Real world2
43 | 1x320/CE-long | 1x320 unidirectional | 10 Dense | cross entropy | 32x1000x123 | fixed
[1000] | 1 | Synthetic
44 | 4x320/CE-long | 4x320 bidirectional | 10 Dense | cross entropy | 32x1000x123 | fixed
[1000] | 1 | Synthetic
45 | 4x320/CTC-long | 4x320 bidirectional | 59 Dense | CTC| 32x1000x123 | variable
[500..1000] | 100 | Real-world3
46 |
52 | 1N=number of samples, T=time-steps, C=feature channels
53 | 2ASR-task on TIDIGITS/isolated digit recognition, default training set (0.7 hours of speech): 123-
54 | dimensional filterbank features with 100fps, average sequence length of 98, alphabet size of 10 digits and
55 | 1 label per sample
56 | 3ASR-task on WSJ/continuous speech recognition, pre-processing with [EESEN](https://github.com/srvk/eesen) on training subset
57 | si-284 (81h of speech): 123-dimensional filterbank features with 100fps, average sequence length 783, alphabet
58 | size of 59 characters and average number of characters per sample 102
59 | ## Results
60 | - Xeon W-2195 CPU, GTX 1080 Founders Edition, Ubuntu 16.04
61 | - The results reflect the mean time to fully process a batch (forward + backward pass).
62 | - The measurements are taken over 500 runs, and the first 100 are discarded as warm-up.
63 |
64 | Benchmark | Results
65 | -|-
66 | 1x320/CE-short
---
L1: 1x320 unidir LSTM
L2: 10 Dense
---
cross-entropy loss
input size 64x100x123
fixed sequence length
---
433k parameters
|
67 | 1x320/CE-long
---
L1: 1x320 unidir LSTM
L2: 10 Dense
---
cross-entropy loss
input size 32x1000x123
fixed sequence length
---
576k parameters
|
68 | 4x320/CE-long
---
L1-4: 4x320 bidir LSTM
L5: 10 Dense
---
cross-entropy loss
input size 32x1000x123
fixed sequence length
---
8.5M parameters
|
69 | 4x320/CTC-long
L1-4: 4x320 bidir LSTM
L5: 59 Dense
---
CTC loss
input size 32x1000x123
variable sequence length
---
8.5M parameters
|
70 |
71 | Remarks:
72 | - The benchmark scripts are carefully written, but not optimized to squeeze that last bit of
73 | performance out of them. They should reflect typical day-to-day research applications.
74 | - Due to time constraints, only the 1x320 LSTM benchmark covers all considered frameworks.
75 | For the multi-layer 4x320 networks, only implementations that provided helper functions to
76 | create stacked bidirectional networks were evaluated. An exemption of this rule was made
77 | for Lasagne, in order to include a Theano-based contender for this scenario.
78 | - The TensorFlow benchmarks use the `feed_dict` input method that is simple to implement,
79 | but slower than the [`tf.data` API](https://www.tensorflow.org/performance/performance_guide#input_pipeline_optimization). Implementing a high performance input pipeline in TensorFlow is not trivial, and only the feed_dict approach allowed for a similar implementation complexity as in the PyTorch and Lasagne cases.
80 | - The TensorFlow `cuDNNLSTM` was not tested with variable length data as it does not support
81 | such input (see [issue 6633](https://github.com/TensorFlow/TensorFlow/issues/6633)).
82 | - The TensorFlow benchmark uses the integrated `tf.nn.ctc_loss` instead of the warp-
83 | ctc library, even though there is a TensorFlow binding available ([Link](https://github.com/baidu-research/warp-ctc)). The performance
84 | difference has not been measured.
85 | - PyTorch 0.4.0 merged the Tensor and Variable classes and does not need the Variable
86 | wrapper anymore. The Variable wrapper has a negligible performance impact on version
87 | 0.4.0, but is required for older PyTorch releases in the PyTorch version comparison.
88 | - The CTC benchmark was not carried out on PyTorch 0.1.12_2 as the compilation process was too complex. The packed sequence implementation has a large impact on performance for v0.2.0_4 (see [issue 4512](https://github.com/PyTorch/PyTorch/pull/4512)).
89 |
--------------------------------------------------------------------------------
/results/10/pytorch_comparison/1x320-LSTM_cross-entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/pytorch_comparison/1x320-LSTM_cross-entropy.png
--------------------------------------------------------------------------------
/results/10/pytorch_comparison/1x320-LSTM_cross-entropy_100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/pytorch_comparison/1x320-LSTM_cross-entropy_100.png
--------------------------------------------------------------------------------
/results/10/pytorch_comparison/4x320-BIDIR-LSTM_CTC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/pytorch_comparison/4x320-BIDIR-LSTM_CTC.png
--------------------------------------------------------------------------------
/results/10/pytorch_comparison/4x320-BIDIR-LSTM_cross-entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefbraun/rnn_benchmarks/eb6358a67c944c6cbb64a9d73e8ccd18de2567e4/results/10/pytorch_comparison/4x320-BIDIR-LSTM_cross-entropy.png
--------------------------------------------------------------------------------
/results/10/pytorch_comparison/readme.md:
--------------------------------------------------------------------------------
1 | # PyTorch comparison
2 | - PyTorch 0.1 to 0.4 version comparison
3 | - LSTM implementations by cuDNN, fused kernels and naive approaches
4 | - Fixed sequence-length data with cross-entropy loss and variable sequence-length data with CTC loss
5 | - Input sizes 64x100x123 and 32x1000x123 (batch size x time steps x channels)
6 | - Network sizes 1x320 and 4x320 (number of layers x number of LSTM units)
7 |
8 | ## Framework versions
9 | Framework | Version | Release |Backend | cuda | cuDNN
10 | -|-|-|-|-|-
11 | PyTorch | 0.4.0 | [April 2018](https://github.com/PyTorch/PyTorch/releases/tag/v0.4.0) | - | 9.0 | 7102
12 | PyTorch | 0.3.1post2 | [February 2018](https://github.com/PyTorch/PyTorch/releases/tag/v0.3.1) | - | 8.0 | 7005
13 | PyTorch | 0.2.0_4 | [August 2017](https://github.com/PyTorch/PyTorch/releases/tag/v0.2.0) | - | 8.0 | 6021
14 | PyTorch | 0.1.12_2 | [April 2018](https://github.com/PyTorch/PyTorch/releases/tag/v0.1.12) | - | 8.0 | 6021
15 |
16 | ## LSTM implementations
17 |
18 | Library | Name | Details
19 | -|-|-
20 | PyTorch | [`LSTMCell-basic`](https://github.com/stefbraun/rnn_benchmarks/blob/master/1x320-LSTM/bench_pytorch_LSTMCell-basic.py) | Custom code, pure PyTorch implementation, easy to modify. Loop over time with Python `for` loop
21 | PyTorch | [`LSTMCell-fused`](http://PyTorch.org/docs/stable/nn.html?highlight=lstmcell#torch.nn.LSTMCell) | LSTM with optimized kernel for single time steps. Loop over time with Python `for` loop
22 | PyTorch |[`cuDNNLSTM`](http://PyTorch.org/docs/stable/nn.html?highlight=lstm#torch.nn.LSTM) | Wrapper to cuDNN LSTM implementation
23 |
24 | ## Loss functions and input data
25 | The loss functions are varied with the input data:
26 | 1. Cross-entropy for fixed sequence length data
27 | - default implementation from PyTorch
28 | 2. Connectionist Temporal Classification (CTC) for variable sequence length data
29 | - warp_ctc for [PyTorch](https://github.com/SeanNaren/warp-ctc)
30 |
31 | Benchmark name | Layers x LSTM units | # Classes & output units | Loss | Input size [NxTxC] 1 | Sequence length | Labels per sample| Benchmark scenario
32 | -|-|-|-|-|-|-|-
33 | 1x320/CE-short | 1x320 unidirectional | 10 Dense | cross entropy | 64x100x123 | fixed
[100] | 1 | Real-world2
34 | 1x320/CE-long | 1x320 unidirectional | 10 Dense | cross entropy | 32x1000x123 | fixed
[1000] | 1 | Synthetic
35 | 4x320/CE-long | 4x320 bidirectional | 10 Dense | cross entropy | 32x1000x123 | fixed
[1000] | 1 | Synthetic
36 | 4x320/CTC-long | 4x320 bidirectional | 59 Dense | CTC| 32x1000x123 | variable
[500..1000] | 100 | Real-world3
37 |
43 | 1N=number of samples, T=time-steps, C=feature channels
44 | 2ASR-task on TIDIGITS/isolated digit recognition, default training set (0.7 hours of speech): 123-
45 | dimensional filterbank features with 100fps, average sequence length of 98, alphabet size of 10 digits and
46 | 1 label per sample
47 | 3ASR-task on WSJ/continuous speech recognition, pre-processing with [EESEN](https://github.com/srvk/eesen) on training subset
48 | si-284 (81h of speech): 123-dimensional filterbank features with 100fps, average sequence length 783, alphabet
49 | size of 59 characters and average number of characters per sample 102
50 | ## Results
51 | - Xeon W-2195 CPU, GTX 1080 Founders Edition, Ubuntu 16.04
52 | - The results reflect the mean time to fully process a batch (forward + backward pass).
53 | - The measurements are taken over 500 runs, and the first 100 are discarded as warm-up.
54 |
55 | Benchmark | Results
56 | -|-
57 | 1x320/CE-short
---
L1: 1x320 unidir LSTM
L2: 10 Dense
---
cross-entropy loss
input size 64x100x123
fixed sequence length
---
433k parameters
|
58 | 1x320/CE-long
---
L1: 1x320 unidir LSTM
L2: 10 Dense
---
cross-entropy loss
input size 32x1000x123
fixed sequence length
---
576k parameters
|
59 | 4x320/CE-long
---
L1-4: 4x320 bidir LSTM
L5: 10 Dense
---
cross-entropy loss
input size 32x1000x123
fixed sequence length
---
8.5M parameters
|
60 | 4x320/CTC-long
L1-4: 4x320 bidir LSTM
L5: 59 Dense
---
CTC loss
input size 32x1000x123
variable sequence length
---
8.5M parameters
|
61 |
62 | Remarks:
63 | - The benchmark scripts are carefully written, but not optimized to squeeze that last bit of
64 | performance out of them. They should reflect typical day-to-day research applications.
65 | - Due to time constraints, only the 1x320 LSTM benchmark covers all considered frameworks.
66 | For the multi-layer 4x320 networks, only implementations that provided helper functions to
67 | create stacked bidirectional networks were evaluated.
68 | - PyTorch 0.4.0 merged the Tensor and Variable classes and does not need the Variable
69 | wrapper anymore. The Variable wrapper has a negligible performance impact on version
70 | 0.4.0, but is required for older PyTorch releases in the PyTorch version comparison.
71 | - The CTC benchmark was not carried out on PyTorch 0.1.12_2 as the compilation process was too complex. The packed sequence implementation has a large impact on performance for v0.2.0_4 (see [issue 4512](https://github.com/PyTorch/PyTorch/pull/4512)).
72 |
--------------------------------------------------------------------------------
/support.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from collections import OrderedDict
3 |
4 | import matplotlib.pyplot as plt
5 | import numpy as np
6 | import os.path
7 | import pandas as pd
8 |
9 |
10 | def default_params():
11 | rnn_size = 320
12 | learning_rate = 1e-3
13 | batches = 500
14 | return rnn_size, learning_rate, batches
15 |
16 |
17 | def toy_batch(seed=11, shape=(64, 100, 123), classes=10):
18 | batch_size, max_len, features = shape
19 | np.random.seed(seed)
20 |
21 | # Samples
22 | bX = np.float32(np.random.uniform(-1, 1, (shape)))
23 | b_lenX = np.int32(np.ones(batch_size) * max_len)
24 |
25 | # Targets
26 | bY = np.int32(np.random.randint(low=0, high=classes - 1, size=batch_size))
27 |
28 | return bX, b_lenX, bY, classes
29 |
30 |
31 | def toy_batch_ctc(seed=11, shape=(32, 1000, 123), classes=59):
32 | batch_size, max_len, features = shape
33 | np.random.seed(seed)
34 |
35 | # Samples
36 | bX = np.float32(np.random.uniform(-1, 1, (shape)))
37 | b_lenX = np.int32(np.linspace(max_len / 2, max_len, batch_size))
38 | # print(b_lenX)
39 | maskX = np.zeros((batch_size, max_len), dtype='float32')
40 | for i, len_sample in enumerate(b_lenX):
41 | maskX[i, :len_sample] = np.ones((1, len_sample))
42 |
43 | # Targets
44 | bY = np.int32(np.random.randint(low=1, high=classes - 1,
45 | size=batch_size * 100)) # remember warp-ctc: 0 is the blank label, tensorflow-ctc: -1 is the blank label
46 | b_lenY = np.int32(np.ones(batch_size) * 100) # labels per sample comes from WSJ-si84
47 |
48 | return bX, b_lenX, maskX, bY, b_lenY, classes
49 |
50 |
51 | def check_results(batch_loss_list, batch_time_list, train_start, train_end):
52 |
53 | # Initialize
54 | abort = 0
55 |
56 | # 0. Check if loss is numeric (not NAN and not inf)
57 | check_loss=[np.isfinite(loss) for loss in batch_loss_list]
58 | if False not in check_loss:
59 | print('>>> Loss check 1/2 passed: loss is finite {}'.format(np.unique(check_loss)))
60 | else:
61 | print('!!! Loss check 1/2 failed: loss is NOT finite {}'.format(np.unique(check_loss)))
62 | abort = 1
63 |
64 | # 1. Check if loss is decreasing
65 | check_loss=np.diff(batch_loss_list)
66 | if np.sum(check_loss)<0:
67 | print('>>> Loss check 2/2 passed: loss is globally decreasing')
68 | else:
69 | print('!!! Loss check 2/2 failed: loss is NOT globally decreasing')
70 | abort=1
71 |
72 | # 2. Check deviation between the full loop time and the sum of individual batches
73 | loop_time = train_end - train_start
74 | batch_time_sum = np.sum(batch_time_list)
75 | factor = loop_time / batch_time_sum
76 | deviation = np.abs((1 - factor) * 100)
77 |
78 | if deviation < 1: # Less than 1% deviation
79 | print('>>> Timing check passed - < 1% deviation between loop time and sum of batches ::: Loop time {:.3f} ::: Sum of batch times {:.3f} ::: Deviation [%] {:.3f}'.format(loop_time,
80 | batch_time_sum,
81 | deviation))
82 | else:
83 | print('!!! Timing check failed - Deviation > 1% ::: Loop time {:.3f} ::: Sum of batch times {:.3f} :::'
84 | ' Deviation [%] {:.3f}'.format(loop_time, batch_time_sum, deviation))
85 | abort=1
86 |
87 | if abort==1:
88 | sys.exit('!!! Abort benchmark.')
89 | print('=' * 100)
90 |
91 |
92 | def write_results(script_name, bench, experiment, parameters, run_time, version=None,
93 | logfile=None):
94 |
95 | if logfile == None:
96 | # Get path
97 | repo_path = os.path.dirname(os.path.realpath(__file__))
98 |
99 | with open(os.path.join(repo_path, 'results', 'conf')) as f:
100 | mode = f.readline().strip()
101 |
102 | logfile = os.path.join(repo_path, 'results', mode, 'results.csv')
103 |
104 | # Prepare header
105 | if os.path.isfile(logfile) == False:
106 | df = pd.DataFrame(index=None, columns=['name', 'bench', 'version', 'experiment', 'parameters', 'runtime'])
107 | df.to_csv(logfile, index=None)
108 |
109 | # Prepare new results
110 | row_list = []
111 | for rt in run_time:
112 | row = OrderedDict()
113 | row['experiment'] = experiment
114 | row['bench'] = bench
115 | row['version'] = version
116 | row['name'] = script_name
117 | row['parameters'] = parameters
118 | row['runtime'] = rt
119 |
120 | row_list.append(row)
121 |
122 | dfa = pd.DataFrame.from_dict(row_list)
123 |
124 | # Append new results
125 | df = pd.read_csv(logfile)
126 | df = df.append(dfa)
127 | df.to_csv(logfile, index=None)
128 |
129 |
130 | def print_results(run_time):
131 | if len(run_time) > 100:
132 | run_time = run_time[100:]
133 | else:
134 | print('!!! First 100 batches are considered as warm-up. Please run more batches')
135 | run_time=np.asarray(run_time)*1000
136 | print(
137 | '>>> Time per batch [ms] ::: Mean {:.1f} ::: Std {:.1f} ::: Median {:.1f} ::: 99Percentile {:.1f} ::: Min {:.1f} ::: Max {:.1f}'.format(
138 | np.mean(run_time), np.std(run_time),
139 | np.median(run_time), np.percentile(run_time, 99), np.min(run_time), np.max(run_time)))
140 |
141 | def plot_results(time):
142 | fig, ax = plt.subplots()
143 | ax.scatter(range(len(time)), time)
144 | ax.grid()
145 | ax.set_xlabel('Batch #')
146 | ax.set_ylabel('Time per Batch [sec]')
147 | return fig, ax
148 |
149 |
150 | # Helper functions for label conversion from warp-ctc to tf-ctc format:-(
151 | def target_converter(bY, b_lenY):
152 | b_lenY_cs = np.cumsum(b_lenY)[:-1]
153 | bY_conv = np.split(bY, b_lenY_cs)
154 | return bY_conv
155 |
156 |
157 | def sparse_tuple_from(sequences, dtype=np.int32):
158 | """Create a sparse representention of x.
159 | Args:
160 | sequences: a list of lists of type dtype where each element is a sequence
161 | Returns:
162 | A tuple with (indices, values, shape)
163 | """
164 | indices = []
165 | values = []
166 |
167 | for n, seq in enumerate(sequences):
168 | indices.extend(zip([n] * len(seq), range(len(seq))))
169 | values.extend(seq)
170 |
171 | indices = np.asarray(indices, dtype=np.int64)
172 | values = np.asarray(values, dtype=dtype)
173 | shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)
174 |
175 | return indices, values, shape
176 |
--------------------------------------------------------------------------------
/utils/analyse_pandas.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import pandas as pd
5 |
6 | parser = argparse.ArgumentParser(description='Process results dataframe')
7 | parser.add_argument('--file', default=None,
8 | help='Dataframe to process.')
9 | args = parser.parse_args()
10 |
11 | # Load file
12 | logfile = os.path.join(args.file)
13 | df = pd.read_csv(logfile)
14 |
15 | # assert (int(df.groupby(['experiment', 'bench', 'version']).count()['runtime'].unique()) == 500)
16 | df['runtime'] = df['runtime'] * 1000
17 |
18 | df=df.groupby(['experiment','bench']).tail(400)
19 | df['mean'] = df.groupby(['experiment','bench']).transform('mean')['runtime']
20 | df['std'] = df.groupby(['experiment','bench']).transform('std')['runtime']
21 | df = df.sort_values(['mean'], ascending=True)
22 | grp=df.groupby(['experiment','bench'], as_index=False).tail(1).round(1)
23 | print(grp.to_string())
24 |
--------------------------------------------------------------------------------
/utils/disable_cores.sh:
--------------------------------------------------------------------------------
1 | num_cores=4
2 | num_cores_ht=$((18+$num_cores))
3 | echo num_cores_ht
4 | for i in `seq $num_cores 1 17`;
5 | do echo 0 > /sys/devices/system/cpu/cpu$i/online;
6 | done
7 |
8 | for i in `seq $num_cores_ht 1 36`;
9 | do echo 0 > /sys/devices/system/cpu/cpu$i/online;
10 | done
11 |
12 |
--------------------------------------------------------------------------------
/utils/enable_cores.sh:
--------------------------------------------------------------------------------
1 | for i in $(seq 36 $END);
2 | do echo 1 > /sys/devices/system/cpu/cpu$i/online;
3 | done
4 |
--------------------------------------------------------------------------------
/utils/plot_all.sh:
--------------------------------------------------------------------------------
1 | unzip -o results/results_100.zip -d results/
2 | python main/framework_comparison/plot.py
3 | mv results/framework_comparison/1x320-LSTM_cross-entropy.pdf results/framework_comparison/1x320-LSTM_cross-entropy_100.pdf
4 | mv results/framework_comparison/1x320-LSTM_cross-entropy.png results/framework_comparison/1x320-LSTM_cross-entropy_100.png
5 |
6 | python main/pytorch_comparison/plot.py
7 | mv results/pytorch_comparison/1x320-LSTM_cross-entropy.pdf results/pytorch_comparison/1x320-LSTM_cross-entropy_100.pdf
8 | mv results/pytorch_comparison/1x320-LSTM_cross-entropy.png results/pytorch_comparison/1x320-LSTM_cross-entropy_100.png
9 |
10 | unzip -o results/results_1k.zip -d results/
11 | python main/framework_comparison/plot.py
12 | python main/pytorch_comparison/plot.py
13 |
--------------------------------------------------------------------------------
/utils/rm_results.sh:
--------------------------------------------------------------------------------
1 | rm results/framework_comparison/results.csv results/pytorch_comparison/results.csv
2 |
3 |
4 |
--------------------------------------------------------------------------------