├── README.md ├── a_stream_pe1x_gold.txt ├── block_fun ├── block_test ├── block_test.cpp ├── block_top ├── block_top.cpp ├── block_top.h ├── config.h ├── config_test.h ├── diffusion-lib.h ├── gloden_out.txt ├── load_param ├── load_param.h ├── mm_bias_160.bin ├── mm_fm_160.bin ├── mm_fm_trans_160.bin ├── mm_out_160.bin ├── mm_out_short_160.bin ├── mm_w_160.bin ├── param_sa.h ├── result_verify_norm_1.txt ├── result_verify_shortcut_1.txt ├── sa_tools ├── sa_tools.h ├── stream_tools.h ├── test.h ├── test_DSP ├── true_A.txt ├── true_W.txt └── yg_script.tcl /README.md: -------------------------------------------------------------------------------- 1 | # SDA_code 2 | SDA: Low-Bit Stable Diffusion Acceleration on Edge FPGAs 3 | -------------------------------------------------------------------------------- /a_stream_pe1x_gold.txt: -------------------------------------------------------------------------------- 1 | -14 2 | -66 3 | 59 4 | 121 5 | -92 6 | 100 7 | 7 8 | 122 9 | 76 10 | -91 11 | 16 12 | 33 13 | 123 14 | 94 15 | 104 16 | 10 17 | -73 18 | -93 19 | -68 20 | 53 21 | -17 22 | 98 23 | -60 24 | 118 25 | 87 26 | 85 27 | -50 28 | 23 29 | -21 30 | -3 31 | -79 32 | -105 33 | 36 34 | 45 35 | 99 36 | -90 37 | -61 38 | 51 39 | -43 40 | -65 41 | 13 42 | 1 43 | 32 44 | 95 45 | -63 46 | 90 47 | 81 48 | -28 49 | -109 50 | -25 51 | -1 52 | -94 53 | 83 54 | 47 55 | -13 56 | -82 57 | 102 58 | -52 59 | -107 60 | -70 61 | 39 62 | 19 63 | 46 64 | -125 65 | 51 66 | 97 67 | 17 68 | -47 69 | -6 70 | 73 71 | 6 72 | 108 73 | -53 74 | 113 75 | -114 76 | -87 77 | -84 78 | 53 79 | 3 80 | -57 81 | -20 82 | -90 83 | -64 84 | 102 85 | -58 86 | -128 87 | -22 88 | 90 89 | 116 90 | -39 91 | 124 92 | 22 93 | -57 94 | 101 95 | -27 96 | 45 97 | 59 98 | -62 99 | -15 100 | 91 101 | -36 102 | -114 103 | 59 104 | 86 105 | 27 106 | -109 107 | 50 108 | -54 109 | -87 110 | -116 111 | 7 112 | -27 113 | 55 114 | -7 115 | -44 116 | 8 117 | 123 118 | 96 119 | 40 120 | -58 121 | 62 122 | 117 123 | -118 124 | 37 125 | 79 126 | 116 127 | -2 128 | -94 129 | 101 130 | 79 131 | -64 132 | 67 133 | -28 134 | -60 135 | -24 136 | -35 137 | -59 138 | -106 139 | -57 140 | 30 141 | 77 142 | 56 143 | 1 144 | 85 145 | -108 146 | 79 147 | -127 148 | -85 149 | -108 150 | -65 151 | -2 152 | -24 153 | 79 154 | 77 155 | -43 156 | 15 157 | -9 158 | 94 159 | 41 160 | -7 161 | -62 162 | -82 163 | -99 164 | -116 165 | -28 166 | -45 167 | -90 168 | -55 169 | -99 170 | 17 171 | 21 172 | -116 173 | 58 174 | -22 175 | 112 176 | 124 177 | -20 178 | 62 179 | -34 180 | -23 181 | -101 182 | -103 183 | 80 184 | 49 185 | 100 186 | -34 187 | -119 188 | -89 189 | -7 190 | -15 191 | -73 192 | -22 193 | -102 194 | -38 195 | 96 196 | 17 197 | -40 198 | -32 199 | 38 200 | 23 201 | 114 202 | -64 203 | 96 204 | 58 205 | 81 206 | -124 207 | 123 208 | -28 209 | -63 210 | -35 211 | -68 212 | -65 213 | 52 214 | -99 215 | 70 216 | 90 217 | 50 218 | 46 219 | -15 220 | -74 221 | -128 222 | -95 223 | -27 224 | -67 225 | -4 226 | 22 227 | -105 228 | 41 229 | 74 230 | -89 231 | 73 232 | -25 233 | -9 234 | -29 235 | 104 236 | 18 237 | 121 238 | -42 239 | -122 240 | 41 241 | 59 242 | -62 243 | -105 244 | 40 245 | 79 246 | -42 247 | 5 248 | -38 249 | -85 250 | -125 251 | -84 252 | 36 253 | -97 254 | 48 255 | 69 256 | -96 257 | -41 258 | -81 259 | 110 260 | 61 261 | -102 262 | 9 263 | -35 264 | -66 265 | 97 266 | 68 267 | -39 268 | 83 269 | 41 270 | -60 271 | 21 272 | -51 273 | -95 274 | 100 275 | -46 276 | -27 277 | -120 278 | 118 279 | 113 280 | 63 281 | -49 282 | 79 283 | 93 284 | 30 285 | -80 286 | 69 287 | 9 288 | -58 289 | 127 290 | -121 291 | -89 292 | 74 293 | 45 294 | 39 295 | -87 296 | 124 297 | -53 298 | -41 299 | 104 300 | -123 301 | -30 302 | 75 303 | 87 304 | 34 305 | 40 306 | 69 307 | 115 308 | -24 309 | 2 310 | -58 311 | -30 312 | -75 313 | 2 314 | -117 315 | 63 316 | -107 317 | -30 318 | -111 319 | 56 320 | -61 321 | -90 322 | -93 323 | -45 324 | -71 325 | 17 326 | -66 327 | 7 328 | -93 329 | -17 330 | 83 331 | 50 332 | 19 333 | 126 334 | -105 335 | 102 336 | 109 337 | 110 338 | -126 339 | -82 340 | -32 341 | -34 342 | 43 343 | -98 344 | 80 345 | -113 346 | -6 347 | 81 348 | 104 349 | -122 350 | 66 351 | 8 352 | 61 353 | -103 354 | 91 355 | 93 356 | -98 357 | 96 358 | 121 359 | 42 360 | -48 361 | 118 362 | 32 363 | 109 364 | 30 365 | -90 366 | 81 367 | -9 368 | 93 369 | -43 370 | -49 371 | -124 372 | 24 373 | -105 374 | 39 375 | 123 376 | 57 377 | 83 378 | -68 379 | -16 380 | -44 381 | 73 382 | -100 383 | 4 384 | -30 385 | -45 386 | 6 387 | -116 388 | 16 389 | -58 390 | -64 391 | -126 392 | 74 393 | 18 394 | -60 395 | 47 396 | -103 397 | -63 398 | -12 399 | -51 400 | -23 401 | -52 402 | 51 403 | 73 404 | -66 405 | 114 406 | 103 407 | -120 408 | 123 409 | -111 410 | -103 411 | -98 412 | 33 413 | 113 414 | 95 415 | -68 416 | 44 417 | 3 418 | -126 419 | 119 420 | -28 421 | -3 422 | 28 423 | 3 424 | 35 425 | -25 426 | -106 427 | -56 428 | -10 429 | 89 430 | -78 431 | 62 432 | 77 433 | -26 434 | -33 435 | -121 436 | 62 437 | 75 438 | 2 439 | 28 440 | 106 441 | -120 442 | 6 443 | -75 444 | -49 445 | 92 446 | -114 447 | 61 448 | -54 449 | -72 450 | 53 451 | 38 452 | 82 453 | -84 454 | -37 455 | -59 456 | 49 457 | 92 458 | 49 459 | 59 460 | -5 461 | 34 462 | 6 463 | -37 464 | -57 465 | -1 466 | 82 467 | 95 468 | 43 469 | 115 470 | -68 471 | 38 472 | -18 473 | 54 474 | 32 475 | -12 476 | 40 477 | 75 478 | -74 479 | -17 480 | -67 481 | -28 482 | -95 483 | -71 484 | 61 485 | 62 486 | -124 487 | 34 488 | -18 489 | 11 490 | -107 491 | 81 492 | -103 493 | -44 494 | 49 495 | 53 496 | 127 497 | -114 498 | 100 499 | 30 500 | -42 501 | 15 502 | 63 503 | -4 504 | -28 505 | 32 506 | -128 507 | -43 508 | 13 509 | 33 510 | -1 511 | 111 512 | 33 513 | -39 514 | -39 515 | -109 516 | 29 517 | -91 518 | -51 519 | -77 520 | 19 521 | 17 522 | -12 523 | -35 524 | -3 525 | 48 526 | 13 527 | -71 528 | -89 529 | -91 530 | 39 531 | -65 532 | -110 533 | 28 534 | 64 535 | -7 536 | -58 537 | -85 538 | -125 539 | -90 540 | -11 541 | -109 542 | -99 543 | -48 544 | 34 545 | 60 546 | 117 547 | -123 548 | -44 549 | 23 550 | 74 551 | -110 552 | -93 553 | 60 554 | -109 555 | 13 556 | 99 557 | -126 558 | 29 559 | 42 560 | 17 561 | -99 562 | -121 563 | -36 564 | -86 565 | -91 566 | 25 567 | -11 568 | -121 569 | -93 570 | 67 571 | 28 572 | 29 573 | -80 574 | -108 575 | -93 576 | 86 577 | 79 578 | 127 579 | -49 580 | 73 581 | 95 582 | 16 583 | -113 584 | 9 585 | -47 586 | -116 587 | 6 588 | 68 589 | 98 590 | 117 591 | 123 592 | -20 593 | 26 594 | -8 595 | -71 596 | -60 597 | 44 598 | 6 599 | 15 600 | -83 601 | 73 602 | -56 603 | 6 604 | 41 605 | 24 606 | 10 607 | -25 608 | 100 609 | -47 610 | -88 611 | -50 612 | 84 613 | 23 614 | -103 615 | -54 616 | 93 617 | -7 618 | -57 619 | 61 620 | -50 621 | 0 622 | -10 623 | 91 624 | 58 625 | 15 626 | 79 627 | -40 628 | -1 629 | -62 630 | 73 631 | 10 632 | -98 633 | 18 634 | 117 635 | 6 636 | 109 637 | -71 638 | 70 639 | -110 640 | 9 641 | -30 642 | -52 643 | 113 644 | 21 645 | -2 646 | 17 647 | 59 648 | 122 649 | 73 650 | -17 651 | 10 652 | -94 653 | 40 654 | -21 655 | -104 656 | 67 657 | 122 658 | 78 659 | -64 660 | -21 661 | -25 662 | -84 663 | 25 664 | 3 665 | -8 666 | -26 667 | -128 668 | -43 669 | -106 670 | 90 671 | -18 672 | -64 673 | 41 674 | 9 675 | 21 676 | 123 677 | -124 678 | -88 679 | -64 680 | 49 681 | 122 682 | 108 683 | -4 684 | -29 685 | -40 686 | -56 687 | 99 688 | -122 689 | -109 690 | 2 691 | 48 692 | -54 693 | 52 694 | 44 695 | -106 696 | -45 697 | -107 698 | -63 699 | 29 700 | -117 701 | -108 702 | -20 703 | -92 704 | 65 705 | 105 706 | 31 707 | -28 708 | 17 709 | 9 710 | -80 711 | -70 712 | 94 713 | 109 714 | 22 715 | 36 716 | 116 717 | 73 718 | 103 719 | -106 720 | -13 721 | -84 722 | 106 723 | 86 724 | 23 725 | 15 726 | -119 727 | 10 728 | -88 729 | 2 730 | -110 731 | -48 732 | 102 733 | 49 734 | 125 735 | 82 736 | 116 737 | 45 738 | 99 739 | -121 740 | 100 741 | -39 742 | -58 743 | -73 744 | 47 745 | -10 746 | 3 747 | -117 748 | -70 749 | 98 750 | -100 751 | 27 752 | -44 753 | -24 754 | 105 755 | -19 756 | -99 757 | 92 758 | -44 759 | -100 760 | 61 761 | 29 762 | 120 763 | -12 764 | -26 765 | -32 766 | 74 767 | -24 768 | -109 769 | 124 770 | -34 771 | 66 772 | -62 773 | -80 774 | -70 775 | -100 776 | -127 777 | -45 778 | -108 779 | -26 780 | 40 781 | 72 782 | -24 783 | 68 784 | 55 785 | 114 786 | -26 787 | 40 788 | 68 789 | 40 790 | 27 791 | 45 792 | 33 793 | 48 794 | 36 795 | -108 796 | -119 797 | 103 798 | 96 799 | 29 800 | 29 801 | -20 802 | 20 803 | 56 804 | -29 805 | 87 806 | -49 807 | 66 808 | -117 809 | -4 810 | -97 811 | 96 812 | -108 813 | 41 814 | -82 815 | 116 816 | -39 817 | -43 818 | -96 819 | 105 820 | 79 821 | 97 822 | -98 823 | 66 824 | 78 825 | -73 826 | 120 827 | -95 828 | 48 829 | 83 830 | -13 831 | 1 832 | 23 833 | 27 834 | 42 835 | -37 836 | -45 837 | 14 838 | 93 839 | -101 840 | 92 841 | 68 842 | 17 843 | 77 844 | -124 845 | -96 846 | -69 847 | 7 848 | 84 849 | 94 850 | 121 851 | 113 852 | 22 853 | 7 854 | 20 855 | 79 856 | -41 857 | -118 858 | 57 859 | -36 860 | 99 861 | -83 862 | 84 863 | 28 864 | 50 865 | 22 866 | 115 867 | 57 868 | 120 869 | 21 870 | 100 871 | 45 872 | -50 873 | -47 874 | -59 875 | -58 876 | 53 877 | 61 878 | -83 879 | -27 880 | -38 881 | 62 882 | 45 883 | 91 884 | 60 885 | 10 886 | 83 887 | 2 888 | -51 889 | -102 890 | 26 891 | 114 892 | -72 893 | -70 894 | -86 895 | 15 896 | -19 897 | -122 898 | -16 899 | -12 900 | -17 901 | 29 902 | 57 903 | -13 904 | -55 905 | -99 906 | 14 907 | 94 908 | 125 909 | -121 910 | -119 911 | 117 912 | -32 913 | -103 914 | 72 915 | 10 916 | -8 917 | 122 918 | 46 919 | 55 920 | 40 921 | -23 922 | -11 923 | 90 924 | -72 925 | 56 926 | -54 927 | 115 928 | -66 929 | -78 930 | 10 931 | 88 932 | 14 933 | -119 934 | 106 935 | -66 936 | 65 937 | 5 938 | -41 939 | -1 940 | 50 941 | 94 942 | 22 943 | 81 944 | 22 945 | 67 946 | 45 947 | -103 948 | -6 949 | 102 950 | -44 951 | -48 952 | -92 953 | 121 954 | 1 955 | -14 956 | 105 957 | -108 958 | 0 959 | 110 960 | -122 961 | 111 962 | 54 963 | -121 964 | 2 965 | 28 966 | 94 967 | -48 968 | -77 969 | -110 970 | -11 971 | -42 972 | 65 973 | 44 974 | 118 975 | 72 976 | 51 977 | 42 978 | -92 979 | 69 980 | 89 981 | 21 982 | -13 983 | -37 984 | -109 985 | -71 986 | -83 987 | 100 988 | 35 989 | 61 990 | -46 991 | -33 992 | -21 993 | 118 994 | 69 995 | 53 996 | -95 997 | 55 998 | -16 999 | 63 1000 | -55 1001 | 12 1002 | -59 1003 | 40 1004 | 57 1005 | 36 1006 | 75 1007 | 100 1008 | -97 1009 | -70 1010 | 80 1011 | -45 1012 | 92 1013 | 114 1014 | 48 1015 | -103 1016 | -73 1017 | -101 1018 | -73 1019 | 104 1020 | -109 1021 | 42 1022 | -106 1023 | -124 1024 | -111 1025 | -83 1026 | 108 1027 | 50 1028 | 127 1029 | -67 1030 | -122 1031 | -81 1032 | 66 1033 | -121 1034 | 82 1035 | 1 1036 | -121 1037 | 70 1038 | -25 1039 | -8 1040 | -116 1041 | 14 1042 | 31 1043 | -49 1044 | 122 1045 | 121 1046 | -41 1047 | -85 1048 | 45 1049 | 101 1050 | 11 1051 | -20 1052 | 24 1053 | 90 1054 | 62 1055 | 47 1056 | 124 1057 | 41 1058 | -37 1059 | -20 1060 | 6 1061 | -46 1062 | 65 1063 | -80 1064 | -84 1065 | 112 1066 | -62 1067 | 3 1068 | 48 1069 | 22 1070 | 89 1071 | -72 1072 | -32 1073 | -58 1074 | -92 1075 | -90 1076 | -57 1077 | -18 1078 | -117 1079 | 83 1080 | 10 1081 | 99 1082 | -77 1083 | -31 1084 | 54 1085 | -81 1086 | -53 1087 | -59 1088 | -101 1089 | 112 1090 | -85 1091 | -66 1092 | 7 1093 | 74 1094 | -24 1095 | -102 1096 | -128 1097 | 32 1098 | -26 1099 | 113 1100 | 84 1101 | -105 1102 | 107 1103 | 91 1104 | -73 1105 | -80 1106 | -114 1107 | -100 1108 | -118 1109 | 81 1110 | 113 1111 | -16 1112 | -82 1113 | 46 1114 | -74 1115 | 114 1116 | 22 1117 | -31 1118 | -106 1119 | -75 1120 | 38 1121 | -26 1122 | 82 1123 | -110 1124 | 46 1125 | -30 1126 | -32 1127 | -21 1128 | -28 1129 | -56 1130 | 93 1131 | 56 1132 | 5 1133 | -122 1134 | 108 1135 | -68 1136 | -40 1137 | -109 1138 | 57 1139 | -45 1140 | 18 1141 | 70 1142 | -10 1143 | -104 1144 | -49 1145 | -109 1146 | -93 1147 | 81 1148 | 45 1149 | 65 1150 | -106 1151 | -60 1152 | 122 1153 | 122 1154 | 127 1155 | 88 1156 | 33 1157 | -126 1158 | 86 1159 | 94 1160 | 36 1161 | -96 1162 | 61 1163 | -88 1164 | 124 1165 | 39 1166 | -27 1167 | 90 1168 | -94 1169 | -69 1170 | -74 1171 | -98 1172 | -55 1173 | -17 1174 | 115 1175 | 118 1176 | 88 1177 | 66 1178 | 125 1179 | -9 1180 | 79 1181 | -50 1182 | -39 1183 | -86 1184 | -6 1185 | 18 1186 | 114 1187 | -92 1188 | -73 1189 | 69 1190 | 33 1191 | -35 1192 | 123 1193 | -105 1194 | 5 1195 | 9 1196 | 7 1197 | -23 1198 | 74 1199 | 121 1200 | 7 1201 | -112 1202 | 93 1203 | -91 1204 | -84 1205 | 78 1206 | 105 1207 | 2 1208 | -26 1209 | -29 1210 | -118 1211 | 18 1212 | -7 1213 | -77 1214 | -1 1215 | -7 1216 | -69 1217 | -66 1218 | 14 1219 | 88 1220 | 56 1221 | -98 1222 | 77 1223 | -25 1224 | -115 1225 | -70 1226 | -86 1227 | -14 1228 | 58 1229 | 84 1230 | 60 1231 | 70 1232 | -12 1233 | -126 1234 | 18 1235 | 99 1236 | 105 1237 | 6 1238 | -91 1239 | 96 1240 | 96 1241 | 93 1242 | -124 1243 | -103 1244 | -110 1245 | 45 1246 | 76 1247 | -44 1248 | -13 1249 | 46 1250 | -66 1251 | -17 1252 | -119 1253 | 97 1254 | -33 1255 | 1 1256 | 92 1257 | -57 1258 | -87 1259 | -66 1260 | 125 1261 | 94 1262 | -40 1263 | 0 1264 | 126 1265 | -17 1266 | 18 1267 | -80 1268 | 99 1269 | -55 1270 | 53 1271 | -34 1272 | -26 1273 | -127 1274 | 28 1275 | 53 1276 | 60 1277 | 8 1278 | -60 1279 | -102 1280 | -61 1281 | -------------------------------------------------------------------------------- /block_fun: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Michaela1224/SDA_code/5cc239f6bd1b48a8e1b689b0cb8af0d9dec511cf/block_fun -------------------------------------------------------------------------------- /block_test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Michaela1224/SDA_code/5cc239f6bd1b48a8e1b689b0cb8af0d9dec511cf/block_test -------------------------------------------------------------------------------- /block_test.cpp: -------------------------------------------------------------------------------- 1 | // #include "/tools/Xilinx/Vitis_HLS/2020.2/include/gmp.h" 2 | // #define __gmp_const const 3 | 4 | #include 5 | #include 6 | #include 7 | #include "block_top.h" 8 | #include "test.h" 9 | #include "config_test.h" 10 | // using namespace std; 11 | #define GENERATE_BIN 12 | 13 | /* 14 | int main(void){ 15 | 16 | // conv 17 | ap_uint<128> *conv3_ddr_a; 18 | 19 | conv3_ddr_a = (ap_uint<128>*)malloc((2*CONV_R*CONV_C*CONV_M/(MAX_OUP*2))*sizeof(ap_uint<128>)); //typedef ap_int<32> ADT4; 20 | 21 | 22 | FILE* fp_QAin = fopen("conv_in_oup.bin", "rb"); 23 | 24 | fread(conv3_ddr_a,sizeof(ap_uint<128>),(2*CONV_R*CONV_C*CONV_M/(MAX_OUP*2)),fp_QAin); 25 | // fread(conv3_ddr_shortcut,sizeof(ap_uint<256>),(CONV_R*CONV_C*CONV_N/(MAX_INP * PACK_NUM)),fp_QAin); 26 | 27 | fclose(fp_QAin); 28 | 29 | 30 | ap_uint<128> *conv3_ddr_w; 31 | 32 | conv3_ddr_w =(ap_uint<128>*)malloc(((CONV_K*CONV_N)/MAX_INP)*(CONV_M)*2*sizeof(ap_uint<128>)); //typedef ap_int<32> ADT4; 33 | 34 | FILE* fp_QWin = fopen("conv3_w.bin", "rb"); 35 | 36 | fread(conv3_ddr_w,sizeof(ap_uint<128>),2*((CONV_K*CONV_N)/MAX_INP)*CONV_M,fp_QWin); 37 | 38 | cout< *conv3_ddr_fc_shortcut; 44 | // conv3_ddr_fc_shortcut = (ap_uint<128>*)malloc((CONV_M/(MAX_OUP))*2*sizeof(ap_uint<128>)); //typedef ap_int<32> ADT4; 45 | 46 | // FILE* fp_QFC = fopen("conv3_fcvu_short.bin", "rb"); 47 | 48 | // fread(conv3_ddr_fc_shortcut,sizeof(ap_uint<128>),(CONV_M/(MAX_OUP))*2,fp_QFC); 49 | // fclose(fp_QFC); 50 | 51 | ap_uint<128> *conv3_ddr_shortcut; 52 | cout<<"Byte: "<)<*)malloc((CONV_R*CONV_C*CONV_M/(2 * MAX_OUP))*2*sizeof(ap_uint<128>)); // 8192 54 | 55 | FILE* fpa_shortcut = fopen("conv3_fm_short.bin", "wb"); 56 | 57 | fwrite(conv3_ddr_shortcut,sizeof(ap_uint<128>),(CONV_R*CONV_C*CONV_M/(2 * MAX_OUP))*2,fpa_shortcut); 58 | 59 | fclose(fpa_shortcut); 60 | 61 | 62 | 63 | 64 | // bia parameter////////////////// 65 | 66 | // 这里 待修改 67 | 68 | unsigned bias_num=CONV_M/(MAX_OUP/2); 69 | 70 | unsigned factor_num=1; 71 | 72 | const unsigned PACKING_MAX_NORM_PE_NUM=128/(MAX_NORM_PE*16*2); 73 | unsigned norm_gamma_beta_num=CONV_M/(PACKING_MAX_NORM_PE_NUM*MAX_NORM_PE); 74 | const unsigned PACKING_MAX_NORM_PE_PTF_FACTOR_NUM=2; // attention: 当Y不为20要做修改 512/(2*) 75 | unsigned norm_ptf_num=CONV_M/(PACKING_MAX_NORM_PE_PTF_FACTOR_NUM*MAX_OUP); 76 | 77 | unsigned total_num=bias_num+factor_num+norm_gamma_beta_num+norm_ptf_num; 78 | 79 | 80 | 81 | ap_uint<128> *input_ln_parameter = (ap_uint<128> *)malloc((total_num)*sizeof(ap_uint<128>)); 82 | 83 | 84 | 85 | FILE* fp_QBin = fopen("conv3_bias.bin", "rb"); 86 | 87 | fread(input_ln_parameter,sizeof(ap_uint<128>),total_num,fp_QBin); 88 | 89 | fclose(fp_QBin); 90 | 91 | 92 | 93 | 94 | 95 | unsigned layer_bias_offset=0; 96 | 97 | // unsigned layermode=0; 98 | 99 | 100 | 101 | 102 | ap_uint<128>* ddr_fm_back; 103 | ap_uint<128>* ddr_fm_shortcut_back; 104 | 105 | ddr_fm_back = (ap_uint<128>*)malloc((2*CONV_R*CONV_C*CONV_M/(MAX_OUP*2))*sizeof(ap_uint<128>)); 106 | ddr_fm_shortcut_back = (ap_uint<128>*)malloc((2*CONV_R*CONV_C*CONV_M/(MAX_OUP*2))*sizeof(ap_uint<128>)); 107 | 108 | 109 | 110 | 111 | unsigned which_path; 112 | which_path=2; 113 | 114 | bool CONV1_TO_MM_EN; 115 | 116 | CONV1_TO_MM_EN=true; 117 | 118 | do_compute_top(conv3_ddr_a, conv3_ddr_w ,input_ln_parameter,conv3_ddr_shortcut, ddr_fm_back, ddr_fm_shortcut_back, layer_bias_offset, CONV_R, CONV_C, CONV_N, CONV_M,CONV_D, 119 | which_path,CONV1_TO_MM_EN); 120 | 121 | 122 | 123 | // #ifdef GENERATE_BIN 124 | // FILE* fpout_short = fopen("conv_out_short.bin", "wb"); 125 | 126 | // fwrite(ddr_fm_shortcut_back,sizeof(ap_uint<128>),(2*CONV_R*CONV_C*CONV_M/(MAX_OUP*2)),fpout_short); 127 | 128 | // fclose(fpout_short); 129 | // #endif 130 | 131 | 132 | #ifdef GENERATE_BIN 133 | FILE* fpout = fopen("conv_out.bin", "wb"); 134 | 135 | fwrite(ddr_fm_back,sizeof(ap_uint<128>),(2*CONV_R*CONV_C*CONV_M/(MAX_OUP*2)),fpout); 136 | 137 | fclose(fpout); 138 | #endif 139 | 140 | 141 | 142 | ap_uint<128> temp_128b; 143 | ap_uint temp; 144 | 145 | ap_uint temp_8b0; 146 | ap_uint temp_8b1; 147 | 148 | FILE* fp1 = fopen("result_verify_norm_1.txt", "wb"); 149 | 150 | for(int i=0; i<2*CONV_R*CONV_C*CONV_M/(MAX_OUP*2);i++){ 151 | #pragma HLS PIPELINE II=1 152 | temp_128b=ddr_fm_back[i]; 153 | temp=temp_128b; 154 | for(int j=0; j A[R][N]; 203 | ap_int W[N][M]; 204 | float O_golden[R][M]; 205 | 206 | 207 | generate_mm_a(A); 208 | generate_mm_w(W); 209 | generate_mm_output(A,W,O_golden); 210 | 211 | 212 | 213 | ap_uint<128> *DDR_A; 214 | // cout<<"Byte: "<)<*)malloc((R*M*2/(MAX_OUP*2))*sizeof(ap_uint<128>)); //typedef ap_int<32> ADT4; 216 | 217 | // host_DDR_A_128b(A, DDR_A); 218 | // // cout <<"The Value of Var_p: \t" <),(R*M*2/(MAX_OUP*2)),fp_QAin); 224 | // fread(conv3_ddr_shortcut,sizeof(ap_uint<256>),(CONV_R*CONV_C*CONV_N/(MAX_INP * PACK_NUM)),fp_QAin); 225 | 226 | fclose(fp_QAin); 227 | 228 | 229 | 230 | 231 | // ap_uint<128> *DDR_A_shortcut; 232 | 233 | // DDR_A_shortcut = (ap_uint<128>*)malloc((2*R*M/(MAX_OUP * PACK_NUM))*sizeof(ap_uint<128>)); //typedef ap_int<32> ADT4; 234 | // generate_mm_shortcut_128btest(DDR_A_shortcut); 235 | 236 | // FILE* fp_QASHORT = fopen("mm_fm_trans_160.bin", "rb"); 237 | 238 | // fread(DDR_A_shortcut,sizeof(ap_uint<128>),(R*M*2/(MAX_OUP*2)),fp_QASHORT); 239 | 240 | 241 | // fclose(fp_QASHORT); 242 | 243 | 244 | 245 | // #ifdef GENERATE_BIN 246 | // FILE* fpa_short = fopen("mm_fm_160_short.bin", "wb"); 247 | 248 | // fwrite(DDR_A_shortcut,sizeof(ap_uint<128>),(2*R*M/(MAX_OUP * PACK_NUM)),fpa_short); 249 | 250 | // fclose(fpa_short); 251 | // #endif 252 | 253 | 254 | // ap_uint<128> *DDR_W; 255 | 256 | // DDR_W = (ap_uint<128>*)malloc((2*N*M/(MAX_OUP * PACK_NUM))*sizeof(ap_uint<128>)); //typedef ap_int<32> ADT4; 257 | 258 | // host_DDR_W_Softmax_128b(W, DDR_W); 259 | 260 | // #ifdef GENERATE_BIN 261 | // FILE* fpw = fopen("mm_w_160.bin", "wb"); 262 | 263 | // fwrite(DDR_W,sizeof(ap_uint<128>),(2*N*M/(MAX_OUP * PACK_NUM)),fpw); 264 | 265 | // fclose(fpw); 266 | // #endif 267 | 268 | ap_uint<128> *DDR_W; 269 | 270 | DDR_W = (ap_uint<128>*)malloc((N*M/(MAX_OUP * PACK_NUM))*sizeof(ap_uint<128>)); //typedef ap_int<32> ADT4; 271 | 272 | host_DDR_W_128b(W, DDR_W); 273 | 274 | #ifdef GENERATE_BIN 275 | FILE* fpw = fopen("mm_w_160.bin", "wb"); 276 | 277 | fwrite(DDR_W,sizeof(ap_uint<128>),(N*M/(MAX_OUP * PACK_NUM)),fpw); 278 | 279 | fclose(fpw); 280 | #endif 281 | 282 | 283 | 284 | unsigned bias_num=M/(MAX_OUP/2); 285 | 286 | unsigned factor_num=1; 287 | 288 | // const unsigned PACKING_MAX_NORM_PE_NUM=128/(MAX_NORM_PE*16*2); 289 | // unsigned norm_gamma_beta_num=M/(PACKING_MAX_NORM_PE_NUM*MAX_NORM_PE); 290 | // const unsigned PACKING_MAX_NORM_PE_PTF_FACTOR_NUM=2; // attention: 当Y不为20要做修改 512/(2*) 291 | // unsigned norm_ptf_num=M/(PACKING_MAX_NORM_PE_PTF_FACTOR_NUM*MAX_OUP); 292 | 293 | // unsigned total_num=bias_num+factor_num+norm_gamma_beta_num+norm_ptf_num; 294 | 295 | unsigned total_num=bias_num+factor_num; 296 | 297 | ap_uint<128> *input_ln_parameter = (ap_uint<128> *)malloc((total_num)*sizeof(ap_uint<128>)); 298 | 299 | 300 | generate_conv_allbias_128btest(input_ln_parameter,bias_num,total_num); 301 | 302 | 303 | #ifdef GENERATE_BIN 304 | FILE* fpb = fopen("mm_bias_160.bin", "wb"); 305 | 306 | fwrite(input_ln_parameter,sizeof(ap_uint<128>),total_num,fpb); 307 | 308 | fclose(fpb); 309 | #endif 310 | 311 | 312 | 313 | 314 | 315 | ap_uint<128>* ddr_fm_back; 316 | ap_uint<128>* ddr_fm_shortcut_back; 317 | 318 | ddr_fm_back = (ap_uint<128>*)malloc((R*M*2/(MAX_OUP*2))*sizeof(ap_uint<128>)); 319 | ddr_fm_shortcut_back = (ap_uint<128>*)malloc((R*M*2/(MAX_OUP*2))*sizeof(ap_uint<128>)); 320 | 321 | 322 | 323 | 324 | unsigned which_path; 325 | which_path=10; 326 | 327 | bool CONV1_TO_MM_EN; 328 | 329 | CONV1_TO_MM_EN=false; 330 | 331 | do_compute_top(DDR_A, DDR_W,input_ln_parameter,0, ddr_fm_back, ddr_fm_shortcut_back, 0,0, R, 0, N, M,0, 332 | which_path,CONV1_TO_MM_EN); 333 | 334 | 335 | 336 | #ifdef GENERATE_BIN 337 | FILE* fpout_short = fopen("mm_out_short_160.bin", "wb"); 338 | 339 | fwrite(ddr_fm_shortcut_back,sizeof(ap_uint<128>),R*M*2/(MAX_OUP*2),fpout_short); 340 | 341 | fclose(fpout_short); 342 | #endif 343 | 344 | 345 | #ifdef GENERATE_BIN 346 | FILE* fpout = fopen("mm_out_160.bin", "wb"); 347 | 348 | fwrite(ddr_fm_back,sizeof(ap_uint<128>),R*M*2/(MAX_OUP*2),fpout); 349 | 350 | fclose(fpout); 351 | #endif 352 | 353 | 354 | ap_uint<128> temp_128b; 355 | ap_uint temp; 356 | 357 | ap_uint temp_8b0; 358 | ap_uint temp_8b1; 359 | 360 | FILE* fp1 = fopen("result_verify_norm_1.txt", "wb"); 361 | 362 | for(int i=0; i 5 | #include 6 | using namespace hls; 7 | 8 | 9 | 10 | 11 | void do_compute_top(ap_uint<128>* img_conv3_mm, 12 | // conv3的权重输入 13 | ap_uint<128> *weight_conv3_mm, 14 | 15 | // ap_uint<512>*fc_weight, 16 | // bias+scalefactor 17 | ap_uint<128>* ddr_bias_scale_factor, // BIAS_BIT*16 18 | // 19 | ap_uint<128>* ddr_fm_shortcut, 20 | 21 | ap_uint<128>* ddr_fm_back, 22 | ap_uint<128>* ddr_fm_shortcut_back, 23 | // stream > fifo_C_deQua[16], 24 | 25 | const unsigned layer_bias_offset, 26 | const unsigned layer_weight_offset, 27 | // const ap_uint<4> ENCODE_MODE, 28 | const unsigned R, 29 | const unsigned C, 30 | const unsigned N, 31 | const unsigned M, 32 | const unsigned D, 33 | const unsigned WhichPath, 34 | const bool CONV1_TO_MM_EN); 35 | 36 | 37 | -------------------------------------------------------------------------------- /config.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #define CONV_K 3 5 | 6 | 7 | #define MAX_INP 20 8 | #define MAX_OUP 10 9 | 10 | #define SA_INP 5 11 | #define SA_OUP 5 12 | 13 | #define MAX_A_ROW MAX_INP/SA_INP 14 | #define MAX_A_COL MAX_OUP/SA_OUP 15 | 16 | // 假定 S2三个处理的并行度都相同 17 | #define MAX_NORM_PE 5 // Norm单元的并行度 18 | #define MAX_SOFTMAX_STAGE1_PE 5 19 | #define MAX_SOFTMAX_STAGE2_PE 5 20 | #define MAX_GELU_PE 5 21 | 22 | 23 | 24 | #define FC_INP 2 25 | #define FC_OUP 10 // 02-24-setting true 26 | #define LINREAR_N 1280 27 | 28 | 29 | #define IN_BIT 8 30 | #define W_BIT 4 31 | #define PACK_NUM 2 32 | #define PACK_CONV_NUM 3 33 | #define PACK_OUT_NUM 4 34 | #define ACC_BIT 36 // 02-23-setting true 35 | #define BIAS_BIT 16 36 | typedef ap_fixed<16, 8> LN_BIAS_DB; // LN gamma、beta 37 | #define OUT_BIT 8 38 | 39 | #define SHORTCUT_BIT 8 40 | #define DEQUAN_BIT 16 41 | #define DEQUAN_INTEGER_BIT 8 42 | #define Shift_Num 17 43 | 44 | typedef ap_fixed De_Quan_DB; // LN gamma、beta 45 | 46 | #define QUAN_FACTOR_BIT 16 47 | #define QUAN_FACTOR_INTEGER_BIT 8 48 | typedef ap_fixed Quan_Factor_DB; // LN gamma、beta 49 | 50 | #define SILU_BIT 16 51 | #define SILU_INTEGER_BIT 8 52 | typedef ap_fixed SILU_DB; // LN gamma、beta 53 | 54 | 55 | 56 | 57 | #define ILN_WIDTH 8 58 | #define LN_PWF_FACTOR_BIT 2 // LN PWF FACTOR 59 | #define ILN_N_MEAN_WIDTH 27// LN的输入 8+3+17 约为 60 | #define ILN_N_VAR_WIDTH 35// LN的输入 8+4+6 +17= 35 61 | #define ILN_MEAN_WIDTH 11// LN的输入 8+3 62 | #define ILN_VAR_WIDTH 18// LN的输入 8+4+6 63 | #define ILN_OUT_WIDTH 18 // LN的输入 64 | #define ILN_OUT_INTEGER_WIDTH 12 // LN的输入 65 | typedef ap_fixed LN_OUT_DB; 66 | 67 | 68 | 69 | #define SOFTMAX_OUT_WIDTH 16 70 | #define SOFTMAX_OUT_INTEGER_WIDTH 2 // LN的输入 71 | 72 | 73 | 74 | #define MAX_SOFTMAX_M_LENGTH 4100 // 02-24-setting true 75 | typedef ap_fixed<24, 8> SOFTMAX_SUM_DB; 76 | 77 | 78 | 79 | #define GELU_OUT_WIDTH 16 // LN的输入 80 | #define GELU_OUT_INTEGER_WIDTH 8 // LN的输入 81 | 82 | 83 | #define MAX_SOFTMAX_INBUF_LENGTH 8240 // 02-23-setting true 84 | #define MAX_GELU_INBUF_LENGTH 8240 // 02-23-setting true 85 | #define MAX_GELU_ROW_INBUF_LENGTH 5120 // 02-23-setting true 86 | #define MAX_NORM_INBUF_LENGTH 8240 // 02-23-setting true 87 | #define MAX_SHORTCUT_NORM_INBUF_LENGTH 2560 // 02-23-setting true 88 | 89 | #define AXI_BIAS_BIT 512 90 | 91 | // #define MAX_M 320 92 | // #define MAX_N 320 93 | 94 | #define MAX_CONV3_WEIGHT_LENGTH 7680 // 02-23-setting true 95 | #define MAX_CONV3_BIAS_LENGTH 410 // 02-23-setting true 10240/10 96 | #define MAX_SCALE_FACTOR_LENGTH 6 //quan/dequan number 97 | #define MAX_MM_FM_LENGTH 5120 // 02-23-setting true 98 | #define MAX_NORM_BIAS_LENGTH 256 // 02-23-setting true 99 | #define MAX_NORM_PWF_FACTOR_LENGTH 128 // 02-23-setting true 100 | 101 | // #define MAX_FC1D_WEIGHT_LENGTH 1600 // (1280/(FC_INP*2))*(1280/FC_OUP) 02-24-setting true 102 | // #define MAX_FC1D_INPUT_LENGTH 320 // 02-23-setting true 103 | // #define MAX_FC_BIAS_LENGTH 640 // 02-23-setting true 104 | -------------------------------------------------------------------------------- /config_test.h: -------------------------------------------------------------------------------- 1 | // MM 2 | #define R 160 3 | #define N 160 4 | #define M 160 5 | 6 | // WS 7 | 8 | 9 | #define CONV_R 32 10 | #define CONV_C 32 11 | #define CONV_N 80 12 | #define CONV_M 80 13 | #define CONV_D 20 14 | 15 | #define layer1_W_offset 0 16 | #define layer2_W_offset ((CONV_K*CONV_N)/MAX_INP)*CONV_M*2 17 | 18 | #define layer1_BIAS_offset 0 19 | #define layer2_BIAS_offset (CONV_M/(MAX_OUP/2))+1+(CONV_M/(2*MAX_OUP)) 20 | -------------------------------------------------------------------------------- /diffusion-lib.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "/tools/Xilinx/Vitis_HLS/2020.2/include/gmp.h" 3 | #define __gmp_const const 4 | 5 | #include "sa_tools.h" 6 | #include "stream_tools.h" 7 | #include "load_param.h" 8 | #include -------------------------------------------------------------------------------- /load_param: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Michaela1224/SDA_code/5cc239f6bd1b48a8e1b689b0cb8af0d9dec511cf/load_param -------------------------------------------------------------------------------- /load_param.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "/tools/Xilinx/Vitis_HLS/2020.2/include/gmp.h" 3 | #define __gmp_const const 4 | 5 | #include "config.h" 6 | #include 7 | using namespace hls; 8 | using namespace std; 9 | 10 | // #define OUTPUTW_DEBUG 11 | 12 | 13 | void WriteMMFMParam_MMTRANSFER(ap_uint<128>* in, ap_uint< MAX_INP * IN_BIT * PACK_NUM> mm_a_buf[MAX_MM_FM_LENGTH], 14 | const unsigned MM_N){ 15 | #pragma HLS INLINE OFF 16 | 17 | 18 | 19 | 20 | ap_uint BUFA_80b[MAX_INP]; 21 | ap_uint BUFB_80b[MAX_INP]; 22 | #pragma HLS ARRAY_PARTITION variable=BUFA_80b complete dim=0 23 | #pragma HLS ARRAY_PARTITION variable=BUFB_80b complete dim=0 24 | 25 | ap_uint<128> temp_128b; 26 | ap_uint<80> temp_80b; 27 | ap_uint< MAX_INP * IN_BIT * PACK_NUM > temp_oup; 28 | 29 | bool arb = 0; 30 | unsigned int max_inp_cnt=0; 31 | unsigned int cnt_write=0; 32 | 33 | for(unsigned i=0; i=MAX_INP&&max_inp_cnt<(MAX_OUP/2)){ 48 | for(unsigned m=0;m* in, ap_uint< MAX_INP * IN_BIT * PACK_NUM> mm_a_buf[MAX_MM_FM_LENGTH], 95 | const unsigned MM_N){ 96 | #pragma HLS INLINE OFF 97 | 98 | ap_uint<128> weight_in[3]; 99 | ap_uint<384> weight_in_384b; 100 | 101 | unsigned int bitIdx=0; 102 | unsigned int colIdx=0; 103 | ap_uint< MAX_INP * IN_BIT * PACK_NUM > temp; // 20*8*2=320b 104 | for(unsigned i=0; i > &fifo_out, 133 | ap_uint<128>* ddr_fm_result, 134 | const unsigned NumLines){ 135 | 136 | 137 | ap_uint temp; 138 | ap_uint<128> temp_128b; 139 | 140 | 141 | for (unsigned rep = 0; rep < NumLines; rep++) { 142 | #pragma HLS PIPELINE II=1 143 | temp=fifo_out.read(); 144 | temp_128b=temp; 145 | ddr_fm_result[rep]=temp_128b; 146 | 147 | } 148 | } 149 | 150 | 151 | void Write_Shortcut_conv3_to_conv1(stream > &fifo_out, 152 | ap_uint<128>* ddr_fm_result, 153 | const unsigned OUT_W, 154 | const unsigned D, 155 | const unsigned OUT_H, 156 | const unsigned M_div_D, 157 | const unsigned NumLines){ 158 | 159 | unsigned int loop0,loop1,loop2,loop3,loop4; 160 | 161 | unsigned int loop0_cnt=0; 162 | unsigned int loop1_cnt=0; 163 | unsigned int loop2_cnt=0; 164 | unsigned int loop3_cnt=0; 165 | unsigned int loop4_cnt=0; 166 | // unsigned int loop5_cnt=0; 167 | 168 | 169 | ap_uint temp; 170 | ap_uint<128> temp_128b; 171 | unsigned index; 172 | 173 | 174 | loop0=2; // MAX_OUP本身拆成的2个 175 | loop1=OUT_W/2; 176 | loop2=OUT_H; 177 | loop3=D/MAX_OUP; 178 | loop4=M_div_D; 179 | 180 | for (unsigned rep = 0; rep < NumLines; rep++) { 181 | #pragma HLS PIPELINE II=1 182 | temp=fifo_out.read(); 183 | temp_128b=temp; 184 | 185 | // 存数按照数据下次用的顺序 186 | index=loop4_cnt*loop3*loop2*loop1*loop0+loop3_cnt*loop2*loop1*loop0+loop2_cnt*loop1*loop0+loop1_cnt*loop0+loop0_cnt; 187 | 188 | ddr_fm_result[rep]=temp_128b; 189 | 190 | 191 | // 取数按照数据来的顺序 192 | if(loop0_cnt==loop0-1){ 193 | loop0_cnt=0; 194 | if(loop1_cnt==loop1-1){ 195 | loop1_cnt=0; 196 | if(loop3_cnt==loop3-1){ 197 | loop3_cnt=0; 198 | if(loop2_cnt==loop2-1){ 199 | loop2_cnt=0; 200 | if(loop4_cnt==loop4-1){ 201 | loop4_cnt=0; 202 | } 203 | else{ 204 | loop4_cnt++; 205 | } 206 | } 207 | else{ 208 | loop2_cnt++; 209 | } 210 | } 211 | else{ 212 | loop3_cnt++; 213 | } 214 | } 215 | else{ 216 | loop1_cnt++; 217 | } 218 | } 219 | else{ 220 | loop0_cnt++; 221 | } 222 | } 223 | 224 | } 225 | 226 | void Write_Out_to_DDR_DIRECT(stream > &fifo_out, 227 | ap_uint<128>* ddr_fm_result, 228 | const unsigned NumLines 229 | ){ 230 | 231 | 232 | 233 | ap_uint temp; 234 | ap_uint<128> temp_128b; 235 | 236 | 237 | for (unsigned rep = 0; rep < NumLines; rep++) { 238 | #pragma HLS PIPELINE II=1 239 | temp=fifo_out.read(); 240 | temp_128b=temp; 241 | ddr_fm_result[rep]=temp_128b; 242 | 243 | } 244 | 245 | 246 | } 247 | 248 | 249 | 250 | void Write_to_DDR_NORM_MM_FM_SOFTMAX_GELU(stream > &fifo_out, 251 | ap_uint<128>* ddr_fm_result, 252 | const unsigned PENUM, 253 | const unsigned R_div_2INP, // Groups 254 | const unsigned NumLines){ 255 | 256 | 257 | unsigned int loop0,loop1,loop2,loop3; 258 | 259 | unsigned int loop0_cnt=0; 260 | unsigned int loop1_cnt=0; 261 | unsigned int loop2_cnt=0; 262 | unsigned int loop3_cnt=0; 263 | 264 | loop0=MAX_INP; // MAX_OUP本身拆成的2个 265 | loop1=4; // 除了MAXOUP是2个, 还有MAX_OUP拆成两个 266 | loop2=PENUM/2; 267 | loop3=R_div_2INP; 268 | 269 | 270 | 271 | 272 | 273 | ap_uint temp; 274 | ap_uint<128> temp_128b; 275 | unsigned index; 276 | 277 | 278 | for (unsigned rep = 0; rep < NumLines; rep++) { 279 | #pragma HLS PIPELINE II=1 280 | temp=fifo_out.read(); 281 | temp_128b=temp; 282 | 283 | // 存数按照数据下次用的顺序 284 | index=loop3_cnt*loop2*loop1*loop0+loop2_cnt*loop1*loop0+loop1_cnt*loop0+loop0_cnt; 285 | 286 | ddr_fm_result[rep]=temp_128b; 287 | 288 | 289 | // 取数按照数据来的顺序 290 | if(loop1_cnt==loop1-1){ 291 | loop1_cnt=0; 292 | if(loop2_cnt==loop2-1){ 293 | loop2_cnt=0; 294 | if(loop0_cnt==loop0-1){ 295 | loop0_cnt=0; 296 | if(loop3_cnt==loop3-1){ 297 | loop3_cnt=0; 298 | } 299 | else{ 300 | loop3_cnt++; 301 | } 302 | } 303 | else{ 304 | loop0_cnt++; 305 | } 306 | } 307 | else{ 308 | loop2_cnt++; 309 | } 310 | } 311 | else{ 312 | loop1_cnt++; 313 | } 314 | } 315 | 316 | } 317 | 318 | 319 | 320 | void Write_to_DDR_NORM_MM_FM(stream > &fifo_out, 321 | ap_uint<128>* ddr_fm_result, 322 | const unsigned PENUM, 323 | const unsigned R_div_2INP, // Groups 324 | const unsigned NumLines){ 325 | 326 | 327 | unsigned int loop0,loop1,loop2,loop3; 328 | 329 | unsigned int loop0_cnt=0; 330 | unsigned int loop1_cnt=0; 331 | unsigned int loop2_cnt=0; 332 | unsigned int loop3_cnt=0; 333 | 334 | loop0=MAX_INP; // MAX_OUP本身拆成的2个 335 | loop1=4; // 除了MAXOUP是2个, 还有MAX_OUP拆成两个 336 | loop2=PENUM; 337 | loop3=R_div_2INP; 338 | 339 | 340 | 341 | 342 | 343 | ap_uint temp; 344 | ap_uint<128> temp_128b; 345 | unsigned index; 346 | 347 | 348 | for (unsigned rep = 0; rep < NumLines; rep++) { 349 | #pragma HLS PIPELINE II=1 350 | temp=fifo_out.read(); 351 | temp_128b=temp; 352 | 353 | // 存数按照数据下次用的顺序 354 | index=loop3_cnt*loop2*loop1*loop0+loop2_cnt*loop1*loop0+loop1_cnt*loop0+loop0_cnt; 355 | 356 | ddr_fm_result[rep]=temp_128b; 357 | 358 | 359 | // 取数按照数据来的顺序 360 | if(loop1_cnt==loop1-1){ 361 | loop1_cnt=0; 362 | if(loop0_cnt==loop0-1){ 363 | loop0_cnt=0; 364 | if(loop2_cnt==loop2-1){ 365 | loop2_cnt=0; 366 | if(loop3_cnt==loop3-1){ 367 | loop3_cnt=0; 368 | } 369 | else{ 370 | loop3_cnt++; 371 | } 372 | } 373 | else{ 374 | loop2_cnt++; 375 | } 376 | } 377 | else{ 378 | loop0_cnt++; 379 | } 380 | } 381 | else{ 382 | loop1_cnt++; 383 | } 384 | } 385 | 386 | } 387 | 388 | 389 | 390 | void Write_to_DDR_NORM_MM_Tranpose(stream > &fifo_out, 391 | ap_uint<128>* ddr_fm_result, 392 | const unsigned PENUM, 393 | const unsigned R_div_2INP, // Groups 394 | const unsigned NumLines){ 395 | 396 | 397 | unsigned int loop0,loop1,loop2,loop3; 398 | 399 | unsigned int loop0_cnt=0; 400 | unsigned int loop1_cnt=0; 401 | unsigned int loop2_cnt=0; 402 | unsigned int loop3_cnt=0; 403 | 404 | loop0=4; // 除了MAXOUP是2个, 还有MAX_OUP拆成两个 405 | loop1=MAX_INP; 406 | loop2=R_div_2INP; 407 | loop3=PENUM; 408 | 409 | 410 | 411 | 412 | 413 | ap_uint temp; 414 | ap_uint<128> temp_128b; 415 | unsigned index; 416 | 417 | 418 | for (unsigned rep = 0; rep < NumLines; rep++) { 419 | #pragma HLS PIPELINE II=1 420 | temp=fifo_out.read(); 421 | temp_128b=temp; 422 | 423 | // 存数按照数据下次用的顺序 424 | index=loop3_cnt*loop2*loop1*loop0+loop2_cnt*loop1*loop0+loop1_cnt*loop0+loop0_cnt; 425 | 426 | ddr_fm_result[rep]=temp_128b; 427 | 428 | 429 | // 取数按照数据来的顺序 430 | if(loop0_cnt==loop0-1){ 431 | loop0_cnt=0; 432 | if(loop1_cnt==loop1-1){ 433 | loop0_cnt=0; 434 | if(loop3_cnt==loop3-1){ 435 | loop3_cnt=0; 436 | if(loop2_cnt==loop2-1){ 437 | loop2_cnt=0; 438 | } 439 | else{ 440 | loop2_cnt++; 441 | } 442 | } 443 | else{ 444 | loop3_cnt++; 445 | } 446 | } 447 | else{ 448 | loop1_cnt++; 449 | } 450 | } 451 | else{ 452 | loop0_cnt++; 453 | } 454 | } 455 | 456 | } 457 | 458 | void Write_Out_to_DDR_NORM_CONV(stream > &fifo_out, 459 | ap_uint<128>* ddr_fm_result, 460 | const unsigned OUT_W, 461 | const unsigned D, 462 | const unsigned OUT_H, 463 | const unsigned M_div_D, 464 | const unsigned NumLines){ 465 | 466 | 467 | unsigned int loop0,loop1,loop2,loop3,loop4,loop5; 468 | 469 | unsigned int loop0_cnt=0; 470 | unsigned int loop1_cnt=0; 471 | unsigned int loop2_cnt=0; 472 | unsigned int loop3_cnt=0; 473 | unsigned int loop4_cnt=0; 474 | unsigned int loop5_cnt=0; 475 | 476 | 477 | loop0=2; // MAX_OUP本身拆成的2个 478 | loop1=MAX_INP/MAX_OUP; 479 | loop2=OUT_W/2; 480 | loop3=D/MAX_INP; 481 | loop4=M_div_D; 482 | loop5=OUT_H; 483 | 484 | 485 | 486 | 487 | ap_uint temp; 488 | ap_uint<128> temp_128b; 489 | unsigned index; 490 | 491 | 492 | for (unsigned rep = 0; rep < NumLines; rep++) { 493 | #pragma HLS PIPELINE II=1 494 | temp=fifo_out.read(); 495 | temp_128b=temp; 496 | 497 | // 存数按照数据下次用的顺序 498 | index=loop5_cnt*loop4*loop3*loop2*loop1*loop0+loop4_cnt*loop3*loop2*loop1*loop0+loop3_cnt*loop2*loop1*loop0+loop2_cnt*loop1*loop0+loop1_cnt*loop0+loop0_cnt; 499 | 500 | ddr_fm_result[rep]=temp_128b; 501 | 502 | 503 | // 取数按照数据来的顺序 504 | if(loop0_cnt==loop0-1){ 505 | loop0_cnt=0; 506 | if(loop2_cnt==loop2-1){ 507 | loop2_cnt=0; 508 | if(loop1_cnt==loop1-1){ 509 | loop1_cnt=0; 510 | if(loop3_cnt==loop3-1){ 511 | loop3_cnt=0; 512 | if(loop5_cnt==loop5-1){ 513 | loop5_cnt=0; 514 | if(loop4_cnt==loop4-1){ 515 | loop4_cnt=0; 516 | } 517 | else{ 518 | loop4_cnt++; 519 | } 520 | } 521 | else{ 522 | loop5_cnt++; 523 | } 524 | } 525 | else{ 526 | loop3_cnt++; 527 | } 528 | } 529 | else{ 530 | loop1_cnt++; 531 | } 532 | } 533 | else{ 534 | loop2_cnt++; 535 | } 536 | } 537 | else{ 538 | loop0_cnt++; 539 | } 540 | } 541 | 542 | } 543 | 544 | 545 | void Write_Out_to_DDR_NORM(stream > &fifo_out, 546 | ap_uint<128>* ddr_fm_result, 547 | const unsigned PENUM, 548 | const unsigned OUT_W, 549 | const unsigned D, 550 | const unsigned OUT_H, 551 | const unsigned M_div_D, 552 | const unsigned NumLines, 553 | const unsigned which_path, 554 | const bool skip_mode 555 | ){ 556 | 557 | if(skip_mode==false){ 558 | return; 559 | } 560 | 561 | 562 | 563 | 564 | if(which_path==0||which_path==1||which_path==2){ 565 | Write_Out_to_DDR_NORM_CONV(fifo_out,ddr_fm_result,OUT_W, D,OUT_H,M_div_D, NumLines); 566 | } 567 | else if(which_path==4||which_path==5||which_path==9){ 568 | Write_to_DDR_NORM_MM_FM(fifo_out,ddr_fm_result,PENUM, M_div_D, NumLines); 569 | } 570 | else if(which_path==7||which_path==10){ 571 | Write_to_DDR_NORM_MM_FM_SOFTMAX_GELU(fifo_out,ddr_fm_result,PENUM, M_div_D, NumLines); 572 | } 573 | else if(which_path==6){ 574 | Write_to_DDR_NORM_MM_Tranpose(fifo_out,ddr_fm_result,PENUM, M_div_D, NumLines); 575 | } 576 | else{ 577 | Write_Out_to_DDR_DIRECT(fifo_out,ddr_fm_result,NumLines); 578 | 579 | } 580 | 581 | 582 | 583 | 584 | 585 | 586 | } 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | void ExtractPixels_AXI_CONV_DIRECT( 595 | ap_uint<128>* in, 596 | stream >& out_conv, 597 | const unsigned NumLines, 598 | const unsigned conv3_group){ 599 | 600 | 601 | ap_uint<128> act_in; 602 | ap_uint<384> act_in_384b; 603 | 604 | unsigned int bitIdx=0; 605 | unsigned int colIdx=0; 606 | 607 | ap_uint< MAX_INP * IN_BIT * PACK_NUM > temp; 608 | 609 | for(unsigned g = 0; g < conv3_group; g++){ 610 | for (unsigned rep = 0; rep < NumLines*3; rep++) { 611 | #pragma HLS PIPELINE II=1 612 | act_in_384b=act_in_384b>>128; 613 | 614 | act_in_384b(384-1, 256) = in[rep]; 615 | 616 | if(bitIdx==3-1){ 617 | temp=act_in_384b; 618 | out_conv.write(temp); 619 | } 620 | 621 | // cout <<"The Value of Var_a: \t" <* in, 640 | stream >& out_conv, 641 | const unsigned NumLines, 642 | const unsigned conv3_group){ 643 | 644 | 645 | ap_uint<128> act_in_128b; 646 | 647 | ap_uint act_in_80b; 648 | 649 | unsigned int loop0; 650 | unsigned int bitIdx=0; 651 | 652 | loop0=2*(MAX_INP/MAX_OUP); 653 | 654 | ap_uint< MAX_INP * IN_BIT * PACK_NUM > temp; 655 | 656 | for(unsigned g = 0; g < conv3_group; g++){ 657 | for (unsigned rep = 0; rep < NumLines*4; rep++) { 658 | #pragma HLS PIPELINE II=1 659 | 660 | temp=temp>>(IN_BIT * PACK_NUM*MAX_OUP/2); 661 | act_in_128b = in[rep]; 662 | 663 | act_in_80b=act_in_128b; 664 | temp(320-1,240)=act_in_80b; 665 | 666 | if(bitIdx==loop0-1){ 667 | out_conv.write(temp); 668 | } 669 | 670 | // cout <<"The Value of Var_a: \t" <* ddr_conv3_bias_scale, 690 | ap_int conv3_bias[MAX_OUP][MAX_CONV3_BIAS_LENGTH], 691 | ap_int scale_factor[MAX_SCALE_FACTOR_LENGTH], 692 | unsigned NumLines 693 | ){ 694 | 695 | unsigned int bitIdx=0; 696 | unsigned int colIdx=0; 697 | unsigned int depthIdx=0; 698 | 699 | ap_uint temp_axi_data; 700 | 701 | for(unsigned i=0; i(ddr_conv3_bias_scale[NumLines-1]); 727 | 728 | #ifdef OUTPUT_DEBUG 729 | FILE* fp_bias = fopen("bias_fpga.txt", "wb"); 730 | for(int j=0;j tmp=conv3_bias[i][j]; 733 | fprintf(fp_bias, "%d\t", (int)tmp); 734 | } 735 | fprintf(fp_bias, "\n"); 736 | } 737 | fclose(fp_bias); 738 | #endif 739 | 740 | } 741 | 742 | 743 | 744 | void WriteConv3WeightParam(ap_uint<128> *conv3_weight, 745 | ap_uint conv3_w_buffer[MAX_A_COL][MAX_CONV3_WEIGHT_LENGTH], 746 | unsigned NumLines 747 | ){ 748 | #pragma HLS INLINE OFF 749 | ap_uint<128> temp_w_128b[2]; 750 | ap_uint<256> temp_w_256b; 751 | ap_uint temp_w; 752 | unsigned int bitIdx=0; 753 | unsigned int colIdx=0; 754 | unsigned int depthIdx=0; 755 | 756 | for(unsigned i=0; i test_w; 770 | // test_w=(temp_w[2],temp_w[1],temp_w[0]); 771 | cout< conv3_weight_buf[MAX_A_COL][MAX_CONV3_WEIGHT_LENGTH], 805 | stream > fifo_W_in[MAX_A_COL], 806 | const unsigned NumLines, 807 | const unsigned OUT_H, 808 | bool tran_en){ 809 | #pragma HLS INLINE OFF 810 | if (!tran_en) return; 811 | 812 | ap_uint< MAX_INP * CONV_K *W_BIT > temp; 813 | 814 | for(unsigned j=0; j test=temp((t+1)*4-1,t*4); 824 | // cout<<"test: "< SHORTCUT_IN_buffer0[MAX_SHORTCUT_NORM_INBUF_LENGTH]; 22 | ap_uint SHORTCUT_IN_buffer1[MAX_SHORTCUT_NORM_INBUF_LENGTH]; 23 | 24 | // 1D linear 25 | 26 | // static ap_uint linear1d_input[MAX_FC1D_INPUT_LENGTH]; 27 | // static ap_uint linear1d_weight[FC_OUP][MAX_FC1D_WEIGHT_LENGTH]; 28 | 29 | 30 | // static ap_int linear1d_bias_buffer[FC_OUP][MAX_FC_BIAS_LENGTH]; 31 | //#pragma HLS ARRAY_PARTITION variable=conv3_bias dim=1 complete 32 | 33 | //static ap_fixed linear1d_out_buffer[FC_OUP][MAX_FC_BIAS_LENGTH]; 34 | //#pragma HLS ARRAY_PARTITION variable=conv3_bias dim=1 complete 35 | 36 | // static ap_int linear1d_out_buffer[MAX_OUP][MAX_CONV3_BIAS_LENGTH]; 37 | // //#pragma HLS ARRAY_PARTITION variable=conv3_bias dim=1 complete 38 | 39 | 40 | 41 | 42 | 43 | // 处理resnet block的放置 44 | static ap_uint conv3_w_buffer0[MAX_A_COL][MAX_CONV3_WEIGHT_LENGTH]; // 尽可能一次加载所有权重 45 | //#pragma HLS ARRAY_PARTITION variable=conv3_w_buffer dim=1 complete 46 | 47 | static ap_uint conv3_w_buffer1[MAX_A_COL][MAX_CONV3_WEIGHT_LENGTH]; // 尽可能一次加载所有权重 48 | //#pragma HLS ARRAY_PARTITION variable=conv3_w_buffer dim=1 complete 49 | 50 | 51 | static ap_uint mm_a_buffer0[MAX_MM_FM_LENGTH]; 52 | static ap_uint mm_a_buffer1[MAX_MM_FM_LENGTH]; 53 | 54 | // ap_uint mm_w_buffer[MAX_MM_FM_LENGTH]; 55 | // ap_uint mm_w_buffer1[MAX_MM_FM_LENGTH]; 56 | 57 | 58 | static ap_int conv3_mm_bias_buffer[MAX_OUP][MAX_CONV3_BIAS_LENGTH]; 59 | //#pragma HLS ARRAY_PARTITION variable=conv3_bias dim=1 complete 60 | 61 | 62 | 63 | static ap_uint<128> scale_factor_buffer; // [0]:conv3 [1] fc 64 | 65 | 66 | // scale_factor_buffer[0]=temp_scale_factor(15,0); 67 | // scale_factor_buffer[1]=temp_scale_factor(31,16); 68 | // scale_factor_buffer[2]=temp_scale_factor(47,32); 69 | // scale_factor_buffer[3]=temp_scale_factor(63,48); 70 | // scale_factor_buffer[4]=temp_scale_factor(79,64); 71 | // scale_factor_buffer[5]=temp_scale_factor(95,80); 72 | 73 | 74 | 75 | 76 | static LN_BIAS_DB ln_gamma_buffer[MAX_NORM_PE][MAX_NORM_BIAS_LENGTH]; 77 | 78 | static LN_BIAS_DB ln_beta_buffer[MAX_NORM_PE][MAX_NORM_BIAS_LENGTH]; 79 | 80 | 81 | // mm: MAX_M_LENGTH: M/MAX_OUP 82 | // conv: MAX_M_LENGTH: M/(MAX_OUP*2) 83 | static ap_uint ln_ptf_factor_buffer0[MAX_OUP][MAX_NORM_PWF_FACTOR_LENGTH]; 84 | static ap_uint ln_ptf_factor_buffer1[MAX_OUP][MAX_NORM_PWF_FACTOR_LENGTH]; 85 | 86 | static ap_uint LN_IN_buffer0[MAX_OUP][MAX_NORM_INBUF_LENGTH]; 87 | static ap_uint LN_IN_buffer1[MAX_OUP][MAX_NORM_INBUF_LENGTH]; 88 | 89 | 90 | -------------------------------------------------------------------------------- /sa_tools: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Michaela1224/SDA_code/5cc239f6bd1b48a8e1b689b0cb8af0d9dec511cf/sa_tools -------------------------------------------------------------------------------- /sa_tools.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | using namespace hls; 5 | using namespace std; 6 | 7 | //#define STREAM_DEBUG 8 | 9 | // #define TEST_DEBUG 10 | // #define DEQUAN_DEBUG 11 | 12 | #define MAX(x, y) (((x) > (y)) ? (x) : (y)) /* \brief Maximum value between x and y*/ 13 | #define MAX_BUF_LENGTH 1536 // 02-23-setting true 14 | #define MAX_W 32 // 02-23-setting true 实际为W/2 15 | 16 | 17 | 18 | 19 | template 25 | void SAMEPAD_DSPopt_SA_UP_DOWN( 26 | stream >& in, 27 | stream >& out, 28 | const unsigned Din_H, 29 | const unsigned Din_W_TRUE, 30 | const unsigned Cin, 31 | const unsigned conv3_groups, 32 | const bool skip_mode 33 | ){ 34 | 35 | if(skip_mode==false){ 36 | return; 37 | } 38 | 39 | ap_uint outData; 40 | ap_uint inData; 41 | 42 | for(unsigned int g=0;g=Din_H-PaddingDown){ 49 | outData = 0; 50 | } 51 | else{ 52 | outData=in.read(); 53 | } 54 | out.write(outData); 55 | } 56 | } 57 | } 58 | } 59 | } 60 | 61 | 62 | template // 注意这里的IN_H是padding后的 63 | void conv3padding_opt_SA(stream > &in, 64 | stream > &out, 65 | const unsigned IN_H, 66 | const unsigned IN_W, 67 | const unsigned OUT_H, 68 | const unsigned IN_CH, 69 | const unsigned OUTPENUM, 70 | const unsigned GROUPS, 71 | const bool skip_mode) { 72 | 73 | if(skip_mode==false){ 74 | return; 75 | } 76 | 77 | 78 | const unsigned int multiplying_factor = IN_CH/SIMD; 79 | const unsigned int number_blocks = K + 1 ; 80 | 81 | ap_uint row_buffer[4][MAX_BUF_LENGTH]; 82 | #pragma HLS ARRAY_PARTITION variable=row_buffer complete dim=1 83 | #pragma HLS BIND_STORAGE variable=row_buffer type=ram_s2p 84 | 85 | const unsigned int cycles_write_block = OUTPENUM * (IN_W/2) * K *multiplying_factor; // 一次读一行的三个 86 | const unsigned int cycles_read_block = (IN_W/2)*multiplying_factor; 87 | const unsigned int max_cycles = MAX(cycles_write_block,cycles_read_block); 88 | const unsigned int baseIter = (IN_W/2) * K *multiplying_factor // Initial buffer 89 | + OUT_H * MAX(cycles_write_block,cycles_read_block); 90 | 91 | unsigned int inp = 0, ofm_y = 0, ofm_x = 0, k_y = 0, k_x = 0, wMat =0,count_simd=0; 92 | unsigned int counter_internal_block = 0; 93 | ap_uint<2> current_block_write = 0; 94 | ap_uint<2> current_block_read = 0; 95 | 96 | ap_uint<2> block_read_K; 97 | 98 | unsigned int current_line = 0; 99 | unsigned int current_line_w = 0; 100 | unsigned int current_line_simd = 0; 101 | unsigned int read_block = 0; 102 | unsigned int current_line_in_block; 103 | // unsigned int flag = 0; 104 | ap_uint inElem; 105 | ap_uint<2 * SIMD * IN_BIT> data; 106 | #ifdef INPAD_DEBUG 107 | unsigned int m=0; 108 | #endif 109 | for(unsigned int g=0;g 223 | void MM_to_Out( 224 | stream > in[A_Row][A_Col][SA_PE], 225 | stream > out[PE], 226 | const unsigned NumLines, 227 | const unsigned skip_mode){ 228 | 229 | 230 | if(skip_mode==1){ 231 | return; 232 | } 233 | 234 | ap_uint result; 235 | ap_uint tmp; 236 | 237 | for (unsigned long long rep = 0; rep < NumLines; rep++) { 238 | #pragma HLS PIPELINE II=1 239 | for(unsigned int c = 0; c < A_Col; c++){ 240 | for(unsigned int y = 0; y < SA_PE; y++){ 241 | for(unsigned int r = 0; r < A_Row; r++){ 242 | result=result>>ACC_BIT*PACK_OUT_NUM; 243 | tmp=in[r][c][y].read(); 244 | result(ACC_BIT*PACK_OUT_NUM*A_Row*A_Col-1,ACC_BIT*PACK_OUT_NUM*A_Row*A_Col-ACC_BIT*PACK_OUT_NUM)=tmp; 245 | } 246 | out[c*SA_PE+y].write(result); 247 | } 248 | } 249 | } 250 | 251 | } 252 | 253 | 254 | template 256 | void MM_Parallel_to_Serial_Out( 257 | stream > in[A_Row][A_Col][SA_PE], 258 | stream > out[PE], 259 | const unsigned NumLines, 260 | const bool skip_mode){ 261 | 262 | 263 | if(skip_mode==true){ 264 | return; 265 | } 266 | 267 | // ap_uint result; 268 | ap_uint tmp; 269 | 270 | for (unsigned long long rep = 0; rep < NumLines; rep++) { 271 | for(unsigned int r = 0; r < A_Row; r++){ 272 | #pragma HLS PIPELINE II=1 273 | for(unsigned int c = 0; c < A_Col; c++){ 274 | for(unsigned int y = 0; y < SA_PE; y++){ 275 | // result=result>>ACC_BIT*PACK_OUT_NUM; 276 | tmp=in[r][c][y].read(); 277 | out[c*SA_PE+y].write(tmp); 278 | // result(ACC_BIT*PACK_OUT_NUM*A_Row*A_Col-1,ACC_BIT*PACK_OUT_NUM*A_Row*A_Col-ACC_BIT*PACK_OUT_NUM)=tmp; 279 | } 280 | 281 | } 282 | } 283 | } 284 | 285 | } 286 | 287 | 288 | 289 | template 291 | void W_conv3_array( 292 | stream > fifo_W_in[A_Col], 293 | stream > fifo_W_local_out[A_Row][A_Col], 294 | const unsigned OUT_H, 295 | const unsigned NumLines, 296 | const unsigned GROUPS, 297 | const bool skip_mode) { 298 | #pragma HLS INLINE OFF 299 | 300 | 301 | if(skip_mode==false){ 302 | return; 303 | } 304 | 305 | ap_uint w; 306 | ap_uint temp; 307 | 308 | #ifdef DEBUG 309 | FILE* fp_win0 = fopen("W3_reorg_SIMD_all_in.txt", "wb"); 310 | #endif 311 | 312 | 313 | for (unsigned int h = 0; h < GROUPS; h++) { // 40 314 | for (unsigned int peIdx = 0; peIdx < OUT_H*NumLines; peIdx++) { 315 | #pragma HLS PIPELINE II=1 316 | for (unsigned int c = 0; c < A_Col; c++) { 317 | w=fifo_W_in[c].read(); 318 | for(unsigned int r = 0; r< A_Row; r++) { 319 | temp=w((r+1)*SA_SIMD * CONV_K *W_BIT-1,r*SA_SIMD*CONV_K *W_BIT); 320 | fifo_W_local_out[r][c].write(temp); 321 | } 322 | } 323 | } 324 | } 325 | 326 | 327 | #ifdef DEBUG 328 | fclose(fp_win0); 329 | #endif 330 | 331 | } 332 | 333 | 334 | 335 | template 337 | void A_to_array( 338 | stream > & in, 339 | stream > out[A_Row][A_Col], 340 | const unsigned NumLines){ 341 | ap_uint temp_row; 342 | 343 | 344 | #ifdef STREAM_DEBUG 345 | FILE* fp_asimd= fopen("conv3_stream_A.txt", "wb"); 346 | #endif 347 | 348 | for (unsigned long long rep = 0; rep < NumLines; rep++) { 349 | #pragma HLS loop_tripcount min=NumLines max=NumLines 350 | #pragma HLS PIPELINE II=1 351 | ap_uint temp = in.read(); 352 | 353 | #ifdef STREAM_DEBUG 354 | for(int i=0;i<8;i++){ 355 | ap_uint<4> test=temp((i+1)*4-1,i*4); 356 | fprintf(fp_asimd, "%d\n", (int)test); 357 | } 358 | #endif 359 | 360 | for(unsigned int r = 0; r < A_Row; r++){ 361 | temp_row=temp((r+1)*OutStreamW-1,r*OutStreamW); 362 | for(unsigned int c = 0; c < A_Col; c++){ 363 | out[r][c].write(temp_row); 364 | } 365 | } 366 | } 367 | 368 | 369 | #ifdef STREAM_DEBUG 370 | fclose(fp_asimd); 371 | #endif 372 | 373 | } 374 | 375 | 376 | 377 | template 379 | void W_mm_to_array( 380 | stream > & in, 381 | stream > out[A_Row][A_Col], 382 | const unsigned NumLines, 383 | const bool skip_mode){ 384 | 385 | if(skip_mode==true){ 386 | return; 387 | } 388 | 389 | 390 | ap_uint temp_row; 391 | 392 | 393 | #ifdef STREAM_DEBUG 394 | FILE* fp_asimd= fopen("conv3_stream_A.txt", "wb"); 395 | #endif 396 | 397 | for (unsigned long long rep = 0; rep < NumLines; rep++) { 398 | #pragma HLS loop_tripcount min=NumLines max=NumLines 399 | #pragma HLS PIPELINE II=1 400 | ap_uint temp = in.read(); 401 | 402 | #ifdef STREAM_DEBUG 403 | for(int i=0;i<8;i++){ 404 | ap_uint<4> test=temp((i+1)*4-1,i*4); 405 | fprintf(fp_asimd, "%d\n", (int)test); 406 | } 407 | #endif 408 | for(unsigned int c = 0; c < A_Col; c++){ 409 | temp_row=temp((c+1)*OutStreamW-1,c*OutStreamW); 410 | for(unsigned int r = 0; r < A_Row; r++){ 411 | out[r][c].write(temp_row); 412 | } 413 | } 414 | } 415 | #ifdef STREAM_DEBUG 416 | fclose(fp_asimd); 417 | #endif 418 | 419 | } 420 | 421 | 422 | 423 | ap_uint<44> correct_fun(ap_uint<4> w0,ap_uint<4> w1,ap_uint<4> w2, ap_uint<4> a0,ap_uint<4> a1){ 424 | #pragma HLS INLINE 425 | 426 | ap_uint<44> C_port; 427 | ap_uint<1> signw0=w0.range(3,3); 428 | ap_uint<1> signw1=w1.range(3,3); 429 | ap_uint<1> signw2=w2.range(3,3); 430 | ap_uint<1> signa0=a0.range(3,3); 431 | ap_uint<1> signa1=a1.range(3,3); 432 | 433 | ap_uint<4> sign4w0=(signw0,signw0,signw0,signw0); 434 | ap_uint<4> sign4w1=(signw1,signw1,signw1,signw1); 435 | ap_uint<4> sign4w2=(signw2,signw2,signw2,signw2); 436 | ap_uint<4> sign4a0=(signa0,signa0,signa0,signa0); 437 | ap_uint<4> sign4a1=(signa1,signa1,signa1,signa1); 438 | 439 | ap_uint<4> out0_L0=(sign4w2&a0)+(sign4a0&w2); 440 | ap_uint<8> res0_correct=ap_uint<8>(out0_L0)<<4; 441 | 442 | #ifdef DEBUG 443 | std::cout <<"The Value of res0_correct: \t" < out0_L10=(sign4w1&a0)+(sign4a0&w1); 447 | 448 | ap_uint<4> out0_L11=(sign4w2&a1)+(sign4a1&w2); 449 | 450 | ap_uint<9> res1_correct=ap_uint<9> (out0_L10+out0_L11)<<4; 451 | 452 | #ifdef DEBUG 453 | std::cout <<"The Value of res1_correct: \t" < out0_L20=(sign4w0&a0)+(sign4a0&w0); 457 | ap_uint<4> out0_L21=(sign4w1&a1)+(sign4a1&w1); 458 | 459 | 460 | ap_uint<9> res2_correct=ap_uint<9> (out0_L20+out0_L21)<<4; 461 | 462 | #ifdef DEBUG 463 | std::cout <<"The Value of res2_correct: \t" < out0_L3=(sign4w0&a1)+(sign4a1&w0); 468 | ap_uint<8> res3_correct=ap_uint<8> (out0_L3)<<4; 469 | 470 | #ifdef DEBUG 471 | std::cout <<"The Value of res3_correct: \t" <)res3_correct<<33)+((ap_uint<33> )res2_correct<<22)+((ap_uint<22>)res1_correct<<11)+res0_correct; 475 | 476 | return C_port; 477 | 478 | } 479 | 480 | 481 | void RMPacking_4b_SignedA(ap_uint<4> A0,ap_uint<4> A1,ap_uint<4> W0,ap_uint<4> W1,ap_uint<4> W2, ap_int<8> result[4]){ 482 | 483 | ap_uint<15> B_port= ((ap_uint<15>)A1<<11)+A0; 484 | ap_uint<26> A_port=((ap_uint<26>)W0<<22)+((ap_uint<22>)W1<<11); 485 | ap_uint<4> D_port=W2; 486 | 487 | // result correction 488 | ap_uint<44> C_port=correct_fun(W0,W1,W2,A0,A1); 489 | 490 | // DSP computation 491 | ap_uint<44> P_port=(A_port+D_port)*B_port-C_port; 492 | 493 | ap_uint<11> out[4]; 494 | 495 | // 44-bit data split 496 | out[0]=P_port(11-1,0); // w2a0 for conv | w1a0 for MM 497 | out[1]=P_port(22-1,11-1); // w1a0+w2a1 for conv | w1a1 for MM 498 | out[2]=P_port(33-1,22-1); // w0a0+w1*a1 for conv | w0a0 for MM 499 | out[3]=P_port(44-1,33-1); // w0a1 for conv | w0a1 for MM 500 | 501 | result[0]=ap_int<8>(out[0]); 502 | for(int x=1;x<4;x++){ 503 | out[x]=(out[x]>>1)+(out[x]&1); 504 | result[x]=ap_int<8>(out[x]); 505 | } 506 | 507 | 508 | #ifdef INPUT_DEBUG 509 | ap_int<4> test_A0,test_A1, test_W0, test_W1,test_W2; 510 | test_A0=(ap_int<4>)A0; 511 | test_A1=(ap_int<4>)A1; 512 | test_W0=(ap_int<4>)W0; 513 | test_W1=(ap_int<4>)W1; 514 | test_W2=(ap_int<4>)W2; 515 | cout <<"The Value of A0: \t" < test_out0,test_out1,test_out2,test_out3; 521 | test_out0=test_W2*test_A0; 522 | test_out1=test_W1*test_A0+test_W2*test_A1; 523 | test_out2=test_W0*test_A0+test_W1*test_A1; 524 | test_out3=test_W0*test_A1; 525 | cout <<"test_out0: \t" <(result[0])<< endl; 526 | cout <<"test_out1: \t" <(result[1])<< endl; 527 | cout <<"test_out2: \t" <(result[2])<< endl; 528 | cout <<"test_out3: \t" <(result[3])<< endl; 529 | 530 | if((ap_int<8>(out[0])!=test_out0)||(ap_int<8>(out[1])!=test_out1)||(ap_int<8>(out[2])!=test_out2)||(ap_int<8>(out[3])!=test_out3)){ 531 | cout<<"debug error"; 532 | } 533 | 534 | #endif 535 | 536 | 537 | 538 | } 539 | 540 | 541 | 542 | void RMPacking_4b_USignedA(ap_uint<4> A0,ap_uint<4> A1,ap_int<4> W0,ap_int<4> W1,ap_int<4> W2, ap_int<8> result[4]){ 543 | 544 | 545 | 546 | ap_uint<15> B_port=(A1(4-1,0), (ap_uint<11 - 4>)0,A0(4-1,0)); 547 | ap_int<26> D_port=W0*(1<<(22))+W1*(1<<11)+W2; 548 | 549 | 550 | // DSP computation 551 | ap_int<44> P_port=D_port*B_port; 552 | 553 | ap_int<11> out0; 554 | ap_int<12> out1; 555 | ap_int<12> out2; 556 | ap_int<12> out3; 557 | 558 | // 44-bit data split 559 | out0=P_port(11-1,0); // w2a0 for conv | w1a0 for MM 560 | out1=P_port(22-1,11-1); // w1a0+w2a1 for conv | w1a1 for MM 561 | out2=P_port(33-1,22-1); // w0a0+w1*a1 for conv | w0a0 for MM 562 | out3=P_port(44-1,33-1); // w0a1 for conv | w0a1 for MM 563 | 564 | result[0]=ap_int<11>(out0); 565 | result[1]=(out1 >> 1) + (out1 & 1); 566 | result[2]=(out2 >> 1) + (out2 & 1); 567 | result[3]=(out3 >> 1) + (out3 & 1); 568 | 569 | 570 | #ifdef INPUT_DEBUG 571 | ap_uint<4> test_A0,test_A1; 572 | ap_int<9> test_W0, test_W1,test_W2; 573 | test_A0=(ap_uint<4>)A0; 574 | test_A1=(ap_uint<4>)A1; 575 | test_W0=(ap_int<4>)W0; 576 | test_W1=(ap_int<4>)W1; 577 | test_W2=(ap_int<4>)W2; 578 | cout <<"The Value of A0: \t" < test_out0,test_out1,test_out2,test_out3; 584 | test_out0=test_W2*test_A0; 585 | test_out1=test_W1*test_A0+test_W2*test_A1; 586 | test_out2=test_W0*test_A0+test_W1*test_A1; 587 | test_out3=test_W0*test_A1; 588 | cout <<"test_out0: \t" < 610 | void PE_wrapper(int idr, int idc, stream > &fifo_A_in, 611 | stream > &fifo_W_in, 612 | stream > fifo_C_out[PE], 613 | const unsigned NWnum, 614 | const unsigned NumLines, 615 | const bool mode 616 | ) { 617 | // mode=0 OS stationary for matrix 618 | // mode=1 WS stationary for conv3 619 | 620 | #pragma HLS INLINE OFF 621 | #pragma HLS ARRAY_PARTITION variable=fifo_C_out dim=1 complete 622 | 623 | ap_uint A_simd_reg[SIMD]; 624 | #pragma HLS ARRAY_PARTITION variable=A_simd_reg dim=1 complete 625 | 626 | ap_uint data_A_reg[SIMD][PE]; 627 | #pragma HLS ARRAY_PARTITION variable=data_A_reg dim=1 complete 628 | #pragma HLS ARRAY_PARTITION variable=data_A_reg dim=2 complete 629 | 630 | 631 | ap_uint W_pe_reg[PE]; 632 | #pragma HLS ARRAY_PARTITION variable=W_pe_reg dim=1 complete 633 | 634 | ap_uint data_W_reg[SIMD][PE]; 635 | #pragma HLS ARRAY_PARTITION variable=data_W_reg dim=1 complete 636 | #pragma HLS ARRAY_PARTITION variable=data_W_reg dim=2 complete 637 | 638 | 639 | // ap_int OS_ACC_reg[SIMD][PE][PACK_OUT_NUM]; 640 | // #pragma HLS ARRAY_PARTITION variable=OS_ACC_reg dim=1 complete 641 | // #pragma HLS ARRAY_PARTITION variable=OS_ACC_reg dim=2 complete 642 | // #pragma HLS ARRAY_PARTITION variable=OS_ACC_reg dim=3 complete 643 | 644 | ap_uint res_C_reg[PE]; 645 | #pragma HLS ARRAY_PARTITION variable=res_C_reg dim=1 complete 646 | 647 | 648 | ap_int data_C_reg[SIMD][PE][PACK_OUT_NUM]; 649 | #pragma HLS ARRAY_PARTITION variable=data_C_reg dim=1 complete 650 | #pragma HLS ARRAY_PARTITION variable=data_C_reg dim=2 complete 651 | #pragma HLS ARRAY_PARTITION variable=data_C_reg dim=3 complete 652 | 653 | #ifdef STREAM_DEBUG 654 | // FILE* fp_pe00_a = fopen("a_stream_pe00.txt", "wb"); 655 | // FILE* fp_pe00_w= fopen("w_stream_pe00.txt", "wb"); 656 | // FILE* fp_pe01_a = fopen("a_stream_pe11.txt", "wb"); 657 | // FILE* fp_pe01_w= fopen("w_stream_pe11.txt", "wb"); 658 | FILE* fp_pe00_res= fopen("res_stream_pex3.txt", "wb"); 659 | 660 | #endif 661 | 662 | ap_int acc_tmp[SIMD][PE][4]; 663 | #pragma HLS ARRAY_PARTITION variable=acc_tmp dim=1 complete 664 | #pragma HLS ARRAY_PARTITION variable=acc_tmp dim=2 complete 665 | #pragma HLS ARRAY_PARTITION variable=acc_tmp dim=3 complete 666 | 667 | int rn_index=0; 668 | int out_flag=0; 669 | // int flag=0; 670 | // int cascade_index=0; 671 | 672 | for(unsigned i=0; i=NumLines){ 731 | W_pe_reg[0]=0; 732 | } 733 | 734 | 735 | // A fetcher WS/ OS mode share 736 | for(unsigned m=0; m temp_a; 739 | temp_a=A_simd_reg[m](PACK_NUM*IN_BIT-1,0); 740 | data_A_reg[m][0]=temp_a; 741 | A_simd_reg[m]=A_simd_reg[m]>>(PACK_NUM*IN_BIT); 742 | } 743 | 744 | // W fetcher 745 | // OS mode: write W to the top row PE00-PE03 every cycle 746 | // WS mode: in each first PE+SIMD-1 cycle, write W to PE00-PE33 747 | for(unsigned k=0; k temp_w; 750 | temp_w=W_pe_reg[k](PACK_CONV_NUM*W_BIT-1,0); 751 | if(mode==false){ 752 | data_W_reg[0][k]=temp_w; 753 | } 754 | else if((mode==true) &&(rn_index=0) && (rn_index-k<=SIMD-1)){ 756 | data_W_reg[rn_index-k][k]=temp_w; 757 | } 758 | else if( (NWnum=0) && (rn_index+PE-k<=SIMD-1) ){ 759 | data_W_reg[rn_index-k+PE][k]=temp_w; 760 | } 761 | } 762 | // cout <>(3*W_BIT); 764 | } 765 | 766 | 767 | // A fetcher /W fetcher down sliding 768 | for(unsigned m=SIMD-1; m>0;m--){ 769 | #pragma HLS UNROLL 770 | A_simd_reg[m]=A_simd_reg[m-1]; 771 | } 772 | 773 | for(unsigned m=PE-1; m>0;m--){ 774 | #pragma HLS UNROLL 775 | W_pe_reg[m]=W_pe_reg[m-1]; 776 | } 777 | 778 | 779 | for (int j=PE-1; j>=0;j--) { // PE 780 | #pragma HLS UNROLL 781 | for (int i=SIMD-1; i>=0;i--){ // SIMD 782 | #pragma HLS UNROLL 783 | 784 | // Read A,W 785 | ap_uint<2*IN_BIT> data_A_tmp; 786 | data_A_tmp= data_A_reg[i][j]; 787 | 788 | ap_uint<3*W_BIT> data_W_tmp; 789 | data_W_tmp= data_W_reg[i][j]; 790 | 791 | 792 | 793 | for(int x=0;x<4;x++){ 794 | if(mode==false){ // OS mode: read acc result 795 | if(rep>=NWnum&&i+j==rn_index){ 796 | acc_tmp[i][j][x]= 0; 797 | } 798 | else{ 799 | acc_tmp[i][j][x]= data_C_reg[i][j][x]; 800 | } 801 | 802 | } 803 | else if(mode==true){ // WS mode: read top (i-1,j) acc result 804 | if(i==0){ 805 | acc_tmp[i][j][x]= 0; 806 | } 807 | else{ 808 | acc_tmp[i][j][x]= data_C_reg[i-1][j][x]; 809 | } 810 | } 811 | } 812 | 813 | 814 | ap_uint A0,A1; 815 | ap_uint W0,W1,W2; 816 | (A1, A0) = data_A_tmp; 817 | ap_uint<4> A00,A01,A10,A11; 818 | 819 | (A01, A00) = A0; (A11, A10) = A1; 820 | (W2, W1, W0) = data_W_tmp; 821 | 822 | #ifdef TEST_DEBUG 823 | 824 | std::cout <<"The Value of A0: \t" <<(ap_int)A0 << "\t Binary format: \t" <)A1 << "\t Binary format: \t" <)W0 << "\t Binary format: \t" <)W1 << "\t Binary format: \t" <)W2 << "\t Binary format: \t" < out_L[4], out_H[4]; 832 | ap_int<13> out_T[4]; 833 | 834 | RMPacking_4b_USignedA(A00,A10, W0,W1,W2, out_L); 835 | RMPacking_4b_SignedA(A01,A11, W0,W1,W2, out_H); 836 | 837 | 838 | // 数据累加 839 | for(int x=0;x<4;x++){ 840 | #pragma HLS UNROLL 841 | 842 | // cout<<"out_L:"<<(ap_int<8>)out_L[x]<)out_H[x]<)out_L[x])+(((ap_int<13>)out_H[x])<<4); 846 | 847 | 848 | // cout<<"out_T:"<<(ap_int<13>)out_T[x]< out_tmp=(data_C_reg[i][j][1],data_C_reg[i][j][0],data_C_reg[i][j][3],data_C_reg[i][j][2]); 867 | res_C_reg[j]=out_tmp; 868 | } 869 | 870 | 871 | } 872 | } 873 | // OS mode: output final result 874 | if(mode==false){ 875 | if(rep>=NWnum-1&&out_flag=(out_flag-PE+1)&&out_flag>=PE)){ 878 | fifo_C_out[d_j].write(res_C_reg[d_j]); 879 | } 880 | } 881 | } 882 | } 883 | // WS mode: output final result 884 | else if(mode==true){ 885 | for(int d_j=0;d_j(rep-NumLines)) && (rep>=NumLines)) ){ // 这个条件再重新写一下 888 | 889 | // cout<<"data_C_reg[SIMD-1][d_j][3]:"< out_tmp=(data_C_reg[SIMD-1][d_j][3],data_C_reg[SIMD-1][d_j][2],data_C_reg[SIMD-1][d_j][1],data_C_reg[SIMD-1][d_j][0]); 898 | fifo_C_out[d_j].write(out_tmp); 899 | } 900 | } 901 | } 902 | 903 | 904 | 905 | if(rn_index==NWnum-1){ 906 | rn_index=0; 907 | } 908 | else{ 909 | rn_index++; 910 | } 911 | 912 | if(rn_index==NWnum-1){ 913 | out_flag=0; 914 | } 915 | else if(out_flag==PE+SIMD-1){ 916 | out_flag=out_flag; 917 | } 918 | else{ 919 | out_flag++; 920 | } 921 | 922 | 923 | } 924 | 925 | #ifdef STREAM_DEBUG 926 | // fclose(fp_pe01_a); 927 | // fclose(fp_pe01_w); 928 | fclose(fp_pe00_res); 929 | #endif 930 | 931 | } 932 | 933 | 934 | 935 | 936 | 937 | template 938 | void arrar_acc_to_Res( stream > fifo_C_in[MAX_A_ROW][MAX_A_COL][SA_PE], stream > fifo_C_out[PE], 939 | const unsigned numlines, 940 | const bool skip_mode){ 941 | 942 | 943 | if(skip_mode==false){ 944 | return; 945 | } 946 | 947 | ap_uint temp_4m; 948 | ap_uint psum_4m; 949 | 950 | ap_int temp; 951 | ap_int res; 952 | 953 | for (unsigned int h = 0; h < numlines; h++) { // 40 954 | //#pragma HLS loop_tripcount min=OUT_H max=OUT_H 955 | #pragma HLS PIPELINE II=1 956 | for(unsigned int c = 0; c < MAX_A_COL; c++){ 957 | for(unsigned int m = 0; m < SA_PE; m++){ 958 | for(unsigned int r = 0; r < MAX_A_ROW; r++){ 959 | if(r==0){ 960 | psum_4m=fifo_C_in[r][c][m].read(); 961 | } 962 | else{ 963 | temp_4m=fifo_C_in[r][c][m].read(); 964 | for(unsigned int x=0; x(temp_4m((x+1)*M_BIT-1,x*M_BIT)); 966 | res=ap_int(psum_4m((x+1)*M_BIT-1,x*M_BIT))+temp; 967 | psum_4m((x+1)*M_BIT-1,x*M_BIT)=res; 968 | } 969 | } 970 | } 971 | // 972 | // ap_int test0,test1,test2,test3; 973 | 974 | // (test0,test1,test2,test3)=psum_4m; 975 | // cout<<"test0:" < 995 | void PE_DSP_ACC(stream > fifo_C_in[PE], 996 | stream > fifo_C_res[PE], 997 | const unsigned OUT_H, 998 | const unsigned OUT_W, 999 | const unsigned PENUM, 1000 | const unsigned SIMDNUM, 1001 | const unsigned GROUPS, 1002 | const unsigned skip_mode) { 1003 | #pragma HLS INLINE OFF 1004 | 1005 | if(skip_mode==0){ 1006 | return; 1007 | } 1008 | 1009 | ap_int ACC_P2_prev[PE]; 1010 | #pragma HLS ARRAY_PARTITION variable=ACC_P2_prev dim=1 complete 1011 | 1012 | ap_int ACC_P3_prev[PE]; 1013 | #pragma HLS ARRAY_PARTITION variable=ACC_P3_prev dim=1 complete 1014 | 1015 | ap_int out0=0; 1016 | ap_int out1=0; 1017 | unsigned int Iter_NUM=K*SIMDNUM; 1018 | 1019 | #ifdef IN_DEBUG 1020 | FILE* fpw = fopen("w_40_test.txt", "wb"); 1021 | FILE* fpa = fopen("a_40_test.txt", "wb"); 1022 | 1023 | #endif 1024 | 1025 | for (unsigned int h = 0; h < OUT_H*GROUPS; h++) { // 40 1026 | #pragma HLS loop_tripcount min=OUT_H max=OUT_H 1027 | for (unsigned int peIdx = 0; peIdx < PENUM; peIdx++) { 1028 | for (unsigned int iter = 0; iter < Iter_NUM; iter++) { 1029 | for (unsigned int w = 0; w < OUT_W /2; w++) { // OUT_W / 2 80/2 1030 | #pragma HLS PIPELINE II=1 1031 | for(unsigned int m=0; m < PE; m++ ){ 1032 | bool m_clear = (w == 0); 1033 | // read FM-A 1034 | ap_int fifo_data_C; 1035 | fifo_data_C= fifo_C_in[m].read(); 1036 | 1037 | ap_int S0, S1,S2,S3; 1038 | (S3,S2,S1,S0)=fifo_data_C; 1039 | 1040 | if (m_clear){ // 1 1 1041 | out0=ACC_P2_prev[m]; 1042 | out1=S1; 1043 | } 1044 | else{// 0 1 1045 | out0=S0+ACC_P2_prev[m]; 1046 | out1=S1+ACC_P3_prev[m]; 1047 | } 1048 | ACC_P2_prev[m]=S2; 1049 | ACC_P3_prev[m]=S3; 1050 | 1051 | 1052 | // cout<<"out0:" < 1090 | ap_uint DeQuan_Bias_Unit( 1091 | 1092 | ap_int acc_in, 1093 | ap_int Bias, 1094 | ap_int Layer_Scale){ 1095 | 1096 | #pragma HLS inline off 1097 | ap_int<25> qy= acc_in+Bias; 1098 | 1099 | #ifdef DEQUAN_DEBUG 1100 | cout<<"acc_in: "< fixp_out; 1106 | 1107 | fixp_out=(ap_fixed<48, 40>(qy*Layer_Scale))>>Shift_Factor; 1108 | 1109 | #ifdef DEQUAN_DEBUG 1110 | cout<<"fixp_out: "< out; 1113 | out(DeQuan_BIT-1,0)=fixp_out(DeQuan_BIT-1,0); 1114 | 1115 | #ifdef DEQUAN_DEBUG 1116 | cout<<"out: "< 1127 | void Inter_Reorg_acc_to_Res( stream > fifo_C_in[MAX_OUP], 1128 | stream > fifo_C_out[MAX_OUP], 1129 | const unsigned OUT_H, 1130 | const unsigned OUT_W, 1131 | const unsigned PENUM, 1132 | const unsigned SIMDNUM, 1133 | const unsigned GROUPS, 1134 | const bool skip_mode) { 1135 | 1136 | 1137 | if(skip_mode==false){ 1138 | return; 1139 | } 1140 | 1141 | unsigned int total_num=(GROUPS)*OUT_H*PENUM*K*SIMDNUM*(OUT_W/2); 1142 | 1143 | ap_uint data0, data1; 1144 | 1145 | ap_uint reg[MAX_OUP]; 1146 | #pragma HLS ARRAY_PARTITION variable=reg dim=1 complete 1147 | ap_uint data_in; 1148 | ap_uint data_acc; 1149 | ap_uint res_out; 1150 | 1151 | 1152 | 1153 | 1154 | ap_uint<2*M_BIT> row_buf[MAX_OUP][MAX_W]; 1155 | #pragma HLS ARRAY_PARTITION variable=row_buf dim=1 complete 1156 | 1157 | // ap_uint<2*M_BIT> temp_2m; 1158 | // ap_uint<2*M_BIT> res_2m; 1159 | ap_int temp0,temp1; 1160 | ap_int res0,res1; 1161 | ap_int res_buf0,res_buf1; 1162 | 1163 | 1164 | unsigned int w=0; 1165 | unsigned int infoldIdx=0; 1166 | unsigned int outfoldIdx=0; 1167 | for(unsigned int i=0; i< MAX_OUP;i++){ 1168 | #pragma HLS UNROLL 1169 | (data1, data0) = fifo_C_in[i].read(); 1170 | reg[i]=data1; 1171 | } 1172 | 1173 | // for(unsigned int i=0;i 1253 | ap_uint<12> compute_mean_var(ap_uint temp_x){ 1254 | 1255 | ap_uint<12> var_x; 1256 | // dynamic compress 1257 | ap_uint<1> s; 1258 | ap_uint<4> x0_4b; 1259 | ap_uint<12> x0_12b; 1260 | ap_uint<2> contrl_reg; 1261 | ap_uint<2> shift_value; 1262 | 1263 | contrl_reg=temp_x[7,6]; 1264 | 1265 | if((contrl_reg!=0)){ 1266 | s=1; 1267 | shift_value=4; 1268 | } 1269 | else{ 1270 | s=0; 1271 | shift_value=2; 1272 | } 1273 | temp_x=temp_x>>shift_value; 1274 | 1275 | x0_4b=temp_x(3,0); 1276 | #ifdef RESULT_DEBUG 1277 | cout<<"x0_4b: "<(x0_4b*x0_4b))<<(4*s); 1282 | 1283 | #ifdef RESULT_DEBUG 1284 | cout<<"var_x: "< 1297 | ap_fixed compute_silu(ap_fixed input){ 1298 | 1299 | const ap_fixed<16, 2> onedivsixth = 0.16666666; 1300 | 1301 | #ifdef SILU_DEBUG 1302 | cout<<"input: "< temp=input+3; 1306 | 1307 | #ifdef SILU_DEBUG 1308 | cout<<"temp: "< relu6_temp; 1312 | if(temp>=6){ 1313 | relu6_temp=6; 1314 | } 1315 | else if(temp<=0){ 1316 | relu6_temp=0; 1317 | } 1318 | else{ 1319 | relu6_temp=temp; 1320 | } 1321 | 1322 | #ifdef SILU_DEBUG 1323 | cout<<"relu6_temp: "< temp_div_6; 1327 | temp_div_6=relu6_temp*onedivsixth; 1328 | 1329 | #ifdef SILU_DEBUG 1330 | cout<<"temp_div_6: "< silu_out; 1334 | 1335 | silu_out=temp_div_6*input; 1336 | 1337 | #ifdef SILU_DEBUG 1338 | cout<<"silu_out: "< 1350 | ap_fixed compute_gelu(ap_fixed input){ 1351 | #pragma HLS INLINE OFF 1352 | const ap_fixed<16, 2> onedivsixth = 0.16666666; 1353 | const ap_fixed<16, 2> constant_factor= 1.702; 1354 | 1355 | #ifdef GELU_DEBUG 1356 | cout<<"input: "< temp0=input*constant_factor; 1360 | 1361 | #ifdef GELU_DEBUG 1362 | cout<<"temp0: "< temp=temp0+3; 1367 | 1368 | #ifdef GELU_DEBUG 1369 | cout<<"temp: "< relu6_temp; 1373 | if(temp>=6){ 1374 | relu6_temp=6; 1375 | } 1376 | else if(temp<=0){ 1377 | relu6_temp=0; 1378 | } 1379 | else{ 1380 | relu6_temp=temp; 1381 | } 1382 | 1383 | #ifdef GELU_DEBUG 1384 | cout<<"relu6_temp: "< temp_div_6; 1388 | temp_div_6=relu6_temp*onedivsixth; 1389 | 1390 | #ifdef GELU_DEBUG 1391 | cout<<"temp_div_6: "< silu_out; 1395 | 1396 | silu_out=temp_div_6*input; 1397 | 1398 | #ifdef GELU_DEBUG 1399 | cout<<"silu_out: "< 1410 | // void SiLU_Unit(stream > in[MAX_NORM_PE], 1411 | // stream > out[MAX_NORM_PE], 1412 | // const unsigned NumLines, 1413 | // const bool SA_MODE, 1414 | // const bool NORM_MODE, 1415 | // const bool QUAN_MODE 1416 | // ){ 1417 | 1418 | // if(QUAN_MODE==false){ 1419 | // return; 1420 | // } 1421 | 1422 | // ap_uint x0,x1; 1423 | // ap_fixed fixp_x0,fixp_x1; 1424 | 1425 | // ap_fixed out0,out1; 1426 | // ap_uint<2*SILU_BIT> res; 1427 | 1428 | // const ap_fixed<16, 2> onedivsixth = 0.16666666; 1429 | 1430 | 1431 | // for (unsigned i = 0; i < NumLines; i++) { 1432 | // #pragma HLS PIPELINE II=1 1433 | // for(unsigned int c = 0; c < MAX_NORM_PE; c++){ 1434 | // ap_uint temp = in[c].read(); 1435 | 1436 | // if(NORM_MODE==true&&SA_MODE==true){ 1437 | // (x1,x0)=temp; 1438 | // fixp_x0(ILN_OUT_WIDTH-1,0)=x0(ILN_OUT_WIDTH-1,0); 1439 | // fixp_x1(ILN_OUT_WIDTH-1,0)=x1(ILN_OUT_WIDTH-1,0); 1440 | // out0=compute_silu(fixp_x0); 1441 | // out1=compute_silu(fixp_x1); 1442 | 1443 | // res=(out1(SILU_BIT-1,0),out0(SILU_BIT-1,0)); 1444 | 1445 | // } 1446 | // else{ 1447 | // res=temp; 1448 | 1449 | // } 1450 | // out[c].write(res); 1451 | // } 1452 | // } 1453 | 1454 | // } 1455 | 1456 | 1457 | 1458 | template < unsigned Wbit, 1459 | unsigned Ibit, 1460 | unsigned Mbit, 1461 | unsigned PACKNUM, 1462 | unsigned P> 1463 | ap_int DOT_NPacking( 1464 | ap_uint weights, 1465 | ap_uint in) 1466 | { 1467 | ap_int accumulation = 0; 1468 | 1469 | for (unsigned p = 0; p < P; p++) { 1470 | #pragma HLS UNROLL 1471 | ap_int result; 1472 | ap_uint W1,W0; 1473 | ap_uint A1,A0; 1474 | 1475 | (W1,W0)= weights( (p+1)*PACKNUM*Wbit-1, p*PACKNUM*Wbit ); 1476 | (A1,A0) = in( (p+1)*PACKNUM*Ibit-1, p*PACKNUM*Ibit ); 1477 | 1478 | // cout<<"W1:"<(W1)<<" W0:"<(W0)<(A1)<<" A0:"<(A0)< test=(ap_int(W1)*ap_int(A1))+(ap_int(W0)*ap_int(A0)); 1481 | 1482 | ap_uint<18> B_port=((ap_uint<18>)W1<<14)+W0; 1483 | ap_uint<22> A_port=((ap_uint<22>)A0<<14); 1484 | ap_uint D_port=A1; 1485 | ap_uint<42> P_port=(A_port+D_port)*B_port; 1486 | 1487 | 1488 | 1489 | ap_uint<1> signw0=W0.range(Wbit-1,Wbit-1); 1490 | ap_uint<1> signw1=W1.range(Wbit-1,Wbit-1); 1491 | ap_uint<1> signa0=A0.range(Ibit-1,Ibit-1); 1492 | ap_uint<1> signa1=A1.range(Ibit-1,Ibit-1); 1493 | 1494 | ap_uint<8> sign8w0=(signw0,signw0,signw0,signw0,signw0,signw0,signw0,signw0); 1495 | ap_uint<8> sign8w1=(signw1,signw1,signw1,signw1,signw1,signw1,signw1,signw1); 1496 | ap_uint<4> sign4a0=(signa0,signa0,signa0,signa0); 1497 | ap_uint<4> sign4a1=(signa1,signa1,signa1,signa1); 1498 | 1499 | ap_uint<9> out0_L0=(sign8w0&A0)+(sign8w1&A1); 1500 | ap_uint<5> out0_L1=(sign4a0&W0)+(sign4a1&W1); 1501 | ap_uint<12> res_correct=(ap_uint<13>(out0_L0)<<4)+(ap_uint<13>(out0_L1)<<8); 1502 | 1503 | ap_uint<13> out=P_port(26,14)-res_correct; 1504 | // cout<<"out:"< result_correct=out; 1506 | // cout<<"result_correct:"< 1524 | void SOFTMAX_WriteBUF(stream > in[MAX_OUP], 1525 | ap_uint ROW_T_buf[MAX_OUP][MAX_SOFTMAX_INBUF_LENGTH], 1526 | const unsigned PENUM 1527 | ){ 1528 | #pragma HLS INLINE OFF 1529 | 1530 | 1531 | unsigned int numlines; 1532 | 1533 | unsigned int outdIdx=0; 1534 | unsigned int w=0; 1535 | unsigned int h=0; 1536 | 1537 | 1538 | numlines= PENUM*MAX_INP*2; 1539 | ap_uint< DEQUAN_BIT*2> temp; 1540 | 1541 | for(unsigned m=0; m temp_x0, temp_x1; 1549 | ap_fixed temp_x0_fixp, temp_x1_fixp; 1550 | (temp_x1,temp_x0)=temp; 1551 | 1552 | temp_x0_fixp(DEQUAN_BIT-1,0)=temp_x0(DEQUAN_BIT-1,0); 1553 | temp_x1_fixp(DEQUAN_BIT-1,0)=temp_x1(DEQUAN_BIT-1,0); 1554 | 1555 | 1556 | cout<<"temp_x0_fixp:"< 1592 | void FIND_MAX_VALUE( 1593 | ap_fixed OUP_TempBuf[MAX_OUP], 1594 | ap_fixed &MAX_Temp 1595 | ){ 1596 | //#pragma HLS INLINE OFF 1597 | //#pragma HLS latency max=1 1598 | 1599 | 1600 | ap_fixed temp[MAX_OUP/2]; 1601 | 1602 | ap_fixed temp_temp[2]; 1603 | 1604 | ap_fixed temp_temp_temp; 1605 | 1606 | ap_fixed temp_temp_temp_temp; 1607 | 1608 | for(unsigned i=0; iOUP_TempBuf[2*i+1]?OUP_TempBuf[2*i]:OUP_TempBuf[2*i+1]; 1611 | } 1612 | 1613 | temp_temp[0]=temp[0]>temp[1]?temp[0]:temp[1]; 1614 | temp_temp[1]=temp[2]>temp[3]?temp[2]:temp[3]; 1615 | 1616 | temp_temp_temp=temp_temp[0]>temp_temp[1]?temp_temp[0]:temp_temp[1]; 1617 | temp_temp_temp_temp=temp_temp_temp>temp[4]?temp_temp_temp:temp[4]; 1618 | 1619 | 1620 | MAX_Temp=MAX_Temp>temp_temp_temp_temp?MAX_Temp:temp_temp_temp_temp; 1621 | 1622 | } 1623 | 1624 | 1625 | template < unsigned DEQUAN_BIT, 1626 | unsigned DEQUAN_INTEGER_BIT, 1627 | unsigned MAX_INP, 1628 | unsigned MAX_OUP, 1629 | unsigned MAX_SOFTMAX_INBUF_LENGTH> 1630 | void SOFTMAX_WriteBUF_ADDBUF(stream > in[MAX_OUP], 1631 | ap_uint ROW_T_buf[MAX_OUP][MAX_SOFTMAX_INBUF_LENGTH], 1632 | stream > NO_SOFTMAX_OUT[MAX_OUP], 1633 | ap_fixed tmax_M[MAX_INP][2], 1634 | const unsigned PENUM, 1635 | const bool EBMULT_MODE 1636 | ){ 1637 | #pragma HLS INLINE OFF 1638 | 1639 | // unsigned int loop0,loop1,loop2; 1640 | unsigned int numlines; 1641 | 1642 | unsigned int outdIdx=0; 1643 | unsigned int w=0; 1644 | unsigned int h=0; 1645 | unsigned int index; 1646 | 1647 | // loop0=2; 1648 | // loop1=MAX_INP; 1649 | // loop2=PENUM; // M/(MAX_OUP*2) 1650 | numlines= PENUM*MAX_INP*2; 1651 | ap_uint< DEQUAN_BIT*2> temp; 1652 | ap_uint temp_x0, temp_x1; 1653 | ap_fixed temp_x0_fixp, temp_x1_fixp; 1654 | 1655 | ap_uint< DEQUAN_BIT*2> first_temp; 1656 | ap_uint first_temp_x0, first_temp_x1; 1657 | ap_fixed first_temp_x0_fixp, first_temp_x1_fixp; 1658 | 1659 | ap_fixed first_temp_out0_fixp, first_temp_out1_fixp; 1660 | 1661 | ap_fixed MAX_TempBuf[MAX_INP][2]; 1662 | #pragma HLS ARRAY_PARTITION variable=MAX_TempBuf complete dim=2 1663 | 1664 | ap_fixed OUP_TempBuf[2][MAX_OUP]; 1665 | #pragma HLS ARRAY_PARTITION variable=OUP_TempBuf complete dim=0 1666 | 1667 | ap_fixed OUP_out_TempBuf[2][MAX_OUP]; 1668 | #pragma HLS ARRAY_PARTITION variable=OUP_out_TempBuf complete dim=0 1669 | 1670 | 1671 | ap_fixed max_before_temp0,max_after_temp0; 1672 | ap_fixed max_before_temp1,max_after_temp1; 1673 | 1674 | 1675 | for(unsigned j=0; j(OUP_TempBuf[0],MAX_TempBuf[outdIdx][0]); 1740 | FIND_MAX_VALUE(OUP_TempBuf[1],MAX_TempBuf[outdIdx][1]); 1741 | 1742 | // cout<<"MAX_TempBuf["< 5 | #include 6 | #include 7 | using namespace hls; 8 | using namespace std; 9 | 10 | //#define WINPUT_DEBUG 11 | 12 | template 13 | void DemuxStream2 ( 14 | stream >& in, 15 | stream >& out1, 16 | stream >& out2, 17 | const unsigned mode, 18 | const unsigned NumLines) 19 | { 20 | for (unsigned i = 0; i < NumLines; i++) { 21 | #pragma HLS PIPELINE II=1 22 | ap_uint temp = in.read(); 23 | if (mode == 0) 24 | out1.write(temp); // to_mm 25 | else 26 | out2.write(temp); // to_conv3 27 | } 28 | } 29 | 30 | 31 | template 32 | void MuxStream2( 33 | stream >& in1, 34 | stream >& in2, 35 | stream >& out, 36 | const bool mode, 37 | const unsigned NumLines) 38 | { 39 | for (unsigned i = 0; i < NumLines; i++) { 40 | #pragma HLS PIPELINE II=1 41 | ap_uint temp; 42 | if (mode == false) 43 | temp = in1.read(); 44 | else 45 | temp = in2.read(); 46 | out.write(temp); 47 | } 48 | } 49 | 50 | 51 | 52 | template 53 | void MuxStream2_P( 54 | stream > in1[MAX_PE], 55 | stream > in2[MAX_PE], 56 | stream > out[MAX_PE], 57 | const unsigned NumLines, 58 | const bool mode) 59 | { 60 | 61 | 62 | for (unsigned i = 0; i < NumLines; i++) { 63 | #pragma HLS PIPELINE II=1 64 | ap_uint temp; 65 | for(unsigned j=0; j 78 | void MuxStream2_P_BRANCH( 79 | stream > in1[MAX_PE], 80 | stream > in2[MAX_PE], 81 | stream > out[MAX_PE], 82 | const unsigned NumLines, 83 | const bool mode, 84 | const bool mode_en) 85 | { 86 | if(mode_en==false){ 87 | return; 88 | } 89 | 90 | for (unsigned i = 0; i < NumLines; i++) { 91 | #pragma HLS PIPELINE II=1 92 | ap_uint temp; 93 | for(unsigned j=0; j 107 | void MuxStream3_P_BRANCH( 108 | stream > in_shortcut[MAX_PE], 109 | stream > in_nonorm[MAX_PE], 110 | stream > in_emult[MAX_PE], 111 | stream > out[MAX_PE], 112 | const unsigned NumLines, 113 | const bool SHORCUT_ADD_MODE, 114 | const bool EBMULT_MODE, 115 | const bool SHORCUT_QUAN_MODE) 116 | { 117 | if(SHORCUT_QUAN_MODE==false){ 118 | return; 119 | } 120 | 121 | for (unsigned i = 0; i < NumLines; i++) { 122 | #pragma HLS PIPELINE II=1 123 | ap_uint temp; 124 | for(unsigned j=0; j 143 | void MuxStream3_P( 144 | stream > norm_in[MAX_PE], 145 | stream > softmax_in[MAX_PE], 146 | stream > gelu_in[MAX_PE], 147 | stream > out[MAX_PE], 148 | const unsigned NumLines, 149 | const bool NORM_MODE, 150 | const bool SOFTMAX_MODE, 151 | const bool GELU_MODE, 152 | const bool TRANSPOSE_MODE 153 | ) 154 | { 155 | 156 | if(NORM_MODE==false&&SOFTMAX_MODE==false&&GELU_MODE==false&&TRANSPOSE_MODE==false){ 157 | return; 158 | } 159 | 160 | for (unsigned i = 0; i < NumLines; i++) { 161 | #pragma HLS PIPELINE II=1 162 | ap_uint temp; 163 | for(unsigned j=0; j 178 | void MuxStream2_RC( 179 | stream > in1[A_ROW][A_COL], 180 | stream > in2[A_ROW][A_COL], 181 | stream > out[A_ROW][A_COL], 182 | const unsigned NumLines, 183 | const bool mode) 184 | { 185 | for (unsigned i = 0; i < NumLines; i++) { 186 | #pragma HLS PIPELINE II=1 187 | for (unsigned x = 0; x < A_ROW; x++) { 188 | for (unsigned y = 0; y < A_COL; y++) { 189 | ap_uint temp; 190 | if (mode == false) 191 | temp = in1[x][y].read(); 192 | else 193 | temp = in2[x][y].read(); 194 | out[x][y].write(temp); 195 | } 196 | } 197 | } 198 | } 199 | 200 | 201 | template 203 | void MM_to_CONV3_Stream( 204 | stream >& in, 205 | stream >& out, 206 | const unsigned NumLines, 207 | const bool skip_mode) 208 | { 209 | 210 | 211 | 212 | if(skip_mode==true){ 213 | return; 214 | } 215 | 216 | #ifdef WINPUT_DEBUG 217 | // FILE* fpa = fopen("a_stream_pe00_gold.txt", "wb"); 218 | FILE* fpw = fopen("w_stream.txt", "wb"); 219 | #endif 220 | 221 | ap_uint temp_in; 222 | ap_uint temp_w0; 223 | ap_uint temp_w1; 224 | ap_uint temp_w_exp; 225 | ap_uint temp_out; 226 | 227 | for (unsigned i = 0; i < NumLines; i++) { 228 | #pragma HLS PIPELINE II=1 229 | temp_in = in.read(); 230 | 231 | for(unsigned j=0;j test_w0=temp_w0; 236 | ap_int test_w1=temp_w1; 237 | fprintf(fpw, "%d\n", (int)test_w0); 238 | fprintf(fpw, "%d\n", (int)test_w1); 239 | #endif 240 | temp_out=temp_out>>(W_BIT*PACK_CONV_NUM); 241 | temp_w_exp=((ap_uint)temp_w1<<8)+temp_w0; 242 | temp_out(MAX_OUP * PACK_CONV_NUM * W_BIT-1,((MAX_OUP-1) * PACK_CONV_NUM) * W_BIT)=temp_w_exp; 243 | } 244 | out.write(temp_out); 245 | 246 | } 247 | 248 | #ifdef WINPUT_DEBUG 249 | fclose(fpw); 250 | #endif 251 | 252 | } 253 | 254 | 255 | 256 | template < unsigned InStreamW, 257 | unsigned OutStreamW, 258 | unsigned MAX_PE> 259 | void ExpandWidth_P( 260 | stream > in[MAX_PE], 261 | stream > out[MAX_PE], 262 | const unsigned NumLines, 263 | const unsigned skip_mode 264 | ){ 265 | 266 | if(skip_mode==0){ 267 | return; 268 | } 269 | 270 | const unsigned parts = OutStreamW/InStreamW; 271 | ap_uint buffer[MAX_PE]; 272 | #pragma HLS ARRAY_PARTITION variable=temp_in dim=1 complete 273 | int index=0; 274 | 275 | 276 | for (unsigned rep = 0; rep < NumLines*parts; rep++) { //400*400 277 | #pragma HLS loop_tripcount min=NumLines max=NumLines 278 | 279 | #pragma HLS PIPELINE II=1 280 | for (unsigned i = 0; i < MAX_PE; i++) { 281 | ap_uint temp = in[i].read(); 282 | buffer[i]( (index+1)*InStreamW-1, index*InStreamW ) = temp; 283 | } 284 | 285 | if(index==parts-1){ 286 | for (unsigned i = 0; i < MAX_PE; i++) { 287 | out[i].write(buffer[i]); 288 | } 289 | } 290 | 291 | 292 | 293 | if(index==parts-1){ 294 | index=0; 295 | } 296 | else{ 297 | index++; 298 | } 299 | 300 | } 301 | 302 | 303 | } 304 | 305 | 306 | 307 | 308 | template < unsigned InStreamW, 309 | unsigned IN_PE, 310 | unsigned OU_PE> 311 | void ExpandWidth_OUP( 312 | stream > in[IN_PE], 313 | stream > out[OU_PE], 314 | const unsigned NumLines, 315 | const bool skip_mode 316 | ){ 317 | 318 | if(skip_mode==false){ 319 | return; 320 | } 321 | 322 | const unsigned parts = OU_PE/IN_PE; 323 | 324 | int index=0; 325 | 326 | 327 | for (unsigned rep = 0; rep < NumLines; rep++) { //400*400 328 | #pragma HLS loop_tripcount min=NumLines max=NumLines 329 | #pragma HLS PIPELINE II=1 330 | 331 | for (unsigned i = 0; i < IN_PE; i++) { 332 | ap_uint temp = in[i].read(); 333 | out[index*IN_PE+i].write(temp); 334 | } 335 | 336 | if(index==parts-1){ 337 | index=0; 338 | } 339 | else{ 340 | index++; 341 | } 342 | 343 | } 344 | 345 | 346 | } 347 | 348 | 349 | 350 | template < unsigned InStreamW, 351 | unsigned OutStreamW, 352 | unsigned MAX_PE> 353 | void ReduceWidth_P( 354 | stream > in[MAX_PE], 355 | stream > out[MAX_PE], 356 | const unsigned NumLines, 357 | const bool skip_mode){ 358 | 359 | if(skip_mode==true){ 360 | return; 361 | } 362 | 363 | const unsigned parts = InStreamW/OutStreamW; 364 | ap_uint temp_in[MAX_PE]; 365 | #pragma HLS ARRAY_PARTITION variable=temp_in dim=1 complete 366 | 367 | for (unsigned rep = 0; rep < NumLines; rep++) { //400*400*3*3 368 | #pragma HLS loop_tripcount min=NumLines max=NumLines 369 | #pragma HLS PIPELINE II=1 370 | for (unsigned i = 0; i < MAX_PE; i++) { 371 | temp_in[i] = in[i].read(); 372 | for (unsigned p = 0; p < parts; p++) { 373 | 374 | ap_uint temp_out = temp_in[i](OutStreamW-1, 0); 375 | out[i].write(temp_out); 376 | temp_in[i] = temp_in[i] >> OutStreamW; 377 | } 378 | } 379 | } 380 | } 381 | 382 | 383 | 384 | 385 | 386 | template < unsigned InStreamW, 387 | unsigned OutStreamW> 388 | void ReduceWidth( 389 | stream > & in, 390 | stream > & out, 391 | const unsigned NumLines) 392 | { 393 | const unsigned parts = InStreamW/OutStreamW; 394 | 395 | for (unsigned rep = 0; rep < NumLines; rep++) { //400*400*3*3 396 | #pragma HLS loop_tripcount min=NumLines max=NumLines 397 | #pragma HLS PIPELINE II=InStreamW/OutStreamW 398 | 399 | ap_uint temp_in = in.read(); 400 | for (unsigned p = 0; p < parts; p++) { 401 | 402 | ap_uint temp_out = temp_in(OutStreamW-1, 0); 403 | out.write( temp_out ); 404 | temp_in = temp_in >> OutStreamW; 405 | } 406 | } 407 | } 408 | 409 | 410 | template < unsigned InStreamW, 411 | unsigned OutStreamW> 412 | void ReduceWidth_EN( 413 | stream > & in, 414 | stream > & out, 415 | const unsigned NumLines, 416 | const bool skip_mode) 417 | { 418 | if(skip_mode==false){ 419 | return; 420 | } 421 | const unsigned parts = InStreamW/OutStreamW; 422 | 423 | for (unsigned rep = 0; rep < NumLines; rep++) { //400*400*3*3 424 | #pragma HLS loop_tripcount min=NumLines max=NumLines 425 | #pragma HLS PIPELINE II=InStreamW/OutStreamW 426 | 427 | ap_uint temp_in = in.read(); 428 | for (unsigned p = 0; p < parts; p++) { 429 | 430 | ap_uint temp_out = temp_in(OutStreamW-1, 0); 431 | out.write( temp_out ); 432 | temp_in = temp_in >> OutStreamW; 433 | } 434 | } 435 | } -------------------------------------------------------------------------------- /test.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "config.h" 4 | #include "config_test.h" 5 | #define GLODEN_DEBUG 6 | using namespace std; 7 | 8 | 9 | 10 | void generate_mm_shortcut_128btest(ap_uint<128> *mm_ddr_shortcut){ 11 | 12 | unsigned seed=0; 13 | srand(seed); 14 | 15 | ap_uint a; 16 | ap_uint<16> a_16b; 17 | ap_uint temp_80b0,temp_80b1; 18 | ap_uint<128> temp_128b0,temp_128b1; 19 | int cnt=0; 20 | for(int i=0; i>16; 25 | a_16b=rand(); 26 | // cout <<"The Value of Var_p: \t" < *mm_ddr_shortcut){ 50 | 51 | ap_uint a; 52 | ap_uint<16> a_16b; 53 | for(int i=0; i>16; 57 | a_16b=rand(); 58 | // cout <<"The Value of Var_p: \t" <)a; 64 | a=0; 65 | 66 | 67 | } 68 | 69 | } 70 | 71 | // conv-shortcut-output 72 | void generate_conv_shortcut_512btest(ap_uint<512> *conv_ddr_fcvu){ 73 | 74 | ap_uint a; 75 | ap_uint<16> a_16b; 76 | for(int i=0; i>16; 80 | a_16b=rand(); 81 | // cout <<"The Value of Var_p: \t" <)a; 87 | a=0; 88 | 89 | 90 | } 91 | 92 | } 93 | 94 | 95 | // conv-fcvu-output 96 | 97 | void generate_conv_fcvu_512btest(ap_uint<512> *conv_ddr_fcvu){ 98 | 99 | ap_uint a; 100 | ap_uint<16> a_16b; 101 | for(int i=0; i>16; 105 | a_16b=rand(); 106 | // cout <<"The Value of Var_p: \t" <)a; 112 | a=0; 113 | 114 | 115 | } 116 | 117 | } 118 | 119 | 120 | // ln------parameter 121 | 122 | 123 | void generate_ln_bias(LN_BIAS_DB *INPUT_GaMMA, LN_BIAS_DB *INPUT_BeTa, int NUM){ 124 | unsigned seed=0; 125 | srand(seed); 126 | 127 | FILE* fp_true_res0 = fopen("ln_gama.txt", "wb"); 128 | FILE* fp_true_res1 = fopen("ln_beta.txt", "wb"); 129 | LN_BIAS_DB fp_temp0,fp_temp1; 130 | for(int i=0; i ptf_factor[CONV_M]){ 150 | 151 | unsigned seed=0; 152 | srand(seed); 153 | 154 | FILE* fp_true_A = fopen("ln_conv_ptf_factor.txt", "wb"); 155 | ap_uint<2> temp_2bit; 156 | ap_uint<8> temp_8bit; 157 | 158 | for (int j = 0; j < CONV_M/4; j++) { 159 | for (int r = 0; r < 4; r++) { 160 | 161 | temp_2bit = (ap_uint)rand(); 162 | temp_8bit((r+1)*2-1,r*2)=temp_2bit; 163 | // std::cout <<"The Value of A["< ptf_factor[M]){ 175 | 176 | unsigned seed=0; 177 | srand(seed); 178 | 179 | FILE* fp_true_A = fopen("ln_mm_ptf_factor.txt", "wb"); 180 | ap_uint<2> temp_2bit; 181 | ap_uint<8> temp_8bit; 182 | 183 | for (int j = 0; j < M/4; j++) { 184 | for (int r = 0; r < 4; r++) { 185 | 186 | temp_2bit = (ap_uint)rand(); 187 | temp_8bit((r+1)*2-1,r*2)=temp_2bit; 188 | // std::cout <<"The Value of A["< bias[CONV_M/CONV_D][CONV_D/MAX_OUP][MAX_OUP]){ 200 | 201 | unsigned seed=0; 202 | srand(seed); 203 | int cnt=0; 204 | FILE* fp_true_bias = fopen("true_bias.txt", "wb"); 205 | for (int n=0; n)rand()-(ap_int)rand(); 209 | bias[n][k][m] = cnt; 210 | // std::cout <<"The Value of W["< bias[M/(2*MAX_OUP)][2][MAX_OUP]){ 224 | 225 | unsigned seed=0; 226 | srand(seed); 227 | int cnt=0; 228 | FILE* fp_true_bias = fopen("true_mm_bias.txt", "wb"); 229 | for (int n=0; n)rand()-(ap_int)rand(); 233 | bias[n][k][m] = cnt; 234 | // std::cout <<"The Value of W["< *input_ln_parameter){ 248 | 249 | ap_int conv3_bias[CONV_M/CONV_D][CONV_D/MAX_OUP][MAX_OUP]; 250 | 251 | generate_conv_bias(conv3_bias); 252 | 253 | ap_int conv3_scale_factor=ap_int(1234); 254 | 255 | 256 | LN_BIAS_DB *input_ln_gamma = (LN_BIAS_DB *)malloc(CONV_M*sizeof(LN_BIAS_DB) ); 257 | // LN_BIAS_DB input_gamma[CONV_M]; 258 | 259 | LN_BIAS_DB *input_ln_beta = (LN_BIAS_DB *)malloc(CONV_M*sizeof(LN_BIAS_DB) ); 260 | 261 | generate_ln_bias(input_ln_gamma,input_ln_beta,CONV_M); 262 | 263 | 264 | ap_uint<8> conv_ln_ptf_factor[CONV_M/4]; 265 | 266 | 267 | generate_conv_ln_ptf_factor(conv_ln_ptf_factor); 268 | 269 | 270 | 271 | 272 | 273 | memcpy(input_ln_parameter, conv3_bias, (CONV_M)*2); 274 | 275 | input_ln_parameter[CONV_M/16]=conv3_scale_factor; 276 | cout< *mm_ddr_bias){ 294 | 295 | ap_uint<256> a; 296 | ap_uint<16> a_16b; 297 | for(int i=0; i<23; i++){ 298 | 299 | for(int j=0;j<16;j++){ 300 | a=a>>16; 301 | a_16b=rand(); 302 | // cout <<"The Value of Var_p: \t" < *mm_ddr_bias, unsigned num0, unsigned numlines){ 318 | 319 | 320 | unsigned seed=0; 321 | srand(seed); 322 | 323 | ap_uint<128> a; 324 | ap_uint<16> a_16b; 325 | for(int i=0; i>16; 329 | a_16b=rand(); 330 | // cout <<"The Value of Var_p: \t" <)a; 336 | a=0; 337 | } 338 | 339 | 340 | ap_uint conv3_scale_factor=ap_uint(275); 341 | ap_uint integer_conv3_scale_factor; 342 | integer_conv3_scale_factor(BIAS_BIT-1,0)=conv3_scale_factor(BIAS_BIT-1,0); 343 | 344 | 345 | ap_fixed<16, 8> quan_factor=41.2345; 346 | ap_uint integer_quan_factor; 347 | integer_quan_factor(BIAS_BIT-1,0)=quan_factor(BIAS_BIT-1,0); 348 | 349 | cout<<"quan_factor:"< short_quan_factor=32.2345; 352 | ap_uint integer_short_quan_factor; 353 | integer_short_quan_factor(BIAS_BIT-1,0)=short_quan_factor(BIAS_BIT-1,0); 354 | 355 | cout<<"short_quan_factor:"< fc_scale_factor=ap_uint(200); 358 | ap_uint integer_fc_scale_factor; 359 | integer_fc_scale_factor(BIAS_BIT-1,0)=fc_scale_factor(BIAS_BIT-1,0); 360 | 361 | ap_fixed<16, 8> short_dequan_factor=32.2345; 362 | ap_uint integer_short_dequan_factor; 363 | integer_short_dequan_factor(BIAS_BIT-1,0)=short_dequan_factor(BIAS_BIT-1,0); 364 | 365 | ap_uint integer_factor; 366 | integer_factor=(integer_short_dequan_factor, fc_scale_factor, integer_short_quan_factor,integer_quan_factor,integer_conv3_scale_factor); 367 | 368 | mm_ddr_bias[num0]=(ap_uint<128>)integer_factor; 369 | 370 | 371 | for(int i=0; i>16; 375 | a_16b=rand(); 376 | // cout <<"The Value of Var_p: \t" <)a; 382 | a=0; 383 | } 384 | 385 | 386 | 387 | } 388 | 389 | 390 | void generate_conv_allbias_512btest(ap_uint *mm_ddr_bias, unsigned num0, unsigned numlines){ 391 | 392 | 393 | unsigned seed=0; 394 | srand(seed); 395 | 396 | ap_uint<512> a; 397 | ap_uint<16> a_16b; 398 | for(int i=0; i>16; 402 | a_16b=rand(); 403 | // cout <<"The Value of Var_p: \t" <)a; 409 | a=0; 410 | } 411 | 412 | 413 | ap_uint conv3_scale_factor=ap_uint(275); 414 | ap_uint integer_conv3_scale_factor; 415 | integer_conv3_scale_factor(BIAS_BIT-1,0)=conv3_scale_factor(BIAS_BIT-1,0); 416 | 417 | 418 | ap_fixed<16, 8> quan_factor=41.2345; 419 | ap_uint integer_quan_factor; 420 | integer_quan_factor(BIAS_BIT-1,0)=quan_factor(BIAS_BIT-1,0); 421 | 422 | cout<<"quan_factor:"< short_quan_factor=32.2345; 425 | ap_uint integer_short_quan_factor; 426 | integer_short_quan_factor(BIAS_BIT-1,0)=short_quan_factor(BIAS_BIT-1,0); 427 | 428 | cout<<"short_quan_factor:"< fc_scale_factor=ap_uint(200); 431 | ap_uint integer_fc_scale_factor; 432 | integer_fc_scale_factor(BIAS_BIT-1,0)=fc_scale_factor(BIAS_BIT-1,0); 433 | 434 | ap_fixed<16, 8> short_dequan_factor=32.2345; 435 | ap_uint integer_short_dequan_factor; 436 | integer_short_dequan_factor(BIAS_BIT-1,0)=short_dequan_factor(BIAS_BIT-1,0); 437 | 438 | ap_uint integer_factor; 439 | integer_factor=(integer_short_dequan_factor, fc_scale_factor, integer_short_quan_factor,integer_quan_factor,integer_conv3_scale_factor); 440 | 441 | mm_ddr_bias[num0]=(ap_uint<512>)integer_factor; 442 | 443 | 444 | for(int i=0; i>16; 448 | a_16b=rand(); 449 | // cout <<"The Value of Var_p: \t" <)a; 455 | a=0; 456 | } 457 | 458 | 459 | 460 | } 461 | 462 | 463 | void generate_mm_allbias_512btest(ap_uint *mm_ddr_bias){ 464 | 465 | 466 | unsigned seed=0; 467 | srand(seed); 468 | 469 | ap_uint<320> a; 470 | ap_uint<16> a_16b; 471 | for(int i=0; i<23; i++){ 472 | 473 | for(int j=0;j<20;j++){ 474 | a=a>>16; 475 | a_16b=rand(); 476 | // cout <<"The Value of Var_p: \t" <)a; 482 | a=0; 483 | 484 | 485 | } 486 | 487 | } 488 | 489 | 490 | 491 | 492 | //////////////////////////// 493 | 494 | void generate_mm_a(ap_int A[R][N]){ 495 | 496 | unsigned seed=0; 497 | srand(seed); 498 | 499 | FILE* fp_true_A = fopen("true_A.txt", "wb"); 500 | 501 | for (int r = 0; r < R; r++) { 502 | for(int n=0; n)rand()-(ap_int)rand(); 504 | // std::cout <<"The Value of A["< W[N][M]){ 516 | 517 | unsigned seed=0; 518 | srand(seed); 519 | 520 | FILE* fp_true_W = fopen("true_W.txt", "wb"); 521 | for (int n=0; n)rand()-(ap_int)rand(); 524 | // std::cout <<"The Value of W["< conv3_A[CONV_R][CONV_C][CONV_N]){ 537 | 538 | unsigned seed=0; 539 | srand(seed); 540 | 541 | FILE* fp_true_A = fopen("true_conv3_A.txt", "wb"); 542 | for (int r=0; r)rand()-(ap_int)rand(); 546 | // std::cout <<"The Value of W["< conv3_weight[CONV_K][CONV_K][CONV_N][CONV_M]){ 558 | 559 | unsigned seed=0; 560 | srand(seed); 561 | 562 | FILE* fp_true_W = fopen("true_conv3_W.txt", "wb"); 563 | for (int kr=0; kr)rand()-(ap_int)rand(); 568 | // std::cout <<"The Value of W["< conv3_A[CONV_R][CONV_C][CONV_N], 582 | ap_uint *conv3_ddr_a){ 583 | 584 | int cnt=0; 585 | ap_int temp; 586 | for (int r=0; r>IN_BIT; 592 | temp(MAX_INP * PACK_NUM * IN_BIT-1,(MAX_INP * PACK_NUM-1) * IN_BIT)=conv3_A[r][c*PACK_NUM+s][n*MAX_INP+x]; 593 | } 594 | } 595 | conv3_ddr_a[cnt]=temp; 596 | cnt++; 597 | } 598 | } 599 | } 600 | 601 | } 602 | 603 | // 604 | void reorg_conv3_weight(ap_int conv3_weight[CONV_K][CONV_K][CONV_N][CONV_M], 605 | ap_uint<256> packing_conv3_weight[((CONV_K*CONV_N)/MAX_INP)*(CONV_M/(MAX_A_COL))][MAX_A_COL]){ 606 | 607 | ap_uint packing_conv3_tmp[((CONV_K*CONV_N)/MAX_INP)*(CONV_M/MAX_OUP)][MAX_OUP]; 608 | ap_uint tmp; 609 | ap_uint<256> tmp_256; 610 | 611 | // cout<<((CONV_K*CONV_N)/MAX_INP)*(CONV_M/MAX_OUP)<>W_BIT; 620 | tmp(MAX_INP * CONV_K * W_BIT-1,(MAX_INP * CONV_K-1) * W_BIT)=conv3_weight[kr][kc][n*MAX_INP+x][m*MAX_OUP+y]; 621 | } 622 | } 623 | packing_conv3_tmp[m*CONV_K*(CONV_N/MAX_INP)+kr*(CONV_N/MAX_INP)+n][y]=tmp; 624 | // cout<<"d1: "< A[R][N], ap_int W[N][M],float O_golden[R][M]){ 657 | 658 | FILE* fp_true_res = fopen("gloden_out.txt", "wb"); 659 | for (int r = 0; r < R; r++){ 660 | for (int m=0; m *conv3_ddr_to){ 674 | 675 | FILE* fp1 = fopen(name1, "rb"); 676 | int i = 0; 677 | int j = 0; 678 | ap_int<4> tmp; //输入为3bit 679 | ap_uint in_data; 680 | 681 | int rep; 682 | int temp; 683 | FILE* fp_input = fopen("input_verfy.txt", "wb"); 684 | if ((fp1 == NULL)) { 685 | std::cout << "Load Error!" << std::endl; 686 | return false; 687 | } 688 | 689 | 690 | for (i = 0; i)temp; //数据转化为输入bit 695 | fprintf(fp_input, "%d\n", int(tmp)); 696 | // cout << "start:" << tmp < A_from[R][N], ap_uint *DDR_A_to){ 711 | /* Variable Declaration */ 712 | ap_uint packing_in; 713 | /* Variable Declaration */ 714 | 715 | #ifdef GLODEN_DEBUG 716 | // FILE* fpa = fopen("a_stream_pe00_gold.txt", "wb"); 717 | FILE* fpa = fopen("a_stream_pe1x_gold.txt", "wb"); 718 | #endif 719 | int cnt=0; 720 | for(int r=0;r>IN_BIT; 725 | packing_in(MAX_INP * PACK_NUM * IN_BIT-1,(MAX_INP * PACK_NUM-1) * IN_BIT)=A_from[r*MAX_INP*PACK_NUM+x*PACK_NUM+sr][n]; 726 | 727 | #ifdef GLODEN_DEBUG 728 | if(x==1){ 729 | ap_int a0=A_from[r*MAX_INP*PACK_NUM+x*PACK_NUM+sr][n]; 730 | 731 | fprintf(fpa, "%d\n", (int)a0); 732 | } 733 | 734 | #endif 735 | } 736 | } 737 | DDR_A_to[cnt]=packing_in; 738 | cnt++; 739 | } 740 | } 741 | 742 | #ifdef GLODEN_DEBUG 743 | fclose(fpa); 744 | #endif 745 | 746 | } 747 | 748 | 749 | 750 | void host_DDR_A_512b(ap_int A_from[R][N], ap_uint<512> *DDR_A_to){ 751 | /* Variable Declaration */ 752 | ap_uint packing_in; 753 | ap_uint<512> packing_in_512b; 754 | /* Variable Declaration */ 755 | 756 | #ifdef GLODEN_DEBUG 757 | // FILE* fpa = fopen("a_stream_pe00_gold.txt", "wb"); 758 | FILE* fpa = fopen("a_stream_pe1x_gold.txt", "wb"); 759 | #endif 760 | int cnt=0; 761 | for(int r=0;r>IN_BIT; 766 | packing_in(MAX_INP * PACK_NUM * IN_BIT-1,(MAX_INP * PACK_NUM-1) * IN_BIT)=A_from[r*MAX_INP*PACK_NUM+x*PACK_NUM+sr][n]; 767 | 768 | #ifdef GLODEN_DEBUG 769 | if(x==1){ 770 | ap_int a0=A_from[r*MAX_INP*PACK_NUM+x*PACK_NUM+sr][n]; 771 | 772 | fprintf(fpa, "%d\n", (int)a0); 773 | } 774 | 775 | #endif 776 | } 777 | } 778 | packing_in_512b=((ap_uint<512 - MAX_INP * PACK_NUM * IN_BIT>)0,packing_in); 779 | DDR_A_to[cnt]=packing_in_512b; 780 | cnt++; 781 | } 782 | } 783 | 784 | #ifdef GLODEN_DEBUG 785 | fclose(fpa); 786 | #endif 787 | 788 | } 789 | 790 | 791 | void host_DDR_A_128b(ap_int A_from[R][N], ap_uint<128> *DDR_A_to){ 792 | /* Variable Declaration */ 793 | ap_uint packing_in; 794 | ap_uint<384> packing_in_384b; 795 | ap_uint<128> packing_in_128b0,packing_in_128b1,packing_in_128b2; 796 | 797 | /* Variable Declaration */ 798 | 799 | #ifdef GLODEN_DEBUG 800 | // FILE* fpa = fopen("a_stream_pe00_gold.txt", "wb"); 801 | FILE* fpa = fopen("a_stream_pe1x_gold.txt", "wb"); 802 | #endif 803 | int cnt=0; 804 | for(int r=0;r>IN_BIT; 809 | packing_in(MAX_INP * PACK_NUM * IN_BIT-1,(MAX_INP * PACK_NUM-1) * IN_BIT)=A_from[r*MAX_INP*PACK_NUM+x*PACK_NUM+sr][n]; 810 | 811 | #ifdef GLODEN_DEBUG 812 | if(x==1){ 813 | ap_int a0=A_from[r*MAX_INP*PACK_NUM+x*PACK_NUM+sr][n]; 814 | 815 | fprintf(fpa, "%d\n", (int)a0); 816 | } 817 | 818 | #endif 819 | } 820 | } 821 | packing_in_384b=((ap_uint<384 - MAX_INP * PACK_NUM * IN_BIT>)0,packing_in); 822 | (packing_in_128b2,packing_in_128b1,packing_in_128b0)=packing_in_384b; 823 | 824 | DDR_A_to[cnt]=packing_in_128b0; 825 | cnt++; 826 | 827 | DDR_A_to[cnt]=packing_in_128b1; 828 | cnt++; 829 | 830 | DDR_A_to[cnt]=packing_in_128b2; 831 | cnt++; 832 | 833 | } 834 | } 835 | 836 | #ifdef GLODEN_DEBUG 837 | fclose(fpa); 838 | #endif 839 | 840 | } 841 | 842 | 843 | 844 | void host_DDR_W_Softmax(ap_int W_from[N][M], ap_uint*DDR_W_to){ 845 | /* Variable Declaration */ 846 | ap_uint packing_in; 847 | /* Variable Declaration */ 848 | 849 | #ifdef GLODEN_DEBUG 850 | FILE* fpw = fopen("w_stream_pex1_gold.txt", "wb"); 851 | #endif 852 | int cnt=0; 853 | 854 | for(int rep=0;rep<2;rep++){ 855 | for(int m=0;m>W_BIT; 860 | packing_in(MAX_INP * PACK_NUM * W_BIT-1,(MAX_INP * PACK_NUM-1) * W_BIT)=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 861 | 862 | 863 | #ifdef GLODEN_DEBUG 864 | if(y==1){ 865 | ap_int w0=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 866 | 867 | fprintf(fpw, "%d\n", (int)w0); 868 | } 869 | #endif 870 | 871 | } 872 | } 873 | DDR_W_to[cnt]=packing_in; 874 | cnt++; 875 | } 876 | } 877 | } 878 | 879 | 880 | #ifdef GLODEN_DEBUG 881 | fclose(fpw); 882 | #endif 883 | 884 | } 885 | 886 | 887 | void host_DDR_W_Softmax_128b(ap_int W_from[N][M], ap_uint<128> *DDR_W_to){ 888 | /* Variable Declaration */ 889 | ap_uint packing_in; 890 | ap_uint<128> packing_in_256b; 891 | /* Variable Declaration */ 892 | 893 | #ifdef GLODEN_DEBUG 894 | FILE* fpw = fopen("w_stream_pex1_gold.txt", "wb"); 895 | #endif 896 | int cnt=0; 897 | 898 | for(int rep=0;rep<2;rep++){ 899 | for(int m=0;m>W_BIT; 904 | packing_in(MAX_OUP * PACK_NUM * W_BIT-1,(MAX_OUP * PACK_NUM-1) * W_BIT)=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 905 | 906 | 907 | #ifdef GLODEN_DEBUG 908 | if(y==1){ 909 | ap_int w0=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 910 | 911 | fprintf(fpw, "%d\n", (int)w0); 912 | } 913 | #endif 914 | 915 | } 916 | } 917 | packing_in_256b=((ap_uint<128 - MAX_OUP * PACK_NUM * W_BIT>)0, packing_in); 918 | DDR_W_to[cnt]=packing_in; 919 | cnt++; 920 | } 921 | } 922 | } 923 | 924 | 925 | #ifdef GLODEN_DEBUG 926 | fclose(fpw); 927 | #endif 928 | 929 | } 930 | 931 | void host_DDR_W_Softmax_256b(ap_int W_from[N][M], ap_uint<256> *DDR_W_to){ 932 | /* Variable Declaration */ 933 | ap_uint packing_in; 934 | ap_uint<256> packing_in_256b; 935 | /* Variable Declaration */ 936 | 937 | #ifdef GLODEN_DEBUG 938 | FILE* fpw = fopen("w_stream_pex1_gold.txt", "wb"); 939 | #endif 940 | int cnt=0; 941 | 942 | for(int rep=0;rep<2;rep++){ 943 | for(int m=0;m>W_BIT; 948 | packing_in(MAX_INP * PACK_NUM * W_BIT-1,(MAX_INP * PACK_NUM-1) * W_BIT)=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 949 | 950 | 951 | #ifdef GLODEN_DEBUG 952 | if(y==1){ 953 | ap_int w0=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 954 | 955 | fprintf(fpw, "%d\n", (int)w0); 956 | } 957 | #endif 958 | 959 | } 960 | } 961 | packing_in_256b=((ap_uint<256 - MAX_INP * PACK_NUM * W_BIT>)0, packing_in); 962 | DDR_W_to[cnt]=packing_in; 963 | cnt++; 964 | } 965 | } 966 | } 967 | 968 | 969 | #ifdef GLODEN_DEBUG 970 | fclose(fpw); 971 | #endif 972 | 973 | } 974 | 975 | 976 | void host_DDR_W(ap_int W_from[N][M], ap_uint*DDR_W_to){ 977 | /* Variable Declaration */ 978 | ap_uint packing_in; 979 | /* Variable Declaration */ 980 | 981 | #ifdef GLODEN_DEBUG 982 | FILE* fpw = fopen("w_stream_pex1_gold.txt", "wb"); 983 | #endif 984 | int cnt=0; 985 | 986 | for(int m=0;m>W_BIT; 991 | packing_in(MAX_INP * PACK_NUM * W_BIT-1,(MAX_INP * PACK_NUM-1) * W_BIT)=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 992 | 993 | 994 | #ifdef GLODEN_DEBUG 995 | if(y==1){ 996 | ap_int w0=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 997 | 998 | fprintf(fpw, "%d\n", (int)w0); 999 | } 1000 | #endif 1001 | 1002 | } 1003 | } 1004 | DDR_W_to[cnt]=packing_in; 1005 | cnt++; 1006 | } 1007 | } 1008 | 1009 | 1010 | #ifdef GLODEN_DEBUG 1011 | fclose(fpw); 1012 | #endif 1013 | 1014 | } 1015 | 1016 | 1017 | void host_DDR_W_256b(ap_int W_from[N][M], ap_uint<256>*DDR_W_to){ 1018 | /* Variable Declaration */ 1019 | ap_uint packing_in; 1020 | ap_uint<256> packing_in_256b; 1021 | /* Variable Declaration */ 1022 | 1023 | #ifdef GLODEN_DEBUG 1024 | FILE* fpw = fopen("w_stream_pex1_gold.txt", "wb"); 1025 | #endif 1026 | int cnt=0; 1027 | 1028 | for(int m=0;m>W_BIT; 1033 | packing_in(MAX_OUP * PACK_NUM * W_BIT-1,(MAX_OUP * PACK_NUM-1) * W_BIT)=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 1034 | 1035 | 1036 | #ifdef GLODEN_DEBUG 1037 | if(y==1){ 1038 | ap_int w0=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 1039 | 1040 | fprintf(fpw, "%d\n", (int)w0); 1041 | } 1042 | #endif 1043 | 1044 | } 1045 | } 1046 | packing_in_256b=((ap_uint<256 - MAX_OUP * PACK_NUM * W_BIT>)0, packing_in); 1047 | DDR_W_to[cnt]=packing_in; 1048 | cnt++; 1049 | } 1050 | } 1051 | 1052 | 1053 | #ifdef GLODEN_DEBUG 1054 | fclose(fpw); 1055 | #endif 1056 | 1057 | } 1058 | 1059 | 1060 | void host_DDR_W_128b(ap_int W_from[N][M], ap_uint<128>*DDR_W_to){ 1061 | /* Variable Declaration */ 1062 | ap_uint packing_in; 1063 | ap_uint<128> packing_in_128b; 1064 | /* Variable Declaration */ 1065 | 1066 | #ifdef GLODEN_DEBUG 1067 | FILE* fpw = fopen("w_stream_pex1_gold.txt", "wb"); 1068 | #endif 1069 | int cnt=0; 1070 | 1071 | for(int m=0;m>W_BIT; 1076 | packing_in(MAX_OUP * PACK_NUM * W_BIT-1,(MAX_OUP * PACK_NUM-1) * W_BIT)=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 1077 | 1078 | 1079 | #ifdef GLODEN_DEBUG 1080 | if(y==1){ 1081 | ap_int w0=W_from[n][m*MAX_OUP*PACK_NUM+y*PACK_NUM+sm]; 1082 | 1083 | fprintf(fpw, "%d\n", (int)w0); 1084 | } 1085 | #endif 1086 | 1087 | } 1088 | } 1089 | packing_in_128b=((ap_uint<128 - MAX_OUP * PACK_NUM * W_BIT>)0, packing_in); 1090 | DDR_W_to[cnt]=packing_in; 1091 | cnt++; 1092 | } 1093 | } 1094 | 1095 | 1096 | #ifdef GLODEN_DEBUG 1097 | fclose(fpw); 1098 | #endif 1099 | 1100 | } 1101 | 1102 | 1103 | void print_pe_out(float O_golden[R][M]){ 1104 | char fp_name[100]; 1105 | 1106 | for(int x=0; x< MAX_INP; x++){ 1107 | for(int y=0; y< MAX_OUP; y++){ 1108 | sprintf(fp_name,"gloden_out_pe%d%d.txt",x,y); 1109 | FILE* fp_true_pe00 = fopen(fp_name, "wb"); 1110 | cout<<"begine pe"<