The following table gives results for the Matrix Multiply function with a wide variety of supported parameters, which are defined in: Matrix Multiply Configuration Parameters.
Library Element | T_DATA_A | T_DATA_B | P_DIM_A | P_DIM_AB | P_DIM_B | P_ADD_TILING_A | P_ADD_TILING_B | P_ADD_DETILING_OUT | P_INPUT_WINDOW_VSIZE_A | P_INPUT_WINDOW_VSIZE_B | P_CASC_LEN | UUT_SSR | AIE_VARIANT | Dynamic Power | Latency | Throughput | NUM_BANKS | NUM_AIE | DATA_MEMORY | PROGRAM_MEMORY |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
matrix_mult | cint16 | cint16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 1 | 1 | 0.797 W | 4622 ns | 137 MSa/s | 7 | 1 | 20617 | 2198 |
matrix_mult | int16 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 4 | 2 | 1 | 1.213 W | 540 ns | 1706 MSa/s | 42 | 8 | 30762 | 1788 1820 1820 1916 1788 1820 1820 1916 |
matrix_mult | int16 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 1 | 2 | 0.512 W | 859 ns | 500 MSa/s | 7 | 1 | 11406 | 1536 |
matrix_mult | int16 | int16 | 16 | 64 | 16 | 1 | 1 | 1 | 1024 | 1024 | 1 | 1 | 2 | 0.677 W | 2173 ns | 411 MSa/s | 11 | 3 | 24986 | 1744 1536 1952 |
matrix_mult | int16 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 2 | 2 | 0.644 W | 1353 ns | 500 MSa/s | 14 | 2 | 17692 | 1504 1504 |
matrix_mult | int16 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 2 | 1 | 2 | 0.651 W | 534 ns | 1000 MSa/s | 11 | 2 | 13587 | 1456 1584 |
matrix_mult | int16 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 4 | 2 | 2 | 1.591 W | 457 ns | 2000 MSa/s | 42 | 8 | 30778 | 1216 1440 1440 1552 1216 1440 1440 1552 |
matrix_mult | int16 | int16 | 8 | 64 | 4 | 0 | 0 | 0 | 512 | 256 | 4 | 1 | 1 | 0.885 W | 389 ns | 372 MSa/s | 22 | 4 | 11925 | 1410 1410 1410 1746 |
matrix_mult | int16 | int32 | 8 | 64 | 4 | 0 | 0 | 0 | 512 | 256 | 4 | 1 | 1 | 0.859 W | 419 ns | 316 MSa/s | 19 | 4 | 13082 | 1650 1682 1682 1770 |
matrix_mult | int32 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 1 | 1 | 0.792 W | 2049 ns | 250 MSa/s | 7 | 1 | 16521 | 2000 |
matrix_mult | int32 | int16 | 16 | 64 | 16 | 1 | 1 | 1 | 1024 | 1024 | 1 | 1 | 1 | 0.821 W | 8688 ns | 117 MSa/s | 12 | 3 | 35220 | 3394 2000 4054 |
matrix_mult | int32 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 2 | 1 | 0.912 W | 1131 ns | 484 MSa/s | 14 | 2 | 22802 | 1932 1932 |
matrix_mult | int32 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 2 | 1 | 1 | 0.852 W | 1473 ns | 410 MSa/s | 12 | 2 | 18703 | 1866 1984 |
matrix_mult | int32 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 4 | 2 | 1 | 1.269 W | 738 ns | 1084 MSa/s | 40 | 8 | 35894 | 1824 1840 1840 1916 1824 1840 1840 1916 |
matrix_mult | int16 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 2 | 1 | 1 | 0.841 W | 914 ns | 725 MSa/s | 12 | 2 | 13579 | 1828 1984 |
matrix_mult | int32 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 1 | 2 | 0.522 W | 1613 ns | 250 MSa/s | 6 | 1 | 16526 | 1776 |
matrix_mult | int32 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 2 | 2 | 0.701 W | 843 ns | 500 MSa/s | 12 | 2 | 22812 | 1728 1728 |
matrix_mult | int32 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 2 | 1 | 2 | 0.689 W | 944 ns | 500 MSa/s | 11 | 2 | 18707 | 1648 1824 |
matrix_mult | int32 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 4 | 2 | 2 | 1.674 W | 482 ns | 1815 MSa/s | 42 | 8 | 35898 | 1616 1648 1648 1776 1616 1648 1648 1776 |
matrix_mult | int32 | int16 | 8 | 64 | 4 | 0 | 0 | 0 | 512 | 256 | 4 | 1 | 1 | 0.898 W | 367 ns | 250 MSa/s | 22 | 4 | 14107 | 1720 1736 1736 1808 |
matrix_mult | int32 | int32 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 1 | 1 | 0.795 W | 4622 ns | 137 MSa/s | 7 | 1 | 20617 | 2198 |
matrix_mult | int32 | int32 | 16 | 64 | 16 | 1 | 1 | 1 | 1024 | 1024 | 1 | 1 | 1 | 0.861 W | 9214 ns | 121 MSa/s | 11 | 3 | 43412 | 3394 2182 3750 |
matrix_mult | int32 | int32 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 2 | 1 | 0.893 W | 2619 ns | 250 MSa/s | 14 | 2 | 30994 | 2130 2130 |
matrix_mult | int32 | int32 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 2 | 1 | 1 | 0.915 W | 2885 ns | 238 MSa/s | 12 | 2 | 22801 | 2140 2280 |
matrix_mult | int32 | int32 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 4 | 2 | 1 | 1.321 W | 1209 ns | 703 MSa/s | 41 | 8 | 44098 | 2098 2186 2186 2212 2098 2186 2186 2212 |
matrix_mult | int32 | int32 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 1 | 2 | 0.58 W | 2127 ns | 248 MSa/s | 6 | 1 | 20622 | 1984 |
matrix_mult | int32 | int32 | 16 | 64 | 16 | 1 | 1 | 1 | 1024 | 1024 | 1 | 1 | 2 | 0.768 W | 3223 ns | 250 MSa/s | 13 | 3 | 43417 | 1344 1536 1984 |
matrix_mult | int32 | int32 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 2 | 2 | 0.724 W | 2629 ns | 250 MSa/s | 14 | 2 | 31004 | 1936 1936 |
matrix_mult | int32 | int32 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 2 | 1 | 2 | 0.773 W | 1508 ns | 423 MSa/s | 11 | 2 | 22803 | 1824 2016 |
matrix_mult | int32 | int16 | 16 | 64 | 16 | 1 | 1 | 1 | 1024 | 1024 | 1 | 1 | 2 | 0.702 W | 2582 ns | 250 MSa/s | 13 | 3 | 35225 | 1344 1776 1776 |
matrix_mult | int32 | int32 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 4 | 2 | 2 | 1.658 W | 901 ns | 1000 MSa/s | 42 | 8 | 44090 | 1808 1856 1856 1968 1808 1856 1856 1968 |
matrix_mult | int16 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 2 | 1 | 0.844 W | 1339 ns | 500 MSa/s | 14 | 2 | 17676 | 1884 1884 |
matrix_mult | int16 | int16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 1 | 1 | 0.772 W | 1177 ns | 471 MSa/s | 7 | 1 | 11398 | 1952 |
matrix_mult | cint16 | cint16 | 16 | 64 | 16 | 1 | 1 | 1 | 1024 | 1024 | 1 | 1 | 1 | 0.864 W | 9214 ns | 121 MSa/s | 11 | 3 | 43412 | 3394 2182 3750 |
matrix_mult | cint16 | cint16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 2 | 1 | 0.907 W | 2619 ns | 250 MSa/s | 14 | 2 | 30994 | 2130 2130 |
matrix_mult | cint16 | cint16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 2 | 1 | 1 | 0.9 W | 2885 ns | 238 MSa/s | 12 | 2 | 22801 | 2140 2280 |
matrix_mult | cint16 | cint16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 4 | 2 | 1 | 1.325 W | 1209 ns | 703 MSa/s | 41 | 8 | 44098 | 2098 2186 2186 2212 2098 2186 2186 2212 |
matrix_mult | cint16 | cint16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 1 | 2 | 0.58 W | 2047 ns | 250 MSa/s | 6 | 1 | 20622 | 1728 |
matrix_mult | cint16 | cint16 | 16 | 64 | 16 | 1 | 1 | 1 | 1024 | 1024 | 1 | 1 | 2 | 0.734 W | 2902 ns | 250 MSa/s | 13 | 3 | 43417 | 1088 1744 1728 |
matrix_mult | cint16 | cint16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 1 | 2 | 2 | 0.725 W | 2629 ns | 250 MSa/s | 14 | 2 | 31004 | 1728 1728 |
matrix_mult | cint16 | cint16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 2 | 1 | 2 | 0.771 W | 1421 ns | 426 MSa/s | 11 | 2 | 22803 | 1664 1792 |
matrix_mult | cint16 | cint16 | 16 | 64 | 16 | 0 | 0 | 0 | 1024 | 1024 | 4 | 2 | 2 | 1.733 W | 817 ns | 1000 MSa/s | 42 | 8 | 44090 | 1648 1664 1664 1792 1648 1664 1664 1792 |
matrix_mult | cint16 | cint16 | 4 | 64 | 8 | 0 | 0 | 0 | 256 | 512 | 4 | 1 | 1 | 0.922 W | 553 ns | 217 MSa/s | 20 | 4 | 15137 | 1730 1752 1752 1822 |
matrix_mult | cint16 | int16 | 8 | 64 | 4 | 0 | 0 | 0 | 512 | 256 | 4 | 1 | 1 | 0.887 W | 367 ns | 250 MSa/s | 22 | 4 | 14107 | 1720 1736 1736 1808 |
matrix_mult | cint32 | cint16 | 4 | 64 | 8 | 0 | 0 | 0 | 256 | 512 | 4 | 1 | 1 | 0.939 W | 712 ns | 160 MSa/s | 22 | 4 | 17435 | 1764 1780 1780 1856 |
matrix_mult | cint32 | cint32 | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 1 | 1 | 1 | 0.802 W | 9936 ns | 70 MSa/s | 7 | 1 | 22662 | 2158 |
matrix_mult | int16 | int16 | 16 | 64 | 16 | 1 | 1 | 1 | 1024 | 1024 | 1 | 1 | 1 | 0.8 W | 7936 ns | 129 MSa/s | 13 | 3 | 24977 | 4240 1936 3746 |
matrix_mult | cint32 | cint32 | 16 | 32 | 16 | 1 | 1 | 1 | 512 | 512 | 1 | 1 | 1 | 0.841 W | 17245 ns | 70 MSa/s | 12 | 3 | 47505 | 1586 2160 1554 |
matrix_mult | cint32 | cint32 | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 2 | 1 | 1 | 0.926 W | 5918 ns | 122 MSa/s | 12 | 2 | 24845 | 2072 2192 |
matrix_mult | cint32 | cint32 | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 4 | 2 | 1 | 1.448 W | 2186 ns | 372 MSa/s | 41 | 8 | 46134 | 2088 2160 2160 2208 2088 2160 2160 2208 |
matrix_mult | cint32 | cint32 | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 1 | 1 | 2 | 0.61 W | 4549 ns | 139 MSa/s | 7 | 1 | 22670 | 1968 |
matrix_mult | cint32 | cint32 | 16 | 32 | 16 | 1 | 1 | 1 | 512 | 512 | 1 | 1 | 2 | 0.76 W | 7781 ns | 139 MSa/s | 13 | 3 | 47513 | 1088 1776 1968 |
matrix_mult | cint32 | cint32 | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 1 | 2 | 2 | 0.846 W | 2619 ns | 246 MSa/s | 12 | 2 | 33052 | 1968 1968 |
matrix_mult | cint32 | cint32 | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 2 | 1 | 2 | 0.818 W | 2905 ns | 237 MSa/s | 10 | 2 | 24851 | 1840 2016 |
matrix_mult | cint32 | cint32 | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 4 | 2 | 2 | 1.838 W | 1199 ns | 713 MSa/s | 42 | 8 | 46138 | 1840 1872 1872 2032 1840 1872 1872 2032 |
matrix_mult | cint32 | cint32 | 4 | 64 | 8 | 0 | 0 | 0 | 256 | 512 | 4 | 1 | 1 | 0.935 W | 1242 ns | 100 MSa/s | 21 | 4 | 21531 | 2082 2130 2130 2160 |
matrix_mult | float | cfloat | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 1 | 1 | 1 | 0.772 W | 11583 ns | 63 MSa/s | 7 | 1 | 18824 | 2582 |
matrix_mult | float | cfloat | 16 | 32 | 16 | 1 | 1 | 1 | 512 | 512 | 1 | 1 | 1 | 0.81 W | 19310 ns | 63 MSa/s | 13 | 3 | 40083 | 1586 2566 2910 |
matrix_mult | float | cfloat | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 1 | 2 | 1 | 0.863 W | 5921 ns | 125 MSa/s | 14 | 2 | 29456 | 2582 2582 |
matrix_mult | float | cfloat | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 2 | 1 | 1 | 0.871 W | 6978 ns | 109 MSa/s | 12 | 2 | 21262 | 2448 2646 |
matrix_mult | float | cfloat | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 4 | 2 | 1 | 1.352 W | 3158 ns | 257 MSa/s | 42 | 8 | 44084 | 2642 2514 2514 3138 2642 2514 2514 3138 |
matrix_mult | cint32 | cint32 | 16 | 32 | 16 | 0 | 0 | 0 | 512 | 512 | 1 | 2 | 1 | 0.943 W | 5095 ns | 138 MSa/s | 14 | 2 | 33036 | 2158 2158 |
matrix_mult | int32 | int32 | 8 | 64 | 4 | 0 | 0 | 0 | 512 | 256 | 4 | 1 | 1 | 0.869 W | 556 ns | 250 MSa/s | 18 | 4 | 15137 | 2006 2094 2094 2138 |