|
7 | 7 | #include <stdlib.h> |
8 | 8 | #include <inttypes.h> |
9 | 9 | #include <math.h> |
10 | | - |
11 | 10 | #if defined(HAVE_SME) |
12 | 11 |
|
13 | 12 | /* Function prototypes */ |
@@ -44,15 +43,31 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ |
44 | 43 | m_mod = ceil((double)M/(double)vl_elms) * vl_elms; |
45 | 44 |
|
46 | 45 | float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); |
47 | | - |
| 46 | + |
| 47 | + /* Prevent compiler optimization by reading from memory instead |
| 48 | + * of reading directly from vector (z) registers. |
| 49 | + * */ |
| 50 | + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", |
| 51 | + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", |
| 52 | + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", |
| 53 | + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", |
| 54 | + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", |
| 55 | + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); |
| 56 | + |
48 | 57 | /* Pre-process the left matrix to make it suitable for |
49 | 58 | matrix sum of outer-product calculation |
50 | 59 | */ |
51 | 60 | sgemm_direct_sme1_preprocess(M, K, A, A_mod); |
52 | 61 |
|
53 | 62 | /* Calculate C = A*B */ |
54 | 63 | sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); |
55 | | - |
| 64 | + |
| 65 | + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", |
| 66 | + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", |
| 67 | + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", |
| 68 | + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", |
| 69 | + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", |
| 70 | + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); |
56 | 71 | free(A_mod); |
57 | 72 | } |
58 | 73 |
|
|
0 commit comments