Trace for matmul_check
1. Preprocessing loop contracts
2. Function.inline_def [cFunDef "mm"];
3. List.iter tile [("i", 32); ("j", 32); ("k", 4)];
4. Loop.reorder_at ~order:["bi"; "bj"; "bk"; "i"; "k"; "j"] [cPlusEq ~lhs:[cVar "sum"] ()];
5. Loop.hoist_expr ~dest:[tBefore; cFor "bi"] "pB" ~indep:["bi"; "i"] [cArrayRead "B"];
6. Matrix.stack_copy ~var:"sum" ~copy_var:"s" ~copy_dims:1 [cFor ~body:[cPlusEq ~lhs:[cVar "sum"] ()] "k"];
7. Omp.simd [nbMulti; cFor ~body:[cPlusEq ~lhs:[cVar "s"] ()] "j"];
8. Omp.parallel_for [nbMulti; cFunBody ""; cStrict; cFor ""];
9. Loop.unroll ~simpl:Arith.do_nothing [cFor ~body:[cPlusEq ~lhs:[cVar "s"] ()] "k"];
10. postprocessing (); )
tmp/{before67496f.cpp → afterf8ff61.cpp} RENAMED
 @@ -1,33 +1,63 @@ 1 #include 2 3 // NOTE: using pretty matrix notation 4 5 - void mm(float* C, float* A, float* B, int m, int n, int p) { 6 - __modifies("C ~> Matrix2(m, n)"); 7 - __reads("A ~> Matrix2(m, p)"); 8 - __reads("B ~> Matrix2(p, n)"); 9 - for (int i = 0; i < m; i++) { 10 - __xmodifies("for j in 0..n -> &C[i][j] ~> Cell"); 11 - for (int j = 0; j < n; j++) { 12 - __xmodifies("&C[i][j] ~> Cell"); 13 - float sum = 0.f; 14 - for (int k = 0; k < p; k++) { 15 - const __ghost_fn focusA = 16 - __ghost_begin(matrix2_ro_focus, "M := A, i := i, j := k"); 17 - const __ghost_fn focusB = 18 - __ghost_begin(matrix2_ro_focus, "M := B, i := k, j := j"); 19 - sum += A[i][k] * B[k][j]; 20 - __ghost_end(focusA); 21 - __ghost_end(focusB); 22 - } 23 - C[i][j] = sum; 24 } 25 } 26 } 27 - 28 - void mm1024(float* C, float* A, float* B) { 29 - __modifies("C ~> Matrix2(1024, 1024)"); 30 - __reads("A ~> Matrix2(1024, 1024)"); 31 - __reads("B ~> Matrix2(1024, 1024)"); 32 - mm(C, A, B, 1024, 1024, 1024); 33 }
 1 #include 2 3 // NOTE: using pretty matrix notation 4 5 + void mm1024(float* C, float* A, float* B) { 6 + float* const pB = (float* const)malloc(sizeof(float[32][256][4][32])); 7 + #pragma omp parallel for 8 + for (int bj = 0; bj < 32; bj++) { 9 + for (int bk = 0; bk < 256; bk++) { 10 + for (int k = 0; k < 4; k++) { 11 + for (int j = 0; j < 32; j++) { 12 + pB[32768 * bj + 128 * bk + 32 * k + j] = 13 + B[1024 * (4 * bk + k) + 32 * bj + j]; 14 } 15 } 16 } 17 + } 18 + #pragma omp parallel for 19 + for (int bi = 0; bi < 32; bi++) { 20 + for (int bj = 0; bj < 32; bj++) { 21 + float* const sum = (float* const)malloc(sizeof(float[32][32])); 22 + for (int i = 0; i < 32; i++) { 23 + for (int j = 0; j < 32; j++) { 24 + sum[32 * i + j] = 0.f; 25 + } 26 + } 27 + for (int bk = 0; bk < 256; bk++) { 28 + for (int i = 0; i < 32; i++) { 29 + float* const s = ref[32] float(); 30 + MMEMCPY(s, 0, sum, 32 * i, 32, sizeof(float)); 31 + #pragma omp simd 32 + for (int j = 0; j < 32; j++) { 33 + s[j] += A[1024 * (32 * bi + i) + 4 * bk] * 34 + pB[32768 * bj + 128 * bk + j]; 35 + } 36 + #pragma omp simd 37 + for (int j = 0; j < 32; j++) { 38 + s[j] += A[1 + 1024 * (32 * bi + i) + 4 * bk] * 39 + pB[32 + 32768 * bj + 128 * bk + j]; 40 + } 41 + #pragma omp simd 42 + for (int j = 0; j < 32; j++) { 43 + s[j] += A[2 + 1024 * (32 * bi + i) + 4 * bk] * 44 + pB[64 + 32768 * bj + 128 * bk + j]; 45 + } 46 + #pragma omp simd 47 + for (int j = 0; j < 32; j++) { 48 + s[j] += A[3 + 1024 * (32 * bi + i) + 4 * bk] * 49 + pB[96 + 32768 * bj + 128 * bk + j]; 50 + } 51 + MMEMCPY(sum, 32 * i, s, 0, 32, sizeof(float)); 52 + } 53 + } 54 + for (int i = 0; i < 32; i++) { 55 + for (int j = 0; j < 32; j++) { 56 + C[1024 * (32 * bi + i) + 32 * bj + j] = sum[32 * i + j]; 57 + } 58 + } 59 + free(sum); 60 + } 61 + } 62 + free(pB); 63 }