for
(int j = 0; j < NB; j += NU)
for (int i = 0; i < NB; i += MU)
load C[i..i+MU-1, j..j+NU-1] into
registers
for (int k = 0; k < NB; k++)
//micro-kernel
load
A[i..i+MU-1,k] into registers
load
B[k,j..j+NU-1] into registers
multiply A’s and
B’s and add to C’s
store C[i..i+MU-1, j..j+NU-1]