To demonstrate the effect of memory access patterns on execution time (specifically the effect of cache hits and misses) timed inner loops of ijk.c matrix multiply for(i=0;i<1000;i++){ for(j=0;j<1000;j++){ u00=0.0; for(k=0;k<1000;k++){ u00+=a[i][k]*b[k][j]; // # accesses and operations = O(1000^3) } c[i][j]=u00; // # accesses = O(1000^2) } } timed inner loops of ikj.c matrix multiply for(i=0;i<1000;i++){ for(j=0;j<1000;j++){ c[i][j]=0.0; // # accesses = O(1000^2) } } for(i=0;i<1000;i++){ for(k=0;k<1000;k++){ s00=a[i][k]; // # accesses = O(1000^2) for(j=0;j<1000;j++){ c[i][j]+=s00*b[k][j]; // # accesses and operations = O(1000^3) } } } appears as if second form of matrix multiply should take longer, but ... timing results for mm algorithm ijk utime is 158.43 secs for mm algorithm ikj utime is 44.95 secs => 3.5 times faster for mm algorithm jik utime is 158.85 secs for mm algorithm jki utime is 295.41 secs for mm algorithm kij utime is 53.17 secs for mm algorithm kji utime is 311.78 secs and with blocking for mm algorithm 8d8e8fijk utime is 26.26 secs for mm algorithm 8d8f8eikj utime is 23.21 secs +------------------------------------+ | | | ----- for the same algorithm ----- | | the fastest access pattern is | | 13.4 times faster than the slowest | | | +------------------------------------+