#include <stdio.h>
#include <sys/times.h>
#include <sys/types.h>
#include <time.h>
#define CACHE_MIN (1024) /* smallest cache size */
#define CACHE_MAX (4 * 1024 * 1024) /* largest cache size */
#define SAMPLE 10 /* to get a larger time sample */

#ifndef CLK_TCK
#define CLK_TCK 60 /* # clock ticks per second */
#endif

int x[CACHE_MAX]; /* array to stride through */


double get_seconds() { /* routine to read time */
  struct tms rusage;
  times(&rusage);
  return (double)(rusage.tms_utime)/CLK_TCK;
}


void main()
{
  int register i, index, stride, limit, temp;
  int steps, tsteps, csize;
  double sec0, sec;
  
  for(csize=CACHE_MIN; csize <= CACHE_MAX; csize = csize * 2){
    for(stride = 1; stride <= csize/2; stride = stride * 2){
      sec = 0;
      limit = csize - stride + 1; /* cache size this loop */

      steps = 0;
      do{
	sec0 = get_seconds(); /* start timer */
	/*
	 * Total number of cache accesses per while loop pass
	 * is (limit * SAMPLE) because we touch (limit/stride) elements each
	 * of (SAMPLE * stride) passes
	 */
	for(i=SAMPLE*stride; i!= 0; i = i-1){
	  for(index = 0; index < limit; index = index + stride){
	    x[index] = x[index] + 1; /* cache access */
	  }
	}
	steps = steps + 1; /* count while loop iterations */
	sec = sec + (get_seconds() - sec0); /* end timer */
      } while (sec < 1.0); 

      /* repeat empty loop to subtract loop overhead */
      tsteps = 0; /* used to match no. while iterations */
      do{ /* repeat same no. iterations as above */
	sec0 = get_seconds(); /* start timer */
	for(i = SAMPLE*stride; i!= 0; i = i-1){
	  for(index = 0; index < limit; index = index + stride){
	    temp = temp + index; /* dummy code */
	  }
	}
	tsteps = tsteps + 1; /* count while iterations */
	sec = sec - (get_seconds() - sec0);
      }while (tsteps < steps);


      printf("Size: %7d Stride %7d read+write: %14.13f ms\n",
	     csize * sizeof(int), stride * sizeof(int), 
	     (double)sec*1e3/(steps*SAMPLE*stride*((limit-1)/stride+1)));
    }
  }
}
