#include "mpi.h"

#define TRUE 1
#define MAX_MESSAGE_LENGTH 1000000
#define NULL 0

#define BF( x ) ( (x) == NULL ? ( int * ) 1 : (x) )

void time_allreduce( 
		     MPI_Datatype datatype,    /* datatype to time              */
		     int ntrials,              /* number of trials              */
		     int *lengths,             /* vector lengths to try         */
		     double *timings,          /* timings                       */
		     MPI_Comm comm,            /* communicator                  */
		     int version)              /* version */
{
  int  
    me, nprocs, trial, i, j, k, n, typesize;
  int
    *data_in, *data_out;
  double 
    time;
  MPI_Status
    status;

  /* me = this node's index in the communicator */
  MPI_Comm_rank( comm, &me );          

  /* np = number of processors in communicator */
  MPI_Comm_size( comm, &nprocs );          

  MPI_Type_size( MPI_INT, &typesize );

  data_in  = ( int * ) malloc( lengths[ntrials-1] * typesize );
  data_out = ( int * ) malloc( lengths[ntrials-1] * typesize );
  if ( data_in == NULL )
    printf("malloc failed\n");

  if ( data_out == NULL )
    printf("malloc failed\n");

  for ( trial=0; trial<ntrials; trial++ ){
    n = lengths[ trial ];
    /* Execute each trial twice, timing the second one */
    for ( j=0; j<2; j++ ){
      /* Initialize data */
      for ( i=0; i<n; i++ ) data_in[i] = i;

      for ( i=0; i<n; i++ ) data_out[i] = -1;

      /* start timing */
      MPI_Barrier( comm );
      time = MPI_Wtime();

      switch ( version ) {
      case 0:
	MPI_Allreduce( BF( data_in ), BF( data_out ), n, MPI_INT, MPI_SUM, comm );
	break;
      case 1:
	my_allreduce( data_in, data_out, n, MPI_INT, MPI_SUM, comm );
	break;
      case 2:
	new_mpi_allreduce( data_in, data_out, n, MPI_INT, MPI_SUM, comm );
	break;
      }

      /* stop timing */
      time = MPI_Wtime() - time;

      /* check if result is correct */
#ifndef NOCHECK      
      for ( i=0; i<n; i++ )
	if ( ( data_out[i] != i*nprocs ) && ( n % nprocs == 0 ) ){
	  printf( "error in allreduce\n" );
	  exit( 0 );
	}
#endif
    }

    /* determine max time spent in bcast over all nodes */
    MPI_Allreduce( &time, &timings[ trial ], 1, MPI_DOUBLE, MPI_MAX, 
		   comm );
  }

  free( data_in );
  free( data_out );

  return;
}
