#include "mpi.h"

#define TRUE 1
#define MAX_MESSAGE_LENGTH 1000000

void time_reduce_scatter
               ( MPI_Datatype datatype,    /* datatype to time              */
                 int ntrials,              /* number of trials              */
                 int *lengths,             /* vector lengths to try         */
		 double *timings,          /* timings                       */
                 MPI_Comm comm,            /* communicator                  */
		 int version )
{
  int  
    me, nprocs, trial, i, j, k, n, typesize;
  int
    *data_in, *data_out;
  int
    *recv_counts;
  double 
    time;
  MPI_Status
    status;

  /* me = this node's index in the communicator */
  MPI_Comm_rank( comm, &me );          

  /* np = number of processors in communicator */
  MPI_Comm_size( comm, &nprocs );          

  MPI_Type_size( MPI_INT, &typesize );

  data_in  = ( int * ) malloc( lengths[ntrials-1] * typesize );
  data_out = ( int * ) malloc( lengths[ntrials-1] * typesize );

  recv_counts = ( int * ) malloc( nprocs * sizeof( int ) );

  for ( trial=0; trial<ntrials; trial++ ){
    n = lengths[ trial ];
    /* Execute each trial twice, timing the second one */
    for ( j=0; j<2; j++ ){
      /* Initialize data */
      for ( i=0; i<n; i++ ) data_in[i] = i;

      for ( i=0; i<n; i++ ) data_out[i] = -1;

      /* initialize recv counts */
      for ( i=0; i<nprocs; i++ ) recv_counts[ i ] = n/nprocs;

      /* start timing */
      MPI_Barrier( comm );
      time = MPI_Wtime();

      if ( version == 0 )
	MPI_Reduce_scatter( data_in, data_out, recv_counts,
			    MPI_INT, MPI_SUM, comm );
      else
	my_reduce_scatter( data_in, data_out, recv_counts,
			   MPI_INT, MPI_SUM, comm );

      /* stop timing */
      time = MPI_Wtime() - time;

      /* check if result is correct */
#ifndef NOCHECK      
      for ( i=0; i<n/nprocs; i++ )
	if ( ( data_out[i] != ((me*n/nprocs)+i)*nprocs ) && ( i % nprocs == 0 ) ){
	  printf( "error in reduce_scatter\n" );
	  exit ( 0 );
	}
#endif
    }

    /* determine max time spent in bcast over all nodes */
    MPI_Allreduce( &time, &timings[ trial ], 1, MPI_DOUBLE, MPI_MAX, 
		   comm );
  }

  free( data_in );
  free( data_out );
  free( recv_counts );

  return;
}
