#include "mpi.h"

double sqrt( double );

void my_allgather2_x( char **, int , int, int, int, MPI_Comm );


void my_allgather2( void *send_buf, int send_size, MPI_Datatype send_datatype,
	        void *recv_buf, int recv_size, MPI_Datatype recv_datatype,
                MPI_Comm comm )
{
  int
    me, nprocs, left, right, index, index_next, i, typesize, nprocs1;

  char 
    **recv_location;

  MPI_Status
    status;

  MPI_Request
    request;
  
  if ( send_datatype != recv_datatype ){
    printf(" send_datatype != recv_datatype not yet implemented\n" );
    exit ( 0 );
  }

  MPI_Comm_rank( comm, &me );
  MPI_Comm_size( comm, &nprocs );

  MPI_Type_size( send_datatype, &typesize );

  recv_location = ( char ** ) malloc( ( nprocs + 1 ) * sizeof( char * ) );

  recv_location[ 0 ] = ( char * ) recv_buf;

  for ( i = 0; i<nprocs; i++ )
    recv_location[ i+1 ] = recv_location[ i ] + recv_size * typesize;

  memcpy( recv_location[ me ], send_buf, recv_size * typesize );

  nprocs1 = (int) sqrt( (double) nprocs );

  while ( ( nprocs/nprocs1 )*nprocs1 != nprocs ) nprocs1--;

  my_allgather2_x( recv_location, (me/nprocs1)*nprocs1, nprocs1, 1, me, comm );
  my_allgather2_x( recv_location, me%nprocs1, nprocs/nprocs1, nprocs1, 
		   me, comm );

  free ( recv_location );

  return;
}



void my_allgather2_x( char **recv_location, int proc_first, int nprocs,
		    int stride, int me, MPI_Comm comm )
{
  int
    left, right, index, index_next, i, proc_last;
  MPI_Request
    request;
  MPI_Status
    status;

  proc_last = proc_first + ( nprocs-1 ) * stride;
  
  left = ( me-stride );
  if ( left < proc_first ) left = proc_last;
  
  right = ( me+stride );
  if ( right > proc_last ) right = proc_first;

  index = ( me/stride )*stride;
  
  for ( i=1; i<nprocs; i++ ){
    index_next = ( index+stride );
    if ( index_next > proc_last ) {
      index_next = proc_first;
      index_next = index_next/stride*stride;
    }
    
    MPI_Irecv( recv_location[ index_next ],
	     recv_location[ index_next+stride ] - recv_location[ index_next ],
	     MPI_CHAR, right, MPI_ANY_TAG, comm, &request );

    MPI_Send( recv_location[ index ],
	     recv_location[ index+stride ] - recv_location[ index ],
	     MPI_CHAR, left, 0, comm );

    index = index_next;

    MPI_Wait( &request, &status );
  }
}
