tarting

he RING stages*/ left = (comm_size + rank - 1) % comm_size; right = (rank + 1) % comm_size; j = rank; jnext = left;

Synopsis


mpi_errno = MPIC_Irecv( ((char *)mv2_cuda_allgather_store_buf + jnext*recvcount*recvtype_extent), recvcount*recvtype_extent, MPI_BYTE, left, MPIR_ALLGATHER_TAG, comm_ptr, &recv_req ); mpi_errno = MPIC_Isend(((char *)recvbuf + j*recvcount*recvtype_extent), recvcount*recvtype_extent, MPI_BYTE, right, MPIR_ALLGATHER_TAG, comm_ptr, &send_req, errflag); mpi_errno = MPIC_Waitall(1, &recv_req, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); }

MPIU_Memcpy_CUDA_Async((void *)((char *)recvbuf + jnext*recvcount*recvtype_extent), (void *)((char *)mv2_cuda_allgather_store_buf + jnext*recvcount*recvtype_extent), recvcount*recvtype_extent, cudaMemcpyHostToDevice, stream_h2d );

mpi_errno = MPIC_Waitall(1, &send_req, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); }

j = jnext; jnext = (comm_size + jnext - 1) % comm_size;

/*Intermediate steps of communication*/ for (i=2; i<comm_size-1; i++) { mpi_errno = MPIC_Irecv( ((char *)mv2_cuda_allgather_store_buf + jnext*recvcount*recvtype_extent), recvcount, recvtype, left, MPIR_ALLGATHER_TAG, comm_ptr, &recv_req ); mpi_errno = MPIC_Isend(((char *)mv2_cuda_allgather_store_buf + j*recvcount*recvtype_extent), recvcount, recvtype, right, MPIR_ALLGATHER_TAG, comm_ptr, &send_req, errflag); mpi_errno = MPIC_Waitall(1, &recv_req, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); }

MPIU_Memcpy_CUDA_Async((void *)((char *)recvbuf + jnext*recvcount*recvtype_extent), (void *)((char *)mv2_cuda_allgather_store_buf + jnext*recvcount*recvtype_extent), recvcount*recvtype_extent, cudaMemcpyHostToDevice, stream_h2d );

mpi_errno = MPIC_Waitall(1, &send_req, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); }

j = jnext; jnext = (comm_size + jnext - 1) % comm_size; }

/*Last stage of communication - copy directly to device*/ if ( i < comm_size ){ mpi_errno = MPIC_Irecv( ((char *)recvbuf + jnext*recvcount*recvtype_extent), recvcount, recvtype, left, MPIR_ALLGATHER_TAG, comm_ptr, &recv_req ); mpi_errno = MPIC_Isend(((char *)mv2_cuda_allgather_store_buf + j*recvcount*recvtype_extent), recvcount, recvtype, right, MPIR_ALLGATHER_TAG, comm_ptr, &send_req, errflag); mpi_errno = MPIC_Waitall(1, &recv_req, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } mpi_errno = MPIC_Waitall(1, &send_req, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); }

}

} else { /*Recursive Doubling*/ if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype, ((char *) recvbuf + rank * recvcount * recvtype_extent), recvcount, recvtype); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } } /* This synchronization is needed because MPIR_Localcopy calls cudamemcpy * on the default stream (0) but subsequent MPI_Isend/Irecv calls access * GPU buffers using non-default streams which dont wait for the initial * local copy to complete*/ if (rdma_enable_cuda && cuda_initialized && rdma_cuda_nonblocking_streams) { CUDA_CHECK(cudaEventRecord(cuda_nbstream_sync_event, 0)); CUDA_CHECK(cudaStreamWaitEvent(stream_d2h, cuda_nbstream_sync_event, 0)); }

curr_cnt = recvcount;

mask = 0x1; i = 0;

dst = rank ^ mask; dst_tree_root = dst >> i; dst_tree_root <<= i;

my_tree_root = rank >> i; my_tree_root <<= i;

/* F: saving an MPI_Aint into an int */ send_offset = my_tree_root * recvcount * recvtype_extent; recv_offset = dst_tree_root * recvcount * recvtype_extent;

if (dst < comm_size) { MPIU_Memcpy_CUDA((void*)((char *)mv2_cuda_allgather_store_buf + rank*recvcount*recvtype_extent), (void*)((char *)recvbuf + rank*recvcount*recvtype_extent), recvcount * recvtype_extent, cudaMemcpyDeviceToHost);

mpi_errno = MPIC_Irecv( ((char *)mv2_cuda_allgather_store_buf + recv_offset), (mask)*recvcount, recvtype, dst, MPIR_ALLGATHER_TAG, comm_ptr, &recv_req ); mpi_errno = MPIC_Isend(((char *)mv2_cuda_allgather_store_buf + send_offset), curr_cnt, recvtype, dst, MPIR_ALLGATHER_TAG, comm_ptr, &send_req, errflag);

mpi_errno = MPIC_Waitall(1, &recv_req, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); }

MPIU_Memcpy_CUDA_Async((void*)((char *)recvbuf + recv_offset), (void*)((char *)mv2_cuda_allgather_store_buf + recv_offset), (mask)*recvcount*recvtype_extent, cudaMemcpyHostToDevice, stream_h2d );

mpi_errno = MPIC_Waitall(1, &send_req, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); }

curr_cnt += mask*recvcount; }

mask <<= 1; i++;

while (mask < comm_size) { dst = rank ^ mask;

/* find offset into send and recv buffers. zero out the least significant "i" bits of rank and dst to find root of src and dst subtrees. Use ranks of roots as index to send from and recv into buffer */

dst_tree_root = dst >> i; dst_tree_root <<= i;

my_tree_root = rank >> i; my_tree_root <<= i;

/* FIXME: saving an MPI_Aint into an int */ send_offset = my_tree_root * recvcount * recvtype_extent; recv_offset = dst_tree_root * recvcount * recvtype_extent;

if (dst < comm_size) { if (mask == comm_size/2) { mpi_errno = MPIC_Irecv( ((char *)recvbuf + recv_offset), (mask)*recvcount, recvtype, dst, MPIR_ALLGATHER_TAG, comm_ptr, &recv_req ); } else { mpi_errno = MPIC_Irecv( ((char *)mv2_cuda_allgather_store_buf + recv_offset), (mask)*recvcount, recvtype, dst, MPIR_ALLGATHER_TAG, comm_ptr, &recv_req ); } mpi_errno = MPIC_Isend(((char *)mv2_cuda_allgather_store_buf + send_offset), curr_cnt, recvtype, dst, MPIR_ALLGATHER_TAG, comm_ptr, &send_req, errflag); mpi_errno = MPIC_Waitall(1, &recv_req, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); }

if (mask < comm_size/2) { MPIU_Memcpy_CUDA_Async(((void*) ((char *)recvbuf + recv_offset)), (void *)((char *)mv2_cuda_allgather_store_buf + recv_offset), (mask)*recvcount*recvtype_extent, cudaMemcpyHostToDevice, stream_h2d ); } mpi_errno = MPIC_Waitall(1, &send_req, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); }

curr_cnt += mask*recvcount; }

mask <<= 1; i++; } }

/* wait for the receive copies into the device to complete */ cudaerr = cudaEventRecord(cuda_nbstream_sync_event, stream_h2d); if (cudaerr != cudaSuccess) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**cudaEventRecord", 0); return mpi_errno; } cudaEventSynchronize(cuda_nbstream_sync_event);

/* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT(comm_ptr);

fn_fail

return (mpi_errno); } /* end:nested */ #endif /* #if defined(CHANNEL_MRAIL) || defined(CHANNEL_PSM) */ #endif /*#ifdef(_ENABLE_CUDA_)*/