diff --git a/ompi/mca/coll/han/coll_han_alltoall.c b/ompi/mca/coll/han/coll_han_alltoall.c index 6437ea7a6af..14b8cad51bf 100644 --- a/ompi/mca/coll/han/coll_han_alltoall.c +++ b/ompi/mca/coll/han/coll_han_alltoall.c @@ -69,6 +69,16 @@ int mca_coll_han_alltoall_using_smsc( { mca_coll_han_module_t *han_module = (mca_coll_han_module_t *)module; + opal_convertor_t convertor; + int send_needs_bounce, have_device_buffer; + size_t packed_size = 0; + enum { + BOUNCE_NOT_INITIALIZED = 0, + BOUNCE_IS_FROM_RBUF = 1, + BOUNCE_IS_FROM_FREELIST = 2, + BOUNCE_IS_FROM_MALLOC = 3, + }; + OPAL_OUTPUT_VERBOSE((90, mca_coll_han_component.han_output, "Entering mca_coll_han_alltoall_using_smsc\n")); @@ -82,6 +92,44 @@ int mca_coll_han_alltoall_using_smsc( comm, han_module->previous_alltoall_module); } + if (sbuf == MPI_IN_PLACE) { + /* This is not an in-place algorithm */ + return han_module->previous_alltoall(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, han_module->previous_alltoall_module); + } + + OBJ_CONSTRUCT( &convertor, opal_convertor_t ); + send_needs_bounce = 0; + have_device_buffer = 0; + /* get converter for copying to one of the leader ranks, and get packed size: */ + opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, &sdtype->super, scount, sbuf, 0, &convertor); + have_device_buffer |= opal_convertor_on_device(&convertor); + send_needs_bounce |= opal_convertor_need_buffers(&convertor); + opal_convertor_cleanup(&convertor); + + opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, &rdtype->super, rcount, rbuf, 0, &convertor); + have_device_buffer |= opal_convertor_on_device(&convertor); + send_needs_bounce |= opal_convertor_need_buffers(&convertor); + opal_convertor_get_packed_size( &convertor, &packed_size ); + opal_convertor_cleanup(&convertor); + + if (have_device_buffer) { + /* + Although this algorithm is functional for device buffers, it requires an + extra copy through the bounce buffer that doesn't make it efficient. + Prefer another algorithm instead. + + Note that Open MPI makes assumptions that if one rank uses a device + buffer in a collective, then all ranks will use device buffers, so there + is no need to communicate before taking this branch. + */ + OBJ_DESTRUCT(&convertor); + return han_module->previous_alltoall(sbuf, scount, sdtype, rbuf, rcount, rdtype, + comm, han_module->previous_alltoall_module); + } + + + /* Create the subcommunicators */ if( OMPI_SUCCESS != mca_coll_han_comm_create_new(comm, han_module) ) { opal_output_verbose(1, mca_coll_han_component.han_output, @@ -107,12 +155,11 @@ int mca_coll_han_alltoall_using_smsc( comm, han_module->previous_alltoall_module); } - int rc, send_needs_bounce, ii_push_data; + int rc, ii_push_data; size_t sndsize; MPI_Aint sextent, rextent, lb; - char *send_bounce; - opal_convertor_t convertor; - size_t packed_size = 0, packed_size_tmp; + char *send_bounce = NULL; + size_t packed_size_tmp; int use_isend; void *gather_buf_in[4]; int up_rank; @@ -140,22 +187,6 @@ int mca_coll_han_alltoall_using_smsc( } if (fanout > up_size) { fanout = up_size; } - OBJ_CONSTRUCT( &convertor, opal_convertor_t ); - - - send_needs_bounce = 0; - /* get converter for copying to one of the leader ranks, and get packed size: */ - opal_convertor_copy_and_prepare_for_send(ompi_mpi_local_convertor, &sdtype->super, scount, sbuf, 0, &convertor); - send_needs_bounce |= 0 != opal_convertor_on_device(&convertor); - send_needs_bounce |= opal_convertor_need_buffers(&convertor); - opal_convertor_cleanup(&convertor); - - opal_convertor_copy_and_prepare_for_recv(ompi_mpi_local_convertor, &rdtype->super, rcount, rbuf, 0, &convertor); - send_needs_bounce |= 0 != opal_convertor_on_device(&convertor); - send_needs_bounce |= opal_convertor_need_buffers(&convertor); - opal_convertor_get_packed_size( &convertor, &packed_size ); - opal_convertor_cleanup(&convertor); - /* Because push-mode needs extra synchronizations, we'd like to avoid it, however it might be necessary: @@ -166,7 +197,7 @@ int mca_coll_han_alltoall_using_smsc( If the application buffer is device memory, we'll also need to exchange in push mode so that the process which has device registrations can - perform the reads. + perform the reads. (this mode has been disabled) In both of these cases, we'll need to use the bounce buffer too. */ @@ -186,19 +217,30 @@ int mca_coll_han_alltoall_using_smsc( inter_recv_reqs = malloc(sizeof(*inter_recv_reqs) * up_size ); char **low_bufs = malloc(low_size * sizeof(*low_bufs)); void **sbuf_map_ctx = malloc(low_size * sizeof(&sbuf_map_ctx)); + opal_free_list_item_t *send_fl_item = NULL; const int nptrs_gather = 3; void **gather_buf_out = calloc(low_size*nptrs_gather, sizeof(void*)); - bool send_bounce_is_allocated = false; + int send_bounce_status = BOUNCE_NOT_INITIALIZED; do { start_allgather: if ( 0 == send_needs_bounce ) { send_bounce = (char*)rbuf + up_rank*send_bytes_per_fan; + send_bounce_status = BOUNCE_IS_FROM_RBUF; } else { - if (!send_bounce_is_allocated) { - send_bounce = malloc(send_bytes_per_fan * fanout); - send_bounce_is_allocated = true; + if (send_bounce_status == BOUNCE_NOT_INITIALIZED || send_bounce_status == BOUNCE_IS_FROM_RBUF) { + if (send_bytes_per_fan * fanout < mca_coll_han_component.han_packbuf_bytes) { + send_fl_item = opal_free_list_get(&mca_coll_han_component.pack_buffers); + if (send_fl_item) { + send_bounce_status = BOUNCE_IS_FROM_FREELIST; + send_bounce = send_fl_item->ptr; + } + } + if (!send_fl_item) { + send_bounce = malloc(send_bytes_per_fan * fanout); + send_bounce_status = BOUNCE_IS_FROM_MALLOC; + } } } @@ -384,7 +426,11 @@ int mca_coll_han_alltoall_using_smsc( } } OBJ_DESTRUCT(&convertor); - if (send_bounce_is_allocated) free(send_bounce); + if (send_bounce_status == BOUNCE_IS_FROM_FREELIST) { + opal_free_list_return(&mca_coll_han_component.pack_buffers, send_fl_item); + } else if (send_bounce_status == BOUNCE_IS_FROM_MALLOC) { + free(send_bounce); + } free(inter_send_reqs); free(inter_recv_reqs); free(sbuf_map_ctx);