From 50249337decc3ff7a148cb1d52ef2546788b5143 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 24 Feb 2025 12:18:21 -0700 Subject: [PATCH 1/3] Ensure we correctly identify local vs non-local peers PMIX_LOCALITY is a value that is computed by OMPI when we do connect/accept - it is computed in opal_hwloc_compute_relative_locality and the value is locally stored on each proc. The reason is that PMIX_LOCALITY provides the location of a process relative to you - it isn't an absolute value representing the location of the process on the node. The absolute location of the proc is provided by the runtime in PMIX_LOCALITY_STRING. This is what was retrieved in dpm.c - and then used to compute the relative locality of that proc, which is then stored as PMIX_LOCALITY. So the reason procs from two unconnected jobs aren't able to get each others PMIX_LOCALITY values is simply because (a) they didn't go thru connect/accept, and therefore (b) they never computed and saved those values. Second, the runtime provides PMIX_LOCALITY_STRING only for those procs that have a defined location - i.e., procs that are BOUND. If a process is not bound, then it has no fixed location on the node, and so the runtime doesn't provide a locality string for it. Thus, getting "not found" for a modex retrieval on PMIX_LOCALITY_STRING is NOT a definitive indicator that the proc is on a different node. The only way to determine that a proc is on a different node is to get the list (or array) of procs on the node and see if the proc is on it. We do this in the dpm, but that step was missing from the comm code. So what I've done here is create a new function ompi_dpm_set_locality that both connect/accept and get_rprocs can use since the required functionality is identical. This will hopefully avoid similar mistakes in the future. Signed-off-by: Ralph Castain --- .gitmodules | 2 +- 3rd-party/openpmix | 2 +- 3rd-party/prrte | 2 +- ompi/communicator/comm.c | 23 ++---- ompi/dpm/dpm.c | 173 ++++++++++++++++++++++++--------------- ompi/dpm/dpm.h | 6 ++ 6 files changed, 125 insertions(+), 83 deletions(-) diff --git a/.gitmodules b/.gitmodules index 6914849f45a..4915e205831 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "prrte"] path = 3rd-party/prrte - url = ../../open-mpi/prrte + url = ../../openpmix/prrte.git branch = master [submodule "openpmix"] path = 3rd-party/openpmix diff --git a/3rd-party/openpmix b/3rd-party/openpmix index 08e41ed5629..ce3f5a464c5 160000 --- a/3rd-party/openpmix +++ b/3rd-party/openpmix @@ -1 +1 @@ -Subproject commit 08e41ed5629b51832f5708181af6d89218c7a74e +Subproject commit ce3f5a464c5565cd2432682180e9a2daff5ad22d diff --git a/3rd-party/prrte b/3rd-party/prrte index 30cadc6746e..5733f223637 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 30cadc6746ebddd69ea42ca78b964398f782e4e3 +Subproject commit 5733f223637336a63760774228a08d9827cda727 diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index e7cdc68de63..f555490738f 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -23,7 +23,7 @@ * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2015 Mellanox Technologies. All rights reserved. * Copyright (c) 2017-2022 IBM Corporation. All rights reserved. - * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2018-2024 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. @@ -1137,7 +1137,7 @@ static int ompi_comm_split_unguided(ompi_communicator_t *comm, int split_type, i if (new_size < original_size) { /* If a valid info object was passed, set the selected topology */ if (NULL != info) { - opal_info_set(info, "mpi_hw_resource_type", + opal_info_set(info, "mpi_hw_resource_type", ompi_comm_split_type_hw_guided_support[i].info_value); } ompi_comm_free(&unguided_comm); @@ -1166,7 +1166,7 @@ static int ompi_comm_split_unguided(ompi_communicator_t *comm, int split_type, i * info(in/out) : Info guiding the split operation * newcomm(out) : Pointer to the newly created communicator, or pointer to MPI_COMM_NULL * if no communicator created. - */ + */ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key, opal_info_t *info, ompi_communicator_t **newcomm) { @@ -1632,7 +1632,7 @@ int ompi_comm_create_from_group (ompi_group_t *group, const char *tag, opal_info newcomp->instance = group->grp_instance; /* - * setup predefined keyvals - see MPI Standard for predefined keyvals cached on + * setup predefined keyvals - see MPI Standard for predefined keyvals cached on * communicators created via MPI_Comm_create_from_group or MPI_Intercomm_create_from_groups */ ompi_attr_hash_init(&newcomp->c_keyhash); @@ -2392,18 +2392,9 @@ int ompi_comm_get_rprocs (ompi_communicator_t *local_comm, ompi_communicator_t * goto err_exit; } - /* set the locality of the remote procs */ - for (i=0; i < rsize; i++) { - /* get the locality information - all RTEs are required - * to provide this information at startup */ - uint16_t *u16ptr, u16; - u16ptr = &u16; - OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY, &rprocs[i]->super.proc_name, &u16ptr, PMIX_UINT16); - if (OPAL_SUCCESS == rc) { - rprocs[i]->super.proc_flags = u16; - } else { - rprocs[i]->super.proc_flags = OPAL_PROC_NON_LOCAL; - } + rc = ompi_dpm_set_locality(rprocs, rsize); + if (OMPI_SUCCESS != rc) { + goto err_exit; } /* And now add the information into the database */ diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index 4b5dbf623e1..8e675b5437a 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -20,7 +20,7 @@ * Copyright (c) 2014-2020 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2018-2022 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. @@ -110,7 +110,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, opal_process_name_t pname; opal_list_t ilist, mlist, rlist; pmix_info_t info, tinfo; - pmix_value_t pval; pmix_pdata_t pdat; pmix_proc_t *procs, pxproc; size_t nprocs, n; @@ -394,86 +393,45 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, goto exit; } if (!opal_list_is_empty(&ilist)) { - int prn, nprn = 0; - char *val; opal_process_name_t wildcard_rank; - i = 0; /* start from the begining */ /* convert the list of new procs to a proc_t array */ - new_proc_list = (ompi_proc_t**)calloc(opal_list_get_size(&ilist), - sizeof(ompi_proc_t *)); - /* Extract the modex info for the first proc on the ilist, and then - * remove all processors in the same jobid from the list by getting - * their connection information and moving them into the proc array. - */ + size = opal_list_get_size(&ilist); + new_proc_list = (ompi_proc_t**)calloc(size, sizeof(ompi_proc_t *)); + // put the procs in the array, but order them by jobid so that + // all members of the same jobid are sequential + i = 0; do { - uint32_t *local_ranks_in_jobid = NULL; ompi_dpm_proct_caddy_t* next = NULL; cd = (ompi_dpm_proct_caddy_t*)opal_list_get_first(&ilist); proc = cd->p; wildcard_rank.jobid = proc->super.proc_name.jobid; - wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid; - /* retrieve the local peers for the specified jobid */ - OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCAL_PEERS, - &wildcard_rank, &val, PMIX_STRING); - if (OPAL_SUCCESS == rc && NULL != val) { - char **peers = opal_argv_split(val, ','); - free(val); - nprn = opal_argv_count(peers); - local_ranks_in_jobid = (uint32_t*)calloc(nprn, sizeof(uint32_t)); - for (prn = 0; NULL != peers[prn]; prn++) { - local_ranks_in_jobid[prn] = strtoul(peers[prn], NULL, 10); - } - opal_argv_free(peers); - } - OPAL_LIST_FOREACH_SAFE(cd, next, &ilist, ompi_dpm_proct_caddy_t) { proc = cd->p; - if( proc->super.proc_name.jobid != wildcard_rank.jobid ) + if (proc->super.proc_name.jobid != wildcard_rank.jobid) { continue; /* not a proc from this jobid */ - + } + // check name setup and set arch + ompi_proc_complete_init_single(proc); new_proc_list[i] = proc; + ++i; opal_list_remove_item(&ilist, (opal_list_item_t*)cd); // TODO: do we need to release cd ? OBJ_RELEASE(cd); - /* ompi_proc_complete_init_single() initializes and optionally retrieves - * OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without - * them, we are just fine */ - ompi_proc_complete_init_single(proc); - /* if this proc is local, then get its locality */ - if (NULL != local_ranks_in_jobid) { - uint16_t u16; - for (prn=0; prn < nprn; prn++) { - if (local_ranks_in_jobid[prn] == proc->super.proc_name.vpid) { - /* get their locality string */ - val = NULL; - OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING, - &proc->super.proc_name, &val, PMIX_STRING); - if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) { - u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val); - free(val); - } else { - /* all we can say is that it shares our node */ - u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; - } - proc->super.proc_flags = u16; - /* save the locality for later */ - OPAL_PMIX_CONVERT_NAME(&pxproc, &proc->super.proc_name); - pval.type = PMIX_UINT16; - pval.data.uint16 = proc->super.proc_flags; - PMIx_Store_internal(&pxproc, PMIX_LOCALITY, &pval); - break; - } - } - } - ++i; - } - if (NULL != local_ranks_in_jobid) { - free(local_ranks_in_jobid); } } while (!opal_list_is_empty(&ilist)); + // set locality for each proc + rc = ompi_dpm_set_locality(new_proc_list, size); + if (OPAL_SUCCESS != rc) { + OMPI_ERROR_LOG(rc); + free(new_proc_list); + new_proc_list = NULL; + OPAL_LIST_DESTRUCT(&ilist); + goto exit; + } + /* call add_procs on the new ones */ - rc = MCA_PML_CALL(add_procs(new_proc_list, opal_list_get_size(&ilist))); + rc = MCA_PML_CALL(add_procs(new_proc_list, size)); free(new_proc_list); new_proc_list = NULL; if (OMPI_SUCCESS != rc) { @@ -561,6 +519,93 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, return rc; } +int ompi_dpm_set_locality(ompi_proc_t **procs, int rsize) +{ + pmix_nspace_t nspace; + pmix_proc_t *local_procs = NULL; + size_t nlocalprocs = 0; + pmix_status_t rc; + int i, ret; + pmix_proc_t pproc; + bool local; + size_t m; + uint16_t u16, *u16ptr = &u16; + char *val; + pmix_value_t pval; + + // lazy-execute the resolve - we may not need to do it and + // it is an expensive operation since it must go to the + // local server if we aren't a singleton + + /* set the locality of the remote procs */ + for (i=0; i < rsize; i++) { + OPAL_PMIX_CONVERT_NAME(&pproc, &procs[i]->super.proc_name); + + // first check to see if the locality is available - do + // this as an "optional" check. It could be we previously + // computed locality for this proc, and the check is fast + // since it only is done locally. + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_LOCALITY, + &procs[i]->super.proc_name, &u16ptr, PMIX_UINT16); + if (OPAL_SUCCESS == ret) { + procs[i]->super.proc_flags = u16; + continue; + } + + // if we didn't find it, then we have to actually compute + // the locality for this proc. check to see if we have + // already resolved the peers - if not, then do so + if (NULL == local_procs) { + /* get the local procs - need all local procs since + * we may have multiple namespaces involved */ + PMIx_Load_nspace(nspace, NULL); + rc = PMIx_Resolve_peers(NULL, nspace, &local_procs, &nlocalprocs); + if (PMIX_SUCCESS != rc) { + return OMPI_ERROR; + } + } + + /* see if this process is local to this node */ + local = false; + for (m=0; m < nlocalprocs; m++) { + if (PMIX_CHECK_PROCID(&local_procs[m], &pproc)) { + // this is a local process + local = true; + break; + } + } + if (!local) { + // this proc is not on the same node as us + procs[i]->super.proc_flags = OPAL_PROC_NON_LOCAL; + continue; + } + + /* get the locality information - all RTEs are required + * to provide this information at startup. However, note + * that locality is ONLY defined for procs that are BOUND + * as it requires that a proc be in a known location! */ + val = NULL; + OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING, + &procs[i]->super.proc_name, &val, PMIX_STRING); + if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) { + u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val); + free(val); + } else { + /* all we can say is that it shares our node */ + u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; + } + procs[i]->super.proc_flags = u16; + /* save the locality for later */ + pval.type = PMIX_UINT16; + pval.data.uint16 = procs[i]->super.proc_flags; + PMIx_Store_internal(&pproc, PMIX_LOCALITY, &pval); + } + if (NULL != local_procs) { + PMIX_PROC_FREE(local_procs, nlocalprocs); + } + return OMPI_SUCCESS; +} + static int construct_peers(ompi_group_t *group, opal_list_t *peers) { int i; diff --git a/ompi/dpm/dpm.h b/ompi/dpm/dpm.h index f954f141ac2..b12684bdaa2 100644 --- a/ompi/dpm/dpm.h +++ b/ompi/dpm/dpm.h @@ -15,6 +15,7 @@ * reserved. * Copyright (c) 2018 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2025 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -98,6 +99,11 @@ int ompi_dpm_open_port(char *port_name); */ int ompi_dpm_close_port(const char *port_name); +/* + * Compute locality for array of procs + */ +int ompi_dpm_set_locality(ompi_proc_t **procs, int rsize); + END_C_DECLS #endif /* OMPI_DPM_H */ From 168ad7c7084f8896c95ef3a04d810dbe3b02e50d Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 4 Mar 2025 06:57:33 -0700 Subject: [PATCH 2/3] Sync to head of PMIx/PRRTE master Signed-off-by: Ralph Castain --- 3rd-party/openpmix | 2 +- 3rd-party/prrte | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/3rd-party/openpmix b/3rd-party/openpmix index ce3f5a464c5..c6cf013cfe9 160000 --- a/3rd-party/openpmix +++ b/3rd-party/openpmix @@ -1 +1 @@ -Subproject commit ce3f5a464c5565cd2432682180e9a2daff5ad22d +Subproject commit c6cf013cfe960a0992d9b70503dd0bba7f492544 diff --git a/3rd-party/prrte b/3rd-party/prrte index 5733f223637..509ce3949c0 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 5733f223637336a63760774228a08d9827cda727 +Subproject commit 509ce3949c0d78f21129e5787738f45ab1b9b9bd From 0cbce7b79c569a728bb98ef44396cb9712c6d7a0 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 13 Mar 2025 15:28:08 -0600 Subject: [PATCH 3/3] Update PMIx/PRRTE submodules to track HEAD of upstream master branches Signed-off-by: Ralph Castain --- 3rd-party/openpmix | 2 +- 3rd-party/prrte | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/3rd-party/openpmix b/3rd-party/openpmix index c6cf013cfe9..d8d8b69da85 160000 --- a/3rd-party/openpmix +++ b/3rd-party/openpmix @@ -1 +1 @@ -Subproject commit c6cf013cfe960a0992d9b70503dd0bba7f492544 +Subproject commit d8d8b69da85cc54e450373f61be1066e5b6345a2 diff --git a/3rd-party/prrte b/3rd-party/prrte index 509ce3949c0..845d0e25f9f 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 509ce3949c0d78f21129e5787738f45ab1b9b9bd +Subproject commit 845d0e25f9f24b5a427067f72c152e5ca43a265f