From koop at mvapich.cse.ohio-state.edu Thu Jun 5 12:57:08 2008 From: koop at mvapich.cse.ohio-state.edu (koop@mvapich.cse.ohio-state.edu) Date: Thu Jun 5 12:57:25 2008 Subject: [mvapich-commit] r2650 - mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2 Message-ID: <200806051657.m55Gv8pD013284@mvapich.cse.ohio-state.edu> Author: koop Date: 2008-06-05 12:57:07 -0400 (Thu, 05 Jun 2008) New Revision: 2650 Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_param.c mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_impl.h Log: * Add additional synchronization so pthread_cancel() does not kill the thread before it has a chance to 'ack' any outstanding events Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c 2008-06-05 06:15:04 UTC (rev 2649) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c 2008-06-05 16:57:07 UTC (rev 2650) @@ -786,6 +786,14 @@ fprintf(stderr, "Error getting event!\n"); } + for(i = 0; i < rdma_num_hcas; i++) { + if(MPIDI_CH3I_RDMA_Process.nic_context[i] == context) { + hca_num = i; + } + } + + pthread_mutex_lock(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[hca_num]); + switch (event.event_type) { /* Fatal */ case IBV_EVENT_CQ_ERR: @@ -837,11 +845,6 @@ pthread_spin_lock(&MPIDI_CH3I_RDMA_Process.srq_post_spin_lock); - for(i = 0; i < rdma_num_hcas; i++) { - if(MPIDI_CH3I_RDMA_Process.nic_context[i] == context) { - hca_num = i; - } - } if(-1 == hca_num) { /* Was not able to find the context, @@ -914,6 +917,7 @@ } ibv_ack_async_event(&event); + pthread_mutex_unlock(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[hca_num]); } } Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_param.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_param.c 2008-06-05 06:15:04 UTC (rev 2649) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_param.c 2008-06-05 16:57:07 UTC (rev 2650) @@ -244,6 +244,8 @@ fprintf(stderr,"Unknown Mellanox PCI-Express HCA" " best guess as Mellanox PCI-Express SDR\n"); + fprintf(stderr, "rate: %d\n", rate); + hca_type = MLX_PCI_EX_SDR; } Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c 2008-06-05 06:15:04 UTC (rev 2649) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c 2008-06-05 16:57:07 UTC (rev 2650) @@ -126,6 +126,10 @@ rdma_num_rails = rdma_num_hcas * rdma_num_ports * rdma_num_qp_per_port; + for(i = 0; i < rdma_num_hcas; i++) { + pthread_mutex_init(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[i], 0); + } + DEBUG_PRINT("num_qp_per_port %d, num_rails = %d\n", rdma_num_qp_per_port, rdma_num_rails); @@ -776,9 +780,13 @@ if (MPIDI_CH3I_RDMA_Process.has_srq) { pthread_cond_destroy(&MPIDI_CH3I_RDMA_Process.srq_post_cond[i]); pthread_mutex_destroy(&MPIDI_CH3I_RDMA_Process.srq_post_mutex_lock[i]); + + pthread_mutex_lock(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[i]); pthread_cancel(MPIDI_CH3I_RDMA_Process.async_thread[i]); pthread_join(MPIDI_CH3I_RDMA_Process.async_thread[i], NULL); err = ibv_destroy_srq(MPIDI_CH3I_RDMA_Process.srq_hndl[i]); + pthread_mutex_unlock(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[i]); + pthread_mutex_destroy(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[i]); if (err) MPIU_Error_printf("Failed to destroy SRQ (%d)\n", err); } Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_impl.h =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_impl.h 2008-06-05 06:15:04 UTC (rev 2649) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_impl.h 2008-06-05 16:57:07 UTC (rev 2650) @@ -111,6 +111,7 @@ struct ibv_srq *srq_hndl[MAX_NUM_HCAS]; pthread_spinlock_t srq_post_spin_lock; pthread_mutex_t srq_post_mutex_lock[MAX_NUM_HCAS]; + pthread_mutex_t async_mutex_lock[MAX_NUM_HCAS]; pthread_cond_t srq_post_cond[MAX_NUM_HCAS]; uint32_t srq_zero_post_counter[MAX_NUM_HCAS]; pthread_t async_thread[MAX_NUM_HCAS]; From koop at mvapich.cse.ohio-state.edu Thu Jun 5 13:55:20 2008 From: koop at mvapich.cse.ohio-state.edu (koop@mvapich.cse.ohio-state.edu) Date: Thu Jun 5 13:55:38 2008 Subject: [mvapich-commit] r2652 - mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2 Message-ID: <200806051755.m55HtKjF013524@mvapich.cse.ohio-state.edu> Author: koop Date: 2008-06-05 13:55:19 -0400 (Thu, 05 Jun 2008) New Revision: 2652 Modified: mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_impl.h Log: * Add additional synchronization so pthread_cancel() does not kill the thread before it has a chance to 'ack' any outstanding events Modified: mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c =================================================================== --- mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c 2008-06-05 17:48:39 UTC (rev 2651) +++ mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c 2008-06-05 17:55:19 UTC (rev 2652) @@ -786,6 +786,14 @@ fprintf(stderr, "Error getting event!\n"); } + for(i = 0; i < rdma_num_hcas; i++) { + if(MPIDI_CH3I_RDMA_Process.nic_context[i] == context) { + hca_num = i; + } + } + + pthread_mutex_lock(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[hca_num]); + switch (event.event_type) { /* Fatal */ case IBV_EVENT_CQ_ERR: @@ -837,11 +845,6 @@ pthread_spin_lock(&MPIDI_CH3I_RDMA_Process.srq_post_spin_lock); - for(i = 0; i < rdma_num_hcas; i++) { - if(MPIDI_CH3I_RDMA_Process.nic_context[i] == context) { - hca_num = i; - } - } if(-1 == hca_num) { /* Was not able to find the context, @@ -914,6 +917,7 @@ } ibv_ack_async_event(&event); + pthread_mutex_unlock(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[hca_num]); } } Modified: mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c =================================================================== --- mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c 2008-06-05 17:48:39 UTC (rev 2651) +++ mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c 2008-06-05 17:55:19 UTC (rev 2652) @@ -126,6 +126,10 @@ rdma_num_rails = rdma_num_hcas * rdma_num_ports * rdma_num_qp_per_port; + for(i = 0; i < rdma_num_hcas; i++) { + pthread_mutex_init(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[i], 0); + } + DEBUG_PRINT("num_qp_per_port %d, num_rails = %d\n", rdma_num_qp_per_port, rdma_num_rails); @@ -776,9 +780,13 @@ if (MPIDI_CH3I_RDMA_Process.has_srq) { pthread_cond_destroy(&MPIDI_CH3I_RDMA_Process.srq_post_cond[i]); pthread_mutex_destroy(&MPIDI_CH3I_RDMA_Process.srq_post_mutex_lock[i]); + + pthread_mutex_lock(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[i]); pthread_cancel(MPIDI_CH3I_RDMA_Process.async_thread[i]); pthread_join(MPIDI_CH3I_RDMA_Process.async_thread[i], NULL); err = ibv_destroy_srq(MPIDI_CH3I_RDMA_Process.srq_hndl[i]); + pthread_mutex_unlock(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[i]); + pthread_mutex_destroy(&MPIDI_CH3I_RDMA_Process.async_mutex_lock[i]); if (err) MPIU_Error_printf("Failed to destroy SRQ (%d)\n", err); } Modified: mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_impl.h =================================================================== --- mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_impl.h 2008-06-05 17:48:39 UTC (rev 2651) +++ mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_impl.h 2008-06-05 17:55:19 UTC (rev 2652) @@ -111,6 +111,7 @@ struct ibv_srq *srq_hndl[MAX_NUM_HCAS]; pthread_spinlock_t srq_post_spin_lock; pthread_mutex_t srq_post_mutex_lock[MAX_NUM_HCAS]; + pthread_mutex_t async_mutex_lock[MAX_NUM_HCAS]; pthread_cond_t srq_post_cond[MAX_NUM_HCAS]; uint32_t srq_zero_post_counter[MAX_NUM_HCAS]; pthread_t async_thread[MAX_NUM_HCAS]; From curtisbr at mvapich.cse.ohio-state.edu Mon Jun 9 11:53:23 2008 From: curtisbr at mvapich.cse.ohio-state.edu (curtisbr@mvapich.cse.ohio-state.edu) Date: Mon Jun 9 11:53:39 2008 Subject: [mvapich-commit] r2668 - mvapich2/trunk/src/pm/mpd Message-ID: <200806091553.m59FrNnw010057@mvapich.cse.ohio-state.edu> Author: curtisbr Date: 2008-06-09 11:53:21 -0400 (Mon, 09 Jun 2008) New Revision: 2668 Modified: mvapich2/trunk/src/pm/mpd/mpiexec.py Log: Enable configurable recvTimeout multiplier (MV2_MPD_RECVTIMEOUT_MULTIPLIER) and base the timeout on the number of processes. Modified: mvapich2/trunk/src/pm/mpd/mpiexec.py =================================================================== --- mvapich2/trunk/src/pm/mpd/mpiexec.py 2008-06-09 04:15:48 UTC (rev 2667) +++ mvapich2/trunk/src/pm/mpd/mpiexec.py 2008-06-09 15:53:21 UTC (rev 2668) @@ -841,6 +841,19 @@ print 'no cmd specified' usage() + global recvTimeout + recvTimeoutMultiplier = 0.05 + if os.environ.has_key('MV2_MPD_RECVTIMEOUT_MULTIPLIER'): + try: + recvTimeoutMultiplier = int(os.environ['MV2_MPD_RECVTIMEOUT_MULTIPLIER']) + except ValueError: + try: + recvTimeoutMultiplier = float(os.environ['MV2_MPD_RECVTIMEOUT_MULTIPLIER']) + except ValueError: + print 'Invalid MV2_MPD_RECVTIMEOUT_MULTIPLIER. Value must be a number.' + sys.exit(-1) + recvTimeout = nProcs * recvTimeoutMultiplier + argsetLoRange = nextRange argsetHiRange = nextRange + nProcs - 1 loRange = argsetLoRange From curtisbr at mvapich.cse.ohio-state.edu Mon Jun 9 11:55:15 2008 From: curtisbr at mvapich.cse.ohio-state.edu (curtisbr@mvapich.cse.ohio-state.edu) Date: Mon Jun 9 11:55:32 2008 Subject: [mvapich-commit] r2669 - mvapich2/branches/1.0/src/pm/mpd Message-ID: <200806091555.m59FtFub010078@mvapich.cse.ohio-state.edu> Author: curtisbr Date: 2008-06-09 11:55:15 -0400 (Mon, 09 Jun 2008) New Revision: 2669 Modified: mvapich2/branches/1.0/src/pm/mpd/mpiexec.py Log: Enable configurable recvTimeout multiplier (MV2_MPD_RECVTIMEOUT_MULTIPLIER) and base the timeout on the number of processes. Modified: mvapich2/branches/1.0/src/pm/mpd/mpiexec.py =================================================================== --- mvapich2/branches/1.0/src/pm/mpd/mpiexec.py 2008-06-09 15:53:21 UTC (rev 2668) +++ mvapich2/branches/1.0/src/pm/mpd/mpiexec.py 2008-06-09 15:55:15 UTC (rev 2669) @@ -841,6 +841,19 @@ print 'no cmd specified' usage() + global recvTimeout + recvTimeoutMultiplier = 0.05 + if os.environ.has_key('MV2_MPD_RECVTIMEOUT_MULTIPLIER'): + try: + recvTimeoutMultiplier = int(os.environ['MV2_MPD_RECVTIMEOUT_MULTIPLIER']) + except ValueError: + try: + recvTimeoutMultiplier = float(os.environ['MV2_MPD_RECVTIMEOUT_MULTIPLIER']) + except ValueError: + print 'Invalid MV2_MPD_RECVTIMEOUT_MULTIPLIER. Value must be a number.' + sys.exit(-1) + recvTimeout = nProcs * recvTimeoutMultiplier + argsetLoRange = nextRange argsetHiRange = nextRange + nProcs - 1 loRange = argsetLoRange From curtisbr at mvapich.cse.ohio-state.edu Mon Jun 9 12:35:47 2008 From: curtisbr at mvapich.cse.ohio-state.edu (curtisbr@mvapich.cse.ohio-state.edu) Date: Mon Jun 9 12:36:26 2008 Subject: [mvapich-commit] r2670 - mvapich2/branches/1.0 Message-ID: <200806091635.m59GZlDw010242@mvapich.cse.ohio-state.edu> Author: curtisbr Date: 2008-06-09 12:35:47 -0400 (Mon, 09 Jun 2008) New Revision: 2670 Modified: mvapich2/branches/1.0/CHANGELOG Log: Add notice of configurable MPD mpiexec timeout. Modified: mvapich2/branches/1.0/CHANGELOG =================================================================== --- mvapich2/branches/1.0/CHANGELOG 2008-06-09 15:55:15 UTC (rev 2669) +++ mvapich2/branches/1.0/CHANGELOG 2008-06-09 16:35:47 UTC (rev 2670) @@ -11,6 +11,8 @@ * Fix a startup performance issue when on-demand connection setup is not used. +* Configurable MPD mpiexec based on the number of processes. + MVAPICH2-1.0.2 (02/20/08) * Change the default MV2_DAPL_PROVIDER to OpenIB-cma From curtisbr at mvapich.cse.ohio-state.edu Mon Jun 9 12:40:18 2008 From: curtisbr at mvapich.cse.ohio-state.edu (curtisbr@mvapich.cse.ohio-state.edu) Date: Mon Jun 9 12:40:33 2008 Subject: [mvapich-commit] r2671 - mvapich2/trunk Message-ID: <200806091640.m59GeIPd010266@mvapich.cse.ohio-state.edu> Author: curtisbr Date: 2008-06-09 12:40:18 -0400 (Mon, 09 Jun 2008) New Revision: 2671 Modified: mvapich2/trunk/CHANGELOG Log: Add notice of configurable MPD mpiexec timeout. Modified: mvapich2/trunk/CHANGELOG =================================================================== --- mvapich2/trunk/CHANGELOG 2008-06-09 16:35:47 UTC (rev 2670) +++ mvapich2/trunk/CHANGELOG 2008-06-09 16:40:18 UTC (rev 2671) @@ -3,6 +3,9 @@ This file briefly describes the latest changes to the MVAPICH2 software package. The logs are arranged in the "most recent first" order. +06/09/2008 +* Configurable MPD mpiexec timeout based on the number of processes. + 04/24/2008 * Fix a startup performance issue when on-demand connection setup is not used. From perkinjo at mvapich.cse.ohio-state.edu Tue Jun 10 08:53:07 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 08:53:23 2008 Subject: [mvapich-commit] r2674 - mvapich2/tags Message-ID: <200806101253.m5ACr7IH013212@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-06-10 08:53:05 -0400 (Tue, 10 Jun 2008) New Revision: 2674 Added: mvapich2/tags/1.0.3/ Log: Create mvapich2-1.0.3 release tag Copied: mvapich2/tags/1.0.3 (from rev 2673, mvapich2/branches/1.0) From perkinjo at mvapich.cse.ohio-state.edu Tue Jun 10 08:57:12 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 08:57:27 2008 Subject: [mvapich-commit] r2675 - mvapich2/branches/1.0 Message-ID: <200806101257.m5ACvC9P013233@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-06-10 08:57:12 -0400 (Tue, 10 Jun 2008) New Revision: 2675 Modified: mvapich2/branches/1.0/CHANGELOG Log: Update release date in changelog Modified: mvapich2/branches/1.0/CHANGELOG =================================================================== --- mvapich2/branches/1.0/CHANGELOG 2008-06-10 12:53:05 UTC (rev 2674) +++ mvapich2/branches/1.0/CHANGELOG 2008-06-10 12:57:12 UTC (rev 2675) @@ -4,7 +4,7 @@ package. The logs are arranged in the "most recent first" order. -MVAPICH2-1.0.3 (05/05/08) +MVAPICH2-1.0.3 (06/10/08) * Post buffers before accepting QP connections for iWARP mode From perkinjo at mvapich.cse.ohio-state.edu Tue Jun 10 09:11:01 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 09:11:34 2008 Subject: [mvapich-commit] r2676 - mvapich2/branches/1.0 Message-ID: <200806101311.m5ADB18R013291@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-06-10 09:11:00 -0400 (Tue, 10 Jun 2008) New Revision: 2676 Removed: mvapich2/branches/1.0/osu_benchmarks/ Log: Remove stale osu_benchmarks to replace with external reference to OMB-3.1 branch From perkinjo at mvapich.cse.ohio-state.edu Tue Jun 10 09:13:58 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 09:14:13 2008 Subject: [mvapich-commit] r2677 - mvapich2/branches/1.0 Message-ID: <200806101313.m5ADDw0Y013309@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-06-10 09:13:58 -0400 (Tue, 10 Jun 2008) New Revision: 2677 Modified: mvapich2/branches/1.0/ Log: Make osu_benchmarks refer to OMB-3.1 branch Property changes on: mvapich2/branches/1.0 ___________________________________________________________________ Name: svn:externals + osu_benchmarks https://mvapich.cse.ohio-state.edu/svn/mpi-benchmarks/branches/OMB-3.1 From narravul at mvapich.cse.ohio-state.edu Tue Jun 10 10:52:04 2008 From: narravul at mvapich.cse.ohio-state.edu (narravul@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 10:52:56 2008 Subject: [mvapich-commit] r2679 - mvapich2/branches/1.0 Message-ID: <200806101452.m5AEq4Tm013490@mvapich.cse.ohio-state.edu> Author: narravul Date: 2008-06-10 10:52:03 -0400 (Tue, 10 Jun 2008) New Revision: 2679 Modified: mvapich2/branches/1.0/CHANGELOG Log: Updating the CHANGELOG for the 1.0.3 release. Modified: mvapich2/branches/1.0/CHANGELOG =================================================================== --- mvapich2/branches/1.0/CHANGELOG 2008-06-10 14:40:28 UTC (rev 2678) +++ mvapich2/branches/1.0/CHANGELOG 2008-06-10 14:52:03 UTC (rev 2679) @@ -6,8 +6,14 @@ MVAPICH2-1.0.3 (06/10/08) -* Post buffers before accepting QP connections for iWARP mode +* Add additional synchronization before pthread_cancel() call in + finalization to avoid killing the thread that is supposed to + acknowledge outstanding IB events. Thanks to David Kewley from Dell + for reporting this issue. +* Post buffers before accepting QP connections for iWARP mode. Thanks to + Steve Wise for reporting this issue. + * Fix a startup performance issue when on-demand connection setup is not used. From perkinjo at mvapich.cse.ohio-state.edu Tue Jun 10 16:32:51 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 16:33:09 2008 Subject: [mvapich-commit] r2681 - mvapich/trunk Message-ID: <200806102032.m5AKWpYS014363@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-06-10 16:32:50 -0400 (Tue, 10 Jun 2008) New Revision: 2681 Removed: mvapich/trunk/osu_benchmarks/ Log: Remove copy of osu_benchmarks First step of replacing osu_benchmarks with external reference to OMB repository. From perkinjo at mvapich.cse.ohio-state.edu Tue Jun 10 16:37:31 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 16:37:48 2008 Subject: [mvapich-commit] r2682 - mvapich/trunk Message-ID: <200806102037.m5AKbVIs014387@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-06-10 16:37:31 -0400 (Tue, 10 Jun 2008) New Revision: 2682 Modified: mvapich/trunk/ Log: Create external reference to OMB repository Property changes on: mvapich/trunk ___________________________________________________________________ Name: svn:externals + osu_benchmarks https://mvapich.cse.ohio-state.edu/svn/mpi-benchmarks/branches/OMB-3.1 From perkinjo at mvapich.cse.ohio-state.edu Tue Jun 10 16:50:56 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 16:52:56 2008 Subject: [mvapich-commit] r2683 - mvapich/branches/1.0 Message-ID: <200806102050.m5AKouZ8014725@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-06-10 16:50:54 -0400 (Tue, 10 Jun 2008) New Revision: 2683 Removed: mvapich/branches/1.0/osu_benchmarks/ Modified: mvapich/branches/1.0/ Log: Replace copy of osu_benchmarks with reference to OMB repository Property changes on: mvapich/branches/1.0 ___________________________________________________________________ Name: svn:externals + osu_benchmarks https://mvapich.cse.ohio-state.edu/svn/mpi-benchmarks/branches/OMB-3.1 From perkinjo at mvapich.cse.ohio-state.edu Tue Jun 10 17:09:13 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 17:09:27 2008 Subject: [mvapich-commit] r2685 - mvapich2/tags Message-ID: <200806102109.m5AL9DkH014809@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-06-10 17:09:12 -0400 (Tue, 10 Jun 2008) New Revision: 2685 Removed: mvapich2/tags/1.0.3/ Log: Remove incorrect tag (missing CHANGELOG updates) From perkinjo at mvapich.cse.ohio-state.edu Tue Jun 10 17:09:51 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 17:10:06 2008 Subject: [mvapich-commit] r2686 - mvapich2/tags Message-ID: <200806102109.m5AL9pgI014827@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-06-10 17:09:51 -0400 (Tue, 10 Jun 2008) New Revision: 2686 Added: mvapich2/tags/1.0.3/ Log: Create mvapich2 1.0.3 release tag Copied: mvapich2/tags/1.0.3 (from rev 2685, mvapich2/branches/1.0) From perkinjo at mvapich.cse.ohio-state.edu Tue Jun 10 17:33:32 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 17:33:48 2008 Subject: [mvapich-commit] r2687 - mvapich2/trunk Message-ID: <200806102133.m5ALXWPD014977@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-06-10 17:33:31 -0400 (Tue, 10 Jun 2008) New Revision: 2687 Removed: mvapich2/trunk/osu_benchmarks/ Modified: mvapich2/trunk/ Log: Replace copy of osu_benchmarks with reference to OMB repository Property changes on: mvapich2/trunk ___________________________________________________________________ Name: svn:externals + osu_benchmarks https://mvapich.cse.ohio-state.edu/svn/mpi-benchmarks/branches/OMB-3.1 From curtisbr at mvapich.cse.ohio-state.edu Tue Jun 10 19:26:50 2008 From: curtisbr at mvapich.cse.ohio-state.edu (curtisbr@mvapich.cse.ohio-state.edu) Date: Tue Jun 10 19:27:06 2008 Subject: [mvapich-commit] r2689 - mvapich2/branches/1.0 Message-ID: <200806102326.m5ANQoID015217@mvapich.cse.ohio-state.edu> Author: curtisbr Date: 2008-06-10 19:26:50 -0400 (Tue, 10 Jun 2008) New Revision: 2689 Modified: mvapich2/branches/1.0/CHANGELOG Log: Change description correction (MPD mpiexec timeout). Modified: mvapich2/branches/1.0/CHANGELOG =================================================================== --- mvapich2/branches/1.0/CHANGELOG 2008-06-10 23:02:41 UTC (rev 2688) +++ mvapich2/branches/1.0/CHANGELOG 2008-06-10 23:26:50 UTC (rev 2689) @@ -17,7 +17,7 @@ * Fix a startup performance issue when on-demand connection setup is not used. -* Configurable MPD mpiexec based on the number of processes. +* Configurable MPD mpiexec timeout based on the number of processes. MVAPICH2-1.0.2 (02/20/08) From noronha at mvapich.cse.ohio-state.edu Wed Jun 11 10:56:42 2008 From: noronha at mvapich.cse.ohio-state.edu (noronha@mvapich.cse.ohio-state.edu) Date: Wed Jun 11 10:56:59 2008 Subject: [mvapich-commit] r2690 - in mvapich/trunk: . romio/adio/common Message-ID: <200806111456.m5BEugWr017420@mvapich.cse.ohio-state.edu> Author: noronha Date: 2008-06-11 10:56:41 -0400 (Wed, 11 Jun 2008) New Revision: 2690 Modified: mvapich/trunk/CHANGELOG mvapich/trunk/romio/adio/common/ad_aggregate.c Log: Apply aggregate adio patch to 1.0 trunk. Modified: mvapich/trunk/CHANGELOG =================================================================== --- mvapich/trunk/CHANGELOG 2008-06-10 23:26:50 UTC (rev 2689) +++ mvapich/trunk/CHANGELOG 2008-06-11 14:56:41 UTC (rev 2690) @@ -96,6 +96,9 @@ 02/01/2008 * Add Lustre ADIO driver. This is a contribution from Future Technologies Group, Oak Ridge National Laboratory. +06/10/2008 +* Add aggregate patch, v2 to the Lustre ADIO driver. + 01/25/2008 * Dynamic SRQ Fill-Resize: resize SRQ on limit event. Should decrease the need for tuning based on application and size. Modified: mvapich/trunk/romio/adio/common/ad_aggregate.c =================================================================== --- mvapich/trunk/romio/adio/common/ad_aggregate.c 2008-06-10 23:26:50 UTC (rev 2689) +++ mvapich/trunk/romio/adio/common/ad_aggregate.c 2008-06-11 14:56:41 UTC (rev 2690) @@ -93,13 +93,8 @@ #endif /* get an index into our array of aggregators */ - if (fd->file_system == ADIO_LUSTRE) - rank_index = (int) ((off - ALIGNDOWN(min_off, fd_size) + fd_size)/ - fd_size - 1); - else - rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1); + rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1); - /* remember here that even in Rajeev's original code it was the case that * different aggregators could end up with different amounts of data to * aggregate. here we use fd_end[] to make sure that we know how much @@ -154,10 +149,15 @@ /* partition the total file access range equally among nprocs_for_coll processes */ - fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - - 1)/nprocs_for_coll; + if (alignment) { + min_st_offset = ALIGNDOWN(min_st_offset, alignment); + fd_size = ((max_end_offset - min_st_offset + 1) + + nprocs_for_coll - 1)/nprocs_for_coll; fd_size = (fd_size + alignment -1 ) / alignment * alignment; + } else { + fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - + 1)/nprocs_for_coll; } /* ceiling division as in HPF block distribution */ @@ -171,10 +171,7 @@ fd_end = *fd_end_ptr; fd_start[0] = min_st_offset; - if (alignment) - fd_end[0] = ALIGNDOWN(min_st_offset, fd_size) + fd_size - 1; - else - fd_end[0] = min_st_offset + fd_size - 1; + fd_end[0] = min_st_offset + fd_size - 1; for (i=1; i Author: noronha Date: 2008-06-11 11:20:44 -0400 (Wed, 11 Jun 2008) New Revision: 2692 Modified: mvapich/branches/1.0/CHANGELOG mvapich/branches/1.0/romio/adio/common/ad_aggregate.c Log: Aggregate adio patch, v2. Modified: mvapich/branches/1.0/CHANGELOG =================================================================== --- mvapich/branches/1.0/CHANGELOG 2008-06-11 15:04:04 UTC (rev 2691) +++ mvapich/branches/1.0/CHANGELOG 2008-06-11 15:20:44 UTC (rev 2692) @@ -96,6 +96,9 @@ 02/01/2008 * Add Lustre ADIO driver. This is a contribution from Future Technologies Group, Oak Ridge National Laboratory. +06/10/2008 +* Add aggregate patch, v2 to the Lustre ADIO driver. + 01/25/2008 * Dynamic SRQ Fill-Resize: resize SRQ on limit event. Should decrease the need for tuning based on application and size. Modified: mvapich/branches/1.0/romio/adio/common/ad_aggregate.c =================================================================== --- mvapich/branches/1.0/romio/adio/common/ad_aggregate.c 2008-06-11 15:04:04 UTC (rev 2691) +++ mvapich/branches/1.0/romio/adio/common/ad_aggregate.c 2008-06-11 15:20:44 UTC (rev 2692) @@ -93,13 +93,8 @@ #endif /* get an index into our array of aggregators */ - if (fd->file_system == ADIO_LUSTRE) - rank_index = (int) ((off - ALIGNDOWN(min_off, fd_size) + fd_size)/ - fd_size - 1); - else - rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1); + rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1); - /* remember here that even in Rajeev's original code it was the case that * different aggregators could end up with different amounts of data to * aggregate. here we use fd_end[] to make sure that we know how much @@ -154,10 +149,15 @@ /* partition the total file access range equally among nprocs_for_coll processes */ - fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - - 1)/nprocs_for_coll; + if (alignment) { + min_st_offset = ALIGNDOWN(min_st_offset, alignment); + fd_size = ((max_end_offset - min_st_offset + 1) + + nprocs_for_coll - 1)/nprocs_for_coll; fd_size = (fd_size + alignment -1 ) / alignment * alignment; + } else { + fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - + 1)/nprocs_for_coll; } /* ceiling division as in HPF block distribution */ @@ -171,10 +171,7 @@ fd_end = *fd_end_ptr; fd_start[0] = min_st_offset; - if (alignment) - fd_end[0] = ALIGNDOWN(min_st_offset, fd_size) + fd_size - 1; - else - fd_end[0] = min_st_offset + fd_size - 1; + fd_end[0] = min_st_offset + fd_size - 1; for (i=1; i Author: sridharj Date: 2008-06-11 15:06:48 -0400 (Wed, 11 Jun 2008) New Revision: 2697 Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c mvapich/trunk/mpid/ch_gen2_ud/process/mpirun_rsh.c mvapich/trunk/mpid/ch_psm/process/mpirun_rsh.c mvapich/trunk/mpid/ch_smp/process/mpirun_rsh.c Log: Clean exit for mpirun_rsh when executable is not specified, for instance - mpirun_rsh -np 2 h1 h2 FOO=BAR Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-06-11 19:05:36 UTC (rev 2696) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-06-11 19:06:48 UTC (rev 2697) @@ -1739,7 +1739,7 @@ if(!mpispawn_param_env) goto allocation_error; } - while(strchr(argv[aout_index], '=')) { + while(aout_index != argc && strchr(argv[aout_index], '=')) { name = strdup(argv[aout_index++]); value = strchr(name, '='); value[0] = '\0'; @@ -1762,6 +1762,12 @@ } } + if (aout_index == argc) { + fprintf(stderr, "Incorrect number of arguments.\n"); + usage(); + exit (EXIT_FAILURE); + } + i = argc - aout_index; if(debug_on) i++; if(use_totalview) i++; Modified: mvapich/trunk/mpid/ch_gen2_ud/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2_ud/process/mpirun_rsh.c 2008-06-11 19:05:36 UTC (rev 2696) +++ mvapich/trunk/mpid/ch_gen2_ud/process/mpirun_rsh.c 2008-06-11 19:06:48 UTC (rev 2697) @@ -1739,7 +1739,7 @@ if(!mpispawn_param_env) goto allocation_error; } - while(strchr(argv[aout_index], '=')) { + while(aout_index != argc && strchr(argv[aout_index], '=')) { name = strdup(argv[aout_index++]); value = strchr(name, '='); value[0] = '\0'; @@ -1762,6 +1762,12 @@ } } + if (aout_index == argc) { + fprintf(stderr, "Incorrect number of arguments.\n"); + usage(); + exit (EXIT_FAILURE); + } + i = argc - aout_index; if(debug_on) i++; if(use_totalview) i++; Modified: mvapich/trunk/mpid/ch_psm/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_psm/process/mpirun_rsh.c 2008-06-11 19:05:36 UTC (rev 2696) +++ mvapich/trunk/mpid/ch_psm/process/mpirun_rsh.c 2008-06-11 19:06:48 UTC (rev 2697) @@ -1739,7 +1739,7 @@ if(!mpispawn_param_env) goto allocation_error; } - while(strchr(argv[aout_index], '=')) { + while(aout_index != argc && strchr(argv[aout_index], '=')) { name = strdup(argv[aout_index++]); value = strchr(name, '='); value[0] = '\0'; @@ -1762,6 +1762,12 @@ } } + if (aout_index == argc) { + fprintf(stderr, "Incorrect number of arguments.\n"); + usage(); + exit (EXIT_FAILURE); + } + i = argc - aout_index; if(debug_on) i++; if(use_totalview) i++; Modified: mvapich/trunk/mpid/ch_smp/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_smp/process/mpirun_rsh.c 2008-06-11 19:05:36 UTC (rev 2696) +++ mvapich/trunk/mpid/ch_smp/process/mpirun_rsh.c 2008-06-11 19:06:48 UTC (rev 2697) @@ -1741,7 +1741,7 @@ if(!mpispawn_param_env) goto allocation_error; } - while(strchr(argv[aout_index], '=')) { + while(aout_index != argc && strchr(argv[aout_index], '=')) { name = strdup(argv[aout_index++]); value = strchr(name, '='); value[0] = '\0'; @@ -1764,6 +1764,12 @@ } } + if (aout_index == argc) { + fprintf(stderr, "Incorrect number of arguments.\n"); + usage(); + exit (EXIT_FAILURE); + } + i = argc - aout_index; if(debug_on) i++; if(use_totalview) i++; From sridharj at mvapich.cse.ohio-state.edu Wed Jun 11 15:07:38 2008 From: sridharj at mvapich.cse.ohio-state.edu (sridharj@mvapich.cse.ohio-state.edu) Date: Wed Jun 11 15:07:53 2008 Subject: [mvapich-commit] r2698 - in mvapich/branches/1.0/mpid: ch_gen2/process ch_gen2_ud/process ch_psm/process ch_smp/process Message-ID: <200806111907.m5BJ7c6i018130@mvapich.cse.ohio-state.edu> Author: sridharj Date: 2008-06-11 15:07:38 -0400 (Wed, 11 Jun 2008) New Revision: 2698 Modified: mvapich/branches/1.0/mpid/ch_gen2/process/mpirun_rsh.c mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpirun_rsh.c mvapich/branches/1.0/mpid/ch_psm/process/mpirun_rsh.c mvapich/branches/1.0/mpid/ch_smp/process/mpirun_rsh.c Log: Clean exit for mpirun_rsh when executable is not specified, for instance - mpirun_rsh -np 2 h1 h2 FOO=BAR Modified: mvapich/branches/1.0/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/branches/1.0/mpid/ch_gen2/process/mpirun_rsh.c 2008-06-11 19:06:48 UTC (rev 2697) +++ mvapich/branches/1.0/mpid/ch_gen2/process/mpirun_rsh.c 2008-06-11 19:07:38 UTC (rev 2698) @@ -1739,7 +1739,7 @@ if(!mpispawn_param_env) goto allocation_error; } - while(strchr(argv[aout_index], '=')) { + while(aout_index != argc && strchr(argv[aout_index], '=')) { name = strdup(argv[aout_index++]); value = strchr(name, '='); value[0] = '\0'; @@ -1762,6 +1762,12 @@ } } + if (aout_index == argc) { + fprintf(stderr, "Incorrect number of arguments.\n"); + usage(); + exit (EXIT_FAILURE); + } + i = argc - aout_index; if(debug_on) i++; if(use_totalview) i++; Modified: mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpirun_rsh.c =================================================================== --- mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpirun_rsh.c 2008-06-11 19:06:48 UTC (rev 2697) +++ mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpirun_rsh.c 2008-06-11 19:07:38 UTC (rev 2698) @@ -1739,7 +1739,7 @@ if(!mpispawn_param_env) goto allocation_error; } - while(strchr(argv[aout_index], '=')) { + while(aout_index != argc && strchr(argv[aout_index], '=')) { name = strdup(argv[aout_index++]); value = strchr(name, '='); value[0] = '\0'; @@ -1762,6 +1762,12 @@ } } + if (aout_index == argc) { + fprintf(stderr, "Incorrect number of arguments.\n"); + usage(); + exit (EXIT_FAILURE); + } + i = argc - aout_index; if(debug_on) i++; if(use_totalview) i++; Modified: mvapich/branches/1.0/mpid/ch_psm/process/mpirun_rsh.c =================================================================== --- mvapich/branches/1.0/mpid/ch_psm/process/mpirun_rsh.c 2008-06-11 19:06:48 UTC (rev 2697) +++ mvapich/branches/1.0/mpid/ch_psm/process/mpirun_rsh.c 2008-06-11 19:07:38 UTC (rev 2698) @@ -1739,7 +1739,7 @@ if(!mpispawn_param_env) goto allocation_error; } - while(strchr(argv[aout_index], '=')) { + while(aout_index != argc && strchr(argv[aout_index], '=')) { name = strdup(argv[aout_index++]); value = strchr(name, '='); value[0] = '\0'; @@ -1762,6 +1762,12 @@ } } + if (aout_index == argc) { + fprintf(stderr, "Incorrect number of arguments.\n"); + usage(); + exit (EXIT_FAILURE); + } + i = argc - aout_index; if(debug_on) i++; if(use_totalview) i++; Modified: mvapich/branches/1.0/mpid/ch_smp/process/mpirun_rsh.c =================================================================== --- mvapich/branches/1.0/mpid/ch_smp/process/mpirun_rsh.c 2008-06-11 19:06:48 UTC (rev 2697) +++ mvapich/branches/1.0/mpid/ch_smp/process/mpirun_rsh.c 2008-06-11 19:07:38 UTC (rev 2698) @@ -1741,7 +1741,7 @@ if(!mpispawn_param_env) goto allocation_error; } - while(strchr(argv[aout_index], '=')) { + while(aout_index != argc && strchr(argv[aout_index], '=')) { name = strdup(argv[aout_index++]); value = strchr(name, '='); value[0] = '\0'; @@ -1764,6 +1764,12 @@ } } + if (aout_index == argc) { + fprintf(stderr, "Incorrect number of arguments.\n"); + usage(); + exit (EXIT_FAILURE); + } + i = argc - aout_index; if(debug_on) i++; if(use_totalview) i++; From noronha at mvapich.cse.ohio-state.edu Wed Jun 11 19:42:58 2008 From: noronha at mvapich.cse.ohio-state.edu (noronha@mvapich.cse.ohio-state.edu) Date: Thu Jun 12 00:57:39 2008 Subject: [mvapich-commit] r2699 - mvapich/trunk/romio/adio/common Message-ID: <200806112342.m5BNgwWH018576@mvapich.cse.ohio-state.edu> Author: noronha Date: 2008-06-11 19:42:57 -0400 (Wed, 11 Jun 2008) New Revision: 2699 Modified: mvapich/trunk/romio/adio/common/ad_aggregate.c Log: Use ADIO_Offset instead of int. Modified: mvapich/trunk/romio/adio/common/ad_aggregate.c =================================================================== --- mvapich/trunk/romio/adio/common/ad_aggregate.c 2008-06-11 19:07:38 UTC (rev 2698) +++ mvapich/trunk/romio/adio/common/ad_aggregate.c 2008-06-11 23:42:57 UTC (rev 2699) @@ -126,7 +126,7 @@ process may directly access only its own file domain. */ ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, fd_size; - int alignment = *fd_size_ptr; + ADIO_Offset alignment = *fd_size_ptr; int i; #ifdef AGG_DEBUG From noronha at mvapich.cse.ohio-state.edu Wed Jun 11 19:50:34 2008 From: noronha at mvapich.cse.ohio-state.edu (noronha@mvapich.cse.ohio-state.edu) Date: Thu Jun 12 01:07:40 2008 Subject: [mvapich-commit] r2700 - mvapich/branches/1.0/romio/adio/common Message-ID: <200806112350.m5BNoY0g018603@mvapich.cse.ohio-state.edu> Author: noronha Date: 2008-06-11 19:50:34 -0400 (Wed, 11 Jun 2008) New Revision: 2700 Modified: mvapich/branches/1.0/romio/adio/common/ad_aggregate.c Log: Use MPI_Offset instead of int. Modified: mvapich/branches/1.0/romio/adio/common/ad_aggregate.c =================================================================== --- mvapich/branches/1.0/romio/adio/common/ad_aggregate.c 2008-06-11 23:42:57 UTC (rev 2699) +++ mvapich/branches/1.0/romio/adio/common/ad_aggregate.c 2008-06-11 23:50:34 UTC (rev 2700) @@ -126,7 +126,7 @@ process may directly access only its own file domain. */ ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, fd_size; - int alignment = *fd_size_ptr; + ADIO_Offset alignment = *fd_size_ptr; int i; #ifdef AGG_DEBUG From kumarra at mvapich.cse.ohio-state.edu Tue Jun 24 23:11:13 2008 From: kumarra at mvapich.cse.ohio-state.edu (kumarra@mvapich.cse.ohio-state.edu) Date: Tue Jun 24 23:11:23 2008 Subject: [mvapich-commit] r2783 - in mvapich/trunk: mpid/ch_gen2 src/coll src/env Message-ID: <200806250311.m5P3BDJC002884@mvapich.cse.ohio-state.edu> Author: kumarra Date: 2008-06-24 23:11:04 -0400 (Tue, 24 Jun 2008) New Revision: 2783 Modified: mvapich/trunk/mpid/ch_gen2/shmem_coll.c mvapich/trunk/src/coll/intra_fns_new.c mvapich/trunk/src/env/initutil.c Log: shared memory bcast buffer overflow. Reported by David Kewley@Dell. Modified: mvapich/trunk/mpid/ch_gen2/shmem_coll.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/shmem_coll.c 2008-06-24 21:35:12 UTC (rev 2782) +++ mvapich/trunk/mpid/ch_gen2/shmem_coll.c 2008-06-25 03:11:04 UTC (rev 2783) @@ -401,6 +401,7 @@ volatile char tmpchar; #endif + file_size = file_size + 3*SHMEM_BCAST_FLAGS + 1024*SHMEM_BCAST_METADATA; /* add pid for unique file name */ *bcast_shmem_file = (char *) malloc(sizeof(char) * (SHMEM_COLL_HOSTNAME_LEN + 26 + PID_CHAR_LEN)); Modified: mvapich/trunk/src/coll/intra_fns_new.c =================================================================== --- mvapich/trunk/src/coll/intra_fns_new.c 2008-06-24 21:35:12 UTC (rev 2782) +++ mvapich/trunk/src/coll/intra_fns_new.c 2008-06-25 03:11:04 UTC (rev 2783) @@ -70,6 +70,7 @@ #if (defined(CH_GEN2) || defined(CH_SMP) || defined(CH_GEN2_UD) || defined(CH_PSM)) int shmem_coll_reduce_threshold = (1<<10); int shmem_coll_allreduce_threshold = (1<<15); +int shmem_coll_bcast_threshold = (1<<23); #endif int disable_shmem_bcast =0; #endif @@ -1305,7 +1306,7 @@ MPIR_Datatype_iscontig(datatype->self, &is_contig); if ((comm->comm_coll->shmem_coll_ok) && (disable_shmem_bcast == 0) && - (nbytes > shmem_bcast_large_msg) && (enable_shmem_collectives) && (is_contig) && (is_homogeneous)){ + (nbytes > shmem_bcast_large_msg) && (enable_shmem_collectives) && (is_contig) && (is_homogeneous) && (nbytes < shmem_coll_bcast_threshold)){ return intra_shmem_Bcast_Large( buffer, count, datatype, nbytes, root, comm ); } else{ @@ -1652,7 +1653,7 @@ int stride = 0, is_commutative, leader_comm_size, leader_comm_rank; int leader_root, total_size =0, shmem_comm_rank, num_bytes=0, shmem_offset=-1; int index; - int file_size = 1<<23; + int file_size = shmem_coll_bcast_threshold; /* Get my rank and switch communicators to the hidden collective */ Modified: mvapich/trunk/src/env/initutil.c =================================================================== --- mvapich/trunk/src/env/initutil.c 2008-06-24 21:35:12 UTC (rev 2782) +++ mvapich/trunk/src/env/initutil.c 2008-06-25 03:11:04 UTC (rev 2783) @@ -162,6 +162,7 @@ extern int shmem_coll_max_msg_size; extern int shmem_coll_reduce_threshold; extern int shmem_coll_allreduce_threshold; +extern int shmem_coll_bcast_threshold; #endif #endif extern int bcast_knomial_degree; @@ -278,6 +279,9 @@ if ((value = getenv("VIADEV_SHMEM_COLL_ALLREDUCE_THRESHOLD")) != NULL){ shmem_coll_allreduce_threshold = atoi(value); } + if ((value = getenv("VIADEV_SHMEM_COLL_BCAST_THRESHOLD")) != NULL){ + shmem_coll_bcast_threshold = atoi(value); + } if ((shmem_coll_max_msg_size < shmem_coll_reduce_threshold) || (shmem_coll_max_msg_size < shmem_coll_allreduce_threshold)){ printf("Shmem_coll_max_msg_size should be greater than the thresholds\n"); From sridharj at mvapich.cse.ohio-state.edu Wed Jun 25 11:18:29 2008 From: sridharj at mvapich.cse.ohio-state.edu (sridharj@mvapich.cse.ohio-state.edu) Date: Wed Jun 25 11:18:34 2008 Subject: [mvapich-commit] r2787 - in mvapich/branches/1.0/mpid: ch_gen2/process ch_gen2_ud/process ch_psm/process ch_smp/process Message-ID: <200806251518.m5PFIT6Q004773@mvapich.cse.ohio-state.edu> Author: sridharj Date: 2008-06-25 11:18:28 -0400 (Wed, 25 Jun 2008) New Revision: 2787 Modified: mvapich/branches/1.0/mpid/ch_gen2/process/mpirun_rsh.c mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpirun_rsh.c mvapich/branches/1.0/mpid/ch_psm/process/mpirun_rsh.c mvapich/branches/1.0/mpid/ch_smp/process/mpirun_rsh.c Log: Solaris connect () bug on solaris. Modified: mvapich/branches/1.0/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/branches/1.0/mpid/ch_gen2/process/mpirun_rsh.c 2008-06-25 15:18:01 UTC (rev 2786) +++ mvapich/branches/1.0/mpid/ch_gen2/process/mpirun_rsh.c 2008-06-25 15:18:28 UTC (rev 2787) @@ -2171,7 +2171,7 @@ } if (connect(sock, (struct sockaddr *) &address[0], - (socklen_t)sizeof(addr)) < 0) { + (socklen_t)sizeof(struct sockaddr)) < 0) { perror("connect"); cleanup(); } Modified: mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpirun_rsh.c =================================================================== --- mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpirun_rsh.c 2008-06-25 15:18:01 UTC (rev 2786) +++ mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpirun_rsh.c 2008-06-25 15:18:28 UTC (rev 2787) @@ -2171,7 +2171,7 @@ } if (connect(sock, (struct sockaddr *) &address[0], - (socklen_t)sizeof(addr)) < 0) { + (socklen_t)sizeof(struct sockaddr)) < 0) { perror("connect"); cleanup(); } Modified: mvapich/branches/1.0/mpid/ch_psm/process/mpirun_rsh.c =================================================================== --- mvapich/branches/1.0/mpid/ch_psm/process/mpirun_rsh.c 2008-06-25 15:18:01 UTC (rev 2786) +++ mvapich/branches/1.0/mpid/ch_psm/process/mpirun_rsh.c 2008-06-25 15:18:28 UTC (rev 2787) @@ -2171,7 +2171,7 @@ } if (connect(sock, (struct sockaddr *) &address[0], - (socklen_t)sizeof(addr)) < 0) { + (socklen_t)sizeof(struct sockaddr)) < 0) { perror("connect"); cleanup(); } Modified: mvapich/branches/1.0/mpid/ch_smp/process/mpirun_rsh.c =================================================================== --- mvapich/branches/1.0/mpid/ch_smp/process/mpirun_rsh.c 2008-06-25 15:18:01 UTC (rev 2786) +++ mvapich/branches/1.0/mpid/ch_smp/process/mpirun_rsh.c 2008-06-25 15:18:28 UTC (rev 2787) @@ -2173,7 +2173,7 @@ } if (connect(sock, (struct sockaddr *) &address[0], - (socklen_t)sizeof(addr)) < 0) { + (socklen_t)sizeof(struct sockaddr)) < 0) { perror("connect"); cleanup(); } From sridharj at mvapich.cse.ohio-state.edu Wed Jun 25 11:19:50 2008 From: sridharj at mvapich.cse.ohio-state.edu (sridharj@mvapich.cse.ohio-state.edu) Date: Wed Jun 25 11:19:56 2008 Subject: [mvapich-commit] r2788 - in mvapich/trunk/mpid: ch_gen2/process ch_gen2_ud/process ch_psm/process ch_smp/process Message-ID: <200806251519.m5PFJoHA004791@mvapich.cse.ohio-state.edu> Author: sridharj Date: 2008-06-25 11:19:49 -0400 (Wed, 25 Jun 2008) New Revision: 2788 Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c mvapich/trunk/mpid/ch_gen2_ud/process/mpirun_rsh.c mvapich/trunk/mpid/ch_psm/process/mpirun_rsh.c mvapich/trunk/mpid/ch_smp/process/mpirun_rsh.c Log: Bug with connect () on solaris Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-06-25 15:18:28 UTC (rev 2787) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-06-25 15:19:49 UTC (rev 2788) @@ -2171,7 +2171,7 @@ } if (connect(sock, (struct sockaddr *) &address[0], - (socklen_t)sizeof(addr)) < 0) { + (socklen_t)sizeof(struct sockaddr)) < 0) { perror("connect"); cleanup(); } Modified: mvapich/trunk/mpid/ch_gen2_ud/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2_ud/process/mpirun_rsh.c 2008-06-25 15:18:28 UTC (rev 2787) +++ mvapich/trunk/mpid/ch_gen2_ud/process/mpirun_rsh.c 2008-06-25 15:19:49 UTC (rev 2788) @@ -2171,7 +2171,7 @@ } if (connect(sock, (struct sockaddr *) &address[0], - (socklen_t)sizeof(addr)) < 0) { + (socklen_t)sizeof(struct sockaddr)) < 0) { perror("connect"); cleanup(); } Modified: mvapich/trunk/mpid/ch_psm/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_psm/process/mpirun_rsh.c 2008-06-25 15:18:28 UTC (rev 2787) +++ mvapich/trunk/mpid/ch_psm/process/mpirun_rsh.c 2008-06-25 15:19:49 UTC (rev 2788) @@ -2171,7 +2171,7 @@ } if (connect(sock, (struct sockaddr *) &address[0], - (socklen_t)sizeof(addr)) < 0) { + (socklen_t)sizeof(struct sockaddr)) < 0) { perror("connect"); cleanup(); } Modified: mvapich/trunk/mpid/ch_smp/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_smp/process/mpirun_rsh.c 2008-06-25 15:18:28 UTC (rev 2787) +++ mvapich/trunk/mpid/ch_smp/process/mpirun_rsh.c 2008-06-25 15:19:49 UTC (rev 2788) @@ -2173,7 +2173,7 @@ } if (connect(sock, (struct sockaddr *) &address[0], - (socklen_t)sizeof(addr)) < 0) { + (socklen_t)sizeof(struct sockaddr)) < 0) { perror("connect"); cleanup(); } From kumarra at mvapich.cse.ohio-state.edu Wed Jun 25 15:04:23 2008 From: kumarra at mvapich.cse.ohio-state.edu (kumarra@mvapich.cse.ohio-state.edu) Date: Wed Jun 25 15:04:28 2008 Subject: [mvapich-commit] r2793 - in mvapich/branches/1.0: mpid/ch_gen2 src/coll src/env Message-ID: <200806251904.m5PJ4N1l005247@mvapich.cse.ohio-state.edu> Author: kumarra Date: 2008-06-25 15:04:21 -0400 (Wed, 25 Jun 2008) New Revision: 2793 Modified: mvapich/branches/1.0/mpid/ch_gen2/shmem_coll.c mvapich/branches/1.0/src/coll/intra_fns_new.c mvapich/branches/1.0/src/env/initutil.c Log: Merging checkin 2783 to the branch Modified: mvapich/branches/1.0/mpid/ch_gen2/shmem_coll.c =================================================================== --- mvapich/branches/1.0/mpid/ch_gen2/shmem_coll.c 2008-06-25 17:41:42 UTC (rev 2792) +++ mvapich/branches/1.0/mpid/ch_gen2/shmem_coll.c 2008-06-25 19:04:21 UTC (rev 2793) @@ -401,6 +401,7 @@ volatile char tmpchar; #endif + file_size = file_size + 3*SHMEM_BCAST_FLAGS + 1024*SHMEM_BCAST_METADATA; /* add pid for unique file name */ *bcast_shmem_file = (char *) malloc(sizeof(char) * (SHMEM_COLL_HOSTNAME_LEN + 26 + PID_CHAR_LEN)); Modified: mvapich/branches/1.0/src/coll/intra_fns_new.c =================================================================== --- mvapich/branches/1.0/src/coll/intra_fns_new.c 2008-06-25 17:41:42 UTC (rev 2792) +++ mvapich/branches/1.0/src/coll/intra_fns_new.c 2008-06-25 19:04:21 UTC (rev 2793) @@ -70,6 +70,7 @@ #if (defined(CH_GEN2) || defined(CH_SMP) || defined(CH_GEN2_UD) || defined(CH_PSM)) int shmem_coll_reduce_threshold = (1<<10); int shmem_coll_allreduce_threshold = (1<<15); +int shmem_coll_bcast_threshold = (1<<23); #endif int disable_shmem_bcast =0; #endif @@ -1305,7 +1306,7 @@ MPIR_Datatype_iscontig(datatype->self, &is_contig); if ((comm->comm_coll->shmem_coll_ok) && (disable_shmem_bcast == 0) && - (nbytes > shmem_bcast_large_msg) && (enable_shmem_collectives) && (is_contig) && (is_homogeneous)){ + (nbytes > shmem_bcast_large_msg) && (enable_shmem_collectives) && (is_contig) && (is_homogeneous) && (nbytes < shmem_coll_bcast_threshold)){ return intra_shmem_Bcast_Large( buffer, count, datatype, nbytes, root, comm ); } else{ @@ -1652,7 +1653,7 @@ int stride = 0, is_commutative, leader_comm_size, leader_comm_rank; int leader_root, total_size =0, shmem_comm_rank, num_bytes=0, shmem_offset=-1; int index; - int file_size = 1<<23; + int file_size = shmem_coll_bcast_threshold; /* Get my rank and switch communicators to the hidden collective */ Modified: mvapich/branches/1.0/src/env/initutil.c =================================================================== --- mvapich/branches/1.0/src/env/initutil.c 2008-06-25 17:41:42 UTC (rev 2792) +++ mvapich/branches/1.0/src/env/initutil.c 2008-06-25 19:04:21 UTC (rev 2793) @@ -162,6 +162,7 @@ extern int shmem_coll_max_msg_size; extern int shmem_coll_reduce_threshold; extern int shmem_coll_allreduce_threshold; +extern int shmem_coll_bcast_threshold; #endif #endif extern int bcast_knomial_degree; @@ -278,6 +279,9 @@ if ((value = getenv("VIADEV_SHMEM_COLL_ALLREDUCE_THRESHOLD")) != NULL){ shmem_coll_allreduce_threshold = atoi(value); } + if ((value = getenv("VIADEV_SHMEM_COLL_BCAST_THRESHOLD")) != NULL){ + shmem_coll_bcast_threshold = atoi(value); + } if ((shmem_coll_max_msg_size < shmem_coll_reduce_threshold) || (shmem_coll_max_msg_size < shmem_coll_allreduce_threshold)){ printf("Shmem_coll_max_msg_size should be greater than the thresholds\n"); From narravul at mvapich.cse.ohio-state.edu Sat Jun 28 13:20:43 2008 From: narravul at mvapich.cse.ohio-state.edu (narravul@mvapich.cse.ohio-state.edu) Date: Sat Jun 28 13:20:48 2008 Subject: [mvapich-commit] r2798 - mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2 Message-ID: <200806281720.m5SHKhHA014658@mvapich.cse.ohio-state.edu> Author: narravul Date: 2008-06-28 13:20:41 -0400 (Sat, 28 Jun 2008) New Revision: 2798 Modified: mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_1sc.c Log: Fix for the bug seen with direct one-sided operations on ppc64 with -O2. Reported by Steve Wise. Modified: mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_1sc.c =================================================================== --- mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_1sc.c 2008-06-28 17:14:23 UTC (rev 2797) +++ mvapich2/branches/1.0/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_1sc.c 2008-06-28 17:20:41 UTC (rev 2798) @@ -1000,11 +1000,11 @@ Get_Pinned_Buf(win_ptr, (void*) &cc, sizeof(long long)); - *((long long *) cc) = 1; for (i=0; imrail.rails[i].hca_index; remote_addr[i] = (void *)(uintptr_t) (win_ptr->all_completion_counter[target_rank*rdma_num_rails+i]); + *((long long *) cc) = 1; local_addr[i] = (void *)cc; l_key2[i] = win_ptr->pinnedpool_1sc_dentry->memhandle[hca_index]->lkey; r_key2[i] = win_ptr->r_key2[target_rank*rdma_num_hcas + hca_index]; From sridharj at mvapich.cse.ohio-state.edu Sun Jun 29 12:33:02 2008 From: sridharj at mvapich.cse.ohio-state.edu (sridharj@mvapich.cse.ohio-state.edu) Date: Sun Jun 29 12:33:07 2008 Subject: [mvapich-commit] r2800 - in mvapich/trunk/mpid: ch_gen2/process ch_gen2_ud/process ch_psm/process ch_smp/process Message-ID: <200806291633.m5TGX2AK002507@mvapich.cse.ohio-state.edu> Author: sridharj Date: 2008-06-29 12:33:02 -0400 (Sun, 29 Jun 2008) New Revision: 2800 Modified: mvapich/trunk/mpid/ch_gen2/process/mpispawn_tree.c mvapich/trunk/mpid/ch_gen2_ud/process/mpispawn_tree.c mvapich/trunk/mpid/ch_psm/process/mpispawn_tree.c mvapich/trunk/mpid/ch_smp/process/mpispawn_tree.c Log: mpispawn connect () issue on solaris Modified: mvapich/trunk/mpid/ch_gen2/process/mpispawn_tree.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpispawn_tree.c 2008-06-29 16:31:43 UTC (rev 2799) +++ mvapich/trunk/mpid/ch_gen2/process/mpispawn_tree.c 2008-06-29 16:33:02 UTC (rev 2800) @@ -88,7 +88,7 @@ } if(connect(c_socket[i], (struct sockaddr *)&node_addr[children[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { while(i) close(c_socket[--i]); return CONN_LIB_FAILURE; } @@ -204,7 +204,7 @@ int c_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if(connect(c_socket, (struct sockaddr *)&node_addr[child[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { perror("mpispawn_tree_init"); return -1; } Modified: mvapich/trunk/mpid/ch_gen2_ud/process/mpispawn_tree.c =================================================================== --- mvapich/trunk/mpid/ch_gen2_ud/process/mpispawn_tree.c 2008-06-29 16:31:43 UTC (rev 2799) +++ mvapich/trunk/mpid/ch_gen2_ud/process/mpispawn_tree.c 2008-06-29 16:33:02 UTC (rev 2800) @@ -88,7 +88,7 @@ } if(connect(c_socket[i], (struct sockaddr *)&node_addr[children[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { while(i) close(c_socket[--i]); return CONN_LIB_FAILURE; } @@ -204,7 +204,7 @@ int c_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if(connect(c_socket, (struct sockaddr *)&node_addr[child[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { perror("mpispawn_tree_init"); return -1; } Modified: mvapich/trunk/mpid/ch_psm/process/mpispawn_tree.c =================================================================== --- mvapich/trunk/mpid/ch_psm/process/mpispawn_tree.c 2008-06-29 16:31:43 UTC (rev 2799) +++ mvapich/trunk/mpid/ch_psm/process/mpispawn_tree.c 2008-06-29 16:33:02 UTC (rev 2800) @@ -88,7 +88,7 @@ } if(connect(c_socket[i], (struct sockaddr *)&node_addr[children[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { while(i) close(c_socket[--i]); return CONN_LIB_FAILURE; } @@ -204,7 +204,7 @@ int c_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if(connect(c_socket, (struct sockaddr *)&node_addr[child[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { perror("mpispawn_tree_init"); return -1; } Modified: mvapich/trunk/mpid/ch_smp/process/mpispawn_tree.c =================================================================== --- mvapich/trunk/mpid/ch_smp/process/mpispawn_tree.c 2008-06-29 16:31:43 UTC (rev 2799) +++ mvapich/trunk/mpid/ch_smp/process/mpispawn_tree.c 2008-06-29 16:33:02 UTC (rev 2800) @@ -88,7 +88,7 @@ } if(connect(c_socket[i], (struct sockaddr *)&node_addr[children[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { while(i) close(c_socket[--i]); return CONN_LIB_FAILURE; } @@ -204,7 +204,7 @@ int c_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if(connect(c_socket, (struct sockaddr *)&node_addr[child[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { perror("mpispawn_tree_init"); return -1; } From sridharj at mvapich.cse.ohio-state.edu Sun Jun 29 12:34:17 2008 From: sridharj at mvapich.cse.ohio-state.edu (sridharj@mvapich.cse.ohio-state.edu) Date: Sun Jun 29 12:34:23 2008 Subject: [mvapich-commit] r2802 - in mvapich/branches/1.0/mpid: ch_gen2/process ch_gen2_ud/process ch_psm/process ch_smp/process Message-ID: <200806291634.m5TGYHpN002544@mvapich.cse.ohio-state.edu> Author: sridharj Date: 2008-06-29 12:34:17 -0400 (Sun, 29 Jun 2008) New Revision: 2802 Modified: mvapich/branches/1.0/mpid/ch_gen2/process/mpispawn_tree.c mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpispawn_tree.c mvapich/branches/1.0/mpid/ch_psm/process/mpispawn_tree.c mvapich/branches/1.0/mpid/ch_smp/process/mpispawn_tree.c Log: mpispawn connect () issue on solaris Modified: mvapich/branches/1.0/mpid/ch_gen2/process/mpispawn_tree.c =================================================================== --- mvapich/branches/1.0/mpid/ch_gen2/process/mpispawn_tree.c 2008-06-29 16:33:38 UTC (rev 2801) +++ mvapich/branches/1.0/mpid/ch_gen2/process/mpispawn_tree.c 2008-06-29 16:34:17 UTC (rev 2802) @@ -88,7 +88,7 @@ } if(connect(c_socket[i], (struct sockaddr *)&node_addr[children[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { while(i) close(c_socket[--i]); return CONN_LIB_FAILURE; } @@ -204,7 +204,7 @@ int c_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if(connect(c_socket, (struct sockaddr *)&node_addr[child[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { perror("mpispawn_tree_init"); return -1; } Modified: mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpispawn_tree.c =================================================================== --- mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpispawn_tree.c 2008-06-29 16:33:38 UTC (rev 2801) +++ mvapich/branches/1.0/mpid/ch_gen2_ud/process/mpispawn_tree.c 2008-06-29 16:34:17 UTC (rev 2802) @@ -88,7 +88,7 @@ } if(connect(c_socket[i], (struct sockaddr *)&node_addr[children[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { while(i) close(c_socket[--i]); return CONN_LIB_FAILURE; } @@ -204,7 +204,7 @@ int c_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if(connect(c_socket, (struct sockaddr *)&node_addr[child[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { perror("mpispawn_tree_init"); return -1; } Modified: mvapich/branches/1.0/mpid/ch_psm/process/mpispawn_tree.c =================================================================== --- mvapich/branches/1.0/mpid/ch_psm/process/mpispawn_tree.c 2008-06-29 16:33:38 UTC (rev 2801) +++ mvapich/branches/1.0/mpid/ch_psm/process/mpispawn_tree.c 2008-06-29 16:34:17 UTC (rev 2802) @@ -88,7 +88,7 @@ } if(connect(c_socket[i], (struct sockaddr *)&node_addr[children[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { while(i) close(c_socket[--i]); return CONN_LIB_FAILURE; } @@ -204,7 +204,7 @@ int c_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if(connect(c_socket, (struct sockaddr *)&node_addr[child[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { perror("mpispawn_tree_init"); return -1; } Modified: mvapich/branches/1.0/mpid/ch_smp/process/mpispawn_tree.c =================================================================== --- mvapich/branches/1.0/mpid/ch_smp/process/mpispawn_tree.c 2008-06-29 16:33:38 UTC (rev 2801) +++ mvapich/branches/1.0/mpid/ch_smp/process/mpispawn_tree.c 2008-06-29 16:34:17 UTC (rev 2802) @@ -88,7 +88,7 @@ } if(connect(c_socket[i], (struct sockaddr *)&node_addr[children[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { while(i) close(c_socket[--i]); return CONN_LIB_FAILURE; } @@ -204,7 +204,7 @@ int c_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if(connect(c_socket, (struct sockaddr *)&node_addr[child[i]], - sizeof(struct sockaddr_storage)) < 0) { + sizeof(struct sockaddr)) < 0) { perror("mpispawn_tree_init"); return -1; } From kumarra at mvapich.cse.ohio-state.edu Mon Jun 30 13:28:56 2008 From: kumarra at mvapich.cse.ohio-state.edu (kumarra@mvapich.cse.ohio-state.edu) Date: Mon Jun 30 13:29:04 2008 Subject: [mvapich-commit] r2805 - in mvapich/trunk: mpid/ch_gen2 src/coll Message-ID: <200806301728.m5UHSuGF005877@mvapich.cse.ohio-state.edu> Author: kumarra Date: 2008-06-30 13:28:54 -0400 (Mon, 30 Jun 2008) New Revision: 2805 Modified: mvapich/trunk/mpid/ch_gen2/coll_shmem.h mvapich/trunk/mpid/ch_gen2/shmem_coll.c mvapich/trunk/src/coll/intra_fns_new.c Log: Do not try to use shmem broadcast if shmem_bcast shared memory initialization fails Modified: mvapich/trunk/mpid/ch_gen2/coll_shmem.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/coll_shmem.h 2008-06-30 15:33:16 UTC (rev 2804) +++ mvapich/trunk/mpid/ch_gen2/coll_shmem.h 2008-06-30 17:28:54 UTC (rev 2805) @@ -120,5 +120,6 @@ }; #define SHMEM_BCAST_FLAGS 1024 +#define SHMEM_BCAST_LEADERS 1024 #define SHMEM_BCAST_METADATA (sizeof(aint_t) + 2*sizeof(int)) /* METADATA: buffer address, offset, num_bytes */ #endif Modified: mvapich/trunk/mpid/ch_gen2/shmem_coll.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/shmem_coll.c 2008-06-30 15:33:16 UTC (rev 2804) +++ mvapich/trunk/mpid/ch_gen2/shmem_coll.c 2008-06-30 17:28:54 UTC (rev 2805) @@ -282,7 +282,7 @@ void MPID_SHMEM_COLL_GetShmemBcastBuf(void** output_buf, void* buffer){ char* shmem_coll_buf = (char*)(buffer); - *output_buf = (char*)shmem_coll_buf + 3*SHMEM_BCAST_FLAGS + 1024*SHMEM_BCAST_METADATA; + *output_buf = (char*)shmem_coll_buf + 3*SHMEM_BCAST_FLAGS + SHMEM_BCAST_LEADERS*SHMEM_BCAST_METADATA; } void signal_local_processes(int step, int index, char* send_buf, int offset, int bytes, void* mmap_ptr){ @@ -322,7 +322,7 @@ buffer = (aint_t*)tmp; buffer = (int*)(tmp + sizeof(aint_t)); *offset = *((int*)buffer); - *output_buf = (char*)(mmap_ptr) + 3*SHMEM_BCAST_FLAGS + 1024*SHMEM_BCAST_METADATA + *offset; + *output_buf = (char*)(mmap_ptr) + 3*SHMEM_BCAST_FLAGS + SHMEM_BCAST_LEADERS*SHMEM_BCAST_METADATA + *offset; buffer = (int*)(tmp + sizeof(aint_t) + sizeof(int)); *bytes = *((int*)buffer); @@ -401,7 +401,7 @@ volatile char tmpchar; #endif - file_size = file_size + 3*SHMEM_BCAST_FLAGS + 1024*SHMEM_BCAST_METADATA; + file_size = file_size + 3*SHMEM_BCAST_FLAGS + SHMEM_BCAST_LEADERS*SHMEM_BCAST_METADATA; /* add pid for unique file name */ *bcast_shmem_file = (char *) malloc(sizeof(char) * (SHMEM_COLL_HOSTNAME_LEN + 26 + PID_CHAR_LEN)); @@ -418,7 +418,7 @@ perror("open"); fprintf(stderr, "[%d] shmem_coll_init:error in opening " "shared memory file <%s>: %d\n", viadev.global_id, *bcast_shmem_file, errno); - return -1; + return 0; } @@ -430,7 +430,7 @@ unlink(*bcast_shmem_file); fprintf(stderr, "[%d] shmem_coll_init:error in ftruncate to zero " "shared memory file: %d\n", my_rank, errno); - return -1; + return 0; } /* set file size, without touching pages */ @@ -439,8 +439,8 @@ unlink(*bcast_shmem_file); fprintf(stderr, "[%d] shmem_coll_init:error in ftruncate to size " "shared memory file: %d\n", my_rank, errno); - return -1; - } + return 0; + } /* Ignoring optimal memory allocation for now */ #ifndef _X86_64_ @@ -448,9 +448,9 @@ char *buf; buf = (char *) calloc(*bcast_seg_size + 1, sizeof(char)); if (write(*fd, buf, *bcast_seg_size) != *bcast_seg_size) { - printf("[%d] shmem_coll_init:error in writing " "shared memory file: %d\n", my_rank, errno); + fprintf(stderr, "[%d] shmem_coll_init:error in writing " "shared memory file: %d\n", my_rank, errno); free(buf); - return -1; + return 0; } free(buf); } @@ -462,12 +462,12 @@ fprintf(stderr, "[%d] shmem_coll_init:error in lseek " "on shared memory file: %d\n", my_rank, errno); - return -1; + return 0; } } - return MPI_SUCCESS; + return 1; } int MPID_SHMEM_BCAST_mmap(void** mmap_ptr, int bcast_seg_size, int fd, int my_local_rank, char* bcast_shmem_file) Modified: mvapich/trunk/src/coll/intra_fns_new.c =================================================================== --- mvapich/trunk/src/coll/intra_fns_new.c 2008-06-30 15:33:16 UTC (rev 2804) +++ mvapich/trunk/src/coll/intra_fns_new.c 2008-06-30 17:28:54 UTC (rev 2805) @@ -1110,7 +1110,7 @@ if ((nbytes < coll_table[BCAST_IDX][lgn]) || (coll_table[BCAST_IDX][lgn] == -1)) { - mpi_errno = knomial_2level_Bcast(buffer, count, datatype, root, + mpi_errno = knomial_2level_Bcast(buffer, count, datatype, root, comm); } else{ @@ -1296,6 +1296,7 @@ int pof2 = 1; int is_contig; int is_homogeneous = 1; + int mpi_errno; #ifdef MPID_HAS_HETERO is_homogeneous = (comm->msgform == MPID_MSG_OK) ? 1 : 0; #endif @@ -1307,7 +1308,9 @@ if ((comm->comm_coll->shmem_coll_ok) && (disable_shmem_bcast == 0) && (nbytes > shmem_bcast_large_msg) && (enable_shmem_collectives) && (is_contig) && (is_homogeneous) && (nbytes < shmem_coll_bcast_threshold)){ - return intra_shmem_Bcast_Large( buffer, count, datatype, nbytes, root, comm ); + mpi_errno = intra_shmem_Bcast_Large( buffer, count, datatype, nbytes, root, comm ); + if (mpi_errno == -1) return scatterGatherBcast(buffer, count, datatype, nbytes, root, comm); + else return mpi_errno; } else{ return scatterGatherBcast(buffer, count, datatype, nbytes, root, comm); @@ -1654,6 +1657,7 @@ int leader_root, total_size =0, shmem_comm_rank, num_bytes=0, shmem_offset=-1; int index; int file_size = shmem_coll_bcast_threshold; + int ret_val = 0, flag = 0; /* Get my rank and switch communicators to the hidden collective */ @@ -1681,8 +1685,10 @@ /* Initialize the bcast segment for the first time */ if (comm_ptr->bcast_mmap_ptr == NULL){ - MPID_SHMEM_BCAST_init(file_size, shmem_comm_rank, local_rank, &(comm_ptr->bcast_seg_size), + ret_val = MPID_SHMEM_BCAST_init(file_size, shmem_comm_rank, local_rank, &(comm_ptr->bcast_seg_size), &(comm_ptr->bcast_shmem_file), &(comm_ptr->bcast_fd)); + MPI_Allreduce(&ret_val, &flag, 1, MPI_INT, MPI_LAND, comm->self); + if (flag == 0) return -1; MPI_Barrier(shmem_comm); MPID_SHMEM_BCAST_mmap(&(comm_ptr->bcast_mmap_ptr), comm_ptr->bcast_seg_size, comm_ptr->bcast_fd, local_rank,comm_ptr->bcast_shmem_file); From kumarra at mvapich.cse.ohio-state.edu Mon Jun 30 13:56:52 2008 From: kumarra at mvapich.cse.ohio-state.edu (kumarra@mvapich.cse.ohio-state.edu) Date: Mon Jun 30 13:56:59 2008 Subject: [mvapich-commit] r2806 - in mvapich/branches/1.0: mpid/ch_gen2 src/coll Message-ID: <200806301756.m5UHuqjK005933@mvapich.cse.ohio-state.edu> Author: kumarra Date: 2008-06-30 13:56:51 -0400 (Mon, 30 Jun 2008) New Revision: 2806 Modified: mvapich/branches/1.0/mpid/ch_gen2/coll_shmem.h mvapich/branches/1.0/mpid/ch_gen2/shmem_coll.c mvapich/branches/1.0/src/coll/intra_fns_new.c Log: Do not try to use shmem broadcast if shmem_bcast shared memory initialization fails Modified: mvapich/branches/1.0/mpid/ch_gen2/coll_shmem.h =================================================================== --- mvapich/branches/1.0/mpid/ch_gen2/coll_shmem.h 2008-06-30 17:28:54 UTC (rev 2805) +++ mvapich/branches/1.0/mpid/ch_gen2/coll_shmem.h 2008-06-30 17:56:51 UTC (rev 2806) @@ -120,5 +120,6 @@ }; #define SHMEM_BCAST_FLAGS 1024 +#define SHMEM_BCAST_LEADERS 1024 #define SHMEM_BCAST_METADATA (sizeof(aint_t) + 2*sizeof(int)) /* METADATA: buffer address, offset, num_bytes */ #endif Modified: mvapich/branches/1.0/mpid/ch_gen2/shmem_coll.c =================================================================== --- mvapich/branches/1.0/mpid/ch_gen2/shmem_coll.c 2008-06-30 17:28:54 UTC (rev 2805) +++ mvapich/branches/1.0/mpid/ch_gen2/shmem_coll.c 2008-06-30 17:56:51 UTC (rev 2806) @@ -282,7 +282,7 @@ void MPID_SHMEM_COLL_GetShmemBcastBuf(void** output_buf, void* buffer){ char* shmem_coll_buf = (char*)(buffer); - *output_buf = (char*)shmem_coll_buf + 3*SHMEM_BCAST_FLAGS + 1024*SHMEM_BCAST_METADATA; + *output_buf = (char*)shmem_coll_buf + 3*SHMEM_BCAST_FLAGS + SHMEM_BCAST_LEADERS*SHMEM_BCAST_METADATA; } void signal_local_processes(int step, int index, char* send_buf, int offset, int bytes, void* mmap_ptr){ @@ -322,7 +322,7 @@ buffer = (aint_t*)tmp; buffer = (int*)(tmp + sizeof(aint_t)); *offset = *((int*)buffer); - *output_buf = (char*)(mmap_ptr) + 3*SHMEM_BCAST_FLAGS + 1024*SHMEM_BCAST_METADATA + *offset; + *output_buf = (char*)(mmap_ptr) + 3*SHMEM_BCAST_FLAGS + SHMEM_BCAST_LEADERS*SHMEM_BCAST_METADATA + *offset; buffer = (int*)(tmp + sizeof(aint_t) + sizeof(int)); *bytes = *((int*)buffer); @@ -401,7 +401,7 @@ volatile char tmpchar; #endif - file_size = file_size + 3*SHMEM_BCAST_FLAGS + 1024*SHMEM_BCAST_METADATA; + file_size = file_size + 3*SHMEM_BCAST_FLAGS + SHMEM_BCAST_LEADERS*SHMEM_BCAST_METADATA; /* add pid for unique file name */ *bcast_shmem_file = (char *) malloc(sizeof(char) * (SHMEM_COLL_HOSTNAME_LEN + 26 + PID_CHAR_LEN)); @@ -418,7 +418,7 @@ perror("open"); fprintf(stderr, "[%d] shmem_coll_init:error in opening " "shared memory file <%s>: %d\n", viadev.global_id, *bcast_shmem_file, errno); - return -1; + return 0; } @@ -430,7 +430,7 @@ unlink(*bcast_shmem_file); fprintf(stderr, "[%d] shmem_coll_init:error in ftruncate to zero " "shared memory file: %d\n", my_rank, errno); - return -1; + return 0; } /* set file size, without touching pages */ @@ -439,8 +439,8 @@ unlink(*bcast_shmem_file); fprintf(stderr, "[%d] shmem_coll_init:error in ftruncate to size " "shared memory file: %d\n", my_rank, errno); - return -1; - } + return 0; + } /* Ignoring optimal memory allocation for now */ #ifndef _X86_64_ @@ -448,9 +448,9 @@ char *buf; buf = (char *) calloc(*bcast_seg_size + 1, sizeof(char)); if (write(*fd, buf, *bcast_seg_size) != *bcast_seg_size) { - printf("[%d] shmem_coll_init:error in writing " "shared memory file: %d\n", my_rank, errno); + fprintf(stderr, "[%d] shmem_coll_init:error in writing " "shared memory file: %d\n", my_rank, errno); free(buf); - return -1; + return 0; } free(buf); } @@ -462,12 +462,12 @@ fprintf(stderr, "[%d] shmem_coll_init:error in lseek " "on shared memory file: %d\n", my_rank, errno); - return -1; + return 0; } } - return MPI_SUCCESS; + return 1; } int MPID_SHMEM_BCAST_mmap(void** mmap_ptr, int bcast_seg_size, int fd, int my_local_rank, char* bcast_shmem_file) Modified: mvapich/branches/1.0/src/coll/intra_fns_new.c =================================================================== --- mvapich/branches/1.0/src/coll/intra_fns_new.c 2008-06-30 17:28:54 UTC (rev 2805) +++ mvapich/branches/1.0/src/coll/intra_fns_new.c 2008-06-30 17:56:51 UTC (rev 2806) @@ -1110,7 +1110,7 @@ if ((nbytes < coll_table[BCAST_IDX][lgn]) || (coll_table[BCAST_IDX][lgn] == -1)) { - mpi_errno = knomial_2level_Bcast(buffer, count, datatype, root, + mpi_errno = knomial_2level_Bcast(buffer, count, datatype, root, comm); } else{ @@ -1296,6 +1296,7 @@ int pof2 = 1; int is_contig; int is_homogeneous = 1; + int mpi_errno; #ifdef MPID_HAS_HETERO is_homogeneous = (comm->msgform == MPID_MSG_OK) ? 1 : 0; #endif @@ -1307,7 +1308,9 @@ if ((comm->comm_coll->shmem_coll_ok) && (disable_shmem_bcast == 0) && (nbytes > shmem_bcast_large_msg) && (enable_shmem_collectives) && (is_contig) && (is_homogeneous) && (nbytes < shmem_coll_bcast_threshold)){ - return intra_shmem_Bcast_Large( buffer, count, datatype, nbytes, root, comm ); + mpi_errno = intra_shmem_Bcast_Large( buffer, count, datatype, nbytes, root, comm ); + if (mpi_errno == -1) return scatterGatherBcast(buffer, count, datatype, nbytes, root, comm); + else return mpi_errno; } else{ return scatterGatherBcast(buffer, count, datatype, nbytes, root, comm); @@ -1654,6 +1657,7 @@ int leader_root, total_size =0, shmem_comm_rank, num_bytes=0, shmem_offset=-1; int index; int file_size = shmem_coll_bcast_threshold; + int ret_val = 0, flag = 0; /* Get my rank and switch communicators to the hidden collective */ @@ -1681,8 +1685,10 @@ /* Initialize the bcast segment for the first time */ if (comm_ptr->bcast_mmap_ptr == NULL){ - MPID_SHMEM_BCAST_init(file_size, shmem_comm_rank, local_rank, &(comm_ptr->bcast_seg_size), + ret_val = MPID_SHMEM_BCAST_init(file_size, shmem_comm_rank, local_rank, &(comm_ptr->bcast_seg_size), &(comm_ptr->bcast_shmem_file), &(comm_ptr->bcast_fd)); + MPI_Allreduce(&ret_val, &flag, 1, MPI_INT, MPI_LAND, comm->self); + if (flag == 0) return -1; MPI_Barrier(shmem_comm); MPID_SHMEM_BCAST_mmap(&(comm_ptr->bcast_mmap_ptr), comm_ptr->bcast_seg_size, comm_ptr->bcast_fd, local_rank,comm_ptr->bcast_shmem_file);