From vishnu at mvapich.cse.ohio-state.edu Wed Jun 6 15:22:28 2007 From: vishnu at mvapich.cse.ohio-state.edu (vishnu@mvapich.cse.ohio-state.edu) Date: Wed Jun 6 15:22:45 2007 Subject: [mvapich-commit] r1308 - mvapich/trunk/mpid/ch_gen2_multirail Message-ID: <200706061922.l56JMSSU011860@mvapich.cse.ohio-state.edu> Author: vishnu Date: 2007-06-06 15:22:26 -0400 (Wed, 06 Jun 2007) New Revision: 1308 Modified: mvapich/trunk/mpid/ch_gen2_multirail/viacheck.c Log: Minor fixes to catch the errors early with multi-rail device during polling Modified: mvapich/trunk/mpid/ch_gen2_multirail/viacheck.c =================================================================== --- mvapich/trunk/mpid/ch_gen2_multirail/viacheck.c 2007-06-05 20:23:32 UTC (rev 1307) +++ mvapich/trunk/mpid/ch_gen2_multirail/viacheck.c 2007-06-06 19:22:26 UTC (rev 1308) @@ -162,8 +162,8 @@ if (stripe_size[i] == 0) continue; v = get_vbuf(); + assert(v != NULL); v->shandle = s; - assert(v != NULL); s->rput_flag[i] = 1; v->subchannel = i; v->desc.subchannel = i; @@ -1266,22 +1266,25 @@ #endif for(i = 0; i < num_hcas; i++) { ne = ibv_poll_cq(ibv_dev.cq_hndl[i], 1, sc); - if (ne > 0) + if (ne != 0) break; - } - if (ne > 0) { - readbar(); - if (sc->status != IBV_WC_SUCCESS) { - error_abort_all(IBV_RETURN_ERR, + } + if (ne > 0) { + readbar(); + if (sc->status != IBV_WC_SUCCESS) { + error_abort_all(IBV_RETURN_ERR, "[%s:%d] Got completion with error code %d\n", ibv_dev.my_name, ibv_dev.me, sc->status); - } + } ret = 0; break; + } else if (ne < 0) { + error_abort_all(IBV_RETURN_ERR, "Error polling CQ\n"); } + if (ret == 1) continue; /* error when comes here */ @@ -1330,9 +1333,10 @@ #endif for (i = 0; i < num_hcas; i++){ ne = ibv_poll_cq(ibv_dev.cq_hndl[i], 1, &wc); - if (ne > 0) + if (ne != 0) break; } + if(ne < 0) { error_abort_all(IBV_RETURN_ERR, "Error polling CQ\n"); } else if(ne > 1) { From vishnu at mvapich.cse.ohio-state.edu Wed Jun 6 15:23:36 2007 From: vishnu at mvapich.cse.ohio-state.edu (vishnu@mvapich.cse.ohio-state.edu) Date: Wed Jun 6 15:23:50 2007 Subject: [mvapich-commit] r1309 - mvapich/branches/0.9.9/mpid/ch_gen2_multirail Message-ID: <200706061923.l56JNaTr011871@mvapich.cse.ohio-state.edu> Author: vishnu Date: 2007-06-06 15:23:36 -0400 (Wed, 06 Jun 2007) New Revision: 1309 Modified: mvapich/branches/0.9.9/mpid/ch_gen2_multirail/viacheck.c Log: Minor fixes to catch the errors early with multi-rail device during polling (Thanks to ishai@Mellanox for reporting them) Modified: mvapich/branches/0.9.9/mpid/ch_gen2_multirail/viacheck.c =================================================================== --- mvapich/branches/0.9.9/mpid/ch_gen2_multirail/viacheck.c 2007-06-06 19:22:26 UTC (rev 1308) +++ mvapich/branches/0.9.9/mpid/ch_gen2_multirail/viacheck.c 2007-06-06 19:23:36 UTC (rev 1309) @@ -162,8 +162,8 @@ if (stripe_size[i] == 0) continue; v = get_vbuf(); + assert(v != NULL); v->shandle = s; - assert(v != NULL); s->rput_flag[i] = 1; v->subchannel = i; v->desc.subchannel = i; @@ -1266,22 +1266,25 @@ #endif for(i = 0; i < num_hcas; i++) { ne = ibv_poll_cq(ibv_dev.cq_hndl[i], 1, sc); - if (ne > 0) + if (ne != 0) break; - } - if (ne > 0) { - readbar(); - if (sc->status != IBV_WC_SUCCESS) { - error_abort_all(IBV_RETURN_ERR, + } + if (ne > 0) { + readbar(); + if (sc->status != IBV_WC_SUCCESS) { + error_abort_all(IBV_RETURN_ERR, "[%s:%d] Got completion with error code %d\n", ibv_dev.my_name, ibv_dev.me, sc->status); - } + } ret = 0; break; + } else if (ne < 0) { + error_abort_all(IBV_RETURN_ERR, "Error polling CQ\n"); } + if (ret == 1) continue; /* error when comes here */ @@ -1330,9 +1333,10 @@ #endif for (i = 0; i < num_hcas; i++){ ne = ibv_poll_cq(ibv_dev.cq_hndl[i], 1, &wc); - if (ne > 0) + if (ne != 0) break; } + if(ne < 0) { error_abort_all(IBV_RETURN_ERR, "Error polling CQ\n"); } else if(ne > 1) { From vishnu at mvapich.cse.ohio-state.edu Wed Jun 6 15:40:37 2007 From: vishnu at mvapich.cse.ohio-state.edu (vishnu@mvapich.cse.ohio-state.edu) Date: Wed Jun 6 15:40:53 2007 Subject: [mvapich-commit] r1310 - mvapich/trunk Message-ID: <200706061940.l56JebEM011899@mvapich.cse.ohio-state.edu> Author: vishnu Date: 2007-06-06 15:40:35 -0400 (Wed, 06 Jun 2007) New Revision: 1310 Modified: mvapich/trunk/CHANGELOG Log: Adding entry to Changelog for patches given by Ishai @ Mellanox Modified: mvapich/trunk/CHANGELOG =================================================================== --- mvapich/trunk/CHANGELOG 2007-06-06 19:23:36 UTC (rev 1309) +++ mvapich/trunk/CHANGELOG 2007-06-06 19:40:35 UTC (rev 1310) @@ -4,6 +4,12 @@ This file briefly describes the latest changes to MVAPICH software package. The logs are arranged in the "most recent first" order. +06/06/2007 + +* Catching errors early with multi-rail device + + Contributed by Ishai (Mellanox) + 04/25/2007 * Made shared memory macros tunable at run time From mellanox at mvapich.cse.ohio-state.edu Tue Jun 12 05:02:20 2007 From: mellanox at mvapich.cse.ohio-state.edu (mellanox@mvapich.cse.ohio-state.edu) Date: Tue Jun 12 05:02:43 2007 Subject: [mvapich-commit] r1315 - in mvapich/branches/0.9.9/mpid/ch_gen2: . ptmalloc2 Message-ID: <200706120902.l5C92KMR018327@mvapich.cse.ohio-state.edu> Author: mellanox Date: 2007-06-12 05:02:18 -0400 (Tue, 12 Jun 2007) New Revision: 1315 Modified: mvapich/branches/0.9.9/mpid/ch_gen2/Makefile.in mvapich/branches/0.9.9/mpid/ch_gen2/ptmalloc2/Makefile Log: Removing warning flags: -Wall -Wstrict-prototypes These flags cause to PGI compiler to fail during compilation. Modified: mvapich/branches/0.9.9/mpid/ch_gen2/Makefile.in =================================================================== --- mvapich/branches/0.9.9/mpid/ch_gen2/Makefile.in 2007-06-12 06:36:56 UTC (rev 1314) +++ mvapich/branches/0.9.9/mpid/ch_gen2/Makefile.in 2007-06-12 09:02:18 UTC (rev 1315) @@ -37,8 +37,8 @@ # ifeq ($(CC),gcc) - CFLAGS1 = @CFLAGS@ -D_GNU_SOURCE -I${top_srcdir} -I${srcdir} -I. $(OPTFLAGS) \ - -Wall + CFLAGS1 = @CFLAGS@ -D_GNU_SOURCE -I${top_srcdir} -I${srcdir} -I. $(OPTFLAGS) + else CFLAGS1 = -D_GNU_SOURCE @CFLAGS@ -I${top_srcdir} -I${srcdir} -I. $(OPTFLAGS) endif @@ -161,7 +161,7 @@ $(AR) $(LIBNAME) malloc.o malloc.o: - ${CC} -fPIC -c @CFLAGS@ -D_GNU_SOURCE=1 -O2 -Wall -Wstrict-prototypes -DUSE_TSD_DATA_HACK \ + ${CC} -fPIC -c @CFLAGS@ -D_GNU_SOURCE=1 -O2 -DUSE_TSD_DATA_HACK \ -Iptmalloc2/sysdeps/pthread -Iptmalloc2/sysdeps/generic -I. \ -DTHREAD_STATS=1 ptmalloc2/malloc.c Modified: mvapich/branches/0.9.9/mpid/ch_gen2/ptmalloc2/Makefile =================================================================== --- mvapich/branches/0.9.9/mpid/ch_gen2/ptmalloc2/Makefile 2007-06-12 06:36:56 UTC (rev 1314) +++ mvapich/branches/0.9.9/mpid/ch_gen2/ptmalloc2/Makefile 2007-06-12 09:02:18 UTC (rev 1315) @@ -126,14 +126,12 @@ linux-pthread: $(MAKE) SYS_FLAGS='-D_GNU_SOURCE=1' \ - WARN_FLAGS='-Wall -Wstrict-prototypes' \ OPT_FLAGS='$(OPT_FLAGS)' THR_FLAGS='-DUSE_TSD_DATA_HACK' \ INC_FLAGS='-Isysdeps/pthread -Isysdeps/generic -I.' M_FLAGS='$(M_FLAGS)' \ TESTS='$(TESTS)' linux-malloc.so: $(MAKE) SYS_FLAGS='-D_GNU_SOURCE=1' \ - WARN_FLAGS='-Wall -Wstrict-prototypes' \ OPT_FLAGS='$(OPT_FLAGS)' THR_FLAGS='-DUSE_TSD_DATA_HACK' \ INC_FLAGS='-Isysdeps/pthread -Isysdeps/generic -I.' M_FLAGS='$(M_FLAGS)' \ malloc.so From surs at mvapich.cse.ohio-state.edu Wed Jun 13 11:04:54 2007 From: surs at mvapich.cse.ohio-state.edu (surs@mvapich.cse.ohio-state.edu) Date: Wed Jun 13 11:05:12 2007 Subject: [mvapich-commit] r1321 - in mvapich/trunk/mpid/ch_gen2: . process ptmalloc2 Message-ID: <200706131504.l5DF4sBo022169@mvapich.cse.ohio-state.edu> Author: surs Date: 2007-06-13 11:04:52 -0400 (Wed, 13 Jun 2007) New Revision: 1321 Added: mvapich/trunk/mpid/ch_gen2/ib_init.h Removed: mvapich/trunk/mpid/ch_gen2/ib_init.h Modified: mvapich/trunk/mpid/ch_gen2/Makefile.in mvapich/trunk/mpid/ch_gen2/mpid_init.c mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c mvapich/trunk/mpid/ch_gen2/process/pmgr_client.h mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpd.c mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c mvapich/trunk/mpid/ch_gen2/ptmalloc2/Makefile mvapich/trunk/mpid/ch_gen2/viacheck.c mvapich/trunk/mpid/ch_gen2/viutil.h Log: -- Merge 0.9.9 changes into trunk Modified: mvapich/trunk/mpid/ch_gen2/Makefile.in =================================================================== --- mvapich/trunk/mpid/ch_gen2/Makefile.in 2007-06-12 22:39:17 UTC (rev 1320) +++ mvapich/trunk/mpid/ch_gen2/Makefile.in 2007-06-13 15:04:52 UTC (rev 1321) @@ -37,8 +37,8 @@ # ifeq ($(CC),gcc) - CFLAGS1 = @CFLAGS@ -D_GNU_SOURCE -I${top_srcdir} -I${srcdir} -I. $(OPTFLAGS) \ - -Wall + CFLAGS1 = @CFLAGS@ -D_GNU_SOURCE -I${top_srcdir} -I${srcdir} -I. $(OPTFLAGS) + else CFLAGS1 = -D_GNU_SOURCE @CFLAGS@ -I${top_srcdir} -I${srcdir} -I. $(OPTFLAGS) endif @@ -161,7 +161,7 @@ $(AR) $(LIBNAME) malloc.o malloc.o: - ${CC} -fPIC -c @CFLAGS@ -D_GNU_SOURCE=1 -O2 -Wall -Wstrict-prototypes -DUSE_TSD_DATA_HACK \ + ${CC} -fPIC -c @CFLAGS@ -D_GNU_SOURCE=1 -O2 -DUSE_TSD_DATA_HACK \ -Iptmalloc2/sysdeps/pthread -Iptmalloc2/sysdeps/generic -I. \ -DTHREAD_STATS=1 ptmalloc2/malloc.c Deleted: mvapich/trunk/mpid/ch_gen2/ib_init.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/ib_init.h 2007-06-12 22:39:17 UTC (rev 1320) +++ mvapich/trunk/mpid/ch_gen2/ib_init.h 2007-06-13 15:04:52 UTC (rev 1321) @@ -1,39 +0,0 @@ -#ifndef IB_INIT_H -#define IB_INIT_H - -#include "viaparam.h" -#include "ibverbs_const.h" -#include "viadev.h" -/* -This functions locates PKEY INDEX by PKEY itself -It returns PKEY in the case of success, or int bad_pkey_idx otherwise -*/ -static inline uint16_t get_pkey_index(uint16_t pkey, int port_num) { - static const uint16_t bad_pkey_idx = -1; - uint16_t i; - if(ibv_query_device(viadev.context, &viadev.dev_attr)) { - error_abort_all(GEN_EXIT_ERR, - "Error getting HCA attributes\n"); - } - for (i = 0; i < viadev.dev_attr.max_pkeys ; ++i) { - uint16_t curr_pkey; - ibv_query_pkey(viadev.context, (uint8_t)port_num, (int)i ,&curr_pkey); - if (pkey == ntohs(curr_pkey)) { - return i; - } - } - return bad_pkey_idx; -} - -/* -This functions sets PKEY INDEX according to PKEY, if PKEY was defined by user. -*/ - -static inline void set_pkey_index(uint16_t * pkey_index, int port_num) { - *pkey_index = (viadev_default_pkey == VIADEV_DEFAULT_PKEY ? viadev_default_pkey_ix : get_pkey_index(viadev_default_pkey,port_num)); - if (pkey_index < 0 ) { - error_abort_all(IBV_RETURN_ERR, - "Can't find PKEY INDEX according to given PKEY\n"); - } -} -#endif //IB_INIT_H Copied: mvapich/trunk/mpid/ch_gen2/ib_init.h (from rev 1320, mvapich/branches/0.9.9/mpid/ch_gen2/ib_init.h) Modified: mvapich/trunk/mpid/ch_gen2/mpid_init.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/mpid_init.c 2007-06-12 22:39:17 UTC (rev 1320) +++ mvapich/trunk/mpid/ch_gen2/mpid_init.c 2007-06-13 15:04:52 UTC (rev 1321) @@ -93,7 +93,7 @@ MPIR_debug_state = MPIR_DEBUG_ABORTING; MPIR_Breakpoint(); - pmgr_abort(); + pmgr_abort(code); exit(code); } Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2007-06-12 22:39:17 UTC (rev 1320) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2007-06-13 15:04:52 UTC (rev 1321) @@ -1019,26 +1019,40 @@ } void wait_for_errors(int s,struct sockaddr_in *sockaddr,unsigned int sockaddr_len){ - - int nread,remote_id,s1,i; - + + int nread,remote_id,local_id,s1,i,flag; + s1 = accept(s,(struct sockaddr *) sockaddr,&sockaddr_len); - nread = read(s1, &remote_id, sizeof(remote_id)); + nread = read(s1, &flag, sizeof(flag)); if (nread == -1) { perror("Termination socket read failed"); } else if (nread == 0) { - } else if (nread != sizeof(remote_id)) { + } else if (nread != sizeof(flag)) { printf("Invalid termination socket on read\n"); cleanup(); } else { - printf("mpirun_rsh: Abort signaled from [%d]\n",remote_id); - /*if (remote_id == ABORT_ERROR) { - + printf("Aborting code !\n"); - + */ - /* shut down all our ports */ - close(s); - close(s1); - cleanup(); + nread = read(s1, &local_id, sizeof(local_id)); + if (nread == -1) { + perror("Termination socket read failed"); + } else if (nread == 0) { + } else if (nread != sizeof(local_id)) { + printf("Invalid termination socket on read\n"); + cleanup(); + } else if (flag > -1) { + remote_id=flag; + printf("mpirun_rsh: Abort signaled from [%d : %s] remote host is [%d : %s ]\n",local_id,plist[local_id].hostname,remote_id, plist[remote_id].hostname); + close(s); + close(s1); + cleanup(); + } + else + { + printf("mpirun_rsh: Abort signaled from [%d]\n",local_id); + close(s); + close(s1); + cleanup(); + + } } } Modified: mvapich/trunk/mpid/ch_gen2/process/pmgr_client.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_client.h 2007-06-12 22:39:17 UTC (rev 1320) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_client.h 2007-06-13 15:04:52 UTC (rev 1321) @@ -93,7 +93,7 @@ */ int pmgr_finalize(void); -int pmgr_abort(void); +int pmgr_abort(int); int pmgr_get_mpirun_process(int np,char ***processes_p); #ifdef MCST_SUPPORT Modified: mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpd.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpd.c 2007-06-12 22:39:17 UTC (rev 1320) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpd.c 2007-06-13 15:04:52 UTC (rev 1321) @@ -315,7 +315,7 @@ /* abort call to process manager. Allows it to clean-up * any resources it might have allocated. */ -int pmgr_abort() +int pmgr_abort(int none) { MPD_Abort(1); return(1); Modified: mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c 2007-06-12 22:39:17 UTC (rev 1320) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c 2007-06-13 15:04:52 UTC (rev 1321) @@ -439,11 +439,12 @@ * Call into the process spawner, using the same port we were given * at startup time, to tell it to abort the entire job. */ -int pmgr_abort(void) +int pmgr_abort(int flag) { int s; struct sockaddr_in sin; struct hostent *he; + char* str; he = gethostbyname(mpirun_hostname); if (!he) { @@ -466,7 +467,7 @@ /* write our rank to mpirun_rsh (wait_for_error) * for use in nice error messages */ - + write(s, &flag, sizeof(flag)); write(s, &pmgr_me, sizeof(pmgr_me)); close(s); Modified: mvapich/trunk/mpid/ch_gen2/ptmalloc2/Makefile =================================================================== --- mvapich/trunk/mpid/ch_gen2/ptmalloc2/Makefile 2007-06-12 22:39:17 UTC (rev 1320) +++ mvapich/trunk/mpid/ch_gen2/ptmalloc2/Makefile 2007-06-13 15:04:52 UTC (rev 1321) @@ -126,14 +126,12 @@ linux-pthread: $(MAKE) SYS_FLAGS='-D_GNU_SOURCE=1' \ - WARN_FLAGS='-Wall -Wstrict-prototypes' \ OPT_FLAGS='$(OPT_FLAGS)' THR_FLAGS='-DUSE_TSD_DATA_HACK' \ INC_FLAGS='-Isysdeps/pthread -Isysdeps/generic -I.' M_FLAGS='$(M_FLAGS)' \ TESTS='$(TESTS)' linux-malloc.so: $(MAKE) SYS_FLAGS='-D_GNU_SOURCE=1' \ - WARN_FLAGS='-Wall -Wstrict-prototypes' \ OPT_FLAGS='$(OPT_FLAGS)' THR_FLAGS='-DUSE_TSD_DATA_HACK' \ INC_FLAGS='-Isysdeps/pthread -Isysdeps/generic -I.' M_FLAGS='$(M_FLAGS)' \ malloc.so Modified: mvapich/trunk/mpid/ch_gen2/viacheck.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/viacheck.c 2007-06-12 22:39:17 UTC (rev 1320) +++ mvapich/trunk/mpid/ch_gen2/viacheck.c 2007-06-13 15:04:52 UTC (rev 1321) @@ -325,7 +325,7 @@ if (ret2 == 1) { vbuf_addr = (void *) ((aint_t) sc.wr_id); if (sc.status != IBV_WC_SUCCESS) { - error_abort_all(IBV_STATUS_ERR, + error_abort_all(((vbuf *) vbuf_addr)->grank, "[%s:%d] Got completion with error %s, " "code=%d, dest rank=%d\n", viadev.my_name, viadev.me, @@ -340,7 +340,7 @@ if (ret3 == 1) { vbuf_addr = (void *) ((aint_t) sc.wr_id); if (sc.status != IBV_WC_SUCCESS) { - error_abort_all(IBV_STATUS_ERR, + error_abort_all(((vbuf *) vbuf_addr)->grank, "[%s:%d] Got completion with error %s, " "code=%d, dest rank=%d\n", viadev.my_name, viadev.me, @@ -385,7 +385,7 @@ /* Need to check if it is a completion with error */ if (sc.status != IBV_WC_SUCCESS) { - error_abort_all(IBV_STATUS_ERR, + error_abort_all(((vbuf *) vbuf_addr)->grank, "[%s:%d] Got completion with error %s, " "code=%d, dest rank=%d\n", viadev.my_name, viadev.me, @@ -2365,13 +2365,14 @@ } if (ne > 0) { - + void * vbuf_addr = (void *) ((aint_t) sc->wr_id); if (sc->status != IBV_WC_SUCCESS) { - error_abort_all(IBV_STATUS_ERR, + error_abort_all(((vbuf *) vbuf_addr)->grank, "[%s:%d] Got completion with error %s, " - "code=%d\n", viadev.my_name, - viadev.me, wc_code_to_str(sc->status), - sc->status); + "code=%d, dest rank=%d\n", + viadev.my_name, viadev.me, + wc_code_to_str(sc->status), sc->status, + ((vbuf *) vbuf_addr)->grank); } ret = 0; Modified: mvapich/trunk/mpid/ch_gen2/viutil.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/viutil.h 2007-06-12 22:39:17 UTC (rev 1320) +++ mvapich/trunk/mpid/ch_gen2/viutil.h 2007-06-13 15:04:52 UTC (rev 1321) @@ -48,7 +48,8 @@ } \ fprintf(stderr, message, ##args); \ fprintf(stderr, " at line %d in file %s\n", __LINE__, __FILE__);\ - pmgr_abort(); \ + sleep(1); \ + pmgr_abort(code); \ exit(code); \ } From surs at mvapich.cse.ohio-state.edu Thu Jun 14 08:30:42 2007 From: surs at mvapich.cse.ohio-state.edu (surs@mvapich.cse.ohio-state.edu) Date: Thu Jun 14 08:30:57 2007 Subject: [mvapich-commit] r1324 - mvapich/tags Message-ID: <200706141230.l5ECUgpA025247@mvapich.cse.ohio-state.edu> Author: surs Date: 2007-06-14 08:30:41 -0400 (Thu, 14 Jun 2007) New Revision: 1324 Added: mvapich/tags/0.9.9+psm/ Log: Tagging current trunk with PSM code Copied: mvapich/tags/0.9.9+psm (from rev 1323, mvapich/trunk) From surs at mvapich.cse.ohio-state.edu Thu Jun 14 15:46:32 2007 From: surs at mvapich.cse.ohio-state.edu (surs@mvapich.cse.ohio-state.edu) Date: Thu Jun 14 15:46:46 2007 Subject: [mvapich-commit] r1325 - mvapich/tags Message-ID: <200706141946.l5EJkVbC026188@mvapich.cse.ohio-state.edu> Author: surs Date: 2007-06-14 15:46:30 -0400 (Thu, 14 Jun 2007) New Revision: 1325 Added: mvapich/tags/0.9.9-infinipath/ Log: Rename 0.9.9+psm to 0.9.9-infinipath Copied: mvapich/tags/0.9.9-infinipath (from rev 1324, mvapich/tags/0.9.9+psm) From surs at mvapich.cse.ohio-state.edu Thu Jun 14 15:46:49 2007 From: surs at mvapich.cse.ohio-state.edu (surs@mvapich.cse.ohio-state.edu) Date: Thu Jun 14 15:47:03 2007 Subject: [mvapich-commit] r1326 - mvapich/tags Message-ID: <200706141946.l5EJkntH026199@mvapich.cse.ohio-state.edu> Author: surs Date: 2007-06-14 15:46:49 -0400 (Thu, 14 Jun 2007) New Revision: 1326 Removed: mvapich/tags/0.9.9+psm/ Log: Removing old tag From narravul at mvapich.cse.ohio-state.edu Tue Jun 19 18:27:08 2007 From: narravul at mvapich.cse.ohio-state.edu (narravul@mvapich.cse.ohio-state.edu) Date: Tue Jun 19 18:27:25 2007 Subject: [mvapich-commit] r1333 - mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2 Message-ID: <200706192227.l5JMR8LJ027925@mvapich.cse.ohio-state.edu> Author: narravul Date: 2007-06-19 18:27:06 -0400 (Tue, 19 Jun 2007) New Revision: 1333 Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c Log: Fix for RDMA CM finalize hang. With this we should not be seeing the RDMA CM finalize error. Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c 2007-06-19 17:27:48 UTC (rev 1332) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c 2007-06-19 22:27:06 UTC (rev 1333) @@ -772,6 +772,7 @@ if (rdma_cm_host_list[i] == rdma_cm_host_list[pg_rank]) continue; + MPIDI_PG_Get_vc(cached_pg, i, &vc); rdma_destroy_id(vc->mrail.rails[rail_index].cm_ids); } From narravul at mvapich.cse.ohio-state.edu Wed Jun 20 04:52:40 2007 From: narravul at mvapich.cse.ohio-state.edu (narravul@mvapich.cse.ohio-state.edu) Date: Wed Jun 20 04:52:56 2007 Subject: [mvapich-commit] r1337 - mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2 Message-ID: <200706200852.l5K8qe7K029781@mvapich.cse.ohio-state.edu> Author: narravul Date: 2007-06-20 04:52:38 -0400 (Wed, 20 Jun 2007) New Revision: 1337 Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c Log: Adding MV2_RDMA_CM_ARP_TIMEOUT env variable to enable large clusters to set these as needed at runtime. Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c 2007-06-20 00:08:45 UTC (rev 1336) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c 2007-06-20 08:52:38 UTC (rev 1337) @@ -44,6 +44,7 @@ int *rdma_base_listen_port; int *rdma_cm_host_list; volatile int rdma_cm_finalized = 0; +int rdma_cm_arp_timeout = 2000; char *init_message_buf; /* Used for message exchange in RNIC case */ struct ibv_mr *init_mr; @@ -109,7 +110,7 @@ switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: - ret = rdma_resolve_route(cma_id, 2000); + ret = rdma_resolve_route(cma_id, rdma_cm_arp_timeout); if (ret) { ibv_error_abort(IBV_RETURN_ERR, "rdma_resolve_route error %d\n", ret); @@ -259,6 +260,7 @@ { int i = 0; char hostname[64]; + char *value; if (rdma_num_rails > 1) { ibv_error_abort(IBV_RETURN_ERR, @@ -288,6 +290,14 @@ "Cannot create rdma_create_event_channel\n"); } + if ((value = getenv("MV2_RDMA_CM_ARP_TIMEOUT")) != NULL) { + rdma_cm_arp_timeout = atoi(value); + if (rdma_cm_arp_timeout < 0) { + ibv_error_abort(IBV_RETURN_ERR, + "Invalid rdma cm arp timeout value specified\n"); + } + } + /* Create all the active connect cm_ids */ create_cm_ids(proc, pg_rank, pg_size); @@ -591,7 +601,7 @@ sin.sin_addr.s_addr = ipnum; sin.sin_port = rdma_base_listen_port[rank]; ret = rdma_resolve_addr(vc->mrail.rails[rail_index].cm_ids, NULL, (struct sockaddr *) &sin, - 2000); + rdma_cm_arp_timeout); DEBUG_PRINT("Active connect initiated for %d\n", rank); return ret;