From huangwei at mvapich.cse.ohio-state.edu Wed Jul 4 23:56:50 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Wed Jul 4 23:57:08 2007 Subject: [mvapich-commit] r1360 - in mvapich2/branches/0.9.8/src/mpid/osu_ch3: channels/mrail/src/rdma include Message-ID: <200707050356.l653uoHP016872@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-04 23:56:48 -0400 (Wed, 04 Jul 2007) New Revision: 1360 Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_rndvtransfer.c mvapich2/branches/0.9.8/src/mpid/osu_ch3/include/mpidimpl.h Log: Fix bi-directional get hanging problem Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_rndvtransfer.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_rndvtransfer.c 2007-07-05 03:25:52 UTC (rev 1359) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_rndvtransfer.c 2007-07-05 03:56:48 UTC (rev 1360) @@ -611,6 +611,7 @@ if (VAPI_PROTOCOL_R3 == req->mrail.protocol) { req->mrail.partner_id = get_resp_pkt->request_handle; + MPIDI_VC_revoke_seqnum_send(vc, get_resp_pkt->seqnum); RENDEZVOUS_IN_PROGRESS(vc, req); req->mrail.nearly_complete = 0; PUSH_FLOWLIST(vc); @@ -629,6 +630,7 @@ if (VAPI_PROTOCOL_R3 == req->mrail.protocol) { req->mrail.partner_id = get_resp_pkt->request_handle; + MPIDI_VC_revoke_seqnum_send(vc, get_resp_pkt->seqnum); RENDEZVOUS_IN_PROGRESS(vc, req); req->mrail.nearly_complete = 0; PUSH_FLOWLIST(vc); Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/include/mpidimpl.h =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/include/mpidimpl.h 2007-07-05 03:25:52 UTC (rev 1359) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/include/mpidimpl.h 2007-07-05 03:56:48 UTC (rev 1360) @@ -515,8 +515,16 @@ MPID_Common_thread_unlock(); \ } # endif +/* OSU-MPI2 */ +# define MPIDI_VC_revoke_seqnum_send(vc_, seqnum_) \ + { \ + assert((seqnum_) + 1 == (vc_)->seqnum_send); \ + (vc_)->seqnum_send --; \ + } +/* End of OSU-MPI2 */ #else # define MPIDI_VC_FAI_send_seqnum(vc_, seqnum_out_) +# define MPIDI_VC_revoke_seqnum_send(vc_, seqnum_) #endif /*------------------------------ END VIRTUAL CONNECTION SECTION From huangwei at mvapich.cse.ohio-state.edu Sun Jul 8 09:49:08 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Sun Jul 8 09:49:27 2007 Subject: [mvapich-commit] r1367 - in mvapich2/branches/0.9.8/src/mpid/osu_ch3: channels/mrail/src/gen2 channels/mrail/src/udapl channels/mrail/src/vapi src Message-ID: <200707081349.l68Dn88R016772@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-08 09:49:06 -0400 (Sun, 08 Jul 2007) New Revision: 1367 Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mpidi_ch3_rdma_post.h mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/udapl/mpidi_ch3_rdma_post.h mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/vapi/mpidi_ch3_rdma_post.h mvapich2/branches/0.9.8/src/mpid/osu_ch3/src/ch3u_handle_recv_req.c Log: Apply the one sided fixes to 0.9.8 branch Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mpidi_ch3_rdma_post.h =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mpidi_ch3_rdma_post.h 2007-07-08 03:18:54 UTC (rev 1366) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mpidi_ch3_rdma_post.h 2007-07-08 13:49:06 UTC (rev 1367) @@ -105,6 +105,7 @@ rreq->mrail.rndv_buf_off = rreq->mrail.rndv_buf_sz = 0; \ } \ rreq->mrail.d_entry = NULL; \ + rreq->mrail.protocol = VAPI_PROTOCOL_RENDEZVOUS_UNSPECIFIED; \ } \ } Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/udapl/mpidi_ch3_rdma_post.h =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/udapl/mpidi_ch3_rdma_post.h 2007-07-08 03:18:54 UTC (rev 1366) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/udapl/mpidi_ch3_rdma_post.h 2007-07-08 13:49:06 UTC (rev 1367) @@ -107,6 +107,7 @@ rreq->mrail.rndv_buf_off = rreq->mrail.rndv_buf_sz = 0; \ } \ rreq->mrail.d_entry = NULL; \ + rreq->mrail.protocol = VAPI_PROTOCOL_RENDEZVOUS_UNSPECIFIED; \ } \ } Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/vapi/mpidi_ch3_rdma_post.h =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/vapi/mpidi_ch3_rdma_post.h 2007-07-08 03:18:54 UTC (rev 1366) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/vapi/mpidi_ch3_rdma_post.h 2007-07-08 13:49:06 UTC (rev 1367) @@ -103,6 +103,7 @@ rreq->mrail.rndv_buf_off = rreq->mrail.rndv_buf_sz = 0; \ } \ rreq->mrail.d_entry = NULL; \ + rreq->mrail.protocol = VAPI_PROTOCOL_RENDEZVOUS_UNSPECIFIED; \ } \ } Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/src/ch3u_handle_recv_req.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/src/ch3u_handle_recv_req.c 2007-07-08 03:18:54 UTC (rev 1366) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/src/ch3u_handle_recv_req.c 2007-07-08 13:49:06 UTC (rev 1367) @@ -665,6 +665,11 @@ } fn_exit: + if (TRUE == *complete && + (VAPI_PROTOCOL_RPUT == rreq->mrail.protocol || + VAPI_PROTOCOL_R3 == rreq->mrail.protocol)) + MPIDI_CH3I_MRAILI_RREQ_RNDV_FINISH(rreq); + in_routine = FALSE; MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_HANDLE_RECV_REQ); return mpi_errno; From narravul at mvapich.cse.ohio-state.edu Wed Jul 11 17:28:16 2007 From: narravul at mvapich.cse.ohio-state.edu (narravul@mvapich.cse.ohio-state.edu) Date: Wed Jul 11 17:28:39 2007 Subject: [mvapich-commit] r1376 - mvapich2/branches/0.9.8 Message-ID: <200707112128.l6BLSG7Z026857@mvapich.cse.ohio-state.edu> Author: narravul Date: 2007-07-11 17:28:14 -0400 (Wed, 11 Jul 2007) New Revision: 1376 Modified: mvapich2/branches/0.9.8/CHANGELOG Log: Adding a changelog entry for the rdma_cm related fixes. Modified: mvapich2/branches/0.9.8/CHANGELOG =================================================================== --- mvapich2/branches/0.9.8/CHANGELOG 2007-07-10 23:26:17 UTC (rev 1375) +++ mvapich2/branches/0.9.8/CHANGELOG 2007-07-11 21:28:14 UTC (rev 1376) @@ -3,6 +3,10 @@ This file briefly describes the latest changes to the MVAPICH2 software package. The logs are arranged in the "most recent first" order. +07/10/2007 +* Fix for RDMA_CM finalize rdma_destroy_id failure. Added Timeout env variable for RDMA_CM + ARP. Thanks to Steve Wise for suggesting these. + 03/28/2007 * Fix for RDMA_CM invalid event in finalize. Thanks to Steve Wise and Sean Hefty. From chail at mvapich.cse.ohio-state.edu Wed Jul 11 17:58:46 2007 From: chail at mvapich.cse.ohio-state.edu (chail@mvapich.cse.ohio-state.edu) Date: Wed Jul 11 17:59:06 2007 Subject: [mvapich-commit] r1377 - mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma Message-ID: <200707112158.l6BLwk4j026896@mvapich.cse.ohio-state.edu> Author: chail Date: 2007-07-11 17:58:45 -0400 (Wed, 11 Jul 2007) New Revision: 1377 Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_smp_progress.c Log: Unmap and free shared memory files properly. Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_smp_progress.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_smp_progress.c 2007-07-11 21:28:14 UTC (rev 1376) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_smp_progress.c 2007-07-11 21:58:45 UTC (rev 1377) @@ -96,6 +96,9 @@ static int *current_bytes; static int *total_bytes; +int size_shmem; +int size_pool; + int smp_eagersize = SMP_EAGERSIZE; int smpi_length_queue = SMPI_LENGTH_QUEUE; int smp_num_send_buffer = SMP_NUM_SEND_BUFFER; @@ -462,7 +465,7 @@ { int mpi_errno = MPI_SUCCESS; int my_rank; - unsigned int i, j, size, size_pool, pool, pid, wait; + unsigned int i, j, pool, pid, wait; int local_num, sh_size, pid_len, rq_len, param_len, limit_len; struct stat file_status; struct stat file_status_pool; @@ -585,11 +588,11 @@ sh_size = sizeof(struct shared_mem) + pid_len + param_len + rq_len + limit_len + SMPI_CACHE_LINE_SIZE * 4; - size = (SMPI_CACHE_LINE_SIZE + sh_size + pagesize + + size_shmem = (SMPI_CACHE_LINE_SIZE + sh_size + pagesize + (smpi.num_local_nodes * (smpi.num_local_nodes - 1) * (SMPI_ALIGN(smpi_length_queue + pagesize)))); - DEBUG_PRINT("sizeof shm file %d\n", size); + DEBUG_PRINT("sizeof shm file %d\n", size_shmem); size_pool = SMPI_ALIGN (sizeof (SEND_BUF_T) * smp_num_send_buffer + @@ -610,7 +613,7 @@ } /* set file size, without touching pages */ - if (ftruncate(smpi.fd, size)) { + if (ftruncate(smpi.fd, size_shmem)) { /* to clean up tmp shared file */ unlink(shmem_file); fprintf(stderr, "[%d] smpi_init:error in ftruncate to size " @@ -639,8 +642,8 @@ #ifndef _X86_64_ { char *buf; - buf = (char *) calloc(size + 1, sizeof(char)); - if (write(smpi.fd, buf, size) != size) { + buf = (char *) calloc(size_shmem + 1, sizeof(char)); + if (write(smpi.fd, buf, size_shmem) != size_shmem) { fprintf(stderr, "[%d] smpi_init:error in writing " "shared memory file: %d\n", my_rank, errno); @@ -703,7 +706,7 @@ } usleep(1); } - while (file_status.st_size != size || + while (file_status.st_size != size_shmem || file_status_pool.st_size != size_pool); smpi_shmem = (struct shared_mem *)malloc(sizeof(struct shared_mem)); @@ -711,7 +714,7 @@ DEBUG_PRINT("before mmap\n"); /* mmap of the shared memory file */ - smpi.mmap_ptr = mmap(0, size, + smpi.mmap_ptr = mmap(0, size_shmem, (PROT_READ | PROT_WRITE), (MAP_SHARED), smpi.fd, 0); if (smpi.mmap_ptr == (void *) -1) { @@ -970,15 +973,12 @@ #define FCNAME MPIDI_QUOTE(FUNCNAME) int MPIDI_CH3I_SMP_finalize() { - /* unmap the shared memory file */ - munmap(smpi.mmap_ptr, (SMPI_CACHE_LINE_SIZE + - sizeof(struct shared_mem) + - (smpi.num_local_nodes * - (smpi.num_local_nodes - - 1) * (smpi_length_queue + - SMPI_CACHE_LINE_SIZE)))); + /* unmap the shared memory files */ + munmap(smpi.mmap_ptr, size_shmem); + close(smpi.fd); - close(smpi.fd); + munmap(smpi.send_buf_pool_ptr, size_pool); + close(smpi.fd_pool); if(buffer_head) { free(buffer_head); From gaoq at mvapich.cse.ohio-state.edu Wed Jul 11 18:29:18 2007 From: gaoq at mvapich.cse.ohio-state.edu (gaoq@mvapich.cse.ohio-state.edu) Date: Wed Jul 11 18:29:38 2007 Subject: [mvapich-commit] r1378 - in mvapich2/branches/0.9.8/src: mpid/osu_ch3/channels/mrail/src/gen2 mpid/osu_ch3/channels/mrail/src/rdma pm/mpd Message-ID: <200707112229.l6BMTIjJ026992@mvapich.cse.ohio-state.edu> Author: gaoq Date: 2007-07-11 18:29:16 -0400 (Wed, 11 Jul 2007) New Revision: 1378 Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/cr.c mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_progress.c mvapich2/branches/0.9.8/src/pm/mpd/mpiexec_cr.c Log: Check-in the minor fixes for CKPT Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/cr.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/cr.c 2007-07-11 21:58:45 UTC (rev 1377) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/cr.c 2007-07-11 22:29:16 UTC (rev 1378) @@ -884,6 +884,7 @@ for (i = 0; i < pg_size; i++) { if (pg_rank == i) { lid_all[i] = MPIDI_CH3I_RDMA_Process.lids[0][0]; + ud_qpn_all[i] = ud_qpn_self; continue; } sprintf(key,"ud_info_%08d",i); Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_progress.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_progress.c 2007-07-11 21:58:45 UTC (rev 1377) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_progress.c 2007-07-11 22:29:16 UTC (rev 1378) @@ -169,8 +169,14 @@ if(spin_count > 5) { spin_count = 0; MPID_Thread_mutex_unlock(&MPIR_Process.global_mutex); +#ifdef CKPT + MPIDI_CH3I_CR_unlock(); +#endif MPIDU_Yield(); MPID_Thread_mutex_lock(&MPIR_Process.global_mutex); +#ifdef CKPT + MPIDI_CH3I_CR_lock(); +#endif } #endif spin_count++; @@ -213,8 +219,14 @@ if(spin_count > 50) { spin_count = 0; MPID_Thread_mutex_unlock(&MPIR_Process.global_mutex); +#ifdef CKPT + MPIDI_CH3I_CR_unlock(); +#endif MPIDU_Yield(); MPID_Thread_mutex_lock(&MPIR_Process.global_mutex); +#ifdef CKPT + MPIDI_CH3I_CR_lock(); +#endif } #endif } Modified: mvapich2/branches/0.9.8/src/pm/mpd/mpiexec_cr.c =================================================================== --- mvapich2/branches/0.9.8/src/pm/mpd/mpiexec_cr.c 2007-07-11 21:58:45 UTC (rev 1377) +++ mvapich2/branches/0.9.8/src/pm/mpd/mpiexec_cr.c 2007-07-11 22:29:16 UTC (rev 1378) @@ -140,12 +140,6 @@ kill(mpiexec_pid, SIGINT); exit(-1); } - else if (signal==SIGCHLD) { - int status; - wait(&status); - CR_DBG("exiting with status: %d\n", WEXITSTATUS(status)); - exit(WEXITSTATUS(status)); - } } static int CR_callback(void *arg) From koop at mvapich.cse.ohio-state.edu Thu Jul 12 12:16:59 2007 From: koop at mvapich.cse.ohio-state.edu (koop@mvapich.cse.ohio-state.edu) Date: Thu Jul 12 12:17:19 2007 Subject: [mvapich-commit] r1380 - mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2 Message-ID: <200707121616.l6CGGxoh029487@mvapich.cse.ohio-state.edu> Author: koop Date: 2007-07-12 12:16:57 -0400 (Thu, 12 Jul 2007) New Revision: 1380 Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c Log: * Cleanup the SRQ async_thread more cleanly Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c 2007-07-12 09:04:30 UTC (rev 1379) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/ibv_channel_manager.c 2007-07-12 16:16:57 UTC (rev 1380) @@ -655,6 +655,9 @@ struct ibv_srq_attr srq_attr; int post_new, i, hca_num = -1; + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); + while (1) { if (ibv_get_async_event((struct ibv_context *) context, &event)) { fprintf(stderr, "Error getting event!\n"); Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c 2007-07-12 09:04:30 UTC (rev 1379) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_cm.c 2007-07-12 16:16:57 UTC (rev 1380) @@ -764,6 +764,7 @@ if (MPIDI_CH3I_RDMA_Process.has_srq) { if (!MPIDI_CH3I_RDMA_Process.srq_hndl[i]){ pthread_cancel(MPIDI_CH3I_RDMA_Process.async_thread[i]); + pthread_join(MPIDI_CH3I_RDMA_Process.async_thread[i], NULL); ibv_destroy_srq(MPIDI_CH3I_RDMA_Process.srq_hndl[i]); } } Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c 2007-07-12 09:04:30 UTC (rev 1379) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_init.c 2007-07-12 16:16:57 UTC (rev 1380) @@ -768,6 +768,7 @@ if (MPIDI_CH3I_RDMA_Process.has_srq) { pthread_cancel(MPIDI_CH3I_RDMA_Process.async_thread[i]); + pthread_join(MPIDI_CH3I_RDMA_Process.async_thread[i], NULL); err = ibv_destroy_srq(MPIDI_CH3I_RDMA_Process.srq_hndl[i]); if (err) fprintf(stderr, "Failed to destroy SRQ (%d)\n", err); From perkinjo at mvapich.cse.ohio-state.edu Mon Jul 16 09:45:29 2007 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Mon Jul 16 09:45:50 2007 Subject: [mvapich-commit] r1387 - mvapich/trunk/mpid/ch_gen2/process Message-ID: <200707161345.l6GDjT6Z026286@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2007-07-16 09:45:27 -0400 (Mon, 16 Jul 2007) New Revision: 1387 Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c mvapich/trunk/mpid/ch_gen2/process/pmgr_client.h mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c Log: Added functionality to kill remote processes for the case when the remote shell doesn't propagate the kill signal. Also removed an unnecessary closing of sockets during the boot strapping portion of mpi programs. Incremented PMGR_VERSION to reflect this. Changed to more reliable signal handling. Some other minor code cleanup focused on exits and exit codes as well. Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2007-07-15 21:23:48 UTC (rev 1386) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2007-07-16 13:45:27 UTC (rev 1387) @@ -59,6 +59,7 @@ #define _GNU_SOURCE #include #include +#include #include #include #include @@ -91,20 +92,34 @@ typedef struct { char *hostname; char *device; - int pid; + pid_t pid; + pid_t remote_pid; int port; int control_socket; process_state state; } process; +typedef struct { + const char * hostname; + pid_t * pids; + size_t npids, npids_allocated; +} process_group; + +typedef struct { + process_group * data; + process_group ** index; + size_t npgs, npgs_allocated; +} process_groups; + #define RUNNING(i) ((plist[i].state == P_STARTED || \ plist[i].state == P_CONNECTED || \ plist[i].state == P_RUNNING) ? 1 : 0) /* other information: a.out and rank are implicit. */ -process *plist; -int nprocs; +process_groups * pglist = NULL; +process * plist = NULL; +int nprocs = 0; int aout_index, port; #define MAX_WD_LEN 256 char wd[MAX_WD_LEN]; /* working directory of current process */ @@ -112,11 +127,19 @@ char mpirun_host[MAX_HOST_LEN]; /* hostname of current process */ /* xxx need to add checking for string overflow, do this more carefully ... */ +/* + * Message notifying user of what timed out + */ +static const char * alarm_msg = NULL; #define COMMAND_LEN 2000 #define SEPARATOR ':' - +void free_memory(void); +void pglist_print(void); +void pglist_insert(const char * const, const pid_t const); +void rkill_fast(void); +void rkill_linear(void); void cleanup_handler(int); void nostop_handler(int); void alarm_handler(int); @@ -239,15 +262,19 @@ int hostname_len = 0; totalview_cmd[199] = 0; display[0]='\0'; - + pidglen = sizeof(pid_t); + /* mpirun [-debug] [-xterm] -np N [-hostfile hfile | h1 h2 h3 ... hN] a.out [args] */ + atexit(free_memory); + do { c = getopt_long_only(argc, argv, "+", option_table, &option_index); switch (c) { case '?': case ':': usage(); + exit(EXIT_FAILURE); break; case EOF: break; @@ -255,8 +282,10 @@ switch (option_index) { case 0: nprocs = atoi(optarg); - if (nprocs < 1) + if (nprocs < 1) { usage(); + exit(EXIT_FAILURE); + } break; case 1: debug_on = 1; @@ -290,11 +319,11 @@ case 8: show_version(); usage(); - exit(0); + exit(EXIT_SUCCESS); break; case 9: show_version(); - exit(0); + exit(EXIT_SUCCESS); break; case 10: use_totalview = 1; @@ -311,17 +340,19 @@ break; case 11: usage(); - exit(0); + exit(EXIT_SUCCESS); break; default: fprintf(stderr, "Unknown option\n"); usage(); + exit(EXIT_FAILURE); break; } break; default: fprintf(stderr, "Unreachable statement!\n"); usage(); + exit(EXIT_FAILURE); break; } } while (c != EOF); @@ -332,7 +363,7 @@ fprintf(stderr, "Without hostfile option, hostnames must be " "specified on command line.\n"); usage(); - exit(1); + exit(EXIT_FAILURE); } aout_index = nprocs + optind; } else { @@ -361,13 +392,14 @@ plist = malloc(nprocs * sizeof(process)); if (plist == NULL) { perror("malloc"); - exit(1); + exit(EXIT_FAILURE); } for (i = 0; i < nprocs; i++) { plist[i].state = P_NOTSTARTED; plist[i].device = NULL; plist[i].port = -1; + plist[i].remote_pid = 0; } /* grab hosts from command line or file */ @@ -376,7 +408,7 @@ hostname_len = read_hostfile(hostfile); } else { for (i = 0; i < nprocs; i++) { - plist[i].hostname = argv[optind + i]; + plist[i].hostname = (char *)strndup(argv[optind + i], 100); hostname_len = hostname_len > strlen(plist[i].hostname) ? hostname_len : strlen(plist[i].hostname); } @@ -388,7 +420,7 @@ if (!mpirun_processes) { perror("malloc"); - exit(1); + exit(EXIT_FAILURE); } else { memset(mpirun_processes, 0, nprocs * (hostname_len + 4)); } @@ -412,18 +444,18 @@ s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (s < 0) { perror("socket"); - exit(1); + exit(EXIT_FAILURE); } sockaddr.sin_addr.s_addr = INADDR_ANY; sockaddr.sin_port = 0; if (bind(s, (struct sockaddr *) &sockaddr, sockaddr_len) < 0) { perror("bind"); - exit(1); + exit(EXIT_FAILURE); } if (getsockname(s, (struct sockaddr *) &sockaddr, &sockaddr_len) < 0) { perror("getsockname"); - exit(1); + exit(EXIT_FAILURE); } port = (int) ntohs(sockaddr.sin_port); @@ -431,14 +463,31 @@ if (!show_on) { - signal(SIGHUP, cleanup_handler); - signal(SIGINT, cleanup_handler); - signal(SIGTSTP, nostop_handler); - signal(SIGCHLD, child_handler); - signal(SIGALRM, alarm_handler); + struct sigaction signal_handler; + signal_handler.sa_handler = cleanup_handler; + sigfillset(&signal_handler.sa_mask); + signal_handler.sa_flags = 0; + + sigaction(SIGHUP, &signal_handler, NULL); + sigaction(SIGINT, &signal_handler, NULL); + sigaction(SIGTERM, &signal_handler, NULL); + + signal_handler.sa_handler = nostop_handler; + + sigaction(SIGTSTP, &signal_handler, NULL); + + signal_handler.sa_handler = alarm_handler; + + sigaction(SIGALRM, &signal_handler, NULL); + + signal_handler.sa_handler = child_handler; + sigemptyset(&signal_handler.sa_mask); + + sigaction(SIGCHLD, &signal_handler, NULL); } alarm(1000); + alarm_msg = "Timeout during client startup.\n"; /* long timeout for testing, where process may be stopped in debugger */ #ifdef USE_DDD @@ -511,7 +560,7 @@ } if (show_on) - exit(0); + exit(EXIT_SUCCESS); /*Hostid exchange start */ /* accept incoming connections, read port numbers */ @@ -522,6 +571,9 @@ ACCEPT_HID: sockaddr_len = sizeof(sockaddr); s1 = accept(s, (struct sockaddr *) &sockaddr, &sockaddr_len); + + alarm_msg = "Timeout during hostid exchange.\n"; + if (s1 < 0) { if (errno == EINTR) goto ACCEPT_HID; @@ -592,7 +644,7 @@ hostids = (int *) malloc(hostidlen * nprocs); if (hostids == NULL) { perror("malloc"); - exit(1); + exit(EXIT_FAILURE); } } @@ -626,66 +678,33 @@ } } - /* close all opend sockets */ - for (i = 0; i < nprocs; i++) { - close(plist[i].control_socket); - } - alarm(1000); - /* let enbale the timer again*/ + alarm_msg = "Timeout during address exchange.\n"; + /* lets enable the timer again*/ /* Lets read all other information, LID QP,etc..*/ /* accept incoming connections, read port numbers */ for (i = 0; i < nprocs; i++) { - int version, rank, nread; - char pidstr[12]; -ACCEPT: - sockaddr_len = sizeof(sockaddr); - s1 = accept(s, (struct sockaddr *) &sockaddr, &sockaddr_len); - if (s1 < 0) { - if (errno == EINTR) - goto ACCEPT; - perror("accept"); - cleanup(); - } + int nread; /* * protocol: - * We don't need version number, - * 0. read rank of process - * 1. read address length - * 2. read address itself - * 3. send array of all addresses + * We don't need the version number or the rank, + * 0. read address length + * 1. read address itself + * 2. send array of all addresses */ - /* 0. Find out who we're talking to */ - nread = read(s1, &rank, sizeof(rank)); - if (nread != sizeof(rank)) { - perror("read"); - cleanup(); - } + plist[i].state = P_CONNECTED; - if (rank < 0 || rank >= nprocs - || plist[rank].state != P_STARTED) { - fprintf(stderr, "mpirun: invalid rank received. \n"); - cleanup(); - } - - plist[rank].control_socket = s1; - plist[rank].state = P_CONNECTED; - /* Let us know connection was established * printf("MPIRUN_RSH: Process rank %d connected\n",rank); */ /* 1. Find out length of the data */ - nread = read(s1, &addrlen, sizeof(addrlen)); + nread = read(plist[i].control_socket, &addrlen, sizeof(addrlen)); if (nread != sizeof(addrlen)) { - /* nread == 0 is not actually an error! */ - if (nread == 0) - continue; - perror("read"); cleanup(); } @@ -707,21 +726,20 @@ alladdrs = (int *) malloc(addrlen * nprocs); if (alladdrs == NULL) { perror("malloc"); - exit(1); + exit(EXIT_FAILURE); } } /* 2. Read info from each process */ /* for byte location */ - alladdrs_char = (char *) &alladdrs[rank * addrlen / sizeof(int)]; + alladdrs_char = (char *) &alladdrs[i * addrlen / sizeof(int)]; tot_nread = 0; while (tot_nread < addrlen) { - nread = - read(s1, (void *) (alladdrs_char + tot_nread), - addrlen - tot_nread); + nread = read(plist[i].control_socket, + (void *) (alladdrs_char + tot_nread), addrlen - tot_nread); if (nread < 0) { perror("read"); @@ -733,36 +751,32 @@ read_pid: /* 3. Find out length of the data */ - nread = read(s1, &pidlen, sizeof(pidlen)); + nread = read(plist[i].control_socket, &pidlen, sizeof(pidlen)); if (nread != sizeof(pidlen)) { perror("read"); cleanup(); } /*fprintf(stderr, "read Pid lengths %d and %d \n", pidlen, nread);*/ - if (i == 0) { - pidglen = pidlen; - } else { - if (pidlen != pidglen) { - fprintf(stderr, "Pid lengths %d and %d do not match\n", - pidlen, pidglen); - cleanup(); - } - } + if (pidlen != pidglen) { + fprintf(stderr, "Pid lengths %d and %d do not match\n", + pidlen, pidglen); + cleanup(); + } if (i == 0) { - /* allocate as soon as we know the address length */ + /* allocate as soon as we know the pid length */ allpids = (char *)malloc(pidlen * nprocs); if (allpids == NULL) { perror("malloc"); - exit(1); + exit(EXIT_FAILURE); } } tot_nread=0; while(tot_nread < pidlen) { - nread = read(s1, (void*)(allpids+rank*pidlen+tot_nread), - pidlen - tot_nread); + nread = read(plist[i].control_socket, + (void*)(allpids+i*pidlen+tot_nread), pidlen - tot_nread); /*fprintf(stderr, "read length %d \n", nread);*/ if(nread < 0) { perror("read"); @@ -770,6 +784,9 @@ } tot_nread += nread; } + + plist[i].remote_pid = *((pid_t *)(allpids+i*pidlen)); + pglist_insert(plist[i].hostname, plist[i].remote_pid); } @@ -795,7 +812,7 @@ out_addrs = (int *) malloc(out_addrs_len); if (out_addrs == NULL) { perror("malloc"); - exit(1); + exit(EXIT_FAILURE); } for (i = 0; i < nprocs; i++) { @@ -876,8 +893,7 @@ sleep(100); } close(s); - exit(0); - + exit(EXIT_SUCCESS); } int start_process(int i, char *command_name, char *env) @@ -925,12 +941,12 @@ if ((remote_command = malloc(str_len)) == NULL) { fprintf(stderr, "Failed to malloc %d bytes for remote_command\n", str_len); - exit(1); + exit(EXIT_FAILURE); } if ((xterm_command = malloc(str_len)) == NULL) { fprintf(stderr, "Failed to malloc %d bytes for xterm_command\n", str_len); - exit(1); + exit(EXIT_FAILURE); } @@ -1010,7 +1026,7 @@ if (!show_on) { perror("RSH/SSH command failed!"); } - exit(1); + exit(EXIT_FAILURE); } free(remote_command); @@ -1189,8 +1205,6 @@ fprintf(stderr, "\ta.out => " "name of MPI binary\n"); fprintf(stderr, "\targs => " "arguments for MPI binary\n"); fprintf(stderr, "\n"); - - exit(1); } /* finds first non-whitespace char in input string */ @@ -1221,7 +1235,7 @@ if (hf == NULL) { fprintf(stderr, "Can't open hostfile %s\n", hostfile_name); perror("open"); - exit(1); + exit(EXIT_FAILURE); } for (i = 0; i < nprocs; i++) { @@ -1287,7 +1301,7 @@ } else { fprintf(stderr, "End of file reached on " "hostfile at %d of %d hostnames\n", i, nprocs); - exit(1); + exit(EXIT_FAILURE); } } fclose(hf); @@ -1321,14 +1335,14 @@ if ((pf = fopen(paramfile, "r")) == NULL) { sprintf(errstr, "Cant open paramfile = %s", paramfile); perror(errstr); - exit(1); + exit(EXIT_FAILURE); } if ( strlen(env) == 0 ){ /* Allocating space for env first time */ if ((env = malloc(ENV_LEN)) == NULL) { fprintf(stderr, "Malloc of env failed in read_param_file\n"); - exit(1); + exit(EXIT_FAILURE); } env_left = ENV_LEN - 1; }else{ @@ -1367,7 +1381,7 @@ (ENV_LEN > e_len + 1 ? ENV_LEN : e_len + 1) + strlen(env); if ((env = realloc(env, newlen)) == NULL) { fprintf(stderr, "realloc failed in read_param_file\n"); - exit(1); + exit(EXIT_FAILURE); } if (param_debug) { printf("realloc to %d\n", newlen); @@ -1395,15 +1409,213 @@ } cleanup(); - exit(1); + exit(EXIT_FAILURE); } +void pglist_print(void) { + if(pglist) { + int i, j; + size_t npids = 0, npids_allocated = 0; + + fprintf(stderr, "\n--pglist--\ndata:\n"); + for(i = 0; i < pglist->npgs; i++) { + fprintf(stderr, "%p - %s:", &pglist->data[i], + pglist->data[i].hostname); + + for(j = 0; j < pglist->data[i].npids; j++) { + fprintf(stderr, " %d", pglist->data[i].pids[j]); + } + + fprintf(stderr, "\n"); + npids += pglist->data[i].npids; + npids_allocated += pglist->data[i].npids_allocated; + } + + fprintf(stderr, "\nindex:"); + for(i = 0; i < pglist->npgs; i++) { + fprintf(stderr, " %p", pglist->index[i]); + } + + fprintf(stderr, "\nnpgs/allocated: %d/%d (%d%%)\n", pglist->npgs, + pglist->npgs_allocated, (int)(pglist->npgs_allocated ? 100. * + pglist->npgs / pglist->npgs_allocated : 100.)); + fprintf(stderr, "npids/allocated: %d/%d (%d%%)\n", npids, + npids_allocated, (int)(npids_allocated ? 100. * npids / + npids_allocated : 100.)); + fprintf(stderr, "--pglist--\n\n"); + } +} + +void pglist_insert(const char * const hostname, const pid_t const pid) { + const size_t increment = nprocs > 4 ? nprocs / 4 : 1; + size_t index = 0, bottom = 0, top; + static size_t alloc_error = 0; + int i, strcmp_result; + process_group * pg; + void * backup_ptr; + + if(alloc_error) return; + if(pglist == NULL) goto init_pglist; + + top = pglist->npgs - 1; + index = (top + bottom) / 2; + + while(strcmp_result = strcmp(hostname, pglist->index[index]->hostname)) { + if(bottom >= top) break; + + if(strcmp_result > 0) { + bottom = index + 1; + } + + else { + top = index - 1; + } + + index = (top + bottom) / 2; + } + + if(!strcmp_result) goto insert_pid; + if(strcmp_result > 0) index++; + + goto add_process_group; + +init_pglist: + pglist = malloc(sizeof(process_groups)); + + if(pglist) { + pglist->data = NULL; + pglist->index = NULL; + pglist->npgs = 0; + pglist->npgs_allocated = 0; + } + + else { + goto register_alloc_error; + } + +add_process_group: + if(pglist->npgs == pglist->npgs_allocated) { + process_group * pglist_data_backup = pglist->data; + process_group ** pglist_index_backup = pglist->index; + ptrdiff_t offset; + + pglist->npgs_allocated += increment; + + backup_ptr = pglist->data; + pglist->data = realloc(pglist->data, sizeof(process_group) * + pglist->npgs_allocated); + + if(pglist->data == NULL) { + pglist->data = backup_ptr; + goto register_alloc_error; + } + + backup_ptr = pglist->index; + pglist->index = realloc(pglist->index, sizeof(process_group *) * + pglist->npgs_allocated); + + if(pglist->index == NULL) { + pglist->index = backup_ptr; + goto register_alloc_error; + } + + if(offset = (size_t)pglist->data - (size_t)pglist_data_backup) { + for(i = 0; i < pglist->npgs; i++) { + pglist->index[i] = (process_group *)((size_t)pglist->index[i] + + offset); + } + } + } + + for(i = pglist->npgs; i > index; i--) { + pglist->index[i] = pglist->index[i-1]; + } + + pglist->data[pglist->npgs].hostname = hostname; + pglist->data[pglist->npgs].pids = NULL; + pglist->data[pglist->npgs].npids = 0; + pglist->data[pglist->npgs].npids_allocated = 0; + + pglist->index[index] = &pglist->data[pglist->npgs++]; + +insert_pid: + pg = pglist->index[index]; + + if(pg->npids == pg->npids_allocated) { + if(pg->npids_allocated) { + pg->npids_allocated <<= 1; + + if(pg->npids_allocated < pg->npids) pg->npids_allocated = SIZE_MAX; + if(pg->npids_allocated > nprocs) pg->npids_allocated = nprocs; + } + + else { + pg->npids_allocated = 1; + } + + backup_ptr = pg->pids; + pg->pids = realloc(pg->pids, pg->npids_allocated * sizeof(pid_t)); + + if(pg->pids == NULL) { + pg->pids = backup_ptr; + goto register_alloc_error; + } + } + + pg->pids[pg->npids++] = pid; + + return; + +register_alloc_error: + if(pglist) { + if(pglist->data) { + process_group * pg = pglist->data; + + while(pglist->npgs--) { + if(pg->pids) free((pg++)->pids); + } + + free(pglist->data); + } + + if(pglist->index) free(pglist->index); + + free(pglist); + } + + alloc_error = 1; +} + +void free_memory(void) { + if(pglist) { + if(pglist->data) { + process_group * pg = pglist->data; + + while(pglist->npgs--) { + if(pg->pids) free((pg++)->pids); + } + + free(pglist->data); + } + + if(pglist->index) free(pglist->index); + + free(pglist); + } + + if(plist) { + while(nprocs--) { + if(plist[nprocs].device) free(plist[nprocs].device); + if(plist[nprocs].hostname) free(plist[nprocs].hostname); + } + + free(plist); + } +} + void cleanup(void) { int i; - /* could walk through list of processes, but it looks - like we can just send the signal to the process group - */ if (use_totalview) { fprintf(stderr, "Cleaning up all processes ..."); @@ -1417,39 +1629,183 @@ } for (i = 0; i < nprocs; i++) { - if (RUNNING(i)) { - /* send terminal interrupt, which will hopefully - propagate to the other side. (not sure what xterm will - do here. - */ - kill(plist[i].pid, SIGINT); - } + if (RUNNING(i)) { + /* send terminal interrupt, which will hopefully + propagate to the other side. (not sure what xterm will + do here. + */ + kill(plist[i].pid, SIGINT); + } } + sleep(1); for (i = 0; i < nprocs; i++) { - if (plist[i].state != P_NOTSTARTED) { - /* send regular interrupt to rsh */ - kill(plist[i].pid, SIGTERM); - } + if (plist[i].state != P_NOTSTARTED) { + /* send regular interrupt to rsh */ + kill(plist[i].pid, SIGTERM); + } } sleep(1); for (i = 0; i < nprocs; i++) { - if (plist[i].state != P_NOTSTARTED) { - /* Kill the processes */ - kill(plist[i].pid, SIGKILL); - } + if (plist[i].state != P_NOTSTARTED) { + /* Kill the processes */ + kill(plist[i].pid, SIGKILL); + } } - fprintf(stderr, "done.\n"); + if(pglist) { + rkill_fast(); + } - exit(1); + else { + rkill_linear(); + } + exit(EXIT_FAILURE); } +void rkill_fast(void) { + int i, j, tryagain, spawned_pid[pglist->npgs]; + fprintf(stderr, "Killing remote processes..."); + + for(i = 0; i < pglist->npgs; i++) { + if(0 == (spawned_pid[i] = fork())) { + if(pglist->index[i]->npids) { + const size_t bufsize = 40 + 10 * pglist->index[i]->npids; + const process_group * pg = pglist->index[i]; + char kill_cmd[bufsize], tmp[10]; + + kill_cmd[0] = '\0'; + strcat(kill_cmd, "kill -s SIGKILL"); + + for(j = 0; j < pg->npids; j++) { + snprintf(tmp, 10, " %d", pg->pids[j]); + strcat(kill_cmd, tmp); + } + + strcat(kill_cmd, " >&/dev/null"); + + if(use_rsh) { + execl(RSH_CMD, RSH_CMD, pg->hostname, kill_cmd, NULL); + } + + else { + execl(SSH_CMD, SSH_CMD, SSH_ARG, "-x", pg->hostname, + kill_cmd, NULL); + } + + perror(NULL); + exit(EXIT_FAILURE); + } + + else { + exit(EXIT_SUCCESS); + } + } + } + + while(1) { + static int iteration = 0; + tryagain = 0; + + sleep(1 << iteration); + + for (i = 0; i < pglist->npgs; i++) { + if(spawned_pid[i]) { + if(!(spawned_pid[i] = waitpid(spawned_pid[i], NULL, WNOHANG))) { + tryagain = 1; + } + } + } + + if(++iteration == 5 || !tryagain) { + fprintf(stderr, "DONE\n"); + break; + } + } + + if(tryagain) { + fprintf(stderr, "The following processes may have not been killed:\n"); + for (i = 0; i < pglist->npgs; i++) { + if(spawned_pid[i]) { + const process_group * pg = pglist->index[i]; + + fprintf(stderr, "%s:", pg->hostname); + + for (j = 0; j < pg->npids; j++) { + fprintf(stderr, " %d", pg->pids[j]); + } + + fprintf(stderr, "\n"); + } + } + } +} + +void rkill_linear(void) { + int i, j, tryagain, spawned_pid[nprocs]; + + fprintf(stderr, "Killing remote processes..."); + + for (i = 0; i < nprocs; i++) { + if(0 == (spawned_pid[i] = fork())) { + char kill_cmd[80]; + + if(!plist[i].remote_pid) exit(EXIT_SUCCESS); + + snprintf(kill_cmd, 80, "kill -s SIGKILL %d >&/dev/null", + plist[i].remote_pid); + + if(use_rsh) { + execl(RSH_CMD, RSH_CMD, plist[i].hostname, kill_cmd, NULL); + } + + else { + execl(SSH_CMD, SSH_CMD, SSH_ARG, "-x", + plist[i].hostname, kill_cmd, NULL); + } + + perror(NULL); + exit(EXIT_FAILURE); + } + } + + while(1) { + static int iteration = 0; + tryagain = 0; + + sleep(1 << iteration); + + for (i = 0; i < nprocs; i++) { + if(spawned_pid[i]) { + if(!(spawned_pid[i] = waitpid(spawned_pid[i], NULL, WNOHANG))) { + tryagain = 1; + } + } + } + + if(++iteration == 5 || !tryagain) { + fprintf(stderr, "DONE\n"); + break; + } + } + + if(tryagain) { + fprintf(stderr, "The following processes may have not been killed:\n"); + for (i = 0; i < nprocs; i++) { + if(spawned_pid[i]) { + fprintf(stderr, "%s [%d]\n", plist[i].hostname, + plist[i].remote_pid); + } + } + } +} + + void nostop_handler(int signal) { printf("Stopping from the terminal not allowed\n"); @@ -1457,9 +1813,13 @@ void alarm_handler(int signal) { + extern const char * alarm_msg; + if (use_totalview) { fprintf(stderr, "Timeout alarm signaled\n"); } + + if(alarm_msg) fprintf(stderr, alarm_msg); cleanup(); } @@ -1467,19 +1827,21 @@ void child_handler(int signal) { int status, i, child, pid; - int exitstatus = 0; + int exitstatus = EXIT_SUCCESS; if (use_totalview) { fprintf(stderr, "mpirun: child died. Waiting for others.\n"); } alarm(10); + alarm_msg = "Child died. Timeout while waiting for others.\n"; + for (i = 0; i < nprocs; i++) { pid = wait(&status); if (pid == -1) { perror("wait"); - exitstatus = 1; + exitstatus = EXIT_FAILURE; } else if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { - exitstatus = 1; + exitstatus = EXIT_FAILURE; } for (child = 0; child < nprocs; child++) { if (plist[child].pid == pid) { @@ -1489,9 +1851,11 @@ } if (child == nprocs) { fprintf(stderr, "Unable to find child %d!\n", pid); - exitstatus = 1; + exitstatus = EXIT_FAILURE; } } alarm(0); exit(exitstatus); } + +/* vi:set sw=4 sts=4 tw=80: */ Modified: mvapich/trunk/mpid/ch_gen2/process/pmgr_client.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_client.h 2007-07-15 21:23:48 UTC (rev 1386) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_client.h 2007-07-16 13:45:27 UTC (rev 1387) @@ -108,6 +108,6 @@ * of the spawner, e.g. mpirun_rsh, to check that it understands * the version of the executable. */ -#define PMGR_VERSION 5 +#define PMGR_VERSION 6 #endif Modified: mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c 2007-07-15 21:23:48 UTC (rev 1386) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c 2007-07-16 13:45:27 UTC (rev 1387) @@ -171,6 +171,9 @@ int nwritten; int version; struct sockaddr_in sockaddr; + + if(phase != 0) return; + /* * Exchange information with the mpirun program. Send it our * socket address, get back addresses for our siblings. @@ -208,14 +211,12 @@ */ version = PMGR_VERSION; - if (0 == phase) { - /* first, send a version number */ - nwritten = write(mpirun_socket, &version, sizeof(version)); - if (nwritten != sizeof(version)) { - sleep(2); - perror("write"); - exit(1); - } + /* first, send a version number */ + nwritten = write(mpirun_socket, &version, sizeof(version)); + if (nwritten != sizeof(version)) { + sleep(2); + perror("write"); + exit(1); } /* next, send our rank */ @@ -264,7 +265,6 @@ tot_nread = tot_nread + nread; } fflush(stdout); - close(mpirun_socket); return 1; } @@ -280,7 +280,6 @@ pid_t *ppids = (pid_t *)pallpids; pid_t *allpids = NULL; - pmgr_init_connection(1); /* next, send size of addr */ nwritten = write(mpirun_socket, &addrlen, sizeof(addrlen)); if (nwritten != sizeof(addrlen)) { @@ -314,7 +313,7 @@ exit(1); } - /* next, send size of addr */ + /* next, send size of pid */ nwritten = write(mpirun_socket, &pidlen, sizeof(pidlen)); if (nwritten != sizeof(mypid_len)) { sleep(2); @@ -322,6 +321,7 @@ exit(1); } + /* next, send our pid */ if (pidlen != 0) { nwritten = write(mpirun_socket, &my_pid_int, (size_t) pidlen); if (nwritten != pidlen) { @@ -345,7 +345,7 @@ if (pidlen != 0) { tot_nread=0; - /* finally, read addresses from all processes */ + /* finally, read pids from all processes */ while (tot_nread < pmgr_nprocs*pidlen) { nread = read(mpirun_socket, (void*)((char *)allpids+tot_nread), (size_t) ((pmgr_nprocs*pidlen)-tot_nread)); From mamidala at mvapich.cse.ohio-state.edu Tue Jul 17 12:01:22 2007 From: mamidala at mvapich.cse.ohio-state.edu (mamidala@mvapich.cse.ohio-state.edu) Date: Tue Jul 17 12:01:44 2007 Subject: [mvapich-commit] r1388 - mvapich/branches/0.9.9/src/context Message-ID: <200707171601.l6HG1M3b029595@mvapich.cse.ohio-state.edu> Author: mamidala Date: 2007-07-17 12:01:20 -0400 (Tue, 17 Jul 2007) New Revision: 1388 Modified: mvapich/branches/0.9.9/src/context/comm_free.c Log: checking in the patch for MPI_Finalize seg. fault Modified: mvapich/branches/0.9.9/src/context/comm_free.c =================================================================== --- mvapich/branches/0.9.9/src/context/comm_free.c 2007-07-16 13:45:27 UTC (rev 1387) +++ mvapich/branches/0.9.9/src/context/comm_free.c 2007-07-17 16:01:20 UTC (rev 1388) @@ -59,6 +59,9 @@ #define DBG(a) #define OUTFILE stdout +#if (defined(_SMP_) && (defined(CH_GEN2))) ||defined(CH_SMP) +int flag = 0; +#endif extern int enable_rdma_collectives; #ifdef _SMP_ extern int enable_shmem_collectives; @@ -183,7 +186,7 @@ #endif #if (defined(_SMP_) && (defined(CH_GEN2))) ||defined(CH_SMP) - if((comm->comm_coll == comm) && (comm->comm_type == MPIR_INTRA) && (enable_shmem_collectives)) { + if((comm->comm_coll == comm) && (comm->comm_type == MPIR_INTRA) && (enable_shmem_collectives) && (!flag)) { free_2level_comm(comm); } #endif @@ -214,7 +217,15 @@ /* Free collective communicator (unless it refers back to myself) */ if ( comm->comm_coll != comm ) { MPI_Comm ctmp = comm->comm_coll->self; +#if (defined(_SMP_) && (defined(CH_GEN2))) ||defined(CH_SMP) + if (comm->self == MPI_COMM_SELF){ + flag = 1; + } +#endif MPI_Comm_free ( &ctmp ); +#if (defined(_SMP_) && (defined(CH_GEN2))) ||defined(CH_SMP) + flag = 0; +#endif } /* Put this after freeing the collective comm because it may have From mamidala at mvapich.cse.ohio-state.edu Tue Jul 17 12:02:56 2007 From: mamidala at mvapich.cse.ohio-state.edu (mamidala@mvapich.cse.ohio-state.edu) Date: Tue Jul 17 12:03:15 2007 Subject: [mvapich-commit] r1389 - mvapich/trunk/src/context Message-ID: <200707171602.l6HG2u7G029605@mvapich.cse.ohio-state.edu> Author: mamidala Date: 2007-07-17 12:02:55 -0400 (Tue, 17 Jul 2007) New Revision: 1389 Modified: mvapich/trunk/src/context/comm_free.c Log: checking in patch for MPI_Finalize seg. fault Modified: mvapich/trunk/src/context/comm_free.c =================================================================== --- mvapich/trunk/src/context/comm_free.c 2007-07-17 16:01:20 UTC (rev 1388) +++ mvapich/trunk/src/context/comm_free.c 2007-07-17 16:02:55 UTC (rev 1389) @@ -59,6 +59,9 @@ #define DBG(a) #define OUTFILE stdout +#if (defined(_SMP_) && (defined(CH_GEN2))) ||defined(CH_SMP) +int flag = 0; +#endif extern int enable_rdma_collectives; #ifdef _SMP_ extern int enable_shmem_collectives; @@ -183,7 +186,7 @@ #endif #if (defined(_SMP_) && (defined(CH_GEN2))) ||defined(CH_SMP) - if((comm->comm_coll == comm) && (comm->comm_type == MPIR_INTRA) && (enable_shmem_collectives)) { + if((comm->comm_coll == comm) && (comm->comm_type == MPIR_INTRA) && (enable_shmem_collectives) && (!flag)) { free_2level_comm(comm); } #endif @@ -214,7 +217,15 @@ /* Free collective communicator (unless it refers back to myself) */ if ( comm->comm_coll != comm ) { MPI_Comm ctmp = comm->comm_coll->self; +#if (defined(_SMP_) && (defined(CH_GEN2))) ||defined(CH_SMP) + if (comm->self == MPI_COMM_SELF){ + flag = 1; + } +#endif MPI_Comm_free ( &ctmp ); +#if (defined(_SMP_) && (defined(CH_GEN2))) ||defined(CH_SMP) + flag = 0; +#endif } /* Put this after freeing the collective comm because it may have From surs at mvapich.cse.ohio-state.edu Fri Jul 20 15:35:10 2007 From: surs at mvapich.cse.ohio-state.edu (surs@mvapich.cse.ohio-state.edu) Date: Fri Jul 20 15:35:31 2007 Subject: [mvapich-commit] r1396 - mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2 Message-ID: <200707201935.l6KJZAoS006702@mvapich.cse.ohio-state.edu> Author: surs Date: 2007-07-20 15:35:08 -0400 (Fri, 20 Jul 2007) New Revision: 1396 Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.c mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.h mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.c mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.h Log: Patch to fix hang for memory allocations whose size is bigger than 2^31-1 Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.c 2007-07-20 18:31:07 UTC (rev 1395) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.c 2007-07-20 19:35:08 UTC (rev 1396) @@ -953,9 +953,9 @@ } #ifndef DISABLE_PTMALLOC -void find_and_free_dregs_inside(void *buf, int len) +void find_and_free_dregs_inside(void *buf, size_t len) { - int i; + unsigned long i; unsigned long pagenum_low, pagenum_high; unsigned long npages, begin, end; unsigned long user_low_a, user_high_a; Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.h =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.h 2007-07-20 18:31:07 UTC (rev 1395) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.h 2007-07-20 19:35:08 UTC (rev 1396) @@ -240,7 +240,7 @@ dreg_entry *dreg_new_entry(void *buf, int len); #ifndef DISABLE_PTMALLOC -void find_and_free_dregs_inside(void *buf, int len); +void find_and_free_dregs_inside(void *buf, size_t len); #endif #ifdef CKPT Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.c 2007-07-20 18:31:07 UTC (rev 1395) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.c 2007-07-20 19:35:08 UTC (rev 1396) @@ -96,7 +96,7 @@ #ifndef DISABLE_MUNMAP_HOOK -int mvapich2_munmap(void *buf, int len) +int mvapich2_munmap(void *buf, size_t len) { if(!mvapich2_minfo.munmap) { set_real_munmap_ptr(); Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.h =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.h 2007-07-20 18:31:07 UTC (rev 1395) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.h 2007-07-20 19:35:08 UTC (rev 1396) @@ -42,7 +42,7 @@ void mvapich2_mfin(void); #ifndef DISABLE_MUNMAP_HOOK -int mvapich2_munmap(void *buf, int len); +int mvapich2_munmap(void *buf, size_t len); #endif #ifndef DISABLE_TRAP_SBRK From surs at mvapich.cse.ohio-state.edu Fri Jul 20 17:03:13 2007 From: surs at mvapich.cse.ohio-state.edu (surs@mvapich.cse.ohio-state.edu) Date: Fri Jul 20 17:03:36 2007 Subject: [mvapich-commit] r1397 - mvapich/trunk/mpid/ch_gen2 Message-ID: <200707202103.l6KL3DkM006843@mvapich.cse.ohio-state.edu> Author: surs Date: 2007-07-20 17:03:07 -0400 (Fri, 20 Jul 2007) New Revision: 1397 Modified: mvapich/trunk/mpid/ch_gen2/dreg.c mvapich/trunk/mpid/ch_gen2/dreg.h mvapich/trunk/mpid/ch_gen2/mem_hooks.c mvapich/trunk/mpid/ch_gen2/mem_hooks.h Log: Patch to fix hang for memory allocations whose size is bigger than 2^31-1 Modified: mvapich/trunk/mpid/ch_gen2/dreg.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/dreg.c 2007-07-20 19:35:08 UTC (rev 1396) +++ mvapich/trunk/mpid/ch_gen2/dreg.c 2007-07-20 21:03:07 UTC (rev 1397) @@ -974,9 +974,9 @@ #ifndef DISABLE_PTMALLOC -void find_and_free_dregs_inside(void *buf, int len) +void find_and_free_dregs_inside(void *buf, size_t len) { - int i; + unsigned long i; unsigned long pagenum_low, pagenum_high; unsigned long npages, begin, end; unsigned long user_low_a, user_high_a; Modified: mvapich/trunk/mpid/ch_gen2/dreg.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/dreg.h 2007-07-20 19:35:08 UTC (rev 1396) +++ mvapich/trunk/mpid/ch_gen2/dreg.h 2007-07-20 21:03:07 UTC (rev 1397) @@ -180,7 +180,7 @@ dreg_entry *dreg_new_entry(void *buf, int len, int acl); #ifndef DISABLE_PTMALLOC -void find_and_free_dregs_inside(void *buf, int len); +void find_and_free_dregs_inside(void *buf, size_t len); #endif #endif /* _DREG_H */ Modified: mvapich/trunk/mpid/ch_gen2/mem_hooks.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/mem_hooks.c 2007-07-20 19:35:08 UTC (rev 1396) +++ mvapich/trunk/mpid/ch_gen2/mem_hooks.c 2007-07-20 21:03:07 UTC (rev 1397) @@ -94,7 +94,7 @@ #ifndef DISABLE_MUNMAP_HOOK -int mvapich_munmap(void *buf, int len) +int mvapich_munmap(void *buf, size_t len) { if(!mvapich_minfo.munmap) { set_real_munmap_ptr(); Modified: mvapich/trunk/mpid/ch_gen2/mem_hooks.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/mem_hooks.h 2007-07-20 19:35:08 UTC (rev 1396) +++ mvapich/trunk/mpid/ch_gen2/mem_hooks.h 2007-07-20 21:03:07 UTC (rev 1397) @@ -42,7 +42,7 @@ void mvapich_mfin(void); #ifndef DISABLE_MUNMAP_HOOK -int mvapich_munmap(void *buf, int len); +int mvapich_munmap(void *buf, size_t len); #endif #ifndef DISABLE_TRAP_SBRK From surs at mvapich.cse.ohio-state.edu Sat Jul 21 11:41:59 2007 From: surs at mvapich.cse.ohio-state.edu (surs@mvapich.cse.ohio-state.edu) Date: Sat Jul 21 11:42:19 2007 Subject: [mvapich-commit] r1398 - mvapich2/branches/0.9.8 Message-ID: <200707211541.l6LFfx3p013321@mvapich.cse.ohio-state.edu> Author: surs Date: 2007-07-21 11:41:57 -0400 (Sat, 21 Jul 2007) New Revision: 1398 Modified: mvapich2/branches/0.9.8/CHANGELOG Log: Add credits Modified: mvapich2/branches/0.9.8/CHANGELOG =================================================================== --- mvapich2/branches/0.9.8/CHANGELOG 2007-07-20 21:03:07 UTC (rev 1397) +++ mvapich2/branches/0.9.8/CHANGELOG 2007-07-21 15:41:57 UTC (rev 1398) @@ -3,9 +3,14 @@ This file briefly describes the latest changes to the MVAPICH2 software package. The logs are arranged in the "most recent first" order. +07/20/2007 +* Fix for hang in memory allocations > 2^31 - 1. + Thanks to Bryan Putnam (Purdue) for reporting this. + 07/10/2007 -* Fix for RDMA_CM finalize rdma_destroy_id failure. Added Timeout env variable for RDMA_CM - ARP. Thanks to Steve Wise for suggesting these. +* Fix for RDMA_CM finalize rdma_destroy_id failure. + Added Timeout env variable for RDMA_CM ARP. + Thanks to Steve Wise for suggesting these. 03/28/2007 * Fix for RDMA_CM invalid event in finalize. Thanks to Steve Wise and Sean Hefty. From mamidala at mvapich.cse.ohio-state.edu Wed Jul 25 11:17:46 2007 From: mamidala at mvapich.cse.ohio-state.edu (mamidala@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 11:18:06 2007 Subject: [mvapich-commit] r1409 - mvapich/branches/0.9.9/src/context Message-ID: <200707251517.l6PFHkf6009973@mvapich.cse.ohio-state.edu> Author: mamidala Date: 2007-07-25 11:17:46 -0400 (Wed, 25 Jul 2007) New Revision: 1409 Modified: mvapich/branches/0.9.9/src/context/create_2level_comm.c Log: checking in fix for multiple concurrent communicator hang. Thanks to Pasha from Mellanox for reporting this problem. Modified: mvapich/branches/0.9.9/src/context/create_2level_comm.c =================================================================== --- mvapich/branches/0.9.9/src/context/create_2level_comm.c 2007-07-24 21:31:19 UTC (rev 1408) +++ mvapich/branches/0.9.9/src/context/create_2level_comm.c 2007-07-25 15:17:46 UTC (rev 1409) @@ -35,7 +35,6 @@ int shmem_coll_blocks=4; int shmem_comm_count = 0; extern shmem_coll_region *shmem_coll; -static pthread_mutex_t shmem_coll_lock = PTHREAD_MUTEX_INITIALIZER; void clear_2level_comm (struct MPIR_COMMUNICATOR* comm_ptr) { @@ -140,11 +139,11 @@ MPI_Comm_rank(comm_ptr->shmem_comm, &my_local_id); if (my_local_id == 0){ - pthread_mutex_lock(&shmem_coll_lock); + pthread_mutex_lock(&shmem_coll->shmem_coll_lock); shmem_coll->shmem_comm_count++; - pthread_mutex_unlock(&shmem_coll_lock); + shmem_comm_count = shmem_coll->shmem_comm_count; + pthread_mutex_unlock(&shmem_coll->shmem_coll_lock); } - shmem_comm_count = shmem_coll->shmem_comm_count; MPI_Bcast (&shmem_comm_count, 1, MPI_INT, 0, comm_ptr->shmem_comm); From mamidala at mvapich.cse.ohio-state.edu Wed Jul 25 11:19:27 2007 From: mamidala at mvapich.cse.ohio-state.edu (mamidala@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 11:19:47 2007 Subject: [mvapich-commit] r1410 - mvapich/branches/0.9.9/mpid/ch_gen2 Message-ID: <200707251519.l6PFJRwm009983@mvapich.cse.ohio-state.edu> Author: mamidala Date: 2007-07-25 11:19:27 -0400 (Wed, 25 Jul 2007) New Revision: 1410 Modified: mvapich/branches/0.9.9/mpid/ch_gen2/coll_shmem.h mvapich/branches/0.9.9/mpid/ch_gen2/shmem_coll.c Log: checking in fix for multiple communicator problem. Thanks to Pasha from Mellanox for reporting this. Modified: mvapich/branches/0.9.9/mpid/ch_gen2/coll_shmem.h =================================================================== --- mvapich/branches/0.9.9/mpid/ch_gen2/coll_shmem.h 2007-07-25 15:17:46 UTC (rev 1409) +++ mvapich/branches/0.9.9/mpid/ch_gen2/coll_shmem.h 2007-07-25 15:19:27 UTC (rev 1410) @@ -100,6 +100,7 @@ volatile int barrier_gather[SHMEM_COLL_NUM_COMM][SHMEM_COLL_NUM_PROCS]; volatile int barrier_bcast[SHMEM_COLL_NUM_COMM][SHMEM_COLL_NUM_PROCS]; volatile int shmem_comm_count; + pthread_mutex_t shmem_coll_lock; /* the collective buffer */ char shmem_coll_buf; Modified: mvapich/branches/0.9.9/mpid/ch_gen2/shmem_coll.c =================================================================== --- mvapich/branches/0.9.9/mpid/ch_gen2/shmem_coll.c 2007-07-25 15:17:46 UTC (rev 1409) +++ mvapich/branches/0.9.9/mpid/ch_gen2/shmem_coll.c 2007-07-25 15:19:27 UTC (rev 1410) @@ -163,6 +163,7 @@ shmem_coll->root_complete_gather[j][i] = 1; } } + pthread_mutex_init(&shmem_coll->shmem_coll_lock,NULL); } return MPI_SUCCESS; From mamidala at mvapich.cse.ohio-state.edu Wed Jul 25 11:38:38 2007 From: mamidala at mvapich.cse.ohio-state.edu (mamidala@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 11:38:59 2007 Subject: [mvapich-commit] r1411 - in mvapich/trunk: mpid/ch_gen2 src/context Message-ID: <200707251538.l6PFcc7p010012@mvapich.cse.ohio-state.edu> Author: mamidala Date: 2007-07-25 11:38:36 -0400 (Wed, 25 Jul 2007) New Revision: 1411 Modified: mvapich/trunk/mpid/ch_gen2/coll_shmem.h mvapich/trunk/mpid/ch_gen2/shmem_coll.c mvapich/trunk/src/context/create_2level_comm.c Log: checking in fix for multiple communicator hang problem. Thanks to Pasha from Mellanox for reporting this. Modified: mvapich/trunk/mpid/ch_gen2/coll_shmem.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/coll_shmem.h 2007-07-25 15:19:27 UTC (rev 1410) +++ mvapich/trunk/mpid/ch_gen2/coll_shmem.h 2007-07-25 15:38:36 UTC (rev 1411) @@ -100,6 +100,7 @@ volatile int barrier_gather[SHMEM_COLL_NUM_COMM][SHMEM_COLL_NUM_PROCS]; volatile int barrier_bcast[SHMEM_COLL_NUM_COMM][SHMEM_COLL_NUM_PROCS]; volatile int shmem_comm_count; + pthread_mutex_t shmem_coll_lock; /* the collective buffer */ char shmem_coll_buf; Modified: mvapich/trunk/mpid/ch_gen2/shmem_coll.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/shmem_coll.c 2007-07-25 15:19:27 UTC (rev 1410) +++ mvapich/trunk/mpid/ch_gen2/shmem_coll.c 2007-07-25 15:38:36 UTC (rev 1411) @@ -163,6 +163,7 @@ shmem_coll->root_complete_gather[j][i] = 1; } } + pthread_mutex_init(&shmem_coll->shmem_coll_lock,NULL); } return MPI_SUCCESS; Modified: mvapich/trunk/src/context/create_2level_comm.c =================================================================== --- mvapich/trunk/src/context/create_2level_comm.c 2007-07-25 15:19:27 UTC (rev 1410) +++ mvapich/trunk/src/context/create_2level_comm.c 2007-07-25 15:38:36 UTC (rev 1411) @@ -35,7 +35,6 @@ int shmem_coll_blocks=4; int shmem_comm_count = 0; extern shmem_coll_region *shmem_coll; -static pthread_mutex_t shmem_coll_lock = PTHREAD_MUTEX_INITIALIZER; void clear_2level_comm (struct MPIR_COMMUNICATOR* comm_ptr) { @@ -140,11 +139,11 @@ MPI_Comm_rank(comm_ptr->shmem_comm, &my_local_id); if (my_local_id == 0){ - pthread_mutex_lock(&shmem_coll_lock); + pthread_mutex_lock(&shmem_coll->shmem_coll_lock); shmem_coll->shmem_comm_count++; - pthread_mutex_unlock(&shmem_coll_lock); + shmem_comm_count = shmem_coll->shmem_comm_count; + pthread_mutex_unlock(&shmem_coll->shmem_coll_lock); } - shmem_comm_count = shmem_coll->shmem_comm_count; MPI_Bcast (&shmem_comm_count, 1, MPI_INT, 0, comm_ptr->shmem_comm); From surs at mvapich.cse.ohio-state.edu Wed Jul 25 12:33:45 2007 From: surs at mvapich.cse.ohio-state.edu (surs@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 12:34:06 2007 Subject: [mvapich-commit] r1414 - in mvapich/branches/0.9.9: . mpid/ch_gen2 Message-ID: <200707251633.l6PGXjhH010129@mvapich.cse.ohio-state.edu> Author: surs Date: 2007-07-25 12:33:43 -0400 (Wed, 25 Jul 2007) New Revision: 1414 Modified: mvapich/branches/0.9.9/CHANGELOG mvapich/branches/0.9.9/mpid/ch_gen2/dreg.c mvapich/branches/0.9.9/mpid/ch_gen2/dreg.h mvapich/branches/0.9.9/mpid/ch_gen2/mem_hooks.c mvapich/branches/0.9.9/mpid/ch_gen2/mem_hooks.h Log: Patch to fix hang for memory allocations whose size is bigger than 2^31-1. Thanks to Bryan Putnam (Purdue) for reporting this. Modified: mvapich/branches/0.9.9/CHANGELOG =================================================================== --- mvapich/branches/0.9.9/CHANGELOG 2007-07-25 16:16:09 UTC (rev 1413) +++ mvapich/branches/0.9.9/CHANGELOG 2007-07-25 16:33:43 UTC (rev 1414) @@ -4,6 +4,11 @@ This file briefly describes the latest changes to MVAPICH software package. The logs are arranged in the "most recent first" order. +07/25/2007 + +* Patch to fix hang for memory allocations whose size is bigger than 2^31-1. + Thanks to Bryan Putnam (Purdue) for reporting this. + 04/25/2007 * Made shared memory macros tunable at run time Modified: mvapich/branches/0.9.9/mpid/ch_gen2/dreg.c =================================================================== --- mvapich/branches/0.9.9/mpid/ch_gen2/dreg.c 2007-07-25 16:16:09 UTC (rev 1413) +++ mvapich/branches/0.9.9/mpid/ch_gen2/dreg.c 2007-07-25 16:33:43 UTC (rev 1414) @@ -974,9 +974,9 @@ #ifndef DISABLE_PTMALLOC -void find_and_free_dregs_inside(void *buf, int len) +void find_and_free_dregs_inside(void *buf, size_t len) { - int i; + unsigned long i; unsigned long pagenum_low, pagenum_high; unsigned long npages, begin, end; unsigned long user_low_a, user_high_a; Modified: mvapich/branches/0.9.9/mpid/ch_gen2/dreg.h =================================================================== --- mvapich/branches/0.9.9/mpid/ch_gen2/dreg.h 2007-07-25 16:16:09 UTC (rev 1413) +++ mvapich/branches/0.9.9/mpid/ch_gen2/dreg.h 2007-07-25 16:33:43 UTC (rev 1414) @@ -180,7 +180,7 @@ dreg_entry *dreg_new_entry(void *buf, int len, int acl); #ifndef DISABLE_PTMALLOC -void find_and_free_dregs_inside(void *buf, int len); +void find_and_free_dregs_inside(void *buf, size_t len); #endif #endif /* _DREG_H */ Modified: mvapich/branches/0.9.9/mpid/ch_gen2/mem_hooks.c =================================================================== --- mvapich/branches/0.9.9/mpid/ch_gen2/mem_hooks.c 2007-07-25 16:16:09 UTC (rev 1413) +++ mvapich/branches/0.9.9/mpid/ch_gen2/mem_hooks.c 2007-07-25 16:33:43 UTC (rev 1414) @@ -94,7 +94,7 @@ #ifndef DISABLE_MUNMAP_HOOK -int mvapich_munmap(void *buf, int len) +int mvapich_munmap(void *buf, size_t len) { if(!mvapich_minfo.munmap) { set_real_munmap_ptr(); Modified: mvapich/branches/0.9.9/mpid/ch_gen2/mem_hooks.h =================================================================== --- mvapich/branches/0.9.9/mpid/ch_gen2/mem_hooks.h 2007-07-25 16:16:09 UTC (rev 1413) +++ mvapich/branches/0.9.9/mpid/ch_gen2/mem_hooks.h 2007-07-25 16:33:43 UTC (rev 1414) @@ -42,7 +42,7 @@ void mvapich_mfin(void); #ifndef DISABLE_MUNMAP_HOOK -int mvapich_munmap(void *buf, int len); +int mvapich_munmap(void *buf, size_t len); #endif #ifndef DISABLE_TRAP_SBRK From mamidala at mvapich.cse.ohio-state.edu Wed Jul 25 13:49:30 2007 From: mamidala at mvapich.cse.ohio-state.edu (mamidala@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 13:49:53 2007 Subject: [mvapich-commit] r1415 - in mvapich2/branches/0.9.8/src: mpi/comm mpid/osu_ch3/channels/mrail/src/rdma Message-ID: <200707251749.l6PHnUQY010262@mvapich.cse.ohio-state.edu> Author: mamidala Date: 2007-07-25 13:49:29 -0400 (Wed, 25 Jul 2007) New Revision: 1415 Modified: mvapich2/branches/0.9.8/src/mpi/comm/create_2level_comm.c mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_shmem_coll.c mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/coll_shmem.h Log: checking in patch to solve the concurrent communicator hang problem. Observed with running IMB with -multi option. Modified: mvapich2/branches/0.9.8/src/mpi/comm/create_2level_comm.c =================================================================== --- mvapich2/branches/0.9.8/src/mpi/comm/create_2level_comm.c 2007-07-25 16:33:43 UTC (rev 1414) +++ mvapich2/branches/0.9.8/src/mpi/comm/create_2level_comm.c 2007-07-25 17:49:29 UTC (rev 1415) @@ -24,7 +24,6 @@ int shmem_comm_count = 0; extern shmem_coll_region *shmem_coll; -static pthread_mutex_t shmem_coll_lock = PTHREAD_MUTEX_INITIALIZER; extern int shmem_coll_blocks; void clear_2level_comm (MPID_Comm* comm_ptr) @@ -138,12 +137,12 @@ MPI_Comm_rank(comm_ptr->shmem_comm, &my_local_id); if (my_local_id == 0){ - pthread_mutex_lock(&shmem_coll_lock); + pthread_mutex_lock(&shmem_coll->shmem_coll_lock); shmem_coll->shmem_comm_count++; - pthread_mutex_unlock(&shmem_coll_lock); + shmem_comm_count = shmem_coll->shmem_comm_count; + pthread_mutex_unlock(&shmem_coll->shmem_coll_lock); } - shmem_comm_count = shmem_coll->shmem_comm_count; MPI_Bcast (&shmem_comm_count, 1, MPI_INT, 0, comm_ptr->shmem_comm); if (shmem_comm_count <= shmem_coll_blocks){ Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_shmem_coll.c =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_shmem_coll.c 2007-07-25 16:33:43 UTC (rev 1414) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/ch3_shmem_coll.c 2007-07-25 17:49:29 UTC (rev 1415) @@ -180,6 +180,7 @@ shmem_coll->root_complete_gather[j][i] = 1; } } + pthread_mutex_init(&shmem_coll->shmem_coll_lock,NULL); } return MPI_SUCCESS; Modified: mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/coll_shmem.h =================================================================== --- mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/coll_shmem.h 2007-07-25 16:33:43 UTC (rev 1414) +++ mvapich2/branches/0.9.8/src/mpid/osu_ch3/channels/mrail/src/rdma/coll_shmem.h 2007-07-25 17:49:29 UTC (rev 1415) @@ -99,6 +99,7 @@ volatile int barrier_gather[SHMEM_COLL_NUM_COMM][SHMEM_COLL_NUM_PROCS]; volatile int barrier_bcast[SHMEM_COLL_NUM_COMM][SHMEM_COLL_NUM_PROCS]; volatile int shmem_comm_count; + pthread_mutex_t shmem_coll_lock; /* the collective buffer */ char shmem_coll_buf; From mamidala at mvapich.cse.ohio-state.edu Wed Jul 25 13:51:23 2007 From: mamidala at mvapich.cse.ohio-state.edu (mamidala@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 13:51:44 2007 Subject: [mvapich-commit] r1416 - mvapich2/branches/0.9.8 Message-ID: <200707251751.l6PHpNed010278@mvapich.cse.ohio-state.edu> Author: mamidala Date: 2007-07-25 13:51:23 -0400 (Wed, 25 Jul 2007) New Revision: 1416 Modified: mvapich2/branches/0.9.8/CHANGELOG Log: committing changelog Modified: mvapich2/branches/0.9.8/CHANGELOG =================================================================== --- mvapich2/branches/0.9.8/CHANGELOG 2007-07-25 17:49:29 UTC (rev 1415) +++ mvapich2/branches/0.9.8/CHANGELOG 2007-07-25 17:51:23 UTC (rev 1416) @@ -3,6 +3,10 @@ This file briefly describes the latest changes to the MVAPICH2 software package. The logs are arranged in the "most recent first" order. +07/25/2007 +* Fix for hang while using IMB with -multi option. + Thanks to Pasha (Mellanox) for reporting this. + 07/20/2007 * Fix for hang in memory allocations > 2^31 - 1. Thanks to Bryan Putnam (Purdue) for reporting this. From huangwei at mvapich.cse.ohio-state.edu Wed Jul 25 21:33:29 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 21:33:50 2007 Subject: [mvapich-commit] r1424 - in mvapich2/trunk/src/mpi: coll comm init Message-ID: <200707260133.l6Q1XT6Z011364@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-25 21:33:29 -0400 (Wed, 25 Jul 2007) New Revision: 1424 Modified: mvapich2/trunk/src/mpi/coll/allreduce.c mvapich2/trunk/src/mpi/coll/barrier.c mvapich2/trunk/src/mpi/coll/reduce.c mvapich2/trunk/src/mpi/comm/comm_create.c mvapich2/trunk/src/mpi/comm/comm_dup.c mvapich2/trunk/src/mpi/comm/comm_split.c mvapich2/trunk/src/mpi/init/init.c mvapich2/trunk/src/mpi/init/initthread.c Log: License information for shm_coll files Modified: mvapich2/trunk/src/mpi/coll/allreduce.c =================================================================== --- mvapich2/trunk/src/mpi/coll/allreduce.c 2007-07-26 01:30:05 UTC (rev 1423) +++ mvapich2/trunk/src/mpi/coll/allreduce.c 2007-07-26 01:33:29 UTC (rev 1424) @@ -5,6 +5,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #include "mpiimpl.h" /* -- Begin Profiling Symbol Block for routine MPI_Allreduce */ Modified: mvapich2/trunk/src/mpi/coll/barrier.c =================================================================== --- mvapich2/trunk/src/mpi/coll/barrier.c 2007-07-26 01:30:05 UTC (rev 1423) +++ mvapich2/trunk/src/mpi/coll/barrier.c 2007-07-26 01:33:29 UTC (rev 1424) @@ -5,6 +5,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #include "mpiimpl.h" /* -- Begin Profiling Symbol Block for routine MPI_Barrier */ Modified: mvapich2/trunk/src/mpi/coll/reduce.c =================================================================== --- mvapich2/trunk/src/mpi/coll/reduce.c 2007-07-26 01:30:05 UTC (rev 1423) +++ mvapich2/trunk/src/mpi/coll/reduce.c 2007-07-26 01:33:29 UTC (rev 1424) @@ -5,6 +5,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #include "mpiimpl.h" /* -- Begin Profiling Symbol Block for routine MPI_Reduce */ Modified: mvapich2/trunk/src/mpi/comm/comm_create.c =================================================================== --- mvapich2/trunk/src/mpi/comm/comm_create.c 2007-07-26 01:30:05 UTC (rev 1423) +++ mvapich2/trunk/src/mpi/comm/comm_create.c 2007-07-26 01:33:29 UTC (rev 1424) @@ -5,6 +5,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #include "mpiimpl.h" #include "mpicomm.h" Modified: mvapich2/trunk/src/mpi/comm/comm_dup.c =================================================================== --- mvapich2/trunk/src/mpi/comm/comm_dup.c 2007-07-26 01:30:05 UTC (rev 1423) +++ mvapich2/trunk/src/mpi/comm/comm_dup.c 2007-07-26 01:33:29 UTC (rev 1424) @@ -5,6 +5,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #include "mpiimpl.h" #include "mpicomm.h" Modified: mvapich2/trunk/src/mpi/comm/comm_split.c =================================================================== --- mvapich2/trunk/src/mpi/comm/comm_split.c 2007-07-26 01:30:05 UTC (rev 1423) +++ mvapich2/trunk/src/mpi/comm/comm_split.c 2007-07-26 01:33:29 UTC (rev 1424) @@ -5,6 +5,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #include "mpiimpl.h" #include "mpicomm.h" Modified: mvapich2/trunk/src/mpi/init/init.c =================================================================== --- mvapich2/trunk/src/mpi/init/init.c 2007-07-26 01:30:05 UTC (rev 1423) +++ mvapich2/trunk/src/mpi/init/init.c 2007-07-26 01:33:29 UTC (rev 1424) @@ -5,6 +5,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #include "mpiimpl.h" #include "mpi_init.h" Modified: mvapich2/trunk/src/mpi/init/initthread.c =================================================================== --- mvapich2/trunk/src/mpi/init/initthread.c 2007-07-26 01:30:05 UTC (rev 1423) +++ mvapich2/trunk/src/mpi/init/initthread.c 2007-07-26 01:33:29 UTC (rev 1424) @@ -5,6 +5,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #include "mpiimpl.h" #include "datatype.h" #include "mpi_init.h" From huangwei at mvapich.cse.ohio-state.edu Wed Jul 25 21:47:58 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 21:48:19 2007 Subject: [mvapich-commit] r1426 - mvapich2/trunk/src/mpid/osu_ch3/src Message-ID: <200707260147.l6Q1lwHV011396@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-25 21:47:58 -0400 (Wed, 25 Jul 2007) New Revision: 1426 Modified: mvapich2/trunk/src/mpid/osu_ch3/src/mpid_init.c mvapich2/trunk/src/mpid/osu_ch3/src/mpid_vc.c Log: Add license information Modified: mvapich2/trunk/src/mpid/osu_ch3/src/mpid_init.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/src/mpid_init.c 2007-07-26 01:40:14 UTC (rev 1425) +++ mvapich2/trunk/src/mpid/osu_ch3/src/mpid_init.c 2007-07-26 01:47:58 UTC (rev 1426) @@ -4,6 +4,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #include "mpidimpl.h" #if defined(HAVE_LIMITS_H) Modified: mvapich2/trunk/src/mpid/osu_ch3/src/mpid_vc.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/src/mpid_vc.c 2007-07-26 01:40:14 UTC (rev 1425) +++ mvapich2/trunk/src/mpid/osu_ch3/src/mpid_vc.c 2007-07-26 01:47:58 UTC (rev 1426) @@ -4,6 +4,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #include "mpidimpl.h" /*S From huangwei at mvapich.cse.ohio-state.edu Wed Jul 25 22:09:15 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 22:09:35 2007 Subject: [mvapich-commit] r1430 - mvapich2/trunk Message-ID: <200707260209.l6Q29F7g011517@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-25 22:09:15 -0400 (Wed, 25 Jul 2007) New Revision: 1430 Modified: mvapich2/trunk/LICENSE.TXT Log: LICENSE file should be 2007 Modified: mvapich2/trunk/LICENSE.TXT =================================================================== --- mvapich2/trunk/LICENSE.TXT 2007-07-26 02:05:30 UTC (rev 1429) +++ mvapich2/trunk/LICENSE.TXT 2007-07-26 02:09:15 UTC (rev 1430) @@ -1,6 +1,6 @@ MVAPICH2 -Copyright 2003-2006 The Ohio State University. +Copyright 2003-2007 The Ohio State University. Portions Copyright 1999-2002 The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from U.S. Dept. of Energy). From huangwei at mvapich.cse.ohio-state.edu Wed Jul 25 22:15:13 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 22:15:33 2007 Subject: [mvapich-commit] r1431 - in mvapich2/trunk: osu_benchmarks src/mpid/osu_ch3/channels/mrail/src/udapl src/pm/mpd Message-ID: <200707260215.l6Q2FD5Q011554@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-25 22:15:12 -0400 (Wed, 25 Jul 2007) New Revision: 1431 Modified: mvapich2/trunk/osu_benchmarks/osu_bibw.c mvapich2/trunk/osu_benchmarks/osu_bw.c mvapich2/trunk/osu_benchmarks/osu_latency.c mvapich2/trunk/osu_benchmarks/osu_latency_mt.c mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/avl.c mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/avl.h mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/dreg.c mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/dreg.h mvapich2/trunk/src/pm/mpd/mv2_checkpoint Log: More license files ... Modified: mvapich2/trunk/osu_benchmarks/osu_bibw.c =================================================================== --- mvapich2/trunk/osu_benchmarks/osu_bibw.c 2007-07-26 02:09:15 UTC (rev 1430) +++ mvapich2/trunk/osu_benchmarks/osu_bibw.c 2007-07-26 02:15:12 UTC (rev 1431) @@ -3,7 +3,7 @@ * OSU MPI Bidirectional bandwidth test v2.3 */ /* - * Copyright (C) 2002-2006 the Network-Based Computing Laboratory + * Copyright (C) 2002-2007 the Network-Based Computing Laboratory * (NBCL), The Ohio State University. * * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu) Modified: mvapich2/trunk/osu_benchmarks/osu_bw.c =================================================================== --- mvapich2/trunk/osu_benchmarks/osu_bw.c 2007-07-26 02:09:15 UTC (rev 1430) +++ mvapich2/trunk/osu_benchmarks/osu_bw.c 2007-07-26 02:15:12 UTC (rev 1431) @@ -3,7 +3,7 @@ * OSU MPI Bandwidth test v2.3 */ /* - * Copyright (C) 2002-2006 the Network-Based Computing Laboratory + * Copyright (C) 2002-2007 the Network-Based Computing Laboratory * (NBCL), The Ohio State University. * * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu) Modified: mvapich2/trunk/osu_benchmarks/osu_latency.c =================================================================== --- mvapich2/trunk/osu_benchmarks/osu_latency.c 2007-07-26 02:09:15 UTC (rev 1430) +++ mvapich2/trunk/osu_benchmarks/osu_latency.c 2007-07-26 02:15:12 UTC (rev 1431) @@ -3,7 +3,7 @@ * OSU MPI Latency test v2.2 */ /* - * Copyright (C) 2002-2006 the Network-Based Computing Laboratory + * Copyright (C) 2002-2007 the Network-Based Computing Laboratory * (NBCL), The Ohio State University. * * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu) Modified: mvapich2/trunk/osu_benchmarks/osu_latency_mt.c =================================================================== --- mvapich2/trunk/osu_benchmarks/osu_latency_mt.c 2007-07-26 02:09:15 UTC (rev 1430) +++ mvapich2/trunk/osu_benchmarks/osu_latency_mt.c 2007-07-26 02:15:12 UTC (rev 1431) @@ -3,7 +3,7 @@ */ /* - * Copyright (C) 2002-2006 the Network-Based Computing Laboratory + * Copyright (C) 2002-2007 the Network-Based Computing Laboratory * (NBCL), The Ohio State University. * * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu) Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/avl.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/avl.c 2007-07-26 02:09:15 UTC (rev 1430) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/avl.c 2007-07-26 02:15:12 UTC (rev 1431) @@ -12,7 +12,7 @@ * **/ -/* Copyright (c) 2002-2006, The Ohio State University. All rights +/* Copyright (c) 2002-2007, The Ohio State University. All rights * reserved. * * This file is part of the MVAPICH2 software package developed by the Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/avl.h =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/avl.h 2007-07-26 02:09:15 UTC (rev 1430) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/avl.h 2007-07-26 02:15:12 UTC (rev 1431) @@ -10,7 +10,7 @@ * **/ -/* Copyright (c) 2002-2006, The Ohio State University. All rights +/* Copyright (c) 2002-2007, The Ohio State University. All rights * reserved. * * This file is part of the MVAPICH2 software package developed by the Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/dreg.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/dreg.c 2007-07-26 02:09:15 UTC (rev 1430) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/dreg.c 2007-07-26 02:15:12 UTC (rev 1431) @@ -12,7 +12,7 @@ * Michael Welcome */ -/* Copyright (c) 2002-2006, The Ohio State University. All rights +/* Copyright (c) 2002-2007, The Ohio State University. All rights * reserved. * * This file is part of the MVAPICH2 software package developed by the Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/dreg.h =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/dreg.h 2007-07-26 02:09:15 UTC (rev 1430) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/udapl/dreg.h 2007-07-26 02:15:12 UTC (rev 1431) @@ -12,7 +12,7 @@ * Michael Welcome */ -/* Copyright (c) 2002-2006, The Ohio State University. All rights +/* Copyright (c) 2002-2007, The Ohio State University. All rights * reserved. * * This file is part of the MVAPICH2 software package developed by the Modified: mvapich2/trunk/src/pm/mpd/mv2_checkpoint =================================================================== --- mvapich2/trunk/src/pm/mpd/mv2_checkpoint 2007-07-26 02:09:15 UTC (rev 1430) +++ mvapich2/trunk/src/pm/mpd/mv2_checkpoint 2007-07-26 02:15:12 UTC (rev 1431) @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2002-2006, The Ohio State University. All rights +# Copyright (c) 2002-2007, The Ohio State University. All rights # reserved. # This file is part of the MVAPICH software package developed by the # team members of The Ohio State University's Network-Based Computing From huangwei at mvapich.cse.ohio-state.edu Wed Jul 25 22:20:28 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Wed Jul 25 22:20:48 2007 Subject: [mvapich-commit] r1434 - mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2 Message-ID: <200707260220.l6Q2KSk7011608@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-25 22:20:27 -0400 (Wed, 25 Jul 2007) New Revision: 1434 Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/cr.c Log: Add back mutex locks for CR code Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/cr.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/cr.c 2007-07-26 02:19:36 UTC (rev 1433) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/cr.c 2007-07-26 02:20:27 UTC (rev 1434) @@ -329,7 +329,9 @@ } MPIDI_CH3I_CR_unlock(); if (MPIDI_Process.use_sync_ckpt) { + pthread_mutex_lock(&MVAPICH2_sync_ckpt_lock); pthread_cond_signal(&MVAPICH2_sync_ckpt_cond); + pthread_mutex_unlock(&MVAPICH2_sync_ckpt_lock); } MPICR_is_restarting = 0; } From perkinjo at mvapich.cse.ohio-state.edu Thu Jul 26 10:06:22 2007 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Thu Jul 26 10:06:42 2007 Subject: [mvapich-commit] r1436 - mvapich2/trunk/src/pm/mpd Message-ID: <200707261406.l6QE6MWH013373@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2007-07-26 10:06:22 -0400 (Thu, 26 Jul 2007) New Revision: 1436 Modified: mvapich2/trunk/src/pm/mpd/configure mvapich2/trunk/src/pm/mpd/configure.in Log: Fixes issue where totalview support is not detected properly on 64bit architectures. Modified: mvapich2/trunk/src/pm/mpd/configure =================================================================== --- mvapich2/trunk/src/pm/mpd/configure 2007-07-26 14:05:42 UTC (rev 1435) +++ mvapich2/trunk/src/pm/mpd/configure 2007-07-26 14:06:22 UTC (rev 1436) @@ -2735,7 +2735,7 @@ if test "$enable_totalview" = "yes" ; then echo "$as_me:$LINENO: checking that $pypgm has development tools for totalview" >&5 echo $ECHO_N "checking that $pypgm has development tools for totalview... $ECHO_C" >&6 - if test -f /usr/lib/$pypgm/config/Makefile ; then + if test -f /usr/lib/$pypgm/config/Makefile -o -f /usr/lib64/$pypgm/config/Makefile ; then echo "$as_me:$LINENO: result: yes" >&5 echo "${ECHO_T}yes" >&6 else Modified: mvapich2/trunk/src/pm/mpd/configure.in =================================================================== --- mvapich2/trunk/src/pm/mpd/configure.in 2007-07-26 14:05:42 UTC (rev 1435) +++ mvapich2/trunk/src/pm/mpd/configure.in 2007-07-26 14:06:22 UTC (rev 1436) @@ -112,7 +112,7 @@ if test "$enable_totalview" = "yes" ; then AC_MSG_CHECKING([that $pypgm has development tools for totalview]) - if test -f /usr/lib/$pypgm/config/Makefile ; then + if test -f /usr/lib/$pypgm/config/Makefile -o -f /usr/lib64/$pypgm/config/Makefile ; then AC_MSG_RESULT(yes) else AC_MSG_RESULT(no) From huangwei at mvapich.cse.ohio-state.edu Thu Jul 26 11:25:14 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Thu Jul 26 11:25:37 2007 Subject: [mvapich-commit] r1437 - mvapich2/trunk/osu_benchmarks Message-ID: <200707261525.l6QFPECb013533@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-26 11:25:11 -0400 (Thu, 26 Jul 2007) New Revision: 1437 Added: mvapich2/trunk/osu_benchmarks/osu_mbw_mr.c Log: Add multi-bandwidth test Added: mvapich2/trunk/osu_benchmarks/osu_mbw_mr.c =================================================================== --- mvapich2/trunk/osu_benchmarks/osu_mbw_mr.c 2007-07-26 14:06:22 UTC (rev 1436) +++ mvapich2/trunk/osu_benchmarks/osu_mbw_mr.c 2007-07-26 15:25:11 UTC (rev 1437) @@ -0,0 +1,332 @@ +/* + * OSU MPI Multiple Bandwidth / Message Rate test v1.0 + */ +/* + * Copyright (C) 2002-2007 the Network-Based Computing Laboratory + * (NBCL), The Ohio State University. + * + * Contact: Dr. D. K. Panda (panda@cse.ohio-state.edu) + */ + +/* +This program is available under BSD licensing. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +(1) Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +(2) Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +(3) Neither the name of The Ohio State University nor the names of +their contributors may be used to endorse or promote products derived +from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + + + +#include "mpi.h" +#include +#include +#include +#include + +#define DEFAULT_WINDOW (64) + +#define ITERS_SMALL (100) +#define WARMUP_ITERS_SMALL (10) +#define ITERS_LARGE (20) +#define WARMUP_ITERS_LARGE (2) +#define LARGE_THRESHOLD (8192) + +#define WINDOW_SIZES {8, 16, 32, 64, 128} +#define WINDOW_SIZES_COUNT (5) + +#define MAX_MSG_SIZE (1<<22) +#define MAX_ALIGNMENT (16384) +#define MY_BUF_SIZE (MAX_MSG_SIZE + MAX_ALIGNMENT) + +char s_buf1[MY_BUF_SIZE]; +char r_buf1[MY_BUF_SIZE]; + +MPI_Request * request; +MPI_Status * reqstat; + +double calc_bw(int rank, int size, int num_pairs, int window_size, char *s_buf, char *r_buf); +void usage(); + +int main(int argc, char *argv[]) +{ + char *s_buf, *r_buf; + + int numprocs, rank, align_size; + int pairs, print_rate; + int window_size, window_varied; + int c, curr_size; + + MPI_Init (&argc, &argv); + + MPI_Comm_size (MPI_COMM_WORLD, &numprocs); + MPI_Comm_rank (MPI_COMM_WORLD, &rank); + + /* default values */ + pairs = numprocs / 2; + window_size = DEFAULT_WINDOW; + window_varied = 0; + print_rate = 1; + + while ((c = getopt(argc, argv, "p:w:r:v")) != -1) { + switch (c) + { + case 'p': + pairs = atoi(optarg); + if (pairs > (numprocs / 2)) { + if(0 == rank) + usage(); + goto error; + } + break; + case 'w': + window_size = atoi(optarg); + break; + case 'v': + window_varied = 1; + break; + case 'r': + print_rate = atoi(optarg); + if(0 != print_rate && 1 != print_rate) { + if(0 == rank) + usage(); + goto error; + } + break; + default: + if(0 == rank) + usage(); + goto error; + } + } + + align_size = getpagesize(); + s_buf = + (char *) (((unsigned long) s_buf1 + (align_size - 1)) / + align_size * align_size); + r_buf = + (char *) (((unsigned long) r_buf1 + (align_size - 1)) / + align_size * align_size); + + + + + + if (rank == 0) { + fprintf(stdout, "# OSU MPI Multi BW / Message Rate Test (Version 1.0)\n"); + if (window_varied) { + fprintf(stdout, "# [ pairs: %d ] [ window size: varied ]\n", pairs); + fprintf(stdout, "\n# Uni-directional Bandwidth (MB/sec)\n"); + } else { + fprintf(stdout, "# [ pairs: %d ] [ window size: %d ]\n", pairs, window_size); + if (print_rate) { + fprintf(stdout, "\n# Size MB/sec Messages/sec\n"); + } else { + fprintf(stdout, "\n# Size MB/sec\n"); + } + } + } + + /* More than one window size */ + + if (window_varied) { + int window_array[] = WINDOW_SIZES; + double ** bandwidth_results; + int log_val = 1, tmp_message_size = MAX_MSG_SIZE; + int i, j; + + for(i = 0; i < WINDOW_SIZES_COUNT; i++) { + if(window_array[i] > window_size) + window_size = window_array[i]; + } + + request = (MPI_Request *) malloc(sizeof(MPI_Request) * window_size); + reqstat = (MPI_Status *) malloc(sizeof(MPI_Status) * window_size); + + while (tmp_message_size >>= 1) { log_val++; } + bandwidth_results = (double **) malloc(sizeof(double *) * log_val); + for (i = 0; i < log_val; i++) { + bandwidth_results[i] = + (double *) malloc(sizeof(double) * WINDOW_SIZES_COUNT); + } + + if(rank == 0) { + fprintf(stdout, "# "); + for (i = 0; i < WINDOW_SIZES_COUNT; i++) { + fprintf(stdout, " %10d", window_array[i]); + } + fprintf(stdout, "\n"); + } + + j = 0; + for (curr_size = 1; curr_size <= MAX_MSG_SIZE; curr_size *= 2) { + if(rank == 0) { + fprintf(stdout, "%7d", curr_size); + } + for (i = 0; i < WINDOW_SIZES_COUNT; i++) { + bandwidth_results[j][i] = + calc_bw (rank, curr_size, pairs, window_array[i], s_buf, r_buf); + if(rank == 0) { + fprintf(stdout, " %10.2f", bandwidth_results[j][i]); + } + } + if(rank == 0) { + fprintf(stdout,"\n"); + } + j++; + } + + if (rank == 0 && print_rate) { + fprintf(stdout, "\n# Message Rate Profile\n"); + fprintf(stdout, "# "); + for (i = 0; i < WINDOW_SIZES_COUNT; i++) { + fprintf(stdout, " %10d", window_array[i]); + } + fprintf(stdout, "\n"); + + c = 0; + for (curr_size = 1; curr_size <= MAX_MSG_SIZE; curr_size *= 2) { + fprintf(stdout, "%7d", curr_size); + for (i = 0; i < WINDOW_SIZES_COUNT; i++) { + double rate; + rate = 1000 * 1000 * bandwidth_results[c][i] / curr_size; + fprintf(stdout, " %10.2f", rate); + } + fprintf(stdout,"\n"); + c++; + } + + } + + } else { + /* Just one window size */ + request = (MPI_Request *) malloc(sizeof(MPI_Request) * window_size); + reqstat = (MPI_Status *) malloc(sizeof(MPI_Status) * window_size); + + for (curr_size = 1; curr_size <= MAX_MSG_SIZE; curr_size *= 2) { + double bw, rate; + bw = calc_bw (rank, curr_size, pairs, window_size, s_buf, r_buf); + if (rank == 0) { + rate = 1000 * 1000 * bw / curr_size; + if(print_rate) { + fprintf(stdout, "%7d %7.2f %10.2f\n", curr_size, bw, rate); + } else { + fprintf(stdout, "%7d %7.2f\n", curr_size, bw); + } + } + } + } + +error: + MPI_Finalize(); + return 0; +} + +void usage() { + + printf("Options:\n"); + printf(" -r=<0,1> Print uni-directional message rate (default 1)\n"); + printf(" -p= Number of pairs involved (default np / 2)\n"); + printf(" -w= Number of messages sent before acknowledgement (64, 10)\n"); + printf(" [cannot be used with -v]\n"); + printf(" -v Vary the window size (default no)\n"); + printf(" [cannot be used with -w]\n"); + +} + +double calc_bw(int rank, int size, int num_pairs, int window_size, char *s_buf, char *r_buf) +{ + double t_start = 0.0, t_end = 0.0, t = 0.0, maxtime=0.0, bw=0.0; + int i, j, target; + int loop, skip; + int mult = (DEFAULT_WINDOW / window_size) > 0 ? (DEFAULT_WINDOW / window_size) : 1; + + for (i = 0; i < size; i++) { + s_buf[i] = 'a'; + r_buf[i] = 'b'; + } + + if (size > LARGE_THRESHOLD) { + loop = ITERS_LARGE * mult; + skip = WARMUP_ITERS_LARGE * mult; + } else { + loop = ITERS_SMALL * mult; + skip = WARMUP_ITERS_SMALL * mult; + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank < num_pairs) { + target = rank + num_pairs; + for (i = 0; i < loop + skip; i++) { + if (i == skip) { + MPI_Barrier(MPI_COMM_WORLD); + t_start = MPI_Wtime(); + } + for (j = 0; j < window_size; j++) { + MPI_Isend(s_buf, size, MPI_CHAR, target, 100, + MPI_COMM_WORLD, request + j); + } + MPI_Waitall(window_size, request, reqstat); + MPI_Recv(r_buf, 4, MPI_CHAR, target, 101, MPI_COMM_WORLD, + &reqstat[0]); + } + t_end = MPI_Wtime(); + t = t_end - t_start; + } + else if (rank < num_pairs * 2) + { + target = rank - num_pairs; + for (i = 0; i < loop + skip; i++) { + if (i == skip) { + MPI_Barrier(MPI_COMM_WORLD); + } + for (j = 0; j < window_size; j++) { + MPI_Irecv(r_buf, size, MPI_CHAR, target, 100, + MPI_COMM_WORLD, request + j); + } + MPI_Waitall (window_size, request, reqstat); + MPI_Send (s_buf, 4, MPI_CHAR, target, 101, MPI_COMM_WORLD); + } + } + else { + MPI_Barrier(MPI_COMM_WORLD); + } + + MPI_Reduce(&t, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + + if (rank == 0) { + double tmp; + tmp = ( (num_pairs * size * 1.0) / (1000 * 1000) ); + tmp = tmp * loop * window_size; + bw = tmp / maxtime; + return bw; + } + return 0; +} + + From huangwei at mvapich.cse.ohio-state.edu Thu Jul 26 11:31:23 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Thu Jul 26 11:31:45 2007 Subject: [mvapich-commit] r1440 - in mvapich2/trunk/src: include mpi/errhan Message-ID: <200707261531.l6QFVNHx013580@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-26 11:31:22 -0400 (Thu, 26 Jul 2007) New Revision: 1440 Modified: mvapich2/trunk/src/include/mpierrs.h mvapich2/trunk/src/mpi/errhan/errutil.c Log: License info Modified: mvapich2/trunk/src/include/mpierrs.h =================================================================== --- mvapich2/trunk/src/include/mpierrs.h 2007-07-26 15:29:32 UTC (rev 1439) +++ mvapich2/trunk/src/include/mpierrs.h 2007-07-26 15:31:22 UTC (rev 1440) @@ -4,6 +4,19 @@ * (C) 2001 by Argonne National Laboratory. * See COPYRIGHT in top-level directory. */ + +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + #ifndef MPIERRS_H_INCLUDED #define MPIERRS_H_INCLUDED /* ------------------------------------------------------------------------- */ Modified: mvapich2/trunk/src/mpi/errhan/errutil.c =================================================================== --- mvapich2/trunk/src/mpi/errhan/errutil.c 2007-07-26 15:29:32 UTC (rev 1439) +++ mvapich2/trunk/src/mpi/errhan/errutil.c 2007-07-26 15:31:22 UTC (rev 1440) @@ -5,6 +5,18 @@ * See COPYRIGHT in top-level directory. */ +/* Copyright (c) 2003-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH2 software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH2 in the top level MVAPICH2 directory. + * + */ + /* style: allow:fprintf:4 sig:0 */ /* stdarg is required to handle the variable argument lists for From narravul at mvapich.cse.ohio-state.edu Thu Jul 26 15:43:20 2007 From: narravul at mvapich.cse.ohio-state.edu (narravul@mvapich.cse.ohio-state.edu) Date: Thu Jul 26 15:43:42 2007 Subject: [mvapich-commit] r1441 - mvapich2/trunk Message-ID: <200707261943.l6QJhKon014083@mvapich.cse.ohio-state.edu> Author: narravul Date: 2007-07-26 15:43:18 -0400 (Thu, 26 Jul 2007) New Revision: 1441 Modified: mvapich2/trunk/make.mvapich2.ofa Log: Removing the flag -DOFED_VERSION_1_1 as a default option from the make.mvapich2.ofa script. Modified: mvapich2/trunk/make.mvapich2.ofa =================================================================== --- mvapich2/trunk/make.mvapich2.ofa 2007-07-26 15:31:22 UTC (rev 1440) +++ mvapich2/trunk/make.mvapich2.ofa 2007-07-26 19:43:18 UTC (rev 1441) @@ -91,7 +91,7 @@ RDMA_CM_LIBS="-lrdmacm" RDMA_CM_FLAG="-DRDMA_CM" - RDMA_CM_FLAG="-DRDMA_CM -DOFED_VERSION_1_1" +# RDMA_CM_FLAG="-DRDMA_CM -DOFED_VERSION_1_1" fi # Whether or not to build with ROMIO MPI I/O support. Disabled by default. From huangwei at mvapich.cse.ohio-state.edu Thu Jul 26 16:18:40 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Thu Jul 26 16:19:01 2007 Subject: [mvapich-commit] r1442 - mvapich2/tags Message-ID: <200707262018.l6QKIe2H014186@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-26 16:18:39 -0400 (Thu, 26 Jul 2007) New Revision: 1442 Added: mvapich2/tags/1.0-BETA/ Log: Create tag for 1.0-beta release Copied: mvapich2/tags/1.0-BETA (from rev 1441, mvapich2/trunk) From huangwei at mvapich.cse.ohio-state.edu Thu Jul 26 16:19:25 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Thu Jul 26 16:19:46 2007 Subject: [mvapich-commit] r1443 - mvapich2/branches Message-ID: <200707262019.l6QKJPQ1014196@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-26 16:19:24 -0400 (Thu, 26 Jul 2007) New Revision: 1443 Added: mvapich2/branches/1.0/ Log: Create branch for 1.0 release Copied: mvapich2/branches/1.0 (from rev 1442, mvapich2/trunk) From gaoq at mvapich.cse.ohio-state.edu Fri Jul 27 13:11:49 2007 From: gaoq at mvapich.cse.ohio-state.edu (gaoq@mvapich.cse.ohio-state.edu) Date: Fri Jul 27 13:12:14 2007 Subject: [mvapich-commit] r1445 - mvapich2/trunk/src/mpid/osu_ch3/src Message-ID: <200707271711.l6RHBnLM017263@mvapich.cse.ohio-state.edu> Author: gaoq Date: 2007-07-27 13:11:45 -0400 (Fri, 27 Jul 2007) New Revision: 1445 Modified: mvapich2/trunk/src/mpid/osu_ch3/src/mpid_init.c Log: Apply Sync ckpt changes to trunk Modified: mvapich2/trunk/src/mpid/osu_ch3/src/mpid_init.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/src/mpid_init.c 2007-07-27 14:36:14 UTC (rev 1444) +++ mvapich2/trunk/src/mpid/osu_ch3/src/mpid_init.c 2007-07-27 17:11:45 UTC (rev 1445) @@ -494,12 +494,20 @@ int MVAPICH2_Sync_Checkpoint() { MPID_Comm * comm_ptr; + MPIU_THREADPRIV_DECL; + MPIU_THREADPRIV_GET; + if (MPIDI_Process.use_sync_ckpt == 0) /*Not enabled*/ return 0; MPID_Comm_get_ptr (MPI_COMM_WORLD, comm_ptr); + + MPIU_THREAD_SINGLE_CS_ENTER("coll"); + MPIR_Nest_incr(); MPIR_Barrier(comm_ptr); - + MPIR_Nest_decr(); + MPIU_THREAD_SINGLE_CS_EXIT("coll"); + if (MPIDI_Process.my_pg_rank == 0) {/*Notify console to take checkpoint*/ MPIDI_CH3I_CR_Sync_ckpt_request(); From surs at mvapich.cse.ohio-state.edu Fri Jul 27 14:38:50 2007 From: surs at mvapich.cse.ohio-state.edu (surs@mvapich.cse.ohio-state.edu) Date: Fri Jul 27 14:39:12 2007 Subject: [mvapich-commit] r1446 - mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2 Message-ID: <200707271838.l6RIcouq017427@mvapich.cse.ohio-state.edu> Author: surs Date: 2007-07-27 14:38:48 -0400 (Fri, 27 Jul 2007) New Revision: 1446 Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.c mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.h mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.c mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.h Log: Patch to fix hang for memory allocations whose size is bigger than 2^31-1. Thanks to Bryan Putnam (Purdue) for reporting this. Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.c 2007-07-27 17:11:45 UTC (rev 1445) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.c 2007-07-27 18:38:48 UTC (rev 1446) @@ -956,9 +956,9 @@ } #ifndef DISABLE_PTMALLOC -void find_and_free_dregs_inside(void *buf, int len) +void find_and_free_dregs_inside(void *buf, size_t len) { - int i; + unsigned long i; unsigned long pagenum_low, pagenum_high; unsigned long npages, begin, end; unsigned long user_low_a, user_high_a; Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.h =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.h 2007-07-27 17:11:45 UTC (rev 1445) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/dreg.h 2007-07-27 18:38:48 UTC (rev 1446) @@ -240,7 +240,7 @@ dreg_entry *dreg_new_entry(void *buf, int len); #ifndef DISABLE_PTMALLOC -void find_and_free_dregs_inside(void *buf, int len); +void find_and_free_dregs_inside(void *buf, size_t len); #endif #ifdef CKPT Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.c 2007-07-27 17:11:45 UTC (rev 1445) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.c 2007-07-27 18:38:48 UTC (rev 1446) @@ -96,7 +96,7 @@ #ifndef DISABLE_MUNMAP_HOOK -int mvapich2_munmap(void *buf, int len) +int mvapich2_munmap(void *buf, size_t len) { if(!mvapich2_minfo.munmap) { set_real_munmap_ptr(); Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.h =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.h 2007-07-27 17:11:45 UTC (rev 1445) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/mem_hooks.h 2007-07-27 18:38:48 UTC (rev 1446) @@ -42,7 +42,7 @@ void mvapich2_mfin(void); #ifndef DISABLE_MUNMAP_HOOK -int mvapich2_munmap(void *buf, int len); +int mvapich2_munmap(void *buf, size_t len); #endif #ifndef DISABLE_TRAP_SBRK From rowland at mvapich.cse.ohio-state.edu Mon Jul 30 13:17:35 2007 From: rowland at mvapich.cse.ohio-state.edu (rowland@mvapich.cse.ohio-state.edu) Date: Mon Jul 30 13:18:08 2007 Subject: [mvapich-commit] r1447 - in mvapich2/trunk/test/mpi: . cxx errors f77 f90 Message-ID: <200707301717.l6UHHZ7O010480@mvapich.cse.ohio-state.edu> Author: rowland Date: 2007-07-30 13:17:34 -0400 (Mon, 30 Jul 2007) New Revision: 1447 Modified: mvapich2/trunk/test/mpi/cxx/testlist.in mvapich2/trunk/test/mpi/errors/testlist.in mvapich2/trunk/test/mpi/f77/testlist.in mvapich2/trunk/test/mpi/f90/testlist.in mvapich2/trunk/test/mpi/testlist.in Log: Removed spawn tests as that functionality is not implemented. Modified: mvapich2/trunk/test/mpi/cxx/testlist.in =================================================================== --- mvapich2/trunk/test/mpi/cxx/testlist.in 2007-07-27 18:38:48 UTC (rev 1446) +++ mvapich2/trunk/test/mpi/cxx/testlist.in 2007-07-30 17:17:34 UTC (rev 1447) @@ -5,5 +5,5 @@ info datatype @iodir@ -@spawndir@ +#@spawndir@ @rmadir@ Modified: mvapich2/trunk/test/mpi/errors/testlist.in =================================================================== --- mvapich2/trunk/test/mpi/errors/testlist.in 2007-07-27 18:38:48 UTC (rev 1446) +++ mvapich2/trunk/test/mpi/errors/testlist.in 2007-07-30 17:17:34 UTC (rev 1447) @@ -3,7 +3,7 @@ group pt2pt @rmadir@ -@spawndir@ +#@spawndir@ @iodir@ @f77dir@ @cxxdir@ Modified: mvapich2/trunk/test/mpi/f77/testlist.in =================================================================== --- mvapich2/trunk/test/mpi/f77/testlist.in 2007-07-27 18:38:48 UTC (rev 1446) +++ mvapich2/trunk/test/mpi/f77/testlist.in 2007-07-30 17:17:34 UTC (rev 1447) @@ -3,7 +3,7 @@ datatype pt2pt info -@spawndir@ +#@spawndir@ @iodir@ @rmadir@ init Modified: mvapich2/trunk/test/mpi/f90/testlist.in =================================================================== --- mvapich2/trunk/test/mpi/f90/testlist.in 2007-07-27 18:38:48 UTC (rev 1446) +++ mvapich2/trunk/test/mpi/f90/testlist.in 2007-07-30 17:17:34 UTC (rev 1447) @@ -9,6 +9,6 @@ pt2pt datatype @rmadir@ -@spawndir@ +#@spawndir@ timer topo Modified: mvapich2/trunk/test/mpi/testlist.in =================================================================== --- mvapich2/trunk/test/mpi/testlist.in 2007-07-27 18:38:48 UTC (rev 1446) +++ mvapich2/trunk/test/mpi/testlist.in 2007-07-30 17:17:34 UTC (rev 1447) @@ -11,7 +11,7 @@ init pt2pt @rmadir@ -@spawndir@ +#@spawndir@ topo @iodir@ @f77dir@ From narravul at mvapich.cse.ohio-state.edu Tue Jul 31 10:14:13 2007 From: narravul at mvapich.cse.ohio-state.edu (narravul@mvapich.cse.ohio-state.edu) Date: Tue Jul 31 10:14:43 2007 Subject: [mvapich-commit] r1448 - mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2 Message-ID: <200707311414.l6VEEDgZ013125@mvapich.cse.ohio-state.edu> Author: narravul Date: 2007-07-31 10:14:11 -0400 (Tue, 31 Jul 2007) New Revision: 1448 Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_priv.c Log: Fix for one process jobs using rdma-cm. Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_priv.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_priv.c 2007-07-30 17:17:34 UTC (rev 1447) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/gen2/rdma_iba_priv.c 2007-07-31 14:14:11 UTC (rev 1448) @@ -748,6 +748,12 @@ } } +#ifdef RDMA_CM + if (proc->use_rdma_cm){ + ib_init_rdma_cm(proc, pg_rank, pg_size); + goto fn_exit; + } +#endif if (MPIDI_CH3I_RDMA_Process.has_ring_startup && pg_size > 1) { DEBUG_PRINT("ENTERING MPDRING CASE\n"); @@ -759,13 +765,6 @@ "**fail", "**fail %s", "cannot create cq"); } -#ifdef RDMA_CM - if (proc->use_rdma_cm){ - ib_init_rdma_cm(proc, pg_rank, pg_size); - goto fn_exit; - } -#endif - /* Create complete Queue and Queue pairs */ memset(&boot_attr, 0, sizeof boot_attr); boot_attr.cap.max_send_wr = rdma_default_max_wqe; From huangwei at mvapich.cse.ohio-state.edu Tue Jul 31 23:57:25 2007 From: huangwei at mvapich.cse.ohio-state.edu (huangwei@mvapich.cse.ohio-state.edu) Date: Tue Jul 31 23:57:55 2007 Subject: [mvapich-commit] r1449 - mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi Message-ID: <200708010357.l713vPnB014687@mvapich.cse.ohio-state.edu> Author: huangwei Date: 2007-07-31 23:57:23 -0400 (Tue, 31 Jul 2007) New Revision: 1449 Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/mpidi_ch3_rdma_post.h mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_1sc.c mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_init.c mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/vapi_channel_manager.c Log: Fix vapi compilation errors Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/mpidi_ch3_rdma_post.h =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/mpidi_ch3_rdma_post.h 2007-07-31 14:14:11 UTC (rev 1448) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/mpidi_ch3_rdma_post.h 2007-08-01 03:57:23 UTC (rev 1449) @@ -184,7 +184,7 @@ int MPIDI_CH3I_MRAILI_Waiting_msg(MPIDI_VC_t * vc, vbuf **, int); -int MPIDI_CH3I_MRAILI_Cq_poll(vbuf **, MPIDI_VC_t *, int); +int MPIDI_CH3I_MRAILI_Cq_poll(vbuf **, MPIDI_VC_t *, int, int is_blocking); int MRAILI_Send_noop_if_needed(MPIDI_VC_t *vc, const MRAILI_Channel_info *channel); Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_1sc.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_1sc.c 2007-07-31 14:14:11 UTC (rev 1448) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_1sc.c 2007-08-01 03:57:23 UTC (rev 1449) @@ -148,13 +148,6 @@ MPIDI_RMA_ops *curr_ptr; #ifdef _SMP_ MPIDI_VC_t *vc; - if (SMP_INIT) - { - /*correspoding post has not been issued */ - flag = 0; - break; - } - #endif MPID_Comm_get_ptr(win_ptr->comm, comm_ptr); Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_init.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_init.c 2007-07-31 14:14:11 UTC (rev 1448) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/rdma_iba_init.c 2007-08-01 03:57:23 UTC (rev 1449) @@ -338,7 +338,6 @@ mallopt(M_TRIM_THRESHOLD, -1); mallopt(M_MMAP_MAX, 0); #endif - gethostname(tmp_hname, 255); cached_pg = pg; cached_pg_rank = pg_rank; Modified: mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/vapi_channel_manager.c =================================================================== --- mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/vapi_channel_manager.c 2007-07-31 14:14:11 UTC (rev 1448) +++ mvapich2/trunk/src/mpid/osu_ch3/channels/mrail/src/vapi/vapi_channel_manager.c 2007-08-01 03:57:23 UTC (rev 1449) @@ -247,7 +247,7 @@ if (i != vc->mrail.num_total_subrails) { vbuf * vbuffer; - MPIDI_CH3I_MRAILI_Cq_poll(&vbuffer, vc, 1); + MPIDI_CH3I_MRAILI_Cq_poll(&vbuffer, vc, 1, 0); } return type; @@ -323,7 +323,7 @@ #endif } - type = MPIDI_CH3I_MRAILI_Cq_poll(vbuf_handle, vc, 0); + type = MPIDI_CH3I_MRAILI_Cq_poll(vbuf_handle, vc, 0, 0); if (type != T_CHANNEL_NO_ARRIVE) { switch(type) { case (T_CHANNEL_EXACT_ARRIVE): @@ -366,7 +366,8 @@ * #define T_CHANNEL_CONTROL_MSG_ARRIVE 3 * #define T_CHANNEL_ERROR -1 */ -int MPIDI_CH3I_MRAILI_Cq_poll(vbuf **vbuf_handle, MPIDI_VC_t * vc_req, int receiving) +int MPIDI_CH3I_MRAILI_Cq_poll(vbuf **vbuf_handle, MPIDI_VC_t * vc_req, + int receiving, int is_blocking) { VAPI_ret_t ret1; MPIDI_VC_t *vc;