From perkinjo at mvapich.cse.ohio-state.edu Fri Feb 1 10:20:53 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Fri Feb 1 10:21:06 2008 Subject: [mvapich-commit] r1942 - mvapich/trunk/mpid/ch_gen2/process Message-ID: <200802011520.m11FKrT4000392@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-02-01 10:20:52 -0500 (Fri, 01 Feb 2008) New Revision: 1942 Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h Log: Fix logic so alarm is reset after mpi ranks are spawned. Also fix a couple issues with -show -xterm -debug flag. Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-01-31 16:58:34 UTC (rev 1941) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-02-01 15:20:52 UTC (rev 1942) @@ -67,8 +67,6 @@ int use_xlauncher = 0; int xlauncher_width = 8; int aout_index, port; -int mc_port; -int mc_socket; char *wd; /* working directory of current process */ #define MAX_HOST_LEN 256 char mpirun_host[MAX_HOST_LEN]; /* hostname of current process */ @@ -165,9 +163,6 @@ int main(int argc, char *argv[]) { - struct sockaddr_in mc_sockaddr; - unsigned int mc_sockaddr_len = sizeof(mc_sockaddr); - int i, s, s1, c, option_index; int hostfile_on = 0; #define HOSTFILE_LEN 256 @@ -219,7 +214,6 @@ case 1: debug_on = 1; xterm_on = 1; - legacy_startup = 1; break; case 2: xterm_on = 1; @@ -270,28 +264,28 @@ legacy_startup = 1; break; case 11: - legacy_startup = 1; + legacy_startup = 1; break; - case 12: - use_xlauncher = 1; - break; - case 13: - xlauncher_width = atoi(optarg); - if (xlauncher_width < 1) { - usage(); - exit(EXIT_FAILURE); - } - break; - case 14: + case 12: + use_xlauncher = 1; + break; + case 13: + xlauncher_width = atoi(optarg); + if (xlauncher_width < 1) { + usage(); + exit(EXIT_FAILURE); + } + break; + case 14: + usage(); + exit(EXIT_SUCCESS); + break; + default: + fprintf(stderr, "Unknown option\n"); usage(); - exit(EXIT_SUCCESS); - break; - default: - fprintf(stderr, "Unknown option\n"); - usage(); exit(EXIT_FAILURE); - break; - } + break; + } break; default: fprintf(stderr, "Unreachable statement!\n"); @@ -301,6 +295,11 @@ } } while (c != EOF); + if(!nprocs) { + usage(); + exit(EXIT_FAILURE); + } + if (!hostfile_on) { /* get hostnames from argument list */ if (argc - optind < nprocs + 1) { @@ -384,52 +383,27 @@ gethostname(mpirun_host, MAX_HOST_LEN); get_display_str(); - if (legacy_startup) { - server_socket = s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - if (s < 0) { - perror("socket"); - exit(EXIT_FAILURE); - } - sockaddr.sin_addr.s_addr = INADDR_ANY; - sockaddr.sin_port = 0; - if (bind(s, (struct sockaddr *) &sockaddr, sockaddr_len) < 0) { - perror("bind"); - exit(EXIT_FAILURE); - } - if (getsockname(s, (struct sockaddr *) &sockaddr, &sockaddr_len) < 0) { - perror("getsockname"); - exit(EXIT_FAILURE); - } - - port = (int) ntohs(sockaddr.sin_port); - listen(s, nprocs); + server_socket = s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s < 0) { + perror("socket"); + exit(EXIT_FAILURE); } - - mc_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - - if (mc_socket < 0) { - perror("socket"); - exit(EXIT_FAILURE); + sockaddr.sin_addr.s_addr = INADDR_ANY; + sockaddr.sin_port = 0; + if (bind(s, (struct sockaddr *) &sockaddr, sockaddr_len) < 0) { + perror("bind"); + exit(EXIT_FAILURE); } - mc_sockaddr.sin_addr.s_addr = INADDR_ANY; - mc_sockaddr.sin_port = 0; - - if (bind(mc_socket, (struct sockaddr *)&mc_sockaddr, mc_sockaddr_len) < 0) { - perror("bind"); - exit(EXIT_FAILURE); + if (getsockname(s, (struct sockaddr *) &sockaddr, &sockaddr_len) < 0) { + perror("getsockname"); + exit(EXIT_FAILURE); } - if (getsockname(mc_socket, (struct sockaddr *)&mc_sockaddr, &mc_sockaddr_len) - < 0) { - perror("getsockname"); - exit(EXIT_FAILURE); - } + port = (int) ntohs(sockaddr.sin_port); + listen(s, nprocs); - mc_port = (int) ntohs(mc_sockaddr.sin_port); - listen(mc_socket, nprocs); - if (!show_on) { struct sigaction signal_handler; signal_handler.sa_handler = cleanup_handler; @@ -462,7 +436,7 @@ * situations where people might use two different hostnames for the * same host. */ - pglist_insert(plist[i].hostname, i); + pglist_insert(plist[i].hostname, i); } alarm(1000); @@ -505,100 +479,101 @@ } } - else { - if(pglist && !legacy_startup) { - spawn_fast(argc, argv, totalview_cmd, env); - if(show_on) exit(EXIT_SUCCESS); - mpispawn_checkin(mc_socket, (struct sockaddr *)&mc_sockaddr, - mc_sockaddr_len); - } + else if(pglist && !legacy_startup) { + spawn_fast(argc, argv, totalview_cmd, env); - else { - spawn_linear(argc, argv, totalview_cmd, env); - } + if(show_on) exit(EXIT_SUCCESS); + + mpispawn_checkin(server_socket, (struct sockaddr *)&sockaddr, + sockaddr_len); + alarm(0); + wait_for_mpispawn_errors (server_socket, &sockaddr, sockaddr_len); + + /* + * This exit should never be reached. + */ + exit(EXIT_FAILURE); } - if (show_on) - exit(EXIT_SUCCESS); + spawn_linear(argc, argv, totalview_cmd, env); - if (legacy_startup) { - /* build up an array of file descriptors for pmgr_processops */ - int* fds = (int*) malloc(nprocs*sizeof(int)); + if(show_on) exit(EXIT_SUCCESS); - if(fds == NULL) { - perror("allocating temporary array for socket file descriptors"); - cleanup(); - } - - /* accept incoming connections */ - - for (i = 0; i < nprocs; i++) { - int rank; -ACCEPT_HID: - sockaddr_len = sizeof(sockaddr); - s1 = accept(s, (struct sockaddr *) &sockaddr, &sockaddr_len); + /* build up an array of file descriptors for pmgr_processops */ + int* fds = (int*) malloc(nprocs*sizeof(int)); - alarm_msg = "Timeout during hostid exchange.\n"; + if(fds == NULL) { + perror("allocating temporary array for socket file descriptors"); + cleanup(); + } - if (s1 < 0) { - if ((errno == EINTR) || (errno == EAGAIN)) - goto ACCEPT_HID; - perror("accept"); - cleanup(); - } + /* accept incoming connections */ - /* - * protocol: - * 0. read protocol version number - * 1. read rank of process - */ + for (i = 0; i < nprocs; i++) { + int rank; +ACCEPT_HID: + sockaddr_len = sizeof(sockaddr); + s1 = accept(s, (struct sockaddr *) &sockaddr, &sockaddr_len); - /* 0. Find out what version of the startup protocol the executable - * was compiled to use. */ + alarm_msg = "Timeout during hostid exchange.\n"; - if(read_socket(s1, &version, sizeof(version))) cleanup (); + if (s1 < 0) { + if ((errno == EINTR) || (errno == EAGAIN)) + goto ACCEPT_HID; + perror("accept"); + cleanup(); + } - if(version != PMGR_VERSION) { - fprintf(stderr, "mpirun: executable version %d does not match" - " our version %d.\n", version, PMGR_VERSION); - cleanup(); - } + /* + * protocol: + * 0. read protocol version number + * 1. read rank of process + */ - /* 1. Find out who we're talking to */ - if(read_socket(s1, &rank, sizeof(rank))) cleanup (); + /* 0. Find out what version of the startup protocol the executable + * was compiled to use. */ - if (rank < 0 || rank >= nprocs || - ( !(use_xlauncher) && plist[rank].state != P_STARTED)) { - fprintf(stderr, "mpirun: invalid rank received. \n"); - cleanup(); - } + if(read_socket(s1, &version, sizeof(version))) cleanup (); - fds[rank] = plist[rank].control_socket = s1; - } - /* at this point, all processes have checked in hostids */ - /* cancel the timeout */ - alarm(0); + if(version != PMGR_VERSION) { + fprintf(stderr, "mpirun: executable version %d does not match" + " our version %d.\n", version, PMGR_VERSION); + cleanup(); + } - pmgr_processops(fds, nprocs); + /* 1. Find out who we're talking to */ + if(read_socket(s1, &rank, sizeof(rank))) cleanup (); - /* free it off (processops closes each socket before returning control) */ - free(fds); + if (rank < 0 || rank >= nprocs || + ( !(use_xlauncher) && plist[rank].state != P_STARTED)) { + fprintf(stderr, "mpirun: invalid rank received. \n"); + cleanup(); + } - for (i = 0; i < nprocs; i++) { - plist[i].state = P_RUNNING; - } + fds[rank] = plist[rank].control_socket = s1; + } + /* at this point, all processes have checked in hostids */ + /* cancel the timeout */ + alarm(0); - wait_for_errors (s, (struct sockaddr *)&sockaddr, sockaddr_len); + pmgr_processops(fds, nprocs); + + /* free it off (processops closes each socket before returning control) */ + free(fds); + + for (i = 0; i < nprocs; i++) { + plist[i].state = P_RUNNING; } if(use_xlauncher) { - int status; + int status; - waitpid(md_id, &status, 0); - exit(WIFEXITED(status) ? WEXITSTATUS (status) : MD_EXIT_MINIDAEMON_SIG); + waitpid(md_id, &status, 0); + exit(WIFEXITED(status) ? WEXITSTATUS (status) : MD_EXIT_MINIDAEMON_SIG); } - wait_for_mpispawn_errors (mc_socket, &mc_sockaddr, mc_sockaddr_len); + wait_for_errors (s, (struct sockaddr *)&sockaddr, sockaddr_len); + /* * This return should never be reached. */ @@ -683,12 +658,14 @@ } if(use_totalview) { - len = sprintf(remote_command, "%s MPIRUN_PROCESSES='%s' %s ", remote_command, mpirun_processes, command_name); + len = snprintf(remote_command, str_len, "%s MPIRUN_PROCESSES='%s' %s ", + remote_command, mpirun_processes, command_name); } else { - len = sprintf(remote_command, "%s NOT_USE_TOTALVIEW=1 %s ", remote_command, command_name); + len = snprintf(remote_command, str_len, "%s NOT_USE_TOTALVIEW=1 %s ", + remote_command, command_name); } - if (len > str_len) { + if (len >= str_len) { fprintf(stderr, "Internal error - overflowed remote_command\n"); exit(1); } @@ -791,23 +768,23 @@ int wfe_socket, wfe_abort_code, wfe_abort_mid; while((wfe_socket = accept(s, (struct sockaddr *) sockaddr, - &sockaddr_len)) < 0) { - if(errno == EINTR || errno == EAGAIN) continue; - perror("accept"); - cleanup(); + &sockaddr_len)) < 0) { + if(errno == EINTR || errno == EAGAIN) continue; + perror("accept"); + cleanup(); } if(read_socket(wfe_socket, &wfe_abort_code, sizeof(int)) - || read_socket(wfe_socket, &wfe_abort_mid, sizeof(int))) { - fprintf(stderr, "Termination socket read failed!\n"); + || read_socket(wfe_socket, &wfe_abort_mid, sizeof(int))) { + fprintf(stderr, "Termination socket read failed!\n"); } else { - fprintf(stderr, "Exit code %d signaled from %s\n", wfe_abort_code, - pglist->index[wfe_abort_mid]->hostname); + fprintf(stderr, "Exit code %d signaled from %s\n", wfe_abort_code, + pglist->index[wfe_abort_mid]->hostname); } close (wfe_socket); - cleanup(); + cleanup(); } void process_termination() @@ -1674,7 +1651,7 @@ goto allocation_error; } - tmp = mkstr("%s MPISPAWN_CHECKIN_PORT=%d", mpispawn_env, mc_port); + tmp = mkstr("%s MPISPAWN_CHECKIN_PORT=%d", mpispawn_env, port); if(tmp) { free(mpispawn_env); @@ -1849,10 +1826,9 @@ } for(i = 0; i < pglist->npgs; i++) { - size_t arg_offset = 0; - if(!(pglist->index[i]->pid = fork())) { - const char* argv[6]; + size_t arg_offset = 0; + const char* argv[7]; char *command; tmp = mkstr("%s MPISPAWN_ID=%d", mpispawn_env, i); @@ -1931,34 +1907,31 @@ } } - if(use_rsh) { - argv[0] = RSH_CMD; + if(xterm_on) { + argv[arg_offset++] = XTERM; + argv[arg_offset++] = "-e"; } - else { - argv[arg_offset] = SSH_CMD; - argv[++arg_offset] = SSH_ARG; + if(use_rsh) { + argv[arg_offset++] = RSH_CMD; } - if(xterm_on) { - argv[++arg_offset] = "-X"; - command = mkstr("cd %s; %s %s %s %s -e %s/mpispawn", wd, - ENV_CMD, mpispawn_env, env, XTERM, binary_dirname); - } - else { - command = mkstr("cd %s; %s %s %s %s/mpispawn", wd, ENV_CMD, - mpispawn_env, env, binary_dirname); + argv[arg_offset++] = SSH_CMD; + argv[arg_offset++] = SSH_ARG; } + command = mkstr("cd %s; %s %s %s %s/mpispawn", wd, ENV_CMD, + mpispawn_env, env, binary_dirname); + if(!command) { fprintf(stderr, "Couldn't allocate string for remote command!\n"); exit(EXIT_FAILURE); } - argv[arg_offset + 1] = pglist->index[i]->hostname; - argv[arg_offset + 2] = command; - argv[arg_offset + 3] = NULL; + argv[arg_offset++] = pglist->index[i]->hostname; + argv[arg_offset++] = command; + argv[arg_offset++] = NULL; if(show_on) { size_t arg = 0; @@ -2103,7 +2076,6 @@ if(++num_exited == num_children) { if (legacy_startup) close(server_socket); - close (mc_socket); exit(WEXITSTATUS(status)); } } Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h 2008-01-31 16:58:34 UTC (rev 1941) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h 2008-02-01 15:20:52 UTC (rev 1942) @@ -51,7 +51,7 @@ #ifdef USE_DDD #define DEBUGGER "/usr/bin/ddd" #else -#define DEBUGGER "gdb" +#define DEBUGGER "/usr/bin/gdb" #endif #define _GNU_SOURCE From koop at mvapich.cse.ohio-state.edu Fri Feb 1 14:04:13 2008 From: koop at mvapich.cse.ohio-state.edu (koop@mvapich.cse.ohio-state.edu) Date: Fri Feb 1 14:04:24 2008 Subject: [mvapich-commit] r1943 - mvapich/trunk/mpid/ch_gen2 Message-ID: <200802011904.m11J4Dil000739@mvapich.cse.ohio-state.edu> Author: koop Date: 2008-02-01 14:04:12 -0500 (Fri, 01 Feb 2008) New Revision: 1943 Modified: mvapich/trunk/mpid/ch_gen2/viaparam.c Log: * Change default MTU of ConnectX DDR to 1K for higher performance Modified: mvapich/trunk/mpid/ch_gen2/viaparam.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/viaparam.c 2008-02-01 15:20:52 UTC (rev 1942) +++ mvapich/trunk/mpid/ch_gen2/viaparam.c 2008-02-01 19:04:12 UTC (rev 1943) @@ -1285,7 +1285,7 @@ break; case CONNECTX_DDR: - viadev_default_mtu = IBV_MTU_2048; + viadev_default_mtu = IBV_MTU_1024; viadev_use_srq = 1; viadev_credit_preserve = 100; viadev_initial_credits = viadev_credit_preserve + 100; From koop at mvapich.cse.ohio-state.edu Fri Feb 1 14:20:07 2008 From: koop at mvapich.cse.ohio-state.edu (koop@mvapich.cse.ohio-state.edu) Date: Fri Feb 1 14:20:20 2008 Subject: [mvapich-commit] r1944 - in mvapich/trunk: . mpid/ch_gen2 Message-ID: <200802011920.m11JK7eK000766@mvapich.cse.ohio-state.edu> Author: koop Date: 2008-02-01 14:20:05 -0500 (Fri, 01 Feb 2008) New Revision: 1944 Modified: mvapich/trunk/mpid/ch_gen2/mpid_send.c mvapich/trunk/mpid/ch_gen2/viacheck.c mvapich/trunk/mpid/ch_gen2/viaparam.c mvapich/trunk/mpid/ch_gen2/viapriv.h mvapich/trunk/mvapich.conf Log: * To enable asynchronous progress the -DASYNC flag is now required. Change made due to performance concerns. Modified: mvapich/trunk/mpid/ch_gen2/mpid_send.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/mpid_send.c 2008-02-01 19:04:12 UTC (rev 1943) +++ mvapich/trunk/mpid/ch_gen2/mpid_send.c 2008-02-01 19:20:05 UTC (rev 1944) @@ -197,13 +197,16 @@ } #endif +#ifdef ASYNC if(VIADEV_UNLIKELY(VIADEV_PROTOCOL_ASYNC == viadev_rndv_protocol)) { while (!shandle->is_complete) MPID_DeviceCheck(MPID_NOTBLOCKING); if (VIADEV_PROTOCOL_ASYNC == shandle->protocol) { SEND_COMPLETE(shandle); } - } else { + } else +#endif + { while (!shandle->is_complete) MPID_DeviceCheck(MPID_BLOCKING); } @@ -238,13 +241,16 @@ dest_grank, msgrep, (MPI_Request) shandle, error_code); +#ifdef ASYNC if(VIADEV_UNLIKELY(VIADEV_PROTOCOL_ASYNC == viadev_rndv_protocol)) { while (!shandle->is_complete) MPID_DeviceCheck(MPID_NOTBLOCKING); if (VIADEV_PROTOCOL_ASYNC == shandle->protocol) { SEND_COMPLETE(shandle); } - } else { + } else +#endif + { while (!shandle->is_complete) MPID_DeviceCheck(MPID_BLOCKING); } @@ -352,6 +358,7 @@ { MPIR_SHANDLE *shandle = (MPIR_SHANDLE *) request; +#ifdef ASYNC if(VIADEV_UNLIKELY(VIADEV_PROTOCOL_ASYNC == viadev_rndv_protocol)) { while (!shandle->is_complete) { MPID_DeviceCheck(MPID_NOTBLOCKING); @@ -359,7 +366,9 @@ if (VIADEV_PROTOCOL_ASYNC == shandle->protocol) { SEND_COMPLETE(shandle); } - } else { + } else +#endif + { while (!shandle->is_complete) MPID_DeviceCheck(MPID_BLOCKING); } Modified: mvapich/trunk/mpid/ch_gen2/viacheck.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/viacheck.c 2008-02-01 19:04:12 UTC (rev 1943) +++ mvapich/trunk/mpid/ch_gen2/viacheck.c 2008-02-01 19:20:05 UTC (rev 1944) @@ -456,6 +456,7 @@ }while(0 == ret5); #endif +#ifdef ASYNC if(VIADEV_UNLIKELY(VIADEV_PROTOCOL_ASYNC == viadev_rndv_protocol)) { if(!(VBUF_QUEUE_EMPTY(&viadev.async_info.vbufq))) { @@ -482,6 +483,7 @@ } } } +#endif } /* make progress on pending rendezvous transfers */ @@ -1134,6 +1136,7 @@ #endif #endif +#ifdef ASYNC if (VIADEV_UNLIKELY(VIADEV_PROTOCOL_ASYNC == viadev_rndv_protocol)) { vbuf *new_vbuf; while (header->type != VIADEV_PACKET_NOOP && @@ -1162,7 +1165,9 @@ } } } - } else { + } else +#endif + { #ifdef ADAPTIVE_RDMA_FAST_PATH /* handle out of order messages for send/receive channel */ Modified: mvapich/trunk/mpid/ch_gen2/viaparam.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/viaparam.c 2008-02-01 19:04:12 UTC (rev 1943) +++ mvapich/trunk/mpid/ch_gen2/viaparam.c 2008-02-01 19:20:05 UTC (rev 1944) @@ -564,6 +564,13 @@ #ifdef ADAPTIVE_RDMA_FAST_PATH viadev_rdma_eager_limit = 0; #endif +#ifndef ASYNC + error_abort_all(GEN_EXIT_ERR, "VIADEV_RNDV_PROTOCOL " + "must be either \"RPUT\", \"RGET\" or \"R3\"" + "-- Support for ASYNC is not available since " + "the MPI library was not compiled with -DASYNC"); + +#endif } else error_abort_all(GEN_EXIT_ERR, "VIADEV_RNDV_PROTOCOL " Modified: mvapich/trunk/mpid/ch_gen2/viapriv.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/viapriv.h 2008-02-01 19:04:12 UTC (rev 1943) +++ mvapich/trunk/mpid/ch_gen2/viapriv.h 2008-02-01 19:20:05 UTC (rev 1944) @@ -702,20 +702,37 @@ #define ASYNC_QPN(_c) ((_c)->async_conn.qp->qp_num) #define ASYNC_QP_LID(_c) +#ifdef ASYNC #define ASYNC_THREAD_LOCK { \ if(VIADEV_UNLIKELY(VIADEV_PROTOCOL_ASYNC == viadev_rndv_protocol)) { \ pthread_spin_lock(&viadev.async_info.lock); \ } \ } - #define ASYNC_THREAD_UNLOCK { \ if(VIADEV_UNLIKELY(VIADEV_PROTOCOL_ASYNC == viadev_rndv_protocol)) { \ pthread_spin_unlock(&viadev.async_info.lock); \ RMB(); \ } \ } +#define QLOCK_LOCK() { \ + if (VIADEV_UNLIKELY(qlock_needed)) { \ + pthread_spin_lock(&qlock); \ + } \ +} +#define QLOCK_UNLOCK() { \ + if (VIADEV_UNLIKELY(qlock_needed)) { \ + pthread_spin_unlock(&qlock); \ + } \ +} +#else +#define ASYNC_THREAD_LOCK +#define ASYNC_THREAD_UNLOCK +#define QLOCK_LOCK() +#define QLOCK_UNLOCK() +#endif + #define FLUSH_EXT_SQUEUE(_c) { \ if(VIADEV_UNLIKELY((_c)->send_wqes_avail && (_c)->ext_sendq_head)) { \ viadev_ext_sendq_send(_c); \ @@ -832,18 +849,7 @@ c->rdma_read_tail = r; \ } -#define QLOCK_LOCK() { \ - if (VIADEV_UNLIKELY(qlock_needed)) { \ - pthread_spin_lock(&qlock); \ - } \ -} -#define QLOCK_UNLOCK() { \ - if (VIADEV_UNLIKELY(qlock_needed)) { \ - pthread_spin_unlock(&qlock); \ - } \ -} - #define VIADEV_EAGER_OK(_len, _c) \ (VIADEV_LIKELY(viadev_use_srq) ? \ (VIADEV_LIKELY((_c)->ext_sendq_size <= \ Modified: mvapich/trunk/mvapich.conf =================================================================== --- mvapich/trunk/mvapich.conf 2008-02-01 19:04:12 UTC (rev 1943) +++ mvapich/trunk/mvapich.conf 2008-02-01 19:20:05 UTC (rev 1944) @@ -101,10 +101,14 @@ # # Options are: # - RPUT : Send large messages using RDMA write operations (zero-copy) -# - RGET : Allows for more overlap (zero-copy) +# - RGET : Potentially allows for more overlap (zero-copy) # - R3 : Sends messages without registering memory by using a copy-based approach # - ASYNC : Uses an RGET based protocol to achieve asynchronous progress on large # transfers. Currently sets VIADEV_ADAPTIVE_RDMA_THRESHOLD=0. +# +# NOTE: ASYNC is only available if the library was compiled with the -DASYNC CFLAG +# (not defined by default) +# #----------------------------------------------------------------------------------------- # VIADEV_MAX_RDMA_SIZE=1048576 # From mamidala at mvapich.cse.ohio-state.edu Fri Feb 1 18:06:12 2008 From: mamidala at mvapich.cse.ohio-state.edu (mamidala@mvapich.cse.ohio-state.edu) Date: Fri Feb 1 18:06:23 2008 Subject: [mvapich-commit] r1946 - in mvapich/trunk: mpid/ch_gen2 src/coll src/context src/env Message-ID: <200802012306.m11N6CKX001110@mvapich.cse.ohio-state.edu> Author: mamidala Date: 2008-02-01 18:06:11 -0500 (Fri, 01 Feb 2008) New Revision: 1946 Modified: mvapich/trunk/mpid/ch_gen2/coll_shmem.h mvapich/trunk/mpid/ch_gen2/comm.h mvapich/trunk/mpid/ch_gen2/shmem_coll.c mvapich/trunk/src/coll/intra_fns_new.c mvapich/trunk/src/context/create_2level_comm.c mvapich/trunk/src/env/initutil.c Log: checking in two main changes: 1) New Allgather algather obtained from TACC experience when processes are distributed in block fashion. 2) SGI changes for multiple communicators and shared memory collectives to use optimized algorithm instead of falling back to default. Modified: mvapich/trunk/mpid/ch_gen2/coll_shmem.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/coll_shmem.h 2008-02-01 23:04:07 UTC (rev 1945) +++ mvapich/trunk/mpid/ch_gen2/coll_shmem.h 2008-02-01 23:06:11 UTC (rev 1946) @@ -93,7 +93,7 @@ #define SHMEM_COLL_OFFSET (shmem_coll_blocks * SHMEM_COLL_BLOCK_SIZE) #define FLAG_TYPES 6 #define SHMEM_COLL_BUF_SIZE (shmem_coll_blocks * SHMEM_COLL_BLOCK_SIZE + sizeof(shmem_coll_region) \ - + FLAG_TYPES*shmem_coll_num_comm*smpi.num_local_nodes) + + FLAG_TYPES*shmem_coll_num_comm*smpi.num_local_nodes + shmem_coll_blocks) /* the shared area itself */ typedef struct { @@ -115,6 +115,7 @@ int **root_complete_gather; int **barrier_gather; int **barrier_bcast; + char *shmem_avail; char* shmem_coll_buf; }; Modified: mvapich/trunk/mpid/ch_gen2/comm.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/comm.h 2008-02-01 23:04:07 UTC (rev 1945) +++ mvapich/trunk/mpid/ch_gen2/comm.h 2008-02-01 23:06:11 UTC (rev 1946) @@ -174,6 +174,10 @@ void* bcast_mmap_ptr; char* bcast_shmem_file; int bcast_seg_size; + int allg_cyclic_ok; + MPI_Group new_group; + MPI_Comm new_comm; + int* new_ranks; #endif }; Modified: mvapich/trunk/mpid/ch_gen2/shmem_coll.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/shmem_coll.c 2008-02-01 23:04:07 UTC (rev 1945) +++ mvapich/trunk/mpid/ch_gen2/shmem_coll.c 2008-02-01 23:06:11 UTC (rev 1946) @@ -193,6 +193,9 @@ (int*)(shmem_coll_obj.barrier_bcast[0] + j*smpi.num_local_nodes); } + shmem_coll_obj.shmem_avail = (char*)(shmem_coll_obj.child_complete_bcast[0] + + 6*shmem_coll_num_comm*smpi.num_local_nodes); + if (smpi.my_local_id == 0){ memset(shmem_coll_obj.mmap_ptr, 0, shmem_coll_size); for(j=0; j < shmem_coll_num_comm; j++){ @@ -203,12 +206,17 @@ shmem_coll_obj.root_complete_gather[j][i] = 1; } } - pthread_spin_init(&shmem_coll->shmem_coll_lock,0); + for (j=0;jshmem_coll_lock,0); } - shmem_coll_obj.shmem_coll_buf = - (char*)(shmem_coll_obj.child_complete_bcast[0] + 6*shmem_coll_num_comm*smpi.num_local_nodes); + + shmem_coll_obj.shmem_coll_buf = (char*)(shmem_coll_obj.child_complete_bcast[0] + + 6*shmem_coll_num_comm*smpi.num_local_nodes + shmem_coll_blocks); + return MPI_SUCCESS; } Modified: mvapich/trunk/src/coll/intra_fns_new.c =================================================================== --- mvapich/trunk/src/coll/intra_fns_new.c 2008-02-01 23:04:07 UTC (rev 1945) +++ mvapich/trunk/src/coll/intra_fns_new.c 2008-02-01 23:06:11 UTC (rev 1946) @@ -76,7 +76,7 @@ /* Knomial Bcast */ int bcast_knomial_degree = 4; - +extern int viadev_use_allgather_new; #ifdef MCST_SUPPORT #define UD_MTU_SIZE 2048 #define UD_HEADER 40 @@ -2854,6 +2854,7 @@ int rdma_allgather_enabled, is_contig, sendtype_size; void *send_mybuf, *recv_mybuf; int mylen; + struct MPIR_COMMUNICATOR *new_comm_ptr; MPID_Msgrep_t msgrep = MPID_MSGREP_RECEIVER; MPID_Msg_pack_t msgact = MPID_MSG_OK; @@ -2874,6 +2875,24 @@ nbytes = sendtype_size * sendcount; MPIR_Datatype_iscontig(sendtype->self, &is_contig); + if (viadev_use_allgather_new && comm->allg_cyclic_ok && (recvcount*size*type_size < MPIR_ALLGATHER_LONG_MSG) + && is_contig){ + new_comm_ptr = MPIR_GET_COMM_PTR(comm->new_comm); + void* tmp_recv_buf; + MPIR_ALLOC(tmp_recv_buf, (void *)MALLOC(nbytes*size), comm, MPI_ERR_EXHAUSTED, "MPI_ALLGATHER"); + + intra_Allgather (sendbuf, sendcount, sendtype, tmp_recv_buf, recvcount,recvtype,new_comm_ptr); + + for (i=0; inew_ranks[i])*nbytes), + (char*)tmp_recv_buf+ i*nbytes, nbytes); + } + FREE(tmp_recv_buf); + return mpi_errno; + } + + + /* check if comm_size is a power of two */ pof2 = 1; while (pof2 < size) Modified: mvapich/trunk/src/context/create_2level_comm.c =================================================================== --- mvapich/trunk/src/context/create_2level_comm.c 2008-02-01 23:04:07 UTC (rev 1945) +++ mvapich/trunk/src/context/create_2level_comm.c 2008-02-01 23:06:11 UTC (rev 1946) @@ -40,7 +40,10 @@ int shmem_comm_count = 0; extern shmem_coll_region *shmem_coll; extern int disable_shmem_bcast; +extern int viadev_use_allgather_new; +extern struct shmem_coll_mgmt shmem_coll_obj; + void clear_2level_comm (struct MPIR_COMMUNICATOR* comm_ptr) { comm_ptr->shmem_coll_ok = 0; @@ -50,6 +53,10 @@ comm_ptr->bcast_shmem_file = NULL; comm_ptr->bcast_fd = -1; comm_ptr->bcast_index = 0; + comm_ptr->allg_cyclic_ok = 0; + comm_ptr->new_comm = MPI_COMM_NULL; + comm_ptr->leader_comm = MPI_COMM_NULL; + comm_ptr->shmem_comm = MPI_COMM_NULL; } void free_2level_comm (struct MPIR_COMMUNICATOR* comm_ptr) @@ -66,7 +73,27 @@ if (comm_ptr->bcast_shmem_file){ free(comm_ptr->bcast_shmem_file); } + + if (comm_ptr->new_comm != MPI_COMM_NULL){ + MPI_Comm_free(&(comm_ptr->new_comm)); + MPI_Group_free(&(comm_ptr->new_group)); + free(comm_ptr->new_ranks); + } + if (comm_ptr->leader_comm != MPI_COMM_NULL) { + MPI_Comm_free(&(comm_ptr->leader_comm)); + } + + if (comm_ptr->shmem_comm != MPI_COMM_NULL) { + struct MPIR_COMMUNICATOR* shmem_ptr; + shmem_ptr= MPIR_GET_COMM_PTR(comm_ptr->shmem_comm); + pthread_spin_lock(&shmem_coll->shmem_coll_lock); + shmem_coll_obj.shmem_avail[shmem_ptr->shmem_comm_rank] = 1; + pthread_spin_unlock(&shmem_coll->shmem_coll_lock); + MPI_Comm_free(&(comm_ptr->shmem_comm)); + } + + clear_2level_comm(comm_ptr); } @@ -100,7 +127,6 @@ /* Creating leader group */ int leader = 0; leader = shmem_group[0]; - free(shmem_group); /* Gives the mapping to any process's leader in comm */ comm_ptr->leader_map = malloc(sizeof(int) * size); @@ -163,16 +189,24 @@ if (my_local_id == 0){ pthread_spin_lock(&shmem_coll->shmem_coll_lock); - shmem_coll->shmem_comm_count++; - shmem_comm_count = shmem_coll->shmem_comm_count; + shmem_comm_count = shmem_coll_blocks; + + for (i=0; i < shmem_coll_blocks;i++){ + if (shmem_coll_obj.shmem_avail[i]== 1){ + shmem_comm_count = i; + shmem_coll_obj.shmem_avail[i] = 0; + break; + } + } + pthread_spin_unlock(&shmem_coll->shmem_coll_lock); } MPI_Bcast (&shmem_comm_count, 1, MPI_INT, 0, comm_ptr->shmem_comm); - if (shmem_comm_count <= shmem_coll_blocks){ - shmem_ptr->shmem_comm_rank = shmem_comm_count-1; + if (shmem_comm_count < shmem_coll_blocks){ + shmem_ptr->shmem_comm_rank = shmem_comm_count; input_flag = 1; } else{ @@ -182,20 +216,64 @@ MPI_Allreduce(&input_flag, &output_flag, 1, MPI_INT, MPI_LAND, comm_ptr->self); comm_ptr->bcast_shmem_file = NULL; +#if 1 + if (viadev_use_allgather_new){ + int is_contig =1, check_leader =1, check_size=1, is_local_ok=0,is_block=0; + int PPN; + MPI_Bcast (&leader_rank, 1, MPI_INT, 0, comm_ptr->shmem_comm); + + for ( i=1; i < shmem_grp_size; i++ ){ + if (shmem_group[i] != shmem_group[i-1]+1){ + is_contig =0; + break; + } + } + + if (leader != (shmem_grp_size*leader_rank)){ + check_leader=0; + } + + if (shmem_grp_size != (size/leader_group_size)){ + check_size=0; + } + + is_local_ok = is_contig && check_leader && check_size; + + MPI_Allreduce(&is_local_ok, &is_block, 1, MPI_INT, MPI_LAND, comm_ptr->self); + + if (is_block){ + int counter=0,j; + MPI_Comm new_comm; + comm_ptr->new_ranks = (int*) malloc(sizeof(int)*size); + + PPN = shmem_grp_size; + for (j=0; j < PPN; j++){ + for (i=0; i < leader_group_size; i++){ + comm_ptr->new_ranks[counter] = j + i*PPN; + counter++; + } + } + + MPI_Group_incl(comm_group, size, comm_ptr->new_ranks, &(comm_ptr->new_group)); + MPI_Comm_create(comm_ptr->self, comm_ptr->new_group, &(comm_ptr->new_comm)); + comm_ptr->allg_cyclic_ok=1; + + } + } +#endif + if (output_flag == 1){ comm_ptr->shmem_coll_ok = 1; } else{ comm_ptr->shmem_coll_ok = 0; - free_2level_comm(comm_ptr); - if (comm_ptr->leader_comm) { MPI_Comm_free(&(comm_ptr->leader_comm));} - if (comm_ptr->shmem_comm) { MPI_Comm_free(&(comm_ptr->shmem_comm));} - MPI_Group_free(&subgroup1); - MPI_Group_free(&comm_group); } + MPI_Group_free(&subgroup1); + MPI_Group_free(&comm_group); ++comm_count; + free(shmem_group); } int check_comm_registry(struct MPIR_COMMUNICATOR* comm) Modified: mvapich/trunk/src/env/initutil.c =================================================================== --- mvapich/trunk/src/env/initutil.c 2008-02-01 23:04:07 UTC (rev 1945) +++ mvapich/trunk/src/env/initutil.c 2008-02-01 23:06:11 UTC (rev 1946) @@ -165,6 +165,7 @@ #endif #endif extern int bcast_knomial_degree; +int viadev_use_allgather_new =1; int enable_rdma_collectives = 0; /* MPIR_Init - Initialize the MPI execution environment @@ -313,6 +314,13 @@ } } #endif + if ((value = getenv("VIADEV_USE_ALLGATHER_NEW")) != NULL){ + viadev_use_allgather_new = atoi(value); + } + + if (enable_shmem_collectives == 0){ + viadev_use_allgather_new = 0; + } /* If we wanted to be able to check if we're being debugged, * (so that we could explicitly request that the other processes From noronha at mvapich.cse.ohio-state.edu Fri Feb 1 18:04:09 2008 From: noronha at mvapich.cse.ohio-state.edu (noronha@mvapich.cse.ohio-state.edu) Date: Fri Feb 1 21:09:42 2008 Subject: [mvapich-commit] r1945 - in mvapich/trunk/romio: . adio adio/ad_lustre adio/common adio/include Message-ID: <200802012304.m11N491A001098@mvapich.cse.ohio-state.edu> Author: noronha Date: 2008-02-01 18:04:07 -0500 (Fri, 01 Feb 2008) New Revision: 1945 Added: mvapich/trunk/romio/adio/ad_lustre/ mvapich/trunk/romio/adio/ad_lustre/Makefile.in mvapich/trunk/romio/adio/ad_lustre/README mvapich/trunk/romio/adio/ad_lustre/ad_lustre.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre.h mvapich/trunk/romio/adio/ad_lustre/ad_lustre_close.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_fcntl.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_hints.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_open.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_rdcoll.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_rwcontig.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_wrcoll.c Removed: mvapich/trunk/romio/adio/ad_lustre/Makefile.in mvapich/trunk/romio/adio/ad_lustre/README mvapich/trunk/romio/adio/ad_lustre/ad_lustre.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre.h mvapich/trunk/romio/adio/ad_lustre/ad_lustre_close.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_fcntl.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_hints.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_open.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_rdcoll.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_rwcontig.c mvapich/trunk/romio/adio/ad_lustre/ad_lustre_wrcoll.c Modified: mvapich/trunk/romio/Makefile.in mvapich/trunk/romio/adio/common/ad_aggregate.c mvapich/trunk/romio/adio/common/ad_fstype.c mvapich/trunk/romio/adio/common/ad_read_coll.c mvapich/trunk/romio/adio/common/ad_write_coll.c mvapich/trunk/romio/adio/common/iscontig.c mvapich/trunk/romio/adio/include/adio.h mvapich/trunk/romio/adio/include/adioi.h mvapich/trunk/romio/adio/include/adioi_fs_proto.h mvapich/trunk/romio/adio/include/mpio_error.h mvapich/trunk/romio/adio/include/romioconf.h.in mvapich/trunk/romio/configure mvapich/trunk/romio/configure.in Log: Check-in ad_lustre device. Currently BTIO small size, nprocs=1 and 4, subtype=full have been tested. Modified: mvapich/trunk/romio/Makefile.in =================================================================== --- mvapich/trunk/romio/Makefile.in 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/Makefile.in 2008-02-01 23:04:07 UTC (rev 1945) @@ -14,7 +14,7 @@ MPIO_DIRS = mpi-io EXTRA_SRC_DIRS = @EXTRA_SRC_DIRS@ FILE_SYS_DIRS = @FILE_SYS_DIRS@ -ALL_DIRS = mpi-io mpi-io/fortran mpi2-other/info mpi2-other/info/fortran mpi2-other/array mpi2-other/array/fortran adio/common adio/ad_pfs adio/ad_piofs adio/ad_nfs adio/ad_ufs adio/ad_xfs adio/ad_hfs adio/ad_sfs adio/ad_testfs adio/ad_pvfs adio/ad_pvfs2 adio/ad_panfs adio/ad_gridftp test +ALL_DIRS = mpi-io mpi-io/fortran mpi2-other/info mpi2-other/info/fortran mpi2-other/array mpi2-other/array/fortran adio/common adio/ad_pfs adio/ad_piofs adio/ad_nfs adio/ad_ufs adio/ad_xfs adio/ad_hfs adio/ad_sfs adio/ad_testfs adio/ad_pvfs adio/ad_pvfs2 adio/ad_panfs adio/ad_gridftp adio/ad_lustre test SHELL = /bin/sh @VPATH@ Copied: mvapich/trunk/romio/adio/ad_lustre (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre) Deleted: mvapich/trunk/romio/adio/ad_lustre/Makefile.in =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/Makefile.in 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/Makefile.in 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,49 +0,0 @@ -CC = @CC@ -AR = @AR@ -RANLIB = @RANLIB@ -LIBNAME = @LIBNAME@ -srcdir = @srcdir@ -CC_SHL = @CC_SHL@ -SHLIBNAME = @SHLIBNAME@ - -INCLUDE_DIR = -I@MPI_INCLUDE_DIR@ -I${srcdir}/../include -I../include -I../../include -I${srcdir}/../../../../include -I../../../../include -CFLAGS = @CPPFLAGS@ @CFLAGS@ $(INCLUDE_DIR) - -top_builddir = @master_topbuild_dir@ -LIBTOOL = @LIBTOOL@ -C_COMPILE_SHL = $(CC_SHL) @CFLAGS@ $(INCLUDE_DIR) - -@VPATH@ - -AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \ - ad_lustre_rwcontig.o \ - ad_lustre_wrcoll.o ad_lustre_rdcoll.o \ - ad_lustre_fcntl.o ad_lustre_hints.o ad_lustre_close.o - -default: $(LIBNAME) - @if [ "@ENABLE_SHLIB@" != "none" ] ; then \ - $(MAKE) $(SHLIBNAME).la ;\ - fi - -.SUFFIXES: $(SUFFIXES) .p .lo - -.c.o: - $(CC) $(CFLAGS) -c $< -.c.lo: - $(C_COMPILE_SHL) -c $< -o _s$*.o - @mv -f _s$*.o $*.lo - -$(LIBNAME): $(AD_LUSTRE_OBJECTS) - $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS) - $(RANLIB) $(LIBNAME) - -AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo) -$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS) - $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS) - -coverage: - -@for file in ${AD_LUSTRE_OBJECTS:.o=.c} ; do \ - gcov -b -f $$file ; done - -clean: - @rm -f *.o *.lo Copied: mvapich/trunk/romio/adio/ad_lustre/Makefile.in (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/Makefile.in) Deleted: mvapich/trunk/romio/adio/ad_lustre/README =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/README 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/README 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,64 +0,0 @@ ------------------------------------------------------ -V03: ------------------------------------------------------ - o Correct detection of fs_type when lustre: prefix is not given - o Further fix on stripe alignment - o Tested/Enabled striping hints over Cray XT (Catamount and CNL) - ------------------------------------------------------ -V02: ------------------------------------------------------ -The Lustre ADIO driver has been cleaned up quite a lot. Compared -to the intital posting, here are the changes: - o Removal of dead/redundant code - o Removal of asynchronous IO piece as it appears outdated - o Bug fixes for setting Lustre Hints - o Bug fixes for data sieving - o Improved Setsize operation with one process calling ftruncate - o Improved collective IO with domain partitioning on - Lustre stripe boundary - ------------------------------------------------------ -FAQ: - Q: How do I configure the Lustre ADIO driver? - A: Please include these options when configuring MVAPICH: - --with-romio --with-file-system=lustre - - Q: What if I need to enable Lustre support along with others? - A: You can add support for more file systems as the following: - --with-romio --with-file-system=lustre+nfs+pvfs2 - - Q: How do run a program using the Lustre ADIO driver? - A: First, Please make sure you are on a Lustre file system. - Second, if you have enabled support for multiple file system, - please give a prefix "lustre:" to the name of the accessing file. - For example: - # mpicc -o perf romio/test/perf.c - # bin/mpirun_rsh -np 2 node01 node02 ./perf -fname lustre:testfile - - Q: My program bails out with this error: - **io No such file or directory*? - A: Please make sure the following conditions are true: - -- you are running on the correct file system - -- you have given a correct pathname for the file - -- you have set the correct permission for the file or directory - - Q: My program segfaults with this error: - File locking failed in ADIOI_Set_lock? - A: Recent lustre releases requrie an additional mount option to have - correct file locks. So please insert this option into - your lustre mount command: "-o localflock". - For example: - # mount -o localflock -t lustre xxxx@o2ib:/datafs /mnt/datafs - ------------------------------------------------------ - -Contributing: - o You may contribute via many different ways, such as - testing results, bug reports, and new feature patches. - o We appreciate any courtesy reference of this work. - o Disclaimer: you are welcome to try the code, but at your own risk. - -Contact info: - For more info, visit http://ft.ornl.gov/projects/io/ - Copied: mvapich/trunk/romio/adio/ad_lustre/README (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/README) Deleted: mvapich/trunk/romio/adio/ad_lustre/ad_lustre.c =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/ad_lustre.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,35 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (C) 2001 University of Chicago. - * See COPYRIGHT notice in top-level directory. - * - * With contributions from Future Technologies Group, - * Oak Ridge National Laboratory (http://ft.ornl.gov/) - */ - -#include "ad_lustre.h" - -struct ADIOI_Fns_struct ADIO_LUSTRE_operations = { - ADIOI_LUSTRE_Open, /* Open */ - ADIOI_LUSTRE_ReadContig, /* ReadContig */ - ADIOI_LUSTRE_WriteContig, /* WriteContig */ - ADIOI_LUSTRE_ReadStridedColl, /* ReadStridedColl */ - ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */ - ADIOI_GEN_SeekIndividual, /* SeekIndividual */ - ADIOI_LUSTRE_Fcntl, /* Fcntl */ - ADIOI_LUSTRE_SetInfo, /* SetInfo */ - ADIOI_GEN_ReadStrided, /* ReadStrided */ - ADIOI_GEN_WriteStrided, /* WriteStrided */ - ADIOI_LUSTRE_Close, /* Close */ - ADIOI_GEN_IreadContig, /* IreadContig */ - ADIOI_GEN_IwriteContig, /* IwriteContig */ - ADIOI_GEN_IODone, /* ReadDone */ - ADIOI_GEN_IODone, /* WriteDone */ - ADIOI_GEN_IOComplete, /* ReadComplete */ - ADIOI_GEN_IOComplete, /* WriteComplete */ - ADIOI_GEN_IreadStrided, /* IreadStrided */ - ADIOI_GEN_IwriteStrided, /* IwriteStrided */ - ADIOI_GEN_Flush, /* Flush */ - ADIOI_GEN_Resize, /* Resize */ - ADIOI_GEN_Delete, /* Delete */ -}; Copied: mvapich/trunk/romio/adio/ad_lustre/ad_lustre.c (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre.c) Deleted: mvapich/trunk/romio/adio/ad_lustre/ad_lustre.h =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre.h 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/ad_lustre.h 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,35 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (C) 1997 University of Chicago. - * See COPYRIGHT notice in top-level directory. - * - * With contributions from Future Technologies Group, - * Oak Ridge National Laboratory (http://ft.ornl.gov/) - */ - -#ifndef AD_UNIX_INCLUDE -#define AD_UNIX_INCLUDE - -/* temp*/ -#define HAVE_ASM_TYPES_H 1 - -#include -#include -#include -#include -#include "lustre/lustre_user.h" -#include "adio.h" -/*#include "adioi.h"*/ - -#ifdef HAVE_SIGNAL_H -#include -#endif - -#ifdef HAVE_AIO_H -#include -#ifdef HAVE_SYS_AIO_H -#include -#endif -#endif /* End of HAVE_SYS_AIO_H */ - -#endif /* End of AD_UNIX_INCLUDE */ Copied: mvapich/trunk/romio/adio/ad_lustre/ad_lustre.h (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre.h) Deleted: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_close.c =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_close.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/ad_lustre_close.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,41 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * - * Copyright (C) 1997 University of Chicago. - * See COPYRIGHT notice in top-level directory. - * - * With contributions from Future Technologies Group, - * Oak Ridge National Laboratory (http://ft.ornl.gov/) - */ - -#include "ad_lustre.h" - -#ifdef PROFILE -#include "mpe.h" -#endif - -void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code) -{ - int err, derr=0; - static char myname[] = "ADIOI_LUSTRE_CLOSE"; - -#ifdef PROFILE - MPE_Log_event(9, 0, "start close"); -#endif - - err = close(fd->fd_sys); - -#ifdef PROFILE - MPE_Log_event(10, 0, "end close"); -#endif - - fd->fd_sys = -1; - - if (err == -1 || derr == -1) { - *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, - myname, __LINE__, MPI_ERR_IO, - "**io", - "**io %s", strerror(errno)); - } - else *error_code = MPI_SUCCESS; -} Copied: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_close.c (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_close.c) Deleted: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_fcntl.c =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_fcntl.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/ad_lustre_fcntl.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,98 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (C) 1997 University of Chicago. - * See COPYRIGHT notice in top-level directory. - * - * With contributions from Future Technologies Group, - * Oak Ridge National Laboratory (http://ft.ornl.gov/) - */ - -#include "ad_lustre.h" -#include "adio_extern.h" - -void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, int *error_code) -{ - int i, ntimes; - ADIO_Offset curr_fsize, alloc_size, size, len, done; - ADIO_Status status; - char *buf; -#if defined(MPICH2) || !defined(PRINT_ERR_MSG) - static char myname[] = "ADIOI_LUSTRE_FCNTL"; -#endif - - switch(flag) { - case ADIO_FCNTL_GET_FSIZE: - fcntl_struct->fsize = lseek(fd->fd_sys, 0, SEEK_END); - if (fd->fp_sys_posn != -1) - lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); - if (fcntl_struct->fsize == -1) { - *error_code = MPIO_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, myname, __LINE__, - MPI_ERR_IO, "**io", "**io %s", strerror(errno)); - } - else *error_code = MPI_SUCCESS; - break; - - case ADIO_FCNTL_SET_DISKSPACE: - /* will be called by one process only */ - /* On file systems with no preallocation function, I have to - explicitly write - to allocate space. Since there could be holes in the file, - I need to read up to the current file size, write it back, - and then write beyond that depending on how much - preallocation is needed. - read/write in sizes of no more than ADIOI_PREALLOC_BUFSZ */ - - curr_fsize = lseek(fd->fd_sys, 0, SEEK_END); - alloc_size = fcntl_struct->diskspace; - - size = ADIOI_MIN(curr_fsize, alloc_size); - - ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; - buf = (char *) ADIOI_Malloc(ADIOI_PREALLOC_BUFSZ); - done = 0; - - for (i=0; i curr_fsize) { - memset(buf, 0, ADIOI_PREALLOC_BUFSZ); - size = alloc_size - curr_fsize; - ntimes = (size + ADIOI_PREALLOC_BUFSZ - 1)/ADIOI_PREALLOC_BUFSZ; - for (i=0; ifp_sys_posn != -1) - lseek(fd->fd_sys, fd->fp_sys_posn, SEEK_SET); - *error_code = MPI_SUCCESS; - break; - - case ADIO_FCNTL_SET_ATOMICITY: - fd->atomicity = (fcntl_struct->atomicity == 0) ? 0 : 1; - *error_code = MPI_SUCCESS; - break; - - default: - FPRINTF(stderr, "Unknown flag passed to ADIOI_LUSTRE_Fcntl\n"); - MPI_Abort(MPI_COMM_WORLD, 1); - } -} Copied: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_fcntl.c (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_fcntl.c) Deleted: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_hints.c =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_hints.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/ad_lustre_hints.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,123 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (C) 1997 University of Chicago. - * See COPYRIGHT notice in top-level directory. - * - * With contributions from Future Technologies Group, - * Oak Ridge National Laboratory (http://ft.ornl.gov/) - */ - -#include "ad_lustre.h" - -void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) -{ - char *value, *value_in_fd; - int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1; - struct lov_user_md lum = { 0 }; - int err, myrank, fd_sys, perm, amode, old_mask; - - if ( (fd->info) == MPI_INFO_NULL) { - /* This must be part of the open call. can set striping parameters - if necessary. */ - MPI_Info_create(&(fd->info)); - - /* has user specified striping or server buffering parameters - and do they have the same value on all processes? */ - if (users_info != MPI_INFO_NULL) { - value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); - - MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, - value, &flag); - if (flag) { - str_unit=atoi(value); - } - - MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, - value, &flag); - if (flag) { - str_factor=atoi(value); - } - - MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, - value, &flag); - if (flag) { - start_iodev=atoi(value); - } - - ADIOI_Free(value); - } - - MPI_Comm_rank(fd->comm, &myrank); - if (myrank == 0) { - tmp_val[0] = str_factor; - tmp_val[1] = str_unit; - tmp_val[2] = start_iodev; - } - MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm); - - if (tmp_val[0] != str_factor - || tmp_val[1] != str_unit - || tmp_val[2] != start_iodev) { - FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys" - "-striping_factor:striping_unit:start_iodevice " - "need to be identical across all processes\n"); - MPI_Abort(MPI_COMM_WORLD, 1); - } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { - /* if user has specified striping info, process 0 tries to set it */ - if (!myrank) { - if (fd->perm == ADIO_PERM_NULL) { - old_mask = umask(022); - umask(old_mask); - perm = old_mask ^ 0666; - } - else perm = fd->perm; - - amode = 0; - if (fd->access_mode & ADIO_CREATE) - amode = amode | O_CREAT; - if (fd->access_mode & ADIO_RDONLY) - amode = amode | O_RDONLY; - if (fd->access_mode & ADIO_WRONLY) - amode = amode | O_WRONLY; - if (fd->access_mode & ADIO_RDWR) - amode = amode | O_RDWR; - if (fd->access_mode & ADIO_EXCL) - amode = amode | O_EXCL; - - /* we need to create file so ensure this is set */ - amode = amode | O_LOV_DELAY_CREATE | O_CREAT; - - fd_sys = open(fd->filename, amode, perm); - if (fd_sys == -1) { - if (errno != EEXIST) - fprintf(stderr, - "Failure to open file %s %d %d\n",strerror(errno), amode, perm); - } else { - lum.lmm_magic = LOV_USER_MAGIC; - lum.lmm_pattern = 0; - lum.lmm_stripe_size = str_unit; - lum.lmm_stripe_count = str_factor; - lum.lmm_stripe_offset = start_iodev; - - err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); - if (err == -1 && errno != EEXIST) { - fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); - } - close(fd_sys); - } - } /* End of striping parameters validation */ - } - - MPI_Barrier(fd->comm); - /* set the values for collective I/O and data sieving parameters */ - ADIOI_GEN_SetInfo(fd, users_info, error_code); - } else { - /* The file has been opened previously and fd->fd_sys is a valid - file descriptor. cannot set striping parameters now. */ - - /* set the values for collective I/O and data sieving parameters */ - ADIOI_GEN_SetInfo(fd, users_info, error_code); - } - - *error_code = MPI_SUCCESS; -} Copied: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_hints.c (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_hints.c) Deleted: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_open.c =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_open.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/ad_lustre_open.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,121 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (C) 1997 University of Chicago. - * See COPYRIGHT notice in top-level directory. - * - * With contributions from Future Technologies Group, - * Oak Ridge National Laboratory (http://ft.ornl.gov/) - */ - -#include "ad_lustre.h" - -void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) -{ - int perm, old_mask, amode; - struct lov_user_md lum = { 0 }; - char *value; - -#if defined(MPICH2) || !defined(PRINT_ERR_MSG) - static char myname[] = "ADIOI_LUSTRE_OPEN"; -#endif - - if (fd->perm == ADIO_PERM_NULL) { - old_mask = umask(022); - umask(old_mask); - perm = old_mask ^ 0666; - } - else perm = fd->perm; - - amode = 0; - if (fd->access_mode & ADIO_CREATE) - amode = amode | O_CREAT; - if (fd->access_mode & ADIO_RDONLY) - amode = amode | O_RDONLY; - if (fd->access_mode & ADIO_WRONLY) - amode = amode | O_WRONLY; - if (fd->access_mode & ADIO_RDWR) - amode = amode | O_RDWR; - if (fd->access_mode & ADIO_EXCL) - amode = amode | O_EXCL; - - fd->fd_sys = open(fd->filename, amode, perm); - - if (fd->fd_sys != -1) { - int err; - - value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); - - /* get file striping information and set it in info */ - lum.lmm_magic = LOV_USER_MAGIC; - err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); - - if (!err) { - sprintf(value, "%d", lum.lmm_stripe_size); - MPI_Info_set(fd->info, "striping_unit", value); - - sprintf(value, "%d", lum.lmm_stripe_count); - MPI_Info_set(fd->info, "striping_factor", value); - - sprintf(value, "%d", lum.lmm_stripe_offset); - MPI_Info_set(fd->info, "start_iodevice", value); - } - ADIOI_Free(value); - - if (fd->access_mode & ADIO_APPEND) - fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); - } - - if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) - fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); - - /* --BEGIN ERROR HANDLING-- */ - if (fd->fd_sys == -1) { - if (errno == ENAMETOOLONG) - *error_code = MPIO_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, myname, - __LINE__, MPI_ERR_BAD_FILE, - "**filenamelong", - "**filenamelong %s %d", - fd->filename, - strlen(fd->filename)); - else if (errno == ENOENT) - *error_code = MPIO_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, myname, - __LINE__, MPI_ERR_NO_SUCH_FILE, - "**filenoexist", - "**filenoexist %s", - fd->filename); - else if (errno == ENOTDIR || errno == ELOOP) - *error_code = MPIO_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, - myname, __LINE__, - MPI_ERR_BAD_FILE, - "**filenamedir", - "**filenamedir %s", - fd->filename); - else if (errno == EACCES) { - *error_code = MPIO_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, myname, - __LINE__, MPI_ERR_ACCESS, - "**fileaccess", - "**fileaccess %s", - fd->filename ); - } - else if (errno == EROFS) { - /* Read only file or file system and write access requested */ - *error_code = MPIO_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, myname, - __LINE__, MPI_ERR_READ_ONLY, - "**ioneedrd", 0 ); - } - else { - *error_code = MPIO_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, myname, - __LINE__, MPI_ERR_IO, "**io", - "**io %s", strerror(errno)); - } - } - /* --END ERROR HANDLING-- */ - else *error_code = MPI_SUCCESS; - -} Copied: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_open.c (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_open.c) Deleted: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_rdcoll.c =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_rdcoll.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/ad_lustre_rdcoll.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,19 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (C) 1997 University of Chicago. - * See COPYRIGHT notice in top-level directory. - * - * With contributions from Future Technologies Group, - * Oak Ridge National Laboratory (http://ft.ornl.gov/) - */ - -#include "ad_lustre.h" - -void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count, - MPI_Datatype datatype, int file_ptr_type, - ADIO_Offset offset, ADIO_Status *status, int - *error_code) -{ - ADIOI_GEN_ReadStridedColl(fd, buf, count, datatype, file_ptr_type, - offset, status, error_code); -} Copied: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_rdcoll.c (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_rdcoll.c) Deleted: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_rwcontig.c =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_rwcontig.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/ad_lustre_rwcontig.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,86 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (C) 1997 University of Chicago. - * See COPYRIGHT notice in top-level directory. - * - * With contributions from Future Technologies Group, - * Oak Ridge National Laboratory (http://ft.ornl.gov/) - */ - -#include "ad_lustre.h" - -static void ADIOI_LUSTRE_IOContig(ADIO_File fd, void *buf, int count, - MPI_Datatype datatype, int file_ptr_type, - ADIO_Offset offset, ADIO_Status *status, - int io_mode, int *error_code); - -static void ADIOI_LUSTRE_IOContig(ADIO_File fd, void *buf, int count, - MPI_Datatype datatype, int file_ptr_type, - ADIO_Offset offset, ADIO_Status *status, - int io_mode, int *error_code) -{ - int err=-1, datatype_size, len; -#if defined(MPICH2) || !defined(PRINT_ERR_MSG) - static char myname[] = "ADIOI_LUSTRE_IOCONTIG"; -#endif - - MPI_Type_size(datatype, &datatype_size); - len = datatype_size * count; - - if (file_ptr_type == ADIO_INDIVIDUAL) { - offset = fd->fp_ind; - } - - if (fd->fp_sys_posn != offset) { - err = lseek(fd->fd_sys, offset, SEEK_SET); - if (err == -1) goto ioerr; - } - - if (io_mode) - err = write(fd->fd_sys, buf, len); - else - err = read(fd->fd_sys, buf, len); - - if (err == -1) goto ioerr; - fd->fp_sys_posn = offset + err; - - if (file_ptr_type == ADIO_INDIVIDUAL) { - fd->fp_ind += err; - } - -#ifdef HAVE_STATUS_SET_BYTES - if (status) MPIR_Status_set_bytes(status, datatype, err); -#endif - *error_code = MPI_SUCCESS; - -ioerr: - /* --BEGIN ERROR HANDLING-- */ - if (err == -1) { - *error_code = MPIO_Err_create_code(MPI_SUCCESS, - MPIR_ERR_RECOVERABLE, - myname, __LINE__, - MPI_ERR_IO, "**io", - "**io %s", strerror(errno)); - fd->fp_sys_posn = -1; - return; - } - /* --END ERROR HANDLING-- */ -} - -void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, - MPI_Datatype datatype, int file_ptr_type, - ADIO_Offset offset, ADIO_Status *status, int *error_code) -{ - ADIOI_LUSTRE_IOContig(fd, buf, count, - datatype, file_ptr_type, - offset, status, 1, error_code); -} - -void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, - MPI_Datatype datatype, int file_ptr_type, - ADIO_Offset offset, ADIO_Status *status, int *error_code) -{ - ADIOI_LUSTRE_IOContig(fd, buf, count, - datatype, file_ptr_type, - offset, status, 0, error_code); -} Copied: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_rwcontig.c (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_rwcontig.c) Deleted: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_wrcoll.c =================================================================== --- mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_wrcoll.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/ad_lustre/ad_lustre_wrcoll.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -1,19 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* - * Copyright (C) 1997 University of Chicago. - * See COPYRIGHT notice in top-level directory. - * - * With contributions from Future Technologies Group, - * Oak Ridge National Laboratory (http://ft.ornl.gov/) - */ - -#include "ad_lustre.h" - -void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, - MPI_Datatype datatype, int file_ptr_type, - ADIO_Offset offset, ADIO_Status *status, int - *error_code) -{ - ADIOI_GEN_WriteStridedColl(fd, buf, count, datatype, file_ptr_type, - offset, status, error_code); -} Copied: mvapich/trunk/romio/adio/ad_lustre/ad_lustre_wrcoll.c (from rev 1944, mvapich/branches/mvapich-lustre/romio/adio/ad_lustre/ad_lustre_wrcoll.c) Modified: mvapich/trunk/romio/adio/common/ad_aggregate.c =================================================================== --- mvapich/trunk/romio/adio/common/ad_aggregate.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/common/ad_aggregate.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -2,6 +2,9 @@ /* * Copyright (C) 1997-2001 University of Chicago. * See COPYRIGHT notice in top-level directory. + * + * With contributions from Future Technologies Group, + * Oak Ridge National Laboratory (http://ft.ornl.gov/) */ #include "adio.h" @@ -12,6 +15,8 @@ #undef AGG_DEBUG +#define ALIGNDOWN(sz, al) ((sz)/al*al) + /* This file contains four functions: * * ADIOI_Calc_aggregator() @@ -88,8 +93,13 @@ #endif /* get an index into our array of aggregators */ - rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1); + if (fd->file_system == ADIO_LUSTRE) + rank_index = (int) ((off - ALIGNDOWN(min_off, fd_size) + fd_size)/ + fd_size - 1); + else + rank_index = (int) ((off - min_off + fd_size)/ fd_size - 1); + /* remember here that even in Rajeev's original code it was the case that * different aggregators could end up with different amounts of data to * aggregate. here we use fd_end[] to make sure that we know how much @@ -121,6 +131,7 @@ process may directly access only its own file domain. */ ADIO_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, fd_size; + int alignment = *fd_size_ptr; int i; #ifdef AGG_DEBUG @@ -145,6 +156,10 @@ processes */ fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - 1)/nprocs_for_coll; + if (alignment) { + fd_size = (fd_size + alignment -1 ) / alignment * alignment; + } + /* ceiling division as in HPF block distribution */ *fd_start_ptr = (ADIO_Offset *) @@ -156,7 +171,10 @@ fd_end = *fd_end_ptr; fd_start[0] = min_st_offset; - fd_end[0] = min_st_offset + fd_size - 1; + if (alignment) + fd_end[0] = ALIGNDOWN(min_st_offset, fd_size) + fd_size - 1; + else + fd_end[0] = min_st_offset + fd_size - 1; for (i=1; ihints->cb_read == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_read == ADIOI_HINT_AUTO))) { + int filerange_is_contig = 0; + /* don't do aggregation */ if (fd->hints->cb_read != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); @@ -144,8 +149,13 @@ fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + if (!filetype_is_contig) + ADIOI_Filetype_range_iscontig(fd, offset, file_ptr_type, + datatype, count, &filerange_is_contig); - if (buftype_is_contig && filetype_is_contig) { + if (buftype_is_contig && (filetype_is_contig || + filerange_is_contig)) { + if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; ADIO_ReadContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, @@ -176,6 +186,19 @@ * needs to be mapped to an actual rank in the communicator later. * */ + if (fd->file_system == ADIO_LUSTRE) { + char *value; + int sflag; + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, + value, &sflag); + if (sflag) + fd_size = atoi(value); + else + fd_size = ADIOI_LUSTRE_STRIPE_DFLT; + ADIOI_Free(value); + } + ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, &fd_end, &fd_size); Modified: mvapich/trunk/romio/adio/common/ad_write_coll.c =================================================================== --- mvapich/trunk/romio/adio/common/ad_write_coll.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/common/ad_write_coll.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -4,6 +4,9 @@ * * Copyright (C) 1997 University of Chicago. * See COPYRIGHT notice in top-level directory. + * + * With contributions from Future Technologies Group, + * Oak Ridge National Laboratory (http://ft.ornl.gov/) */ #include "adio.h" @@ -72,7 +75,7 @@ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank; int contig_access_count, interleave_count = 0, buftype_is_contig; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; - ADIO_Offset orig_fp, start_offset, end_offset, fd_size, min_st_offset, off; + ADIO_Offset orig_fp, start_offset, end_offset, fd_size=0, min_st_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *fd_start = NULL, *fd_end = NULL, *end_offsets = NULL; int *buf_idx = NULL, *len_list = NULL; @@ -129,6 +132,8 @@ if (fd->hints->cb_write == ADIOI_HINT_DISABLE || (!interleave_count && (fd->hints->cb_write == ADIOI_HINT_AUTO))) { + int filerange_is_contig = 0; + /* use independent accesses */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); @@ -139,8 +144,12 @@ fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); + if (!filetype_is_contig) + ADIOI_Filetype_range_iscontig(fd, offset, file_ptr_type, + datatype, count, &filerange_is_contig); - if (buftype_is_contig && filetype_is_contig) { + if (buftype_is_contig && (filetype_is_contig || + filerange_is_contig)) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (fd->etype_size) * offset; ADIO_WriteContig(fd, buf, count, datatype, @@ -160,6 +169,19 @@ done by (logically) dividing the file into file domains (FDs); each process may directly access only its own file domain. */ + if (fd->file_system == ADIO_LUSTRE) { + char *value; + int sflag; + value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); + MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, + value, &sflag); + if (sflag) + fd_size = atoi(value); + else + fd_size = ADIOI_LUSTRE_STRIPE_DFLT; + ADIOI_Free(value); + } + ADIOI_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, &min_st_offset, &fd_start, &fd_end, &fd_size); Modified: mvapich/trunk/romio/adio/common/iscontig.c =================================================================== --- mvapich/trunk/romio/adio/common/iscontig.c 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/common/iscontig.c 2008-02-01 23:04:07 UTC (rev 1945) @@ -2,9 +2,13 @@ /* * Copyright (C) 1997 University of Chicago. * See COPYRIGHT notice in top-level directory. + * + * With contributions from Future Technologies Group, + * Oak Ridge National Laboratory (http://ft.ornl.gov/) */ #include "adio.h" +#include "adio_extern.h" /* #ifdef MPISGI #include "mpisgi2.h" #endif */ @@ -101,3 +105,85 @@ in other cases as well.*/ } #endif + +void ADIOI_Filetype_range_start(ADIO_File fd, ADIO_Offset offset, int file_ptr_type, + int *start_index, int *start_ftype, int *start_offset, int *start_io_size) +{ + ADIOI_Flatlist_node *flat_file; + ADIO_Offset disp, abs_off_in_filetype=0; + MPI_Aint filetype_extent; + + int i, st_io_size=0, st_index=0; + int sum, n_etypes_in_filetype, size_in_filetype; + int n_filetypes, etype_in_filetype; + int flag, filetype_size, etype_size; + + flat_file = ADIOI_Flatlist; + while (flat_file->type != fd->filetype) flat_file = flat_file->next; + disp = fd->disp; + + MPI_Type_size(fd->filetype, &filetype_size); + MPI_Type_extent(fd->filetype, &filetype_extent); + etype_size = fd->etype_size; + + if (file_ptr_type == ADIO_INDIVIDUAL) { + offset = fd->fp_ind; /* in bytes */ + n_filetypes = -1; + flag = 0; + while (!flag) { + n_filetypes++; + for (i=0; icount; i++) { + if (disp + flat_file->indices[i] + + (ADIO_Offset) n_filetypes*filetype_extent + flat_file->blocklens[i] + >= offset) { + st_index = i; + st_io_size = (int) (disp + flat_file->indices[i] + + (ADIO_Offset) n_filetypes*filetype_extent + + flat_file->blocklens[i] - offset); + flag = 1; + break; + } + } + } + } else { + n_etypes_in_filetype = filetype_size/etype_size; + n_filetypes = (int) (offset / n_etypes_in_filetype); + etype_in_filetype = (int) (offset % n_etypes_in_filetype); + size_in_filetype = etype_in_filetype * etype_size; + + sum = 0; + for (i=0; icount; i++) { + sum += flat_file->blocklens[i]; + if (sum > size_in_filetype) { + st_index = i; + st_io_size = sum - size_in_filetype; + abs_off_in_filetype = flat_file->indices[i] + + size_in_filetype - (sum - flat_file->blocklens[i]); + break; + } + } + + /* abs. offset in bytes in the file */ + offset = disp + (ADIO_Offset) n_filetypes*filetype_extent + abs_off_in_filetype; + } + + *start_index = st_index; + *start_io_size = st_io_size; + *start_offset = offset; + *start_ftype = n_filetypes; +} + +void ADIOI_Filetype_range_iscontig(ADIO_File fd, ADIO_Offset offset, + int file_ptr_type, MPI_Datatype datatype, int count, int *flag) +{ + int srclen, datatype_size; + int st_index, st_ftype, st_offset, st_io_size; + + MPI_Type_size(datatype, &datatype_size); + srclen = datatype_size * count; + + ADIOI_Filetype_range_start(fd, offset, file_ptr_type, + &st_index, &st_ftype, &st_offset, &st_io_size); + *flag = st_io_size >= srclen ? 1 : 0; +} + Modified: mvapich/trunk/romio/adio/include/adio.h =================================================================== --- mvapich/trunk/romio/adio/include/adio.h 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/include/adio.h 2008-02-01 23:04:07 UTC (rev 1945) @@ -293,6 +293,7 @@ #define ADIO_PVFS2 160 /* PVFS2: 2nd generation PVFS */ #define ADIO_PANFS 161 /* Panasas FS */ #define ADIO_GRIDFTP 162 /* Globus GridFTP */ +#define ADIO_LUSTRE 163 /* Lustre */ #define ADIO_SEEK_SET SEEK_SET #define ADIO_SEEK_CUR SEEK_CUR Modified: mvapich/trunk/romio/adio/include/adioi.h =================================================================== --- mvapich/trunk/romio/adio/include/adioi.h 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/include/adioi.h 2008-02-01 23:04:07 UTC (rev 1945) @@ -4,6 +4,9 @@ * * Copyright (C) 1997 University of Chicago. * See COPYRIGHT notice in top-level directory. + * + * With contributions from Future Technologies Group, + * Oak Ridge National Laboratory (http://ft.ornl.gov/) */ @@ -188,6 +191,7 @@ #define ADIOI_PREALLOC_BUFSZ 4194304 /* buffer size used to preallocate disk space */ +#define ADIOI_LUSTRE_STRIPE_DFLT (1<<20) /* default values for some hints */ /* buffer size for collective I/O = 4MB */ @@ -305,6 +309,10 @@ void *ADIOI_Realloc_fn(void *ptr, size_t size, int lineno, char *fname); void ADIOI_Free_fn(void *ptr, int lineno, char *fname); void ADIOI_Datatype_iscontig(MPI_Datatype datatype, int *flag); +void ADIOI_Filetype_range_iscontig(ADIO_File fd, ADIO_Offset offset, + int file_ptr_type, MPI_Datatype datatype, int count, int *flag); +void ADIOI_Filetype_range_start(ADIO_File fd, ADIO_Offset offset, int file_ptr_type, + int *start_index, int *start_ftype, int *start_offset, int *start_io_size); void ADIOI_Get_position(ADIO_File fd, ADIO_Offset *offset); void ADIOI_Get_eof_offset(ADIO_File fd, ADIO_Offset *eof_offset); void ADIOI_Get_byte_offset(ADIO_File fd, ADIO_Offset offset, Modified: mvapich/trunk/romio/adio/include/adioi_fs_proto.h =================================================================== --- mvapich/trunk/romio/adio/include/adioi_fs_proto.h 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/include/adioi_fs_proto.h 2008-02-01 23:04:07 UTC (rev 1945) @@ -50,6 +50,32 @@ /* prototypes are in adio/ad_sfs/ad_sfs.h */ #endif +#ifdef ROMIO_LUSTRE +extern struct ADIOI_Fns_struct ADIO_LUSTRE_operations; + +void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code); +void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code); +void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int + *error_code); +void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int + *error_code); +void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int + *error_code); +void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status *status, int + *error_code); +void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, + int *error_code); +void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); +#endif + #ifdef ROMIO_NTFS extern struct ADIOI_Fns_struct ADIO_NTFS_operations; /* prototypes are in adio/ad_ntfs/ad_ntfs.h */ Modified: mvapich/trunk/romio/adio/include/mpio_error.h =================================================================== --- mvapich/trunk/romio/adio/include/mpio_error.h 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/include/mpio_error.h 2008-02-01 23:04:07 UTC (rev 1945) @@ -63,6 +63,7 @@ #define MPIR_ERR_FILETYPE 33 #define MPIR_ERR_NO_NTFS 35 #define MPIR_ERR_NO_TESTFS 36 +#define MPIR_ERR_NO_LUSTRE 37 /* MPI_ERR_COMM */ #ifndef MPIR_ERR_COMM_NULL Modified: mvapich/trunk/romio/adio/include/romioconf.h.in =================================================================== --- mvapich/trunk/romio/adio/include/romioconf.h.in 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/adio/include/romioconf.h.in 2008-02-01 23:04:07 UTC (rev 1945) @@ -267,6 +267,9 @@ /* Define for ROMIO with PVFS2 */ #undef ROMIO_PVFS2 +/* Define for ROMIO with LUSTRE */ +#undef ROMIO_LUSTRE + /* Define if int64_t must be defined for PVFS */ #undef ROMIO_PVFS_NEEDS_INT64_DEFINITION Modified: mvapich/trunk/romio/configure =================================================================== --- mvapich/trunk/romio/configure 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/configure 2008-02-01 23:04:07 UTC (rev 1945) @@ -1398,7 +1398,7 @@ # have_aio=no # -known_filesystems="nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp" +known_filesystems="nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp lustre" known_mpi_impls="mpich2_mpi mpich_mpi sgi_mpi hp_mpi cray_mpi lam_mpi" # # Defaults @@ -7390,6 +7390,14 @@ fi +if test -n "$file_system_lustre"; then + +cat >>confdefs.h <<\_ACEOF +#define ROMIO_LUSTRE 1 +_ACEOF + +fi + # # Check for presence and characteristics of async. I/O calls if # not disabled. @@ -11264,7 +11272,7 @@ # are active will be called by the top level ROMIO make ac_config_commands="$ac_config_commands default-1" - ac_config_files="$ac_config_files Makefile localdefs mpi-io/Makefile mpi2-other/info/Makefile mpi2-other/array/Makefile adio/common/Makefile test/Makefile test/misc.c test/large_file.c test/runtests util/romioinstall include/mpio.h include/mpiof.h adio/ad_nfs/Makefile adio/ad_ufs/Makefile adio/ad_panfs/Makefile adio/ad_xfs/Makefile adio/ad_sfs/Makefile adio/ad_pfs/Makefile adio/ad_testfs/Makefile adio/ad_pvfs/Makefile adio/ad_pvfs2/Makefile adio/ad_gridftp/Makefile mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile mpi2-other/array/fortran/Makefile test/fmisc.f test/fcoll_test.f test/pfcoll_test.f test/fperf.f mpi-io/glue/mpich2/Makefile mpi-io! /glue/mpich1/Makefile mpi-io/glue/default/Makefile" + ac_config_files="$ac_config_files Makefile localdefs mpi-io/Makefile mpi2-other/info/Makefile mpi2-other/array/Makefile adio/common/Makefile test/Makefile test/misc.c test/large_file.c test/runtests util/romioinstall include/mpio.h include/mpiof.h adio/ad_nfs/Makefile adio/ad_ufs/Makefile adio/ad_panfs/Makefile adio/ad_xfs/Makefile adio/ad_sfs/Makefile adio/ad_pfs/Makefile adio/ad_testfs/Makefile adio/ad_pvfs/Makefile adio/ad_pvfs2/Makefile adio/ad_gridftp/Makefile adio/ad_lustre/Makefile mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile mpi2-other/array/fortran/Makefile test/fmisc.f test/fcoll_test.f test/pfcoll_test.f test/fperf.f mpi-io/glu! e/mpich2/Makefile mpi-io/glue/mpich1/Makefile mpi-io/glue/default/Makefile" cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure # tests run on this system so they can be shared between configure @@ -11822,6 +11830,7 @@ "adio/ad_pvfs/Makefile" ) CONFIG_FILES="$CONFIG_FILES adio/ad_pvfs/Makefile" ;; "adio/ad_pvfs2/Makefile" ) CONFIG_FILES="$CONFIG_FILES adio/ad_pvfs2/Makefile" ;; "adio/ad_gridftp/Makefile" ) CONFIG_FILES="$CONFIG_FILES adio/ad_gridftp/Makefile" ;; + "adio/ad_lustre/Makefile" ) CONFIG_FILES="$CONFIG_FILES adio/ad_lustre/Makefile" ;; "mpi-io/fortran/Makefile" ) CONFIG_FILES="$CONFIG_FILES mpi-io/fortran/Makefile" ;; "mpi2-other/info/fortran/Makefile" ) CONFIG_FILES="$CONFIG_FILES mpi2-other/info/fortran/Makefile" ;; "mpi2-other/array/fortran/Makefile" ) CONFIG_FILES="$CONFIG_FILES mpi2-other/array/fortran/Makefile" ;; Modified: mvapich/trunk/romio/configure.in =================================================================== --- mvapich/trunk/romio/configure.in 2008-02-01 19:20:05 UTC (rev 1944) +++ mvapich/trunk/romio/configure.in 2008-02-01 23:04:07 UTC (rev 1945) @@ -92,7 +92,7 @@ # have_aio=no # -known_filesystems="nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp" +known_filesystems="nfs ufs pfs pvfs pvfs2 testfs xfs panfs gridftp lustre" known_mpi_impls="mpich2_mpi mpich_mpi sgi_mpi hp_mpi cray_mpi lam_mpi" # # Defaults @@ -985,6 +985,9 @@ if test -n "$file_system_testfs"; then AC_DEFINE(ROMIO_TESTFS,1,[Define for ROMIO with TESTFS]) fi +if test -n "$file_system_lustre"; then + AC_DEFINE(ROMIO_LUSTRE,1,[Define for ROMIO with LUSTRE]) +fi if test -n "$file_system_xfs"; then AC_DEFINE(ROMIO_XFS,1,[Define for ROMIO with XFS]) @@ -1890,6 +1893,7 @@ adio/ad_testfs/Makefile adio/ad_pvfs/Makefile \ adio/ad_pvfs2/Makefile \ adio/ad_gridftp/Makefile \ + adio/ad_lustre/Makefile \ mpi-io/fortran/Makefile mpi2-other/info/fortran/Makefile \ mpi2-other/array/fortran/Makefile test/fmisc.f \ test/fcoll_test.f test/pfcoll_test.f test/fperf.f \ From perkinjo at mvapich.cse.ohio-state.edu Sun Feb 3 12:31:30 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Sun Feb 3 12:31:42 2008 Subject: [mvapich-commit] r1947 - mvapich/trunk/mpid/ch_gen2/process Message-ID: <200802031731.m13HVUD5023632@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-02-03 12:31:29 -0500 (Sun, 03 Feb 2008) New Revision: 1947 Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c mvapich/trunk/mpid/ch_gen2/process/mpirun_util.c Log: Fix legacy startup path Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-02-01 23:06:11 UTC (rev 1946) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-02-03 17:31:29 UTC (rev 1947) @@ -261,7 +261,6 @@ TOTALVIEW_CMD); sprintf(totalview_cmd, "%s", TOTALVIEW_CMD); } - legacy_startup = 1; break; case 11: legacy_startup = 1; @@ -582,97 +581,55 @@ int start_process(int i, char *command_name, char *env) { - char *remote_command; - char *xterm_command; - char xterm_title[100]; + char * remote_command = NULL; + char * xterm_command = NULL; + char * xterm_title = NULL; - char *ld_library_path; - - char *device_port_env = NULL; int id = getpid(); - int str_len, len; - if (plist[i].device != NULL && strlen(plist[i].device) != 0){ - device_port_env = (char * )malloc(BASE_ENV_LEN + strlen(plist[i].device) + 1); - sprintf(device_port_env, "VIADEV_DEVICE=%s \0", plist[i].device); - } - if (plist[i].port != -1){ - if (device_port_env != NULL){ - device_port_env = (char*)realloc(device_port_env, - strlen(device_port_env) + 1 + BASE_ENV_LEN - +sizeof(plist[i].port) + 1); - sprintf(&device_port_env[strlen(device_port_env)], "VIADEV_DEFAULT_PORT=%d \0", - plist[i].port); - } else { - device_port_env = (char *) malloc(BASE_ENV_LEN + - sizeof(plist[i].port) + 1); - sprintf(device_port_env, "VIADEV_DEFAULT_PORT=%d \0", plist[i].port); - } - } + remote_command = mkstr("cd %s; %s LD_LIBRARY_PATH=%s", wd, ENV_CMD, LD_LIBRARY_PATH_MPI); - if (device_port_env==NULL) { - device_port_env=strdup("\0"); + if(getenv("LD_LIBRARY_PATH")) { + remote_command = append_str(remote_command, mkstr(":%s", + getenv("LD_LIBRARY_PATH"))); } - if(use_totalview) { - str_len = strlen(command_name) + strlen(env) + strlen(wd) + - strlen(mpirun_processes) + strlen(device_port_env) + 512; - } else { - str_len = strlen(command_name) + strlen(env) + strlen(wd) + - strlen(device_port_env) + 530; - } + remote_command = append_str(remote_command, mkstr(" MPIRUN_MPD=0")); + remote_command = append_str(remote_command, mkstr(" MPIRUN_HOST=%s", + mpirun_host)); + remote_command = append_str(remote_command, mkstr(" MPIRUN_PORT=%d", port)); + remote_command = append_str(remote_command, mkstr(" MPIRUN_RANK=%d", i)); + remote_command = append_str(remote_command, mkstr(" MPIRUN_NPROCS=%d", + nprocs)); + remote_command = append_str(remote_command, mkstr(" MPIRUN_ID=%d", id)); + remote_command = append_str(remote_command, mkstr(" %s %s", display, env)); - if ((ld_library_path = getenv( "LD_LIBRARY_PATH" ) ) != NULL ) { - str_len += strlen(ld_library_path); + if(plist[i].device) { + remote_command = append_str(remote_command, mkstr(" VIADEV_DEVICE=%s", + plist[i].device)); } - if ((remote_command = malloc(str_len)) == NULL) { - fprintf(stderr, "Failed to malloc %d bytes for remote_command\n", - str_len); - exit(EXIT_FAILURE); + if(plist[i].port != -1) { + remote_command = append_str(remote_command, + mkstr(" VIADEV_DEFAULT_PORT=%d", plist[i].port)); } - if ((xterm_command = malloc(str_len)) == NULL) { - fprintf(stderr, "Failed to malloc %d bytes for xterm_command\n", - str_len); - exit(EXIT_FAILURE); - } - - /* - * this is the remote command we execute whether we were are using - * an xterm or using rsh directly - */ - if (ld_library_path != NULL ) { - sprintf(remote_command, "cd %s; %s LD_LIBRARY_PATH=%s:%s " - "MPIRUN_MPD=0 MPIRUN_HOST=%s MPIRUN_PORT=%d " - "MPIRUN_RANK=%d MPIRUN_NPROCS=%d MPIRUN_ID=%d %s %s %s", - wd, ENV_CMD,LD_LIBRARY_PATH_MPI,ld_library_path, - mpirun_host, port, i, - nprocs, id, display,env,device_port_env); - } else { - sprintf(remote_command, "cd %s; %s LD_LIBRARY_PATH=%s " - "MPIRUN_MPD=0 MPIRUN_HOST=%s MPIRUN_PORT=%d " - "MPIRUN_RANK=%d MPIRUN_NPROCS=%d MPIRUN_ID=%d %s %s %s", - wd, ENV_CMD,LD_LIBRARY_PATH_MPI, mpirun_host, port, i, - nprocs, id, display,env,device_port_env); - } - if(use_totalview) { - len = snprintf(remote_command, str_len, "%s MPIRUN_PROCESSES='%s' %s ", - remote_command, mpirun_processes, command_name); - } else { - len = snprintf(remote_command, str_len, "%s NOT_USE_TOTALVIEW=1 %s ", - remote_command, command_name); + remote_command = append_str(remote_command, + mkstr(" MPIRUN_PROCESSES='%s'", mpirun_processes)); } - if (len >= str_len) { - fprintf(stderr, "Internal error - overflowed remote_command\n"); - exit(1); + else { + remote_command = append_str(remote_command, + mkstr(" NOT_USE_TOTALVIEW=1")); } + remote_command = append_str(remote_command, mkstr(" %s", command_name)); + fprintf(stderr, "remote_command: %s\n", remote_command); + if (xterm_on) { - sprintf(xterm_command, "%s; echo process exited", remote_command); - sprintf(xterm_title, "\"mpirun process %d of %d\"", i, nprocs); + xterm_command = mkstr("%s; echo process exited", remote_command); + xterm_title = mkstr("\"mpirun process %d of %d\"", i, nprocs); } plist[i].pid = fork(); @@ -726,6 +683,7 @@ free(remote_command); free(xterm_command); + free(xterm_title); return (0); } Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_util.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_util.c 2008-02-01 23:06:11 UTC (rev 1946) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_util.c 2008-02-03 17:31:29 UTC (rev 1947) @@ -13,57 +13,81 @@ #include "mpirun_util.h" #include -char * mkstr(const char * format, ...) { +/* + * ptr must be suitable for a call to realloc + */ +char * vedit_str(char * const ptr, const char * format, va_list args) { va_list ap; int size; char * str; - va_start(ap, format); + va_copy(ap, args); size = vsnprintf(NULL, 0, format, ap); va_end(ap); if(size++ < 0) return NULL; - str = malloc(sizeof(char) * size); + str = realloc(ptr, sizeof(char [size])); - if(str) { - va_start(ap, format); - size = vsnprintf(str, size, format, ap); - va_end(ap); - - if(size < 0) return NULL; + if(!str) { + perror("vedit_str [realloc]"); + exit(EXIT_FAILURE); } + va_copy(ap, args); + size = vsnprintf(str, size, format, ap); + va_end(ap); + + if(size < 0) return NULL; + return str; } /* * ptr must be suitable for a call to realloc */ -char * chstr(char * ptr, const char * format, ...) { +char * edit_str(char * const ptr, char const * const format, ...) { va_list ap; - int size; char * str; va_start(ap, format); - size = vsnprintf(NULL, 0, format, ap); + str = vedit_str(ptr, format, ap); va_end(ap); + + return str; +} - if(size++ < 0) return NULL; +char * mkstr(char const * const format, ...) { + va_list ap; + char * str; - str = realloc(ptr, sizeof(char) * size); + va_start(ap, format); + str = vedit_str(NULL, format, ap); + va_end(ap); - if(str) { - va_start(ap, format); - size = vsnprintf(str, size, format, ap); - va_end(ap); + return str; +} - if(size < 0) return NULL; +/* + * ptr & suffix must be dynamically allocated + */ +char * append_str(char * ptr, char * const suffix) { + va_list ap; + + ptr = realloc(ptr, sizeof(char [strlen(ptr) + strlen(suffix) + 1])); + + if(!ptr) { + perror("append_str [realloc]"); + exit(EXIT_FAILURE); } - return str; + strcat(ptr, suffix); + free(suffix); + + return ptr; } + int read_socket(int socket, void * buffer, size_t bytes) { char * data = buffer; ssize_t rv; From perkinjo at mvapich.cse.ohio-state.edu Sun Feb 3 12:49:36 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Sun Feb 3 12:49:47 2008 Subject: [mvapich-commit] r1950 - mvapich/trunk/mpid/ch_gen2/process Message-ID: <200802031749.m13Hna97023678@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-02-03 12:49:35 -0500 (Sun, 03 Feb 2008) New Revision: 1950 Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c Log: Remove debugging printout Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-02-03 17:44:21 UTC (rev 1949) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-02-03 17:49:35 UTC (rev 1950) @@ -625,7 +625,6 @@ } remote_command = append_str(remote_command, mkstr(" %s", command_name)); - fprintf(stderr, "remote_command: %s\n", remote_command); if (xterm_on) { xterm_command = mkstr("%s; echo process exited", remote_command); From perkinjo at mvapich.cse.ohio-state.edu Sun Feb 3 12:41:37 2008 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Sun Feb 3 13:45:16 2008 Subject: [mvapich-commit] r1948 - in mvapich/trunk/mpid/ch_gen2: . process Message-ID: <200802031741.m13HfboF023653@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2008-02-03 12:41:36 -0500 (Sun, 03 Feb 2008) New Revision: 1948 Modified: mvapich/trunk/mpid/ch_gen2/mpirun.ch_gen2.in mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.c mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.h mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_common.c mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_common.h mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_mpirun.c mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_mpispawn.c mvapich/trunk/mpid/ch_gen2/viainit.c Log: Remove debugging printout Modified: mvapich/trunk/mpid/ch_gen2/mpirun.ch_gen2.in =================================================================== --- mvapich/trunk/mpid/ch_gen2/mpirun.ch_gen2.in 2008-02-03 17:31:29 UTC (rev 1947) +++ mvapich/trunk/mpid/ch_gen2/mpirun.ch_gen2.in 2008-02-03 17:41:36 UTC (rev 1948) @@ -62,6 +62,12 @@ else via_args="$via_show $via_xterm $via_debug" + if [ "$debugger" == "totalview" ] ; then + # user threw -tv or -totalview on command line, mpirun.args set + # debugger=totalview, and now we need to pass -tv on to mpirun_rsh + # which reads TOTALVIEW from environment + via_args="$via_args -tv" + fi if [ "x$via_paramfile" != "x" ] ; then via_args="$via_args -paramfile $via_paramfile" fi Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-02-03 17:31:29 UTC (rev 1947) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2008-02-03 17:41:36 UTC (rev 1948) @@ -625,7 +625,6 @@ } remote_command = append_str(remote_command, mkstr(" %s", command_name)); - fprintf(stderr, "remote_command: %s\n", remote_command); if (xterm_on) { xterm_command = mkstr("%s; echo process exited", remote_command); Modified: mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.c 2008-02-03 17:31:29 UTC (rev 1947) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.c 2008-02-03 17:41:36 UTC (rev 1948) @@ -64,28 +64,61 @@ #include #include #include +#include +#include +#include #include "pmgr_collective_client.h" -char *mpirun_hostname; -struct hostent *mpirun_hostent; -int mpirun_port; -int mpirun_socket; -int pmgr_me, pmgr_nprocs, pmgr_id; +/* set env variable to select which trees to use, if any -- all enabled by default */ +#ifndef MPIRUN_USE_TREES +#define MPIRUN_USE_TREES (0) +#endif +#ifndef MPIRUN_USE_GATHER_TREE +#define MPIRUN_USE_GATHER_TREE (0) +#endif +#ifndef MPIRUN_USE_BCAST_TREE +#define MPIRUN_USE_BCAST_TREE (0) +#endif + +#ifndef MPIRUN_CONNECT_TRIES +#define MPIRUN_CONNECT_TRIES (7) +#endif +#ifndef MPIRUN_CONNECT_TIMEOUT +#define MPIRUN_CONNECT_TIMEOUT (2) /* seconds */ +#endif +#ifndef MPIRUN_CONNECT_BACKOFF +#define MPIRUN_CONNECT_BACKOFF (5) /* seconds */ +#endif + +/* set envvar MPIRUN_USE_TREES={0,1} to disable/enable tree algorithms */ +int mpirun_use_trees = MPIRUN_USE_TREES; +/* set envvar MPIRUN_USE_GATHER_TREE={0,1} to disable/enable gather tree */ +int mpirun_use_gather_tree = MPIRUN_USE_GATHER_TREE; +/* set envvar MPIRUN_USE_BCAST_TREE={0,1} to disable/enable bcast tree */ +int mpirun_use_bcast_tree = MPIRUN_USE_BCAST_TREE; + +int mpirun_connect_tries = MPIRUN_CONNECT_TRIES; +int mpirun_connect_timeout = MPIRUN_CONNECT_TIMEOUT; /* seconds */ +int mpirun_connect_backoff = MPIRUN_CONNECT_BACKOFF; /* seconds */ + +char* mpirun_hostname; +int mpirun_port; +int mpirun_socket; +int pmgr_nprocs = -1; +int pmgr_id = -1; static int pmgr_close_on_abort = 0; /* tree data structures */ -int pmgr_parent; /* MPI rank of parent */ -int pmgr_parent_s; /* socket fd to parent */ -int* pmgr_child; /* MPI ranks of children */ -int* pmgr_child_s; /* socket fds to children */ -int pmgr_num_child; /* number of children */ -int* pmgr_child_incl;/* number of children each child is responsible for (includes itself) */ +int pmgr_parent; /* MPI rank of parent */ +int pmgr_parent_s; /* socket fd to parent */ +int* pmgr_child; /* MPI ranks of children */ +int* pmgr_child_s; /* socket fds to children */ +int pmgr_num_child; /* number of children */ +int* pmgr_child_incl; /* number of children each child is responsible for (includes itself) */ int pmgr_num_child_incl; /* total number of children this node is responsible for */ -/* set env variable to select which trees to use, if any -- all enabled by default */ -int mpirun_use_trees; /* set by MPIRUN_USE_TREES={0,1} to disable/enable tree algorithms */ -int mpirun_use_gather_tree; /* set by MPIRUN_USE_GATHER_TREE={0,1} to disable/enable gather tree */ -int mpirun_use_bcast_tree; /* set by MPIRUN_USE_BCAST_TREE={0,1} to disable/enable bcast tree */ +/* startup time, time between starting pmgr_open and finishing pmgr_close */ +struct timeval time_open, time_close; /* * ============================= @@ -93,54 +126,264 @@ * ============================= */ -/* Reads environment variable, bails if not set */ -char* pmgr_getenv(char* envvar) +/* read size bytes into buf from mpirun_socket */ +int pmgr_read(void* buf, int size) { - char* str = getenv(envvar); - if (str == NULL) { - pmgr_error("Can't read %s", envvar); - exit(1); + return pmgr_read_fd(mpirun_socket, buf, size); +} + +/* write size bytes into mpirun_socket from buf */ +int pmgr_write(void* buf, int size) +{ + return pmgr_write_fd(mpirun_socket, buf, size); +} + +/* write integer into mpirun_socket */ +int pmgr_write_int(int value) +{ + return pmgr_write(&value, sizeof(value)); +} + +/* + * ============================= + * The mpirun_* functions implement PMGR_COLLECTIVE operations through + * the mpirun process. Typically, this amounts to a flat tree with the + * mpirun process at the root. These functions implement the client side + * of the protocol specified in pmgr_collective_mpirun.c. + * ============================= + */ + +/* + * Perform barrier, each task writes an int then waits for an int + */ +int mpirun_barrier() +{ + /* send BARRIER op code, then wait on integer reply */ + int buf; + + pmgr_write_int(PMGR_BARRIER); + pmgr_read(&buf, sizeof(int)); + + return PMGR_SUCCESS; +} + +/* + * Perform MPI-like Broadcast, root writes sendcount bytes from buf, + * into mpirun_socket, all receive sendcount bytes into buf + */ +int mpirun_bcast(void* buf, int sendcount, int root) +{ + /* send BCAST op code, then root, then size of data */ + pmgr_write_int(PMGR_BCAST); + pmgr_write_int(root); + pmgr_write_int(sendcount); + + /* if i am root, send data */ + if (pmgr_me == root) { + pmgr_write(buf, sendcount); } - return str; + + /* read in data */ + pmgr_read(buf, sendcount); + + return PMGR_SUCCESS; } -/* read size bytes into buf from mpirun_socket */ -int pmgr_read(void* buf, int size) { - return pmgr_read_fd(mpirun_socket, buf, size); +/* + * Perform MPI-like Gather, each task writes sendcount bytes from sendbuf + * into mpirun_socket, then root receives N*sendcount bytes into recvbuf + */ +int mpirun_gather(void* sendbuf, int sendcount, void* recvbuf, int root) +{ + /* send GATHER op code, then root, then size of data, then data itself */ + pmgr_write_int(PMGR_GATHER); + pmgr_write_int(root); + pmgr_write_int(sendcount); + pmgr_write(sendbuf, sendcount); + + /* only the root receives data */ + if (pmgr_me == root) { + pmgr_read(recvbuf, sendcount * pmgr_nprocs); + } + + return PMGR_SUCCESS; } -/* write size bytes into mpirun_socket from buf */ -int pmgr_write(void* buf, int size) { - return pmgr_write_fd(mpirun_socket, buf, size); +/* + * Perform MPI-like Scatter, root writes N*sendcount bytes from sendbuf + * into mpirun_socket, then each task receives sendcount bytes into recvbuf + */ +int mpirun_scatter(void* sendbuf, int sendcount, void* recvbuf, int root) +{ + /* send SCATTER op code, then root, then size of data, then data itself */ + pmgr_write_int(PMGR_SCATTER); + pmgr_write_int(root); + pmgr_write_int(sendcount); + + /* if i am root, send all chunks to mpirun */ + if (pmgr_me == root) { + pmgr_write(sendbuf, sendcount * pmgr_nprocs); + } + + /* receive my chunk */ + pmgr_read(recvbuf, sendcount); + + return PMGR_SUCCESS; } -/* write integer into mpirun_socket */ -int pmgr_write_int(int value) { - return pmgr_write(&value, sizeof(value)); +/* + * Perform MPI-like Allgather, each task writes sendcount bytes from sendbuf + * into mpirun_socket, then receives N*sendcount bytes into recvbuf + */ +int mpirun_allgather(void* sendbuf, int sendcount, void* recvbuf) +{ + /* send ALLGATHER op code, then size of data, then data itself */ + pmgr_write_int(PMGR_ALLGATHER); + pmgr_write_int(sendcount); + pmgr_write(sendbuf, sendcount); + pmgr_read (recvbuf, sendcount * pmgr_nprocs); + + return PMGR_SUCCESS; } -/* ============================= +/* + * Perform MPI-like Alltoall, each task writes N*sendcount bytes from sendbuf + * into mpirun_socket, then recieves N*sendcount bytes into recvbuf + */ +int mpirun_alltoall(void* sendbuf, int sendcount, void* recvbuf) +{ + /* send ALLTOALL op code, then size of data, then data itself */ + pmgr_write_int(PMGR_ALLTOALL); + pmgr_write_int(sendcount); + pmgr_write(sendbuf, sendcount * pmgr_nprocs); + pmgr_read (recvbuf, sendcount * pmgr_nprocs); + + return PMGR_SUCCESS; +} + +/* + * ============================= * Functions to open/close/gather/bcast the TCP/socket tree. * ============================= */ -/* connect to given IP:port and return opened socket file descriptor */ -int pmgr_connect(struct in_addr ip, int port) +/* Open a connection on socket FD to peer at ADDR (which LEN bytes long). + * This function uses a non-blocking filedescriptor for the connect(), + * and then does a bounded poll() for the connection to complete. This + * allows us to timeout the connect() earlier than TCP might do it on + * its own. We have seen timeouts that failed after several minutes, + * where we would really prefer to time out earlier and retry the connect. + * + * Return 0 on success, -1 for errors. + */ +static int pmgr_connect_w_timeout (int fd, struct sockaddr const * addr, + socklen_t len, int millisec) { - int sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - if (sockfd < 0) { - perror("socket"); - exit(1); + int rc, flags, err, err_len; + struct pollfd ufds; + + flags = fcntl(fd, F_GETFL); + fcntl(fd, F_SETFL, flags | O_NONBLOCK); + + err = 0; + rc = connect(fd , addr , len); + if (rc < 0 && errno != EINPROGRESS) { + return -1; } + if (rc == 0) { + goto done; /* connect completed immediately */ + } + ufds.fd = fd; + ufds.events = POLLIN | POLLOUT; + ufds.revents = 0; + +again: rc = poll(&ufds, 1, millisec); + if (rc == -1) { + /* poll failed */ + if (errno == EINTR) { + /* NOTE: connect() is non-interruptible in Linux */ + pmgr_error("EINTR while polling connection: (poll() %m errno=%d) @ file %s:%d", + errno, __FILE__, __LINE__); + goto again; + } else { + pmgr_error("Polling connection: (poll() %m errno=%d) @ file %s:%d", + errno, __FILE__, __LINE__); + } + return -1; + } else if (rc == 0) { + /* poll timed out before any socket events */ + /* perror("pmgr_connect_w_timeout poll timeout"); */ + return -1; + } else { + /* poll saw some event on the socket + * We need to check if the connection succeeded by + * using getsockopt. The revent is not necessarily + * POLLERR when the connection fails! */ + err_len = sizeof(err); + if (getsockopt(fd, SOL_SOCKET, SO_ERROR, + &err, &err_len) < 0) + { + return -1; /* solaris pending error */ + } + } + +done: + fcntl(fd, F_SETFL, flags); + + /* NOTE: Connection refused is typically reported for + * non-responsived nodes plus attempts to communicate + * with terminated launcher. */ + if (err) { + pmgr_error("Error on socket in pmgr_connect_w_timeout() @ file %s:%d", + __FILE__, __LINE__); + return -1; + } + + return 0; +} + +/* Connect to given IP:port. Upon successful connection, pmgr_connect + * shall return the connected socket file descriptor. Otherwise, -1 shall be + * returned. + */ +int pmgr_connect(struct in_addr ip, int port) +{ struct sockaddr_in sockaddr; + int sockfd; + int i; + + /* set up address to connect to */ sockaddr.sin_family = AF_INET; sockaddr.sin_addr = ip; sockaddr.sin_port = port; - if (connect(sockfd, (struct sockaddr *) &sockaddr, sizeof(sockaddr)) < 0) { - perror("connect"); - exit(1); + /* Try making the connection several times, with a random backoff + between tries. */ + for (i = 0; ; i++) { + /* create a socket */ + sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sockfd < 0) { + pmgr_error("Creating socket (socket() %m errno=%d) @ file %s:%d", + errno, __FILE__, __LINE__); + return -1; + } + + /* connect socket to address */ + if (pmgr_connect_w_timeout(sockfd, (struct sockaddr *) &sockaddr, + sizeof(sockaddr), mpirun_connect_timeout * 1000) < 0) { + if (i >= mpirun_connect_tries) { + pmgr_error("Connecting socket: pmgr_connect_w_timeout() failed @ file %s:%d", + __FILE__, __LINE__); + close(sockfd); + return -1; + } else { + close(sockfd); + usleep(((rand() % (mpirun_connect_backoff * 1000)) + 1) * 1000); + } + } else { + break; + } } return sockfd; @@ -154,35 +397,39 @@ /* initialize parent and children based on pmgr_me and pmgr_nprocs */ int n = 1; int max_children = 0; - while(n < pmgr_nprocs) { n <<= 1; max_children++; } + while (n < pmgr_nprocs) { + n <<= 1; + max_children++; + } pmgr_parent = 0; pmgr_num_child = 0; pmgr_num_child_incl = 0; - pmgr_child = malloc(max_children * sizeof(int)); - pmgr_child_s = malloc(max_children * sizeof(int)); - pmgr_child_incl = malloc(max_children * sizeof(int)); + pmgr_child = (int*) pmgr_malloc(max_children * sizeof(int), "Child MPI rank array"); + pmgr_child_s = (int*) pmgr_malloc(max_children * sizeof(int), "Child socket fd array"); + pmgr_child_incl = (int*) pmgr_malloc(max_children * sizeof(int), "Child children count array"); /* find our parent and list of children */ int low = 0; int high = pmgr_nprocs - 1; while (high - low > 0) { - int mid = (high - low) / 2 + (high - low) % 2 + low; - if (low == pmgr_me) { - pmgr_child[pmgr_num_child] = mid; - pmgr_child_incl[pmgr_num_child] = high - mid + 1; - pmgr_num_child++; - pmgr_num_child_incl += (high - mid + 1); - } - if (mid == pmgr_me) { pmgr_parent = low; } - if (mid <= pmgr_me) { low = mid; } - else { high = mid-1; } + int mid = (high - low) / 2 + (high - low) % 2 + low; + if (low == pmgr_me) { + pmgr_child[pmgr_num_child] = mid; + pmgr_child_incl[pmgr_num_child] = high - mid + 1; + pmgr_num_child++; + pmgr_num_child_incl += (high - mid + 1); + } + if (mid == pmgr_me) { pmgr_parent = low; } + if (mid <= pmgr_me) { low = mid; } + else { high = mid-1; } } /* create a socket to accept connection from parent */ int sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (sockfd < 0) { - perror("socket"); + pmgr_error("Creating parent socket (socket() %m errno=%d) @ file %s:%d", + errno, __FILE__, __LINE__); exit(1); } @@ -190,41 +437,50 @@ memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_ANY); - sin.sin_port = htons(0); /* bind ephemeral port */ + sin.sin_port = htons(0); /* bind ephemeral port - OS will assign us a free port */ + /* bind socket */ if (bind(sockfd, (struct sockaddr *) &sin, sizeof(sin)) < 0) { - perror("binding socket"); + pmgr_error("Binding parent socket (bind() %m errno=%d) @ file %s:%d", + errno, __FILE__, __LINE__); exit(1); } - listen(sockfd, 1); + /* set the socket to listen for connections */ + if (listen(sockfd, 1) < 0) { + pmgr_error("Setting parent socket to listen (listen() %m errno=%d) @ file %s:%d", + errno, __FILE__, __LINE__); + exit(1); + } + /* ask which port the OS assigned our socket to */ socklen_t len = sizeof(sin); if (getsockname(sockfd, (struct sockaddr *) &sin, &len) < 0) { - perror("getting sockname"); + pmgr_error("Reading parent socket port number (getsockname() %m errno=%d) @ file %s:%d", + errno, __FILE__, __LINE__); exit(1); } + /* extract our ip and port number to send to mpirun */ char hn[256]; gethostname(hn, 256); - struct hostent * he = gethostbyname(hn); + struct hostent* he = gethostbyname(hn); struct in_addr ip = * (struct in_addr *) *(he->h_addr_list); short port = sin.sin_port; - /* gather socket data to rank 0 */ + /* allocate buffers to receive ip:port table for all tasks */ int sendcount = sizeof(ip) + sizeof(port); - void* sendbuf = malloc(sendcount); - void* recvbuf = malloc(sendcount * pmgr_nprocs); + void* sendbuf = (void*) pmgr_malloc(sendcount, "Send buffer for socket data"); + void* recvbuf = (void*) pmgr_malloc(sendcount * pmgr_nprocs, "Receive buffer for socket table"); + /* fill in send buffer with our ip:port */ memcpy(sendbuf, &ip, sizeof(ip)); memcpy((char*)sendbuf + sizeof(ip), &port, sizeof(port)); - pmgr_gather(sendbuf, sendcount, recvbuf, 0); + /* gather ip:port info to rank 0 -- explicitly call mpirun_gather since tcp tree is not setup */ + mpirun_gather(sendbuf, sendcount, recvbuf, 0); - /* - * if i'm not rank 0, accept a connection (from parent) and receive socket - * table - */ + /* if i'm not rank 0, accept a connection (from parent) and receive socket table */ if (pmgr_me != 0) { socklen_t parent_len; struct sockaddr parent_addr; @@ -240,11 +496,16 @@ struct in_addr child_ip = * (struct in_addr *) ((char*)recvbuf + sendcount*c); short child_port = * (short*) ((char*)recvbuf + sendcount*c + sizeof(ip)); pmgr_child_s[i] = pmgr_connect(child_ip, child_port); + if (pmgr_child_s[i] == -1) { + pmgr_error("Connecting to child failed (rank %d) @ file %s:%d", + c, __FILE__, __LINE__); + exit(1); + } pmgr_write_fd(pmgr_child_s[i], recvbuf, sendcount * pmgr_nprocs); } - free(sendbuf); - free(recvbuf); + pmgr_free(sendbuf); + pmgr_free(recvbuf); return PMGR_SUCCESS; } @@ -256,18 +517,20 @@ int pmgr_close_tree() { /* if i'm not rank 0, close socket connection with parent */ - if (pmgr_me != 0) + if (pmgr_me != 0) { close(pmgr_parent_s); + } /* and all children */ int i; - for(i=0; i max) { max = all[i]; } + } + pmgr_free(all); } - else { - mpirun_barrier(); - } + /* broadcast max int from rank 0 and set recvint */ + pmgr_bcast((void*) &max, sizeof(int), 0); + *recvint = max; + pmgr_gettimeofday(&end); + pmgr_debug(2, "Exiting pmgr_allreducemaxint(), took %f seconds for %d procs", pmgr_getsecs(&end,&start), pmgr_nprocs); return PMGR_SUCCESS; } /* - * Perform MPI-like Broadcast, root writes sendcount bytes from buf, - * into mpirun_socket, all receive sendcount bytes into buf + * Perform MPI-like Allgather of NULL-terminated strings (whose lengths may vary + * from task to task). + * + * Each task provides a pointer to its NULL-terminated string as input. + * Each task then receives an array of pointers to strings indexed by rank number + * and also a pointer to the buffer holding the string data. + * When done with the strings, both the array of string pointers and the + * buffer should be freed. + * + * Example Usage: + * char host[256], **hosts, *buf; + * gethostname(host, sizeof(host)); + * pmgr_allgatherstr(host, &hosts, &buf); + * for(int i=0; i