From koop at mvapich.cse.ohio-state.edu Mon Nov 5 11:07:13 2007 From: koop at mvapich.cse.ohio-state.edu (koop@mvapich.cse.ohio-state.edu) Date: Mon Nov 5 11:07:51 2007 Subject: [mvapich-commit] r1626 - mvapich/trunk/mpid/ch_gen2 Message-ID: <200711051607.lA5G7DRn010737@mvapich.cse.ohio-state.edu> Author: koop Date: 2007-11-05 11:07:12 -0500 (Mon, 05 Nov 2007) New Revision: 1626 Modified: mvapich/trunk/mpid/ch_gen2/viaparam.c Log: * Increase default send WQEs to 64 Modified: mvapich/trunk/mpid/ch_gen2/viaparam.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/viaparam.c 2007-10-29 21:14:06 UTC (rev 1625) +++ mvapich/trunk/mpid/ch_gen2/viaparam.c 2007-11-05 16:07:12 UTC (rev 1626) @@ -68,7 +68,7 @@ * Must be within NIC MaxQpEntries limit. * Size will be adjusted below. */ -unsigned long viadev_sq_size = 40; +unsigned long viadev_sq_size = 64; /* How many sWQEs can be in use before we start doing coalescing * for small messages From perkinjo at mvapich.cse.ohio-state.edu Mon Nov 5 11:54:41 2007 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Mon Nov 19 16:52:34 2007 Subject: [mvapich-commit] r1627 - in mvapich/trunk: . mpid/ch_gen2 mpid/ch_gen2/process util Message-ID: <200711051654.lA5GsfSq010806@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2007-11-05 11:54:39 -0500 (Mon, 05 Nov 2007) New Revision: 1627 Added: mvapich/trunk/mpid/ch_gen2/process/minidaemon.c mvapich/trunk/mpid/ch_gen2/process/minidaemon.h mvapich/trunk/mpid/ch_gen2/process/minidaemon_client.c mvapich/trunk/mpid/ch_gen2/process/mpirun_util.c mvapich/trunk/mpid/ch_gen2/process/mpirun_util.h Removed: mvapich/trunk/mpid/ch_gen2/process/minidaemon/ Modified: mvapich/trunk/make.mvapich.gen2 mvapich/trunk/mpid/ch_gen2/Makefile.in mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h mvapich/trunk/mpid/ch_gen2/process/mpispawn.c mvapich/trunk/mpid/ch_gen2/process/mpispawn.h mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c mvapich/trunk/util/mpiinstall.in Log: Checking in changes to allow for MELLANOX xlauncher. Modified: mvapich/trunk/make.mvapich.gen2 =================================================================== --- mvapich/trunk/make.mvapich.gen2 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/make.mvapich.gen2 2007-11-05 16:54:39 UTC (rev 1627) @@ -96,7 +96,7 @@ export FFLAGS=${FFLAGS:--L${IBHOME_LIB}} export CFLAGS=${CFLAGS:--D${ARCH} -D${COMPAT} ${PTMALLOC} -DEARLY_SEND_COMPLETION -DMEMORY_SCALE -DVIADEV_RPUT_SUPPORT -D_SMP_ -D_SMP_RNDV_ -DCH_GEN2 -D_GNU_SOURCE ${COMPILER_FLAG} ${HAVE_MPD_RING} -I${IBHOME}/include $OPT_FLAG} -export MPIRUN_CFLAGS="${MPIRUN_CFLAGS} -DLD_LIBRARY_PATH_MPI=\\\"${PREFIX}/lib/shared\\\" -DPARAM_GLOBAL=\\\"${PREFIX}/etc/mvapich.conf\\\"" +export MPIRUN_CFLAGS="${MPIRUN_CFLAGS} -DLD_LIBRARY_PATH_MPI=\\\"${PREFIX}/lib/shared\\\" -DMPI_PREFIX=\\\"${PREFIX}/\\\" -DPARAM_GLOBAL=\\\"${PREFIX}/etc/mvapich.conf\\\"" # Prelogue make distclean &>/dev/null Modified: mvapich/trunk/mpid/ch_gen2/Makefile.in =================================================================== --- mvapich/trunk/mpid/ch_gen2/Makefile.in 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/Makefile.in 2007-11-05 16:54:39 UTC (rev 1627) @@ -123,23 +123,33 @@ default_all: default -default: setlinks lib $(MPIRUN) $(IBMGRP) $(PTMALLOC) +default: setlinks lib $(MPIRUN) $(IBMGRP) $(PTMALLOC) minidaemon_client lib: $(VIAOBJECTS) $(MPICHOBJECTS) $(AR) $(LIBNAME) $? +minidaemon_client: process/minidaemon.o + cd process &&\ + $(CC) $(MPIRUN_CFLAGS) minidaemon.c -c -o minidaemon.o &&\ + $(CC) $(MPIRUN_CFLAGS) -o minidaemon_client minidaemon.o mpirun_util.o minidaemon_client.c &&\ + install -m 755 minidaemon_client ${top_srcdir}/bin + mpirun_rsh: process/pmgr_client_mpirun_rsh.o cd process &&\ $(CC) getopt.c -c -o getopt.o &&\ $(CC) getopt1.c -c -o getopt1.o &&\ - $(CC) $(MPIRUN_CFLAGS) -o mpispawn mpispawn.c &&\ + $(CC) $(MPIRUN_CFLAGS) -c mpirun_util.c &&\ + $(CC) $(MPIRUN_CFLAGS) -c minidaemon.c &&\ + $(CC) $(MPIRUN_CFLAGS) -c pmgr_client_mpirun_rsh.c &&\ + $(CC) $(MPIRUN_CFLAGS) -o minidaemon_client minidaemon.o mpirun_util.o minidaemon_client.c &&\ + $(CC) $(MPIRUN_CFLAGS) -o mpispawn mpispawn.c mpirun_util.o &&\ + $(CC) $(MPIRUN_CFLAGS) -o mpirun_rsh $(MPIRUN_MAC) minidaemon.o mpirun_util.o mpirun_rsh.c &&\ + install -m 755 minidaemon_client ${top_srcdir}/bin &&\ install -m 755 mpispawn ${top_srcdir}/bin &&\ - $(CC) $(MPIRUN_CFLAGS) -o mpirun_rsh $(MPIRUN_MAC) mpirun_rsh.c &&\ install -m 755 mpirun_rsh ${top_srcdir}/bin &&\ - $(CC) $(MPIRUN_CFLAGS) -c pmgr_client_mpirun_rsh.c &&\ - $(AR) $(LIBNAME) pmgr_client_mpirun_rsh.o + $(AR) $(LIBNAME) pmgr_client_mpirun_rsh.o minidaemon.o -(cd $(top_srcdir)/mpid/ch_gen2/process &&\ - ar -d $(LIBNAME) pmgr_client_mpd.o) + ar -d $(LIBNAME) pmgr_client_mpd.o minidaemon.o) mpirun_mpd: process/pmgr_client_mpd.o cd process &&\ @@ -181,10 +191,10 @@ VIAFILES = viainit.c viasend.c viarecv.c viapriv.c viaparam.c viutil.c vbuf.c \ mpid_init.c mpid_send.c mpid_recv.c viacheck.c mpid_pack.c \ - process/pmgr_client_fork.c process/pmgr_client_mpirun_rsh.c \ + process/pmgr_client_fork.c process/minidaemon.c process/pmgr_client_mpirun_rsh.c \ process/mpirun_rsh.c mpid_hsend.c mpid_hrecv.c \ req.h vbuf.h viapacket.h viapriv.h viutil.h \ - process/pmgr_client.h mpid_misc.c viaparam.h viaconfig.h viadev.h \ + process/pmgr_client.h process/minidaemon.h mpid_misc.c viaparam.h viaconfig.h viadev.h \ cmnargs.c dreg.h mpid.h mpid_smpi.h mpid_smpi.c # @@ -208,7 +218,7 @@ clean: /bin/rm -f *.o *.d *~ process/*~ process/*.d process/*.o \ - process/mpirun_rsh process/mpispawn process/core $(VIAOBJECTS) \ + process/mpirun_rsh process/mpispawn process/core process/minidaemon_client $(VIAOBJECTS) \ $(MPICHOBJECTS) $(MPICHSOURCE) core ibmcgrp/*.o ibmcgrp/ibmcgrp \ ${top_srcdir}/bin/ibmcgrp -@(cd ../mpd; \ Added: mvapich/trunk/mpid/ch_gen2/process/minidaemon.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/minidaemon.c 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/process/minidaemon.c 2007-11-05 16:54:39 UTC (rev 1627) @@ -0,0 +1,1302 @@ +/** +Minidaemon ADT Implementation, provided by Mellanox, MPI Team +mailto : xalex@mellanox.co.il +**/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef MAC_OSX +#include +#else +#include +#endif +#include +#include +#include + + + +#include "minidaemon.h" + +#define RSH_CMD "/usr/bin/rsh" +#define SSH_CMD "/usr/bin/ssh" +#define SSH_ARG "-q" + +#define ENV_CMD "/usr/bin/env" + +#ifndef LD_LIBRARY_PATH_MPI +#define LD_LIBRARY_PATH_MPI "/usr/mvapich/lib/shared" +#endif + +#ifndef MPI_PREFIX +#define MPI_PREFIX /usr/mvapich/ +#endif + +#define MPI_LIB lib/shared/ +#define MPI_BIN bin/ +#define MINIDAEMON_CLIENT_NAME minidaemon_client + +/*#define MPI_LD_LIBRARY_PATH MPI_PREFIX##MPI_LIB +#define MPI_BIN_PATH MPI_PREFIX##MPI_BIN +#define MINIDAEMON_EXEC_PATH MPI_BIN_PATH##MINIDAEMON_CLIENT_NAME*/ +#define STR_CONCAT(a,b,c) a ## b ## c +#define STR_CONCAT2(a,b) a ## b + +#define STR(s) #s +#define XSTR(s) STR(s) +#define MPI_BIN_PATH + +#define MAX_WD_LEN 256 +#define MAX_HOST_LEN 256 + +#define MAX_MESSAGE_SIZE 512 +#define MINIDAEMON_PORT 4000 +#define ACCEPT_TIMEOUT 100 +#define NO_TIMEOUT 1000 + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + + + +/* #define CH_LIST_FOREACH_MD_NODES */ + +/* Minidaemon basic types definition */ +typedef enum {CH_JOB_EXT = -1,P_OK = 0, P_KILLED, STATUS_REQUEST, STATUS , KILL} MD_message; +typedef enum {MD_INIT = 0, MD_QP_LID, MD_WORK, MD_TERM } MD_phase; +typedef enum {MD_FINISH = 0, MD_NORMAL, MD_UNKNOWN, MD_NOT_STARTED} MD_status; //TODO MD_RUN instead of NORMAL + +/*Timeout values for WORK stage */ +static const int send2father_work_timeout = 10; +static const int recv_parent_work_timeout = 31; +static const int recv_children_work_timeout = 31; +static const int recv_jobs_work_timeout = NO_TIMEOUT; + +/* Timeout values for TERM stage */ +static const int send2father_term_timeout = NO_TIMEOUT; +static const int recv_parent_term_timeout = NO_TIMEOUT; +static const int recv_children_term_timeout = 30; +static const int recv_jobs_term_timeout = 10; + +/** These 2 structures are for temporarily testing only **/ +/**-----------------------------------------------------------**/ + +struct minidaemon_t { + /* private data members */ + const char *parent_name; + int tree_width; /* md_num */ + int ch_num; /* total process number, i.e the length of children_md_arr */ + int root_ch_num; + + MD_phase phase; /* phase of execution */ + + MD_entry children_md_arr; /* The list of children and parent minidaemon process and own jobs */ + + ChildrenList childrenList; /* The list of parameters for minidaemon itself and its subtree */ + + /*********** PUBLIC METHODS *******************/ + + /* TODO this methods would be implemented in xlauncher ver 2.0 */ + /*void (*timeout_handler)(); + void (*data_handler)();*/ + + int total_running_jobs; + int total_running_mds; + int curr_running_jobs; + int curr_running_mds; + + /* timeout variables */ + int send2father_timeout ; + int recv_parent_timeout ; + int recv_children_timeout ; + int recv_jobs_timeout ; + + const char *command_params; + const char *command; + char *root_hostname; + int mpi_param_len; + + int max_fd_num; + + int isParent; + int parent_fd; + + int pid; + int mpirun_port; + int ppid; + int isLeaf; + int array_it; + +}; +static struct minidaemon_t md_entity; + +struct md_entry_t { + int fd; + MD_status stat; + struct sockaddr * sock_addr; + char resp_stat; +}; + +struct childrenList_t { + char *hostname; + char *device; + int pid; + int port; + int rank; + + MD_status proc_state; + int child_list_len; /* Only for MD nodes */ +}; + +int resp_status_def = 1; +const int MD_do_not_set = -2; +const int attempt_no = 10; + +typedef int ChildrenListIterator; + +/* TODO all functions that use 'private' data member Minidaemon should be static */ +static int children_list_comp (const void *p1, const void *p2); +static ChildrenListIterator getListNextNode(ChildrenListIterator it, int offset); + +static void md_socket_create(); +void md_connect_to_parent(); +void md_listen_parent(int sd); +void md_general_listen(); +void md_socket_close(); +void md_base_init(int ch_num, int width, const char * command_params,const char * command, + const char * root_host,int root_ch_num, int mpirun_port, int ppid); + +void md_start_node_tree(); +int md_start_node(ChildrenListIterator i); +void md_run_own_jobs(); + +void md_children_proc_handler(int ind,int time_delta); +void md_handler(int sd, int i); +void md_status_msg_handler(const int * msg_buf,int i, int size); +void md_req_status_handler(const char * buff); + +void md_send_status_message(int md_index, ChildrenListIterator iter); +void md_send_init_message(int fd); +void md_rebuild_mask(fd_set * mask); +int md_update_socket_status(int ind,int time_delta); + +void md_start_termination_process(); +void md_forced_cleanup_handler(); +void md_send_termination_message(int sd); + +void md_print_status_message(); +void md_read_int(int * buff, int sd); +int md_read_gen_type(void * buff,size_t len,int sd, int att_no); +void md_read_string(char ** buff, int sd); +void md_build_command_string(int i); +void md_list_print(); +static void get_display_str(char * display); + +void timeout_handler (int delta_time); +void sigpipe_handler(int signo); + +static int children_list_comp (const void *p1, const void *p2) { + ChildrenList proc1 = (ChildrenList ) p1; + ChildrenList proc2 = (ChildrenList ) p2; + return strcmp(proc1->hostname, proc2->hostname); /* i.e, 2 string are equal iff strcmp==0 */ +} + +static ChildrenListIterator getListNextNode(ChildrenListIterator it, int offset) { + ChildrenListIterator it_next = it + 1; + + if (it < 0 || offset < 0) { + MD_USR_ERROR(" Bad Iterator received"); + } + + if (it_next >= md_entity.ch_num ) + return md_entity.ch_num; + + while (offset > 0 && it_next < md_entity.ch_num) { + while (children_list_comp( &md_entity.childrenList[it], &md_entity.childrenList[it_next]) == 0) { + if ( (++it_next >= md_entity.ch_num) ) + return md_entity.ch_num; /* return there maximum iterator value +1 in order to notify list end */ + } + --offset; + it=it_next; + } + return it_next; /*the actual index or the end of the list (ch_num) should be returned */ +} + +void listPrint() { + int i, j=0; + MD_PRINT(DDEBUG,"Node %d:",j); + for (i = 0 ; i < md_entity.ch_num; ++i) { + MD_PRINT(DDEBUG," %s",md_entity.childrenList[i].hostname); + if (children_list_comp( &md_entity.childrenList[i], &md_entity.childrenList[j]) != 0) { + j=i; + MD_PRINT(DDEBUG,"\nNode %d", j); + } + } + fprintf(stderr,"\n"); +} + +/** +Create a root instance of MiniDaemon +This function should be +**/ +void minidaemon_create(process * procList, int nproc, int width, const char * command_params, const char * command, int mpirun_port, int ppid) { + int i; + MD_PRINT(DDEBUG," Starting root minidaemon with following parameters:"); + MD_PRINT(DDEBUG,"nproc = %d, width = %d, command = %s,\n\t command_params = %s\n", + nproc,width,command,command_params); + md_base_init(nproc, width, command_params, command,NULL,nproc,mpirun_port, ppid ); + for (i = 0; i < nproc; ++i) { + //TODO memcpy(&md_entity.childrenList[i], &procList[i], sizeof(procList[i].hostname)+sizeof(procList[i].device) + sizeof(procList[i].port)); + md_entity.childrenList[i].hostname = procList[i].hostname; + md_entity.childrenList[i].device = procList[i].device; + md_entity.childrenList[i].port = procList[i].port; + md_entity.childrenList[i].rank = i; + md_entity.childrenList[i].proc_state = MD_NOT_STARTED; + } + + listPrint(); + qsort(md_entity.childrenList, nproc, sizeof(struct childrenList_t), children_list_comp); + listPrint(); + md_entity.isParent = 1; + if ( !(md_entity.root_hostname = (char *) malloc(MAX_HOST_LEN*sizeof(char)))) { + MD_SYS_ERROR(" : malloc"); + } + if (gethostname(md_entity.root_hostname,MAX_HOST_LEN) == -1) { + MD_SYS_ERROR(" : gethostname"); + }; + MD_PRINT(DDEBUG," md_entity.root_hostname is %s\n",md_entity.root_hostname); +} + +/* Init all data member structures */ +void minidaemon_init ( const char * par_name , int ch_num, int width, const char * command_params,const char * command, + const char * root_host , int root_ch_num, int mpirun_port, int ppid, int array_it) { + + MD_PRINT(DDEBUG," Starting minidaemon with following parameters:"); + MD_PRINT(DDEBUG,"parent_name =%s,ch_num = %d, width = %d, command = %s,command_params = %s\n command=%s, root_host=%s, root_ch_num=%d,\ + mpirun_port=%d",par_name,ch_num,width,command,command_params,command,root_host,root_ch_num,mpirun_port,ppid); + + md_base_init(ch_num, width, command_params, command, root_host,root_ch_num, mpirun_port, ppid); + + md_entity.parent_name = par_name; + md_entity.isParent = 0; + md_entity.array_it = array_it; + md_entity.pid = getpid(); + +} + +void md_base_init(int ch_num, int width, const char * command_params,const char * command, const char * root_host ,int root_ch_num, int mpirun_port, int ppid) { + + if ( !(md_entity.children_md_arr = malloc((width+ch_num+1) * sizeof(struct md_entry_t))) || + !(md_entity.childrenList = malloc(ch_num * sizeof(struct childrenList_t))) ) { + MD_SYS_ERROR(" malloc failed"); + } + + md_entity.ch_num = ch_num; + md_entity.root_ch_num = root_ch_num; + md_entity.root_hostname = (char * ) root_host; + md_entity.tree_width = width; + + md_entity.command_params = command_params; + md_entity.mpi_param_len = strlen(command_params); + md_entity.command = command; + + md_entity.max_fd_num = 0; + md_entity.total_running_mds = 0; + md_entity.phase = MD_INIT; + md_entity.mpirun_port = mpirun_port; + md_entity.ppid = ppid; + md_entity.isLeaf = 0; +} + +/* create parent socket and children minidaemon sockets */ +static void md_socket_create() { + + struct hostent* host; + struct utsname name; + struct sockaddr_in srv_sock_name,clnt_sock_name; + int s; + long arg; + + md_entity.phase = MD_WORK; + if (md_entity.total_running_mds < 1) + return; + + if ((s = socket (AF_INET, SOCK_STREAM, 0)) < 0) { + perror ("socket"); + exit(1); + } + + + memset ((char *)&srv_sock_name, '\0', sizeof(srv_sock_name)); + srv_sock_name.sin_port = htons (MINIDAEMON_PORT); + srv_sock_name.sin_family = AF_INET; + + /* determine host system name and internet address*/ + if (uname(&name) == -1) { + perror ("uname"); + exit(1); + } + + if ( (host=gethostbyname(name.nodename)) == NULL) { + MD_PRINT(DDEBUG,"The problem is here, nodename is %s\n",name.nodename); + perror ("gethostbyname"); + exit(1); + } + + /*memcpy ( (char*) &srv_sock_name.sin_addr, host->h_addr_list, host->h_length);*/ + srv_sock_name.sin_addr.s_addr = htonl(INADDR_ANY); + if (bind (s, (struct sockaddr *) &srv_sock_name, sizeof (srv_sock_name)) < 0) { + perror ("bind"); + close (s); + exit(1); + } + + /* Set non-blocking */ + if( (arg = fcntl(s, F_GETFL, NULL)) == 0) { + fprintf(stderr, "Error fcntl(..., F_GETFL) (%s)\n", strerror(errno)); + exit(0); + } + arg |= O_NONBLOCK; + if( fcntl(s, F_SETFL, arg) != 0) { + fprintf(stderr, "Error fcntl(..., F_SETFL) (%s)\n", strerror(errno)); + exit(0); + } + + /* prepare to listen cp for multiple connections*/ + if (listen (s, md_entity.tree_width) ==-1 ) { + perror ("listen"); + close (s); + exit(1); + } + + MD_PRINT(DDEBUG,"Waiting for children to connect...\n"); + int loop = md_entity.total_running_jobs +1 ; + + //TODO: The following block should be spinned-of as a private method (function) + struct timeval timeout; + timeout.tv_sec = ACCEPT_TIMEOUT; + timeout.tv_usec = 0; + int res; + fd_set mask; + FD_ZERO(&mask); + FD_SET(s,&mask); + socklen_t client_len = (socklen_t) sizeof(clnt_sock_name); + MD_PRINT(DDEBUG," total_running_mds = %d\n",md_entity.total_running_mds); + while (loop < md_entity.total_running_mds+ md_entity.total_running_jobs +1) { + switch (res = select(s+1, &mask, NULL, NULL, &timeout)) { + case -1: + /*signal or error were received*/ + /*perror ("select"); + close (s); + exit(1);*/ + + case 0 : + /* timeout was reached */ + MD_PRINT(DDEBUG,"Timeout problem when connecting to children minidaemons"); + md_forced_cleanup_handler(); + exit(1); + default: + if (res < 0) { + perror ("Bad select result\n"); + exit(1); + } + if (FD_ISSET (s, &mask)) { + //TODO : yet another try + /* There may not always be a connection waiting after a SIGIO is delivered or select(2) or poll(2) + return a readability event because the connection might have been removed by an asynchronous + network error or another thread before accept is called. If this happens then the call will block + waiting for the next connection to arrive. To ensure that accept never blocks, the passed socket + s needs to have the O_NONBLOCK flag set + */ + if ( (md_entity.children_md_arr[loop].fd = + accept(s, (struct sockaddr *) &clnt_sock_name, &client_len)) == -1 ) { + perror ("listen"); + close (s); + exit(1); + } + md_entity.max_fd_num = + ( md_entity.max_fd_num > md_entity.children_md_arr[loop].fd + ? md_entity.max_fd_num : md_entity.children_md_arr[loop].fd) ; + MD_PRINT(DDEBUG,"Another one had connected to parent\n"); + md_send_init_message(md_entity.children_md_arr[loop].fd); + /*children_md_arr[loop].sock_addr = clnt_sock_name;*/ //TODO ? + ++loop; + } + break; + } + //TODO real update socket status with time delta + timeout.tv_sec = ACCEPT_TIMEOUT; + timeout.tv_usec = 0; + FD_ZERO(&mask); + FD_SET(s,&mask); + } + return; +} + +/* Start listening to messages from other Minidaemons */ +void minidaemon_run () { + + ChildrenListIterator i; + int node_number = 0; + if (!md_entity.isParent) { + md_connect_to_parent(); + } + md_start_node_tree(); + md_run_own_jobs(); + md_socket_create(); + + /* Here are 4 different stages : + 1. - INIT STATE : Get parameters from parent minidaemon (this stage should be done in md_connect_to_parent() + 2. - QP_LID STAGE : Listen to and handle QP_LID messages in order to run own jobs and pass params to children minidaemons + 3. - WORK STAGE : Listen to STATUS messages from other MD's and keep I-AM-ALIVE on + 4. - TERMINATION STAGE : Listen to messages in order to verify clear shutdown + */ + + md_general_listen(); + MD_PRINT(DDEBUG,"Exiting : unhandled signal, pid=%d\n",md_entity.pid); + md_print_status_message(); + exit(0); +} + +void md_connect_to_parent() { + struct hostent* host; + struct utsname name; + struct sockaddr_in srv_sock_name,clnt_sock_name; + int s; + long arg; + int len; + int i = 0; + + MD_PRINT(DDEBUG,"Connecting to parent : %s\n", md_entity.parent_name); + if ((s = socket (AF_INET, SOCK_STREAM, 0)) < 0) { + perror ("socket"); + exit(1); + } + + memset ((char *)&srv_sock_name, '\0', sizeof(srv_sock_name)); + srv_sock_name.sin_port = htons (MINIDAEMON_PORT); + srv_sock_name.sin_family = AF_INET; + + if ( (host=gethostbyname(md_entity.parent_name)) == NULL) { + MD_PRINT(DDEBUG,"The problem is here, parent name = %s\n",md_entity.parent_name); + perror ("gethostbyname"); + exit(1); + } + + //TODO host->h_addr_list + memcpy ( (char*) &srv_sock_name.sin_addr, host->h_addr, host->h_length); + + + if (connect (s, (struct sockaddr *) &srv_sock_name, sizeof (srv_sock_name)) < 0) { + perror ("connect"); + close (s); + exit(1); + } + /* Set non-blocking */ + if( (arg = fcntl(s, F_GETFL, NULL)) == 0) { + fprintf(stderr, "Error fcntl(..., F_GETFL) (%s)\n", strerror(errno)); + exit(0); + } + arg |= O_NONBLOCK; + if( fcntl(s, F_SETFL, arg) != 0) { + fprintf(stderr, "Error fcntl(..., F_SETFL) (%s)\n", strerror(errno)); + exit(0); + } + + MD_PRINT(DDEBUG,"Connected to parent\n"); + md_entity.max_fd_num = ( md_entity.max_fd_num > s ? md_entity.max_fd_num : s); + + /* + Here we not need the communication to be non-blocking + Minidaemon should receive data that is essential for future flow. + That is, without information about own processes and children minidaemon it can't continue + */ + md_entity.parent_fd = (md_entity.isParent ? 0 : s); + md_listen_parent(s); + MD_PRINT(DDEBUG,"Children List received from the parent\n"); +} + +void md_listen_parent(int sd) { + int i = 0; + int tmp; + /* First of all, write to parent your array_iterator */ + /* That's in order to allow to parent md estimate who is talking to him*/ + MD_PRINT(DPATH," writing our array_it\n"); + if ((tmp=write(sd,(void *) &md_entity.array_it, sizeof(int))) < sizeof(int)) { + MD_SYS_ERROR(" : write"); + }; + MD_PRINT(DPATH," %d bytes was written\n",tmp); + while (i < md_entity.ch_num ) { + md_read_string(&md_entity.childrenList[i].hostname,sd); + MD_PRINT(DPATH," Reading entry No. %d\n",i); + MD_PRINT(DPATH,"Hostname received: %s\n",md_entity.childrenList[i].hostname); + + md_read_string(&md_entity.childrenList[i].device,sd); + MD_PRINT(DPATH,"Device received: %s\n",md_entity.childrenList[i].device); + + md_read_int(&md_entity.childrenList[i].port,sd); + MD_PRINT(DPATH,"Port received: %d\n",md_entity.childrenList[i].port); + + md_read_int(&md_entity.childrenList[i].rank,sd); + MD_PRINT(DPATH,"Rank received: %d\n",md_entity.childrenList[i].rank); + MD_PRINT(DPATH,"----------- Entry %d was succesfully recieved\n",i); + ++i; + } +} + +/************ Utility functions ***************************/ +int md_read_gen_type(void * buff,size_t len,int sd, int attempt_no) { + fd_set mask; + int res, n = 0; + struct timeval timeout; + static const int time_sec = 20; + + while (n < len) { + n += (res = read(sd,buff+n,len)); + if (res > 0) { + MD_PRINT(DPATH," : got %d bytes from socket %d\n",res,sd); + } + else if (res < 0) { + if ((errno == EINTR) || (errno == EAGAIN) ) { + n-=res; + } + else + { + return res; + } + } + else if (--attempt_no == 0) { + return 0; + } + } + return n; + +} + +void md_read_int(int * buff, int sd) { + md_read_gen_type((void*) buff, sizeof(int),sd,attempt_no); + MD_PRINT(DDEBUG," Int received is %d\n",*buff); +} + +void md_read_string(char ** buff,int sd) { + int str_len; + md_read_int((void*) &str_len,sd); + MD_PRINT(DDEBUG,"The str_len received is %d\n",str_len); + if (str_len < 0) + MD_SYS_ERROR(" Received string length is negative ! Exiting...\n"); + if ( (*buff = malloc(str_len+1)) == NULL) { + MD_SYS_ERROR(",malloc()"); + } + (*buff)[str_len] = '\0'; + md_read_gen_type((void *) *buff,str_len,sd,attempt_no); + MD_PRINT(DDEBUG," : the buffer received is %s\n",*buff); +} + + + +/* Counts the exact number of DIFFERENT NODES, except of the first. +That is, for the array {host1, host1, host2, host3, host3, host4 } the output should be 3 +*/ +int md_get_node_counter() { + ChildrenListIterator it; + int md_counter = 0; + MD_PRINT(DDEBUG, " getListNextNode(0,1)=%d\n",getListNextNode(0,1)); + for (it = getListNextNode(0,1) ; it < md_entity.ch_num; it=getListNextNode(it,1)) { + ++md_counter; + } + return md_counter; +} + + +/* Launch the tree of minidaemons over the cluster */ +void md_start_node_tree() { + char * minidaemon_command; + minidaemon_command = mkstr("%s%s%s", XSTR(MPI_PREFIX),XSTR(MPI_BIN), XSTR(MINIDAEMON_CLIENT_NAME) ); + /* STR_CONCAT(XSTR(MPI_PREFIX), XSTR(MPI_BIN), XSTR(MINIDAEMON_CLIENT_NAME)) ; // XSTR(MINIDAEMON_CLIENT_NAME) ; */ + int nproc, width; + const int nproc_len = 8; + const int width_len = 8; + char nproc_str[nproc_len]; + char root_nproc_str[nproc_len]; + char width_str[width_len]; + char mpiport_str[nproc_len]; + char ppid_str[nproc_len]; + char array_it_str[nproc_len]; + + + /* It's very important caclulation + According to the algorithm, the minidaemon should run its own jobs , then + divide the rest of nodes array between (maximum) width minidaemons. + For example, if nodes array equals to { host1, host1, host2, host2, host3, host4 ,host5, host5}host5, host5}, and width =2, + than minidaemon on host1 (MD1) will run MD2 with array= {host2,host2,host3 } and MD4 with array = {host4 ,host5, host5} + After, MD2 will run an additional MD3 with array={host3} (lead mindaemon ) and MD4 will run an additional MD5 with + array = {host5, host5} + */ + int md_node_counter = md_get_node_counter(); + if (md_node_counter < 1) { + md_entity.isLeaf = 1; + MD_PRINT(DDEBUG,"We have reached the bottom level : this minidaemon is a leaf\n"); + return ; + } + ChildrenListIterator i = getListNextNode(0,1); + int md_step = (md_node_counter / md_entity.tree_width) + (md_node_counter % md_entity.tree_width ? 1 : 0); + int md_total = md_node_counter; + MD_PRINT(DDEBUG,"Starting node tree, total node counter is %d, md_step = %d \n",md_node_counter,md_step); + while (md_node_counter > 0) { + md_node_counter -= md_step; + nproc = getListNextNode(i,md_step)-i; + md_entity.childrenList[i].child_list_len = nproc; + MD_PRINT(DDEBUG," nproc=%d , i=%d\n",nproc,i); + assert (nproc > 0); + + + sprintf(width_str, "%d " ,md_entity.tree_width); + sprintf(nproc_str, "%d " , nproc); + sprintf(root_nproc_str, "%d ", md_entity.root_ch_num); + sprintf(mpiport_str, "%d ", md_entity.mpirun_port); + sprintf(ppid_str, "%d ", md_entity.ppid); + sprintf(array_it_str,"%d ", i); + + md_entity.childrenList[i].pid=fork(); + if (md_entity.childrenList[i].pid == 0) { + int j; + MD_PRINT(DDEBUG,"Starting Child Minidaemon No. %d of %d at host number %s ,Exec command : %s\n", + md_total-md_node_counter, md_total,md_entity.childrenList[i].hostname, minidaemon_command); + + if (!md_entity.isParent) + close(md_entity.parent_fd); + + execl(RSH_CMD,RSH_CMD,md_entity.childrenList[i].hostname,minidaemon_command, + md_entity.childrenList[0].hostname,nproc_str,width_str,md_entity.command_params, + md_entity.command,md_entity.root_hostname, root_nproc_str,mpiport_str, ppid_str, array_it_str,NULL); + /*If we've reached this line, ssh failed*/ + MD_SYS_ERROR ("RSH/SSH command failed!"); + } + int j; + i += nproc; + for (j = 0 ; j < i ; ++j) { + md_entity.childrenList[i].proc_state = MD_NORMAL; //TODO insert negative number to status message + } + md_entity.total_running_mds +=1; + int own_jobs_num = (int) getListNextNode(0,1); + md_entity.children_md_arr[own_jobs_num + md_entity.total_running_mds].stat = MD_WORK; + md_entity.children_md_arr[own_jobs_num + md_entity.total_running_mds].resp_stat = !resp_status_def; + MD_PRINT(DDEBUG,"own_jobs_num=%d, md_entity.total_running_mds=%d, !resp_status_def = %d\n", + own_jobs_num,md_entity.total_running_mds,!resp_status_def); + } + return ; /* parent process */ +} + +void md_run_own_jobs() { + + ChildrenListIterator i=0; + + const int add_param_len = 45; /* the lengths of VIADEV_DEVICE etc. */ + const int max_rank_len = 10; /* the maximum length of rank number in symbols , i.e. 1000000 mpi jobs */ + const int max_port_len = 8; + char * curr_command_params = NULL; + char * curr_command = NULL; + char rank_str[max_rank_len]; + char port_str[max_port_len]; + int curr_param_len = 0; + static const char env_command[13]="/usr/bin/env"; + do { + int pd[2]; + + /* create pipe in order to communicate with children process */ + if (pipe(pd) == -1) { + MD_SYS_ERROR("pipe"); + } + md_entity.children_md_arr[i].fd = pd[0]; + md_build_command_string(i/*,&curr_command*/); + MD_PRINT(DDEBUG," : Starting children processes\n"); + if ((md_entity.childrenList[i].pid = fork())== 0) { /* Affinity should be set in vianit.c */ + close(pd[0]); /* closing read side of pipe in the child process */ + + /* close all other filedescriptors, i.e. all except */ + int j; + for (j = 0 ; j < i ; ++j) { + close(md_entity.children_md_arr[i].fd); + } + + MD_PRINT(DINFO,"Starting child process: %s\n", md_entity.command); + execl(md_entity.command, md_entity.command, NULL); + MD_SYS_ERROR("execl failed"); + + } + else { + /* close write side */ + free(curr_command); + close(pd[1]); + md_entity.children_md_arr[i].stat = MD_WORK; //TODO - REPAIR THIS LINE + md_entity.childrenList[i].proc_state = MD_NORMAL; + md_entity.children_md_arr[i].resp_stat = !resp_status_def; + ++i; + ++md_entity.total_running_jobs; //TODO = i + md_entity.max_fd_num = ( md_entity.max_fd_num > pd[0] ? md_entity.max_fd_num : pd[0]) ; + + } + } while ( i < md_entity.ch_num && (children_list_comp(&md_entity.childrenList[0],&md_entity.childrenList[i]) == 0)) ; + + /* Update minidaemon parent entry */ + md_entity.children_md_arr[md_entity.total_running_jobs].fd = md_entity.parent_fd; + md_entity.children_md_arr[md_entity.total_running_jobs].stat = MD_WORK; + md_entity.children_md_arr[md_entity.total_running_jobs].resp_stat = !resp_status_def; + + /* Free the resources */ + free(curr_command_params); + curr_command_params = NULL; + //TODO while (i < MIN (md_entity.ch_num , getNextListNode(0,1)) +} + +void md_build_command_string(int i /*, char ** exec_command*/) { + char *xterm_command; + char xterm_title[100]; + char *ld_library_path; + char *device_port_env=NULL; + static char wd[MAX_WD_LEN]="\0"; + + const int max_rank_len = 10; /* the maximum length of rank number in symbols , i.e. 1000000 mpi jobs */ + const int max_port_len = 8; + const int max_pid_len = 10; + const int max_ch_num_len = 10; + /* const int max_mpirun_port_len = 8; */ + char rank_str[max_rank_len] /*= "\0"*/; + char port_str[max_port_len] /*= "\0"*/; + char pid_str[max_pid_len] /*= "\0"*/; + char ch_num_str[max_ch_num_len] ; + char mpirun_port_str[max_port_len] ; + +#define MAX_DISPLAY_LEN 200 + static char display[MAX_DISPLAY_LEN] = "\0"; +#define BASE_ENV_LEN 17 + int str_len, len; + + int xterm_on = 0; + + if (wd[0] == '\0') + getcwd(wd, MAX_WD_LEN); + if (display[0] == '\0') { + get_display_str(display); + } + putenv(display); + + if (md_entity.childrenList[i].device != NULL && strlen(md_entity.childrenList[i].device ) != 0) { + setenv("VIADEV_DEVICE",md_entity.childrenList[i].device,1); + } + + if (md_entity.childrenList[i].port != -1) { + sprintf(port_str,"%d",md_entity.childrenList[i].port); + setenv("VIADEV_PORT",port_str,1); + } + + //TODO insert MPI PARAMETER parsing + + ld_library_path = getenv("LD_LIBRARY_PATH"); + + MD_PRINT(DDEBUG,"LD_LIBRARY_PATH=%s\n",ld_library_path); + if (ld_library_path != NULL) { + setenv("LD_LIBRARY_PATH",ld_library_path,1); + } else { + setenv("LD_LIBRARY_PATH",LD_LIBRARY_PATH_MPI,1); + } + + /* + * this is the remote command we execute whether we were are using + * an xterm or using rsh directly + */ + sprintf(rank_str,"%d",md_entity.childrenList[i].rank); + setenv("MPIRUN_RANK",rank_str,1); + + setenv("MPIRUN_MPD","0",1); + setenv("MPIRUN_HOST",md_entity.root_hostname,1); + MD_PRINT(DDEBUG," md_entity.root_hostname=%s\n",md_entity.root_hostname); + + sprintf(ch_num_str,"%d",md_entity.root_ch_num); + setenv("MPIRUN_NPROCS",ch_num_str,1); + + + sprintf(pid_str,"%d",md_entity.ppid); + setenv("MPIRUN_ID",pid_str,1); + + sprintf(mpirun_port_str,"%d",md_entity.mpirun_port); + setenv("MPIRUN_PORT",mpirun_port_str,1); + + putenv("NOT_USE_TOTALVIEW=1"); + +} + +//TODO rebuild minidaemonListen() to be a "Class" with general methods, printf("Hostname received: %s\n",passed by parameters + +void md_general_listen() { + /* Registrate Event Handlers, if any */ + /* listen to all fd'md_general_listen()s, including self-pipes, if any */ + int status, + res, + stat, + j; + time_t time_wasted; + char msg_buf[MAX_MESSAGE_SIZE]; + fd_set wr_mask; + struct timeval timeout; + + md_entity.send2father_timeout = send2father_work_timeout; + md_entity.recv_parent_timeout = recv_parent_work_timeout; + md_entity.recv_children_timeout = (md_entity.isLeaf ? NO_TIMEOUT :recv_children_work_timeout ); + md_entity.recv_jobs_timeout = recv_jobs_work_timeout; + + md_entity.curr_running_jobs = md_entity.total_running_jobs; + md_entity.curr_running_mds = md_entity.total_running_mds; + + md_entity.pid = getpid(); + timeout.tv_usec = 0; + + MD_PRINT(DDEBUG,"Minidaemon is starting to listen...\n"); + MD_PRINT(DDEBUG,"The sockets table for process No. %d are:",md_entity.pid); + for (j=0; j < 1 + md_entity.total_running_jobs + md_entity.total_running_mds; ++j) + MD_PRINT(DDEBUG,"%d, ",md_entity.children_md_arr[j].fd); + fprintf(stderr,"\n"); + + /*signal(SIGPIPE, sigpipe_handler);*/ + signal(SIGTERM, sigpipe_handler); + signal(SIGHUP, sigpipe_handler); + while (1) { + /* Wait for father and children to respond, no more than fixed timeout */ + time_wasted = time(NULL); + md_rebuild_mask(&wr_mask); /* should be also general function */ + static int count = 0; + MD_PRINT(DDEBUG,"%d: send2father_timeout=%d ,recv_children_to=%d, pid = %d\n", + count++,md_entity.send2father_timeout,md_entity.recv_children_timeout,md_entity.pid); + if (md_entity.phase == MD_WORK) { + timeout.tv_sec = MIN(md_entity.send2father_timeout,md_entity.recv_children_timeout); + } + else { + if (md_entity.isParent) + timeout.tv_sec = ( md_entity.recv_jobs_timeout == NO_TIMEOUT ? md_entity.recv_children_timeout : + MAX(md_entity.recv_jobs_timeout,md_entity.recv_children_timeout)); + else + timeout.tv_sec = MIN(md_entity.recv_jobs_timeout,md_entity.recv_children_timeout); + } + MD_PRINT(DDEBUG,"Setting current select timeout to %d\n",timeout.tv_sec); + switch (res = select(md_entity.max_fd_num+1,&wr_mask,NULL,NULL,&timeout)) { + case -1: + /* Signal or error was received */ + perror(" Unhandled signal\n"); + md_print_status_message(); + exit(1); + case 0: + /* Is parent timeout ? */ + timeout_handler((int)(time(NULL) - time_wasted)); + break; + default: + /* wait for confirmation status from children */ + assert(res > 0); + int i; + MD_PRINT(DDEBUG,"Got message from own jobs or other minidaemons : handling...\n"); + /* Find all file descriptors that had been updated */ + for (i = 0 ; i < md_entity.total_running_mds + md_entity.total_running_jobs + 1; ++i) { + if (FD_ISSET(md_entity.children_md_arr[i].fd, &wr_mask)) { + /* Child process was ended */ + /* TODO : spin this block out as a separate function */ + + if (i < md_entity.total_running_jobs) { + /* TODO set appropriate timeout */ + /* TODO handle different cases */ + MD_PRINT(DDEBUG,"One of children processes[%d] was ended\n",i); + md_entity.children_md_arr[i].stat = MD_do_not_set; + md_children_proc_handler(i,(int)(time(NULL) - time_wasted)); + } + /* Parent or children minidaemons had sent us a message */ + else { + MD_PRINT(DDEBUG,"Parent or children minidaemons had sent us a message\n"); + md_handler(md_entity.children_md_arr[i].fd,i); /*for parent and children md messages */ + md_update_socket_status(i,(int)(time(NULL) - time_wasted)); + } + //TODO : res optimization + /*if(--res == 0 ) + break;*/ + } + } /* for */ + break; + } /* switch */ + } /* while */ +} + +void timeout_handler (int delta_time) { + + /* in this case, timeout_ch is a current "receive-from-children-or-parent" timeout + and timeout_other is timeout of sending to parent */ + switch(md_entity.phase) { + case MD_WORK : + if (md_entity.send2father_timeout < md_entity.recv_children_timeout) { + if (!md_entity.isParent) { + MD_PRINT(DDEBUG," Sending status to parent, pid=%d\n",md_entity.pid); + md_send_status_message(md_entity.total_running_jobs,-1); + } + md_entity.send2father_timeout = send2father_work_timeout; + md_entity.recv_children_timeout -= delta_time; + assert(md_entity.recv_children_timeout > 0); + } + /* We got timeout from children and/or parent */ + else { + md_start_termination_process(); + md_entity.recv_children_timeout = (md_entity.isLeaf ? NO_TIMEOUT :recv_children_term_timeout); + md_entity.recv_jobs_timeout = recv_jobs_term_timeout; + } + break; + + /* in this case, timeout_ch is a current "receive-from-chilren-or-parent" timeout + and timeout_other is timeout for own-jobs-termination */ + case MD_TERM : + /* here we should wait only for children process and children minidaemons */ + //TODO propagation delay, i.e. to_value = f(base_to_value,tree_depth) + if (md_entity.total_running_jobs !=0) { + md_forced_cleanup_handler(); + } + /* the last status Message we've send */ + //TODO Send Timeout with forced FINISH + if (!md_entity.isParent) { + md_send_status_message(md_entity.total_running_jobs,-1); + } + MD_PRINT(DDEBUG,"Minidaemon exits right now, pid=%d\n",md_entity.pid); + md_socket_close(); + md_print_status_message(); + exit(0); + + default: + break; + } +} + +void md_children_proc_handler(int ind, int time_delta) { + + /* static int counter = -1 ; */ + int *loc_state; + /*md_entity.children_md_arr[ind].stat = */ + waitpid (md_entity.childrenList[ind].pid,loc_state,WNOHANG); + MD_PRINT(DDEBUG," Job %d was finished with status %d\n",md_entity.childrenList[ind].pid, WEXITSTATUS (*loc_state)); + md_entity.childrenList[ind].proc_state = (loc_state ? WEXITSTATUS (*loc_state) : -1); + --md_entity.curr_running_jobs; + + if ( md_entity.curr_running_jobs == 0) { /* the last job */ + md_entity.recv_jobs_timeout = NO_TIMEOUT; + md_entity.recv_children_timeout -= time_delta; + MD_PRINT(DDEBUG," The last own jobs was finished with pid %d\n",md_entity.childrenList[ind].pid); + if (md_entity.curr_running_mds == 0) { + MD_PRINT(DDEBUG," All own jobs and children minidaemons finished : exiting\n"); + md_entity.phase = MD_FINISH; + if (md_entity.isParent) { + md_print_status_message(); + } else { + md_send_status_message(md_entity.total_running_jobs,ind /*md_entity.childrenList[ind].proc_state*/); + } + exit(0); + } + } + if (md_entity.curr_running_jobs == (md_entity.total_running_jobs-1) ) { + md_entity.phase = MD_TERM; + md_entity.recv_jobs_timeout = recv_jobs_term_timeout; + md_entity.recv_children_timeout = recv_children_term_timeout; + + } /* "else" statement is bug when curr_running jobs == 1 ! */ + else { + md_entity.recv_jobs_timeout -= time_delta; + md_entity.recv_children_timeout -= time_delta; + } + if (!md_entity.isParent) { + md_send_status_message(md_entity.total_running_jobs, md_entity.childrenList[ind].proc_state); + } +} + +void md_print_status_message() { + int i; + MD_PRINT(DDEBUG,"Printing the status of all the processes: "); + for (i = 0; i < md_entity.ch_num ; ++i) { + fprintf(stderr,"%d ", md_entity.childrenList[i].proc_state); fflush(stderr); + } + fprintf(stderr,"\n"); fflush(stderr); +} + +//TODO array of pointer to appropriate function , i.e. func_array[msg_buff[0]].handler(); +void md_handler(int sd, int i) { + + static int msg_buf[MAX_MESSAGE_SIZE]; //TODO spin it out + static const int small_msg_size = sizeof(int) * 2; + static const int big_msg_size = sizeof(int) * 4; + int n = 0; + static const short int array_it_index = 1; + + n = md_read_gen_type(msg_buf,small_msg_size,sd, attempt_no); + /* Connection to minidaemon with socket sd was closed */ + if (n <= 0) { + /* update status table */ + --md_entity.curr_running_mds; + md_entity.children_md_arr[i].stat = MD_FINISH; + md_start_termination_process(); + return; + } + MD_PRINT(DDEBUG,"Handling incoming message of size %d\n",n); + //TODO Once we got msg_buf[0], we can read the rest of the data (md_read_gen_type) according to its type + switch ( msg_buf[0]) { + + case STATUS_REQUEST: + MD_PRINT(DDEBUG," Status request was received\n"); + break; + case STATUS: + MD_PRINT(DDEBUG," Status message was received\n"); + if (msg_buf[array_it_index] != -1) { + n += md_read_gen_type((void *) &msg_buf[2],big_msg_size - small_msg_size,sd,attempt_no); + } + md_status_msg_handler(msg_buf,i, n); + break; + case KILL: + MD_PRINT(DDEBUG," KILL message was received\n"); + md_start_termination_process(); + break; + default: + MD_USR_ERROR(" : Bad opcode received\n"); + } + return; + +} + +/* + Message structure : + Byte1 : index of child minidaemon in the list of parent minidaemon + Byte2 : OFFSET (-1 in the case of void message) + Byte3 : Status value (void if previous byte is equal to -1) +*/ +void md_status_msg_handler(const int * msg_buf,int i, int size) { + static const short int array_it_index = 1; + static const short int offset_index = 2; + static const short int proc_stat_val_index = 3; + int iter; + MD_PRINT(DDEBUG,"Status message received\n"); + assert(size >= sizeof(int)*2); + if (size == sizeof(int)*2) { + if (msg_buf[array_it_index] != -1) { + MD_SYS_ERROR(" : message too short\n"); + } + if (i != md_entity.curr_running_jobs) { /* If parent had answered us, there's no need to send him msg right now */ + md_send_status_message(i,-1); + } + return ; + } + assert (size == sizeof(int)*4) ; + iter = msg_buf[array_it_index] + msg_buf[offset_index]; + MD_PRINT(DDEBUG," Setting status %d to index %d\n", msg_buf[proc_stat_val_index],iter); + md_entity.childrenList[iter].proc_state = msg_buf[proc_stat_val_index]; + if (i != md_entity.curr_running_jobs) { /* If parent had answered us, there's no need to send him msg right now */ + md_send_status_message(i,-1); + } +} + +/* This message will be used for : +1. Implementation of the I-AM-ALIVE protocol. In this case, information about child jobs of the sender may remain unchanegd +2. Notification on status changing of one or more processes. In this case, a status of an appropriate job will be changed +*/ +void md_send_status_message(int md_index, ChildrenListIterator iter) { + const int status_buff_len = 4; + static int buff[] = {STATUS,-1,-1,-1}; + int write_size = status_buff_len * sizeof(int); + int sd = md_entity.children_md_arr[md_index].fd; + + if (iter < 0) { /* Send only I-AM-ALIVE message */ + buff[1] = -1; + write_size = sizeof(int) * 2; + } else { + buff[1] = md_entity.array_it ; + buff[2] = iter; + buff[3] = md_entity.childrenList[iter].proc_state; + write_size = sizeof(int) * status_buff_len; + } + MD_PRINT(DDEBUG,"Sending status message to socket %d \n",sd); + if (write(sd,buff,write_size) < write_size ) { + if (md_entity.phase != MD_TERM) + MD_SYS_ERROR(""); + } + +} + +// TODO : define which set - parent, md or proc children should be set +void md_rebuild_mask(fd_set * mask) { + + int i; + FD_ZERO(mask); + //TODO not curr_running, but total_running; + MD_PRINT(DDEBUG,"\n"); + for (i = 0 ; i < md_entity.total_running_jobs; ++i) { + if (md_entity.children_md_arr[i].stat != MD_do_not_set) { + MD_PRINT(DDEBUG," mask for job %d was set\n",i); + FD_SET (md_entity.children_md_arr[i].fd, mask) ; + } + } + + for (i = md_entity.total_running_jobs ; i < md_entity.total_running_mds + md_entity.total_running_jobs +1; ++i) { + if (i == md_entity.total_running_jobs && md_entity.isParent) + continue; /* do not set "parent" socket for root minidaemon */ + MD_PRINT(DDEBUG," i = %d , md_entity.children_md_arr[i].stat= %d, \ +md_entity.children_md_arr[i].resp_stat= %d , resp_status_def=%d\n", + i,md_entity.children_md_arr[i].stat,md_entity.children_md_arr[i].resp_stat,resp_status_def); + + if ((md_entity.children_md_arr[i].resp_stat != resp_status_def) && (md_entity.children_md_arr[i].stat == MD_WORK)) { + MD_PRINT(DDEBUG," mask for md %d was set\n",i); + FD_SET (md_entity.children_md_arr[i].fd, mask) ; + } + } +} + + +int md_update_socket_status(int i,int time_delta) { + + int res=0; + static int num_of_md_responded = 0; + MD_PRINT(DDEBUG," : index = %d, delta = %d, my pid = %d\n",i, time_delta, md_entity.pid); + + if (md_entity.children_md_arr[i].resp_stat != resp_status_def) { + md_entity.children_md_arr[i].resp_stat = resp_status_def; + ++num_of_md_responded; + + if (res=(num_of_md_responded == (md_entity.total_running_mds+ (!md_entity.isParent)))) { /* all mds and parent */ + num_of_md_responded = 0; + resp_status_def = !resp_status_def; /* We change the definition of resp_status_def instead of all the status array*/ + md_entity.recv_children_timeout = recv_children_work_timeout; + } + else /*if (num_of_md_responded == 0)*/ { + md_entity.recv_children_timeout-=time_delta; + } + } + md_entity.send2father_timeout-=time_delta; + return res; +} + +void md_start_termination_process() { + int i ; + MD_PRINT(DDEBUG," Checking\n"); + if (md_entity.phase != MD_TERM) { + MD_PRINT(DDEBUG,"Starting termination process, pid=%d\n",md_entity.pid); + md_entity.phase = MD_TERM; + for (i = md_entity.total_running_jobs +1 ; i < md_entity.total_running_mds+md_entity.total_running_jobs +1; ++i) { + /* send status message to other MD (with termination status) */ + md_send_termination_message(md_entity.children_md_arr[i].fd); + } + } +} + +void md_send_termination_message(int sd) { + + static const int buff[1] = {KILL}; + MD_PRINT(DDEBUG,"Sending termination message to children from process %d: socket %d\n",md_entity.pid, sd); + if (write (sd,buff,sizeof(int)) < sizeof(int)) { + MD_SYS_ERROR("write"); + } +} + +void md_forced_cleanup_handler() { + /* send kill to its own jobs */ + ChildrenListIterator i; + int *loc_state; + /* send forced kill (kill -9) to its own jobs */ + MD_PRINT(DDEBUG," Starting ...\n"); + for (i=0; i < getListNextNode(0,1) ; ++i) { + kill(md_entity.childrenList[i].pid,SIGKILL); + } + for (i=0; i < getListNextNode(0,1) ; ++i) { + waitpid(md_entity.childrenList[i].pid,loc_state,WNOHANG); + md_entity.childrenList[i].proc_state = (loc_state ? WEXITSTATUS (*loc_state) : -1); + } +} + +void md_socket_close() { + + int i; + for (i = 0; i < md_entity.total_running_jobs; ++i) { + close(md_entity.children_md_arr[i].fd); + } + if (!md_entity.isParent) + close(md_entity.children_md_arr[md_entity.total_running_jobs].fd); + + if (!md_entity.isLeaf) + for (i = 0; i < 1 + md_entity.total_running_jobs+md_entity.total_running_jobs; ++i) { + close(md_entity.children_md_arr[i].fd); + } + +} + +void md_req_status_handler(const char * buff) { + MD_PRINT(DDEBUG,"Request for status message received, in pid=%d\n",md_entity.pid); + switch (buff[1]) { + /*case MD_INIT: + md_send_init_message();*/ + default: + md_send_status_message(md_entity.total_running_mds,-1); + } +} + +//TODO optimize with receive width times at the client side +void md_send_init_message(int fd) { + MD_PRINT(DDEBUG," : starting\n"); + ChildrenListIterator it = 0, array_it; + int len; + + /* First of all, we have to know who we are talking to: */ + MD_PRINT(DDEBUG,"Reading array iterator number from child socket no. %d\n",fd); + md_read_int(&array_it,fd); + + MD_PRINT(DDEBUG,"Sending %d childrenList entries to child to socket no. %d\n",md_entity.childrenList[array_it].child_list_len,fd); + MD_PRINT(DDEBUG,"array_it = %d, md_entity.childrenList[array_it].child_list_len = %d\n", + array_it, md_entity.childrenList[array_it].child_list_len); + + for (it = array_it; it < (md_entity.childrenList[array_it].child_list_len + array_it) ; ++it) { + len= (md_entity.childrenList[it].hostname == NULL + ? 0 : strlen(md_entity.childrenList[it].hostname)*sizeof(char)); + MD_PRINT(DDEBUG,"The len to send is %d\n",len); + if (write(fd, &len,sizeof(int)) < sizeof(int)) + perror("write"); + if (write(fd, md_entity.childrenList[it].hostname,len) < len) + perror("write"); + + len= (md_entity.childrenList[it].device == NULL + ? 0 : strlen(md_entity.childrenList[it].device)*sizeof(char)); + if (write(fd, &len,sizeof(int)) < sizeof(int)) + perror("write"); + + if (write(fd,md_entity.childrenList[it].hostname,len) < len) + perror("write"); + + if (write(fd,&md_entity.childrenList[it].port,sizeof(int)) < sizeof(int)) + perror("write"); + MD_PRINT(DDEBUG," : The port is %d\n",md_entity.childrenList[it].port); + + if (write(fd,&md_entity.childrenList[it].rank,sizeof(int)) < sizeof(int)) + perror("write"); + MD_PRINT(DDEBUG," : The rank is %d\n",md_entity.childrenList[it].rank); + } +} + +void sigpipe_handler(int signo) { + + MD_PRINT(DDEBUG,"Handling SIGPIPE\n"); + md_forced_cleanup_handler(); + md_print_status_message(); + exit(1); +} + +static void get_display_str(char * display) { + char *p; + char str[200]; + + if ( (p = getenv( "DISPLAY" ) ) != NULL ) { + strcpy(str, p ); /* For X11 programs */ + sprintf(display,"DISPLAY=%s",str); + } +} Added: mvapich/trunk/mpid/ch_gen2/process/minidaemon.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/minidaemon.h 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/process/minidaemon.h 2007-11-05 16:54:39 UTC (rev 1627) @@ -0,0 +1,95 @@ +#ifndef _MINIDAEMON_H +#define _MINIDAEMON_H 1 + +/** +* Minidaemon ADT provided by Mellanox, MPI Team +* mailto : xalex@mellanox.co.il +* +* Minidaemon interface provides the possibilty to create and run +* daemon process on clusters node. Each instance of it will be created +* at application run time and will destroyed right after application termination. +* Each minidaemon instance may run user-defined jobs on a local machine as well as launch other minidaemons on remote machines. +* So, the user can define the execution scheme ("launch tree") for his needs. The concrete +* Thus, Minidaemon provide an answer to the following issues : + 1. Multicore management. It's only one daemon per machine that will start all jobs there + 2. Scalable and fast launch of (MPI) jobs on a cluster, using different start-up schemes (flat, multilevel tree) + 3. Cluster job management and cleanup (this version is ad-hoc MPI tuned) + 4. Ease of use and simplicity for cluster administrators. There's no constant daemon that requires + additional management and resources +**/ + +/*#include */ +#include "mpirun_rsh.h" +#define MD_USR_ERROR(msg) \ + do { \ + fprintf(stderr,"%s\n",msg); \ + exit(1); \ + } while (0); + +#define MD_SYS_ERROR(msg) \ + do { \ + perror(msg); \ + exit(1); \ + } while (0); + +/* Support for debug prints. */ + +/*#define DNONE 0*/ /* Message with this debug level will be always printed */ +/*#define DINFO 1*/ /* Info messages level is applicable for rare and informative messages */ +/*#define DDEBUG 2 *//* Debug level, should be turned off in normal run */ +/*#define DPATH 3 *//* The highest level, for use in loops or critical sections */ + +/*#define DGLOBAL_LEVEL 3*/ + +/* DGLOBAL_LEVEL should be defined in compilation stage, otherwise it should be zero */ +/*#define MD_PRINT(dlevel,fmt, args...) {if (dlevel <= DGLOBAL_LEVEL){\ + fprintf(stderr, "[%s:%d, pid=%d]", __FILE__, __LINE__,getpid());\ + fprintf(stderr, fmt, ## args); fflush(stderr);}}*/ + + + + +/** +Minidaemon ADT provided by Mellanox, MPI Team + +Minidaemon interface provides the possibilty to create and run +daemon process on clusters node. Each instance of it will be created +at application run time and will destroyed right after application termination. +**/ +/*typedef enum { + P_NOTSTARTED, + P_STARTED, + P_CONNECTED, + P_DISCONNECTED, + P_RUNNING, + P_FINISHED, + P_EXITED +} process_state; + +struct process_t { + char *hostname; + char *device; + pid_t pid; + pid_t remote_pid; + int port; + int control_socket; + process_state state; +} ;*/ + +/* Minidaemon Data Structures */ + +typedef struct childrenList_t * ChildrenList; +typedef struct md_entry_t * MD_entry; +typedef struct minidaemon_t * Minidaemon; +/*typedef struct process_t process;*/ + +void minidaemon_create(process * procList, int nproc, int width,const char * command_params,const char * command, int mpirun_port,int ppid); + +/* Init all data member structures */ +void minidaemon_init (const char * par_name , int ch_num, int width, const char * command_params,const char * command, + const char * root_hostname, int root_ch_num,int mpirun_port, int ppid, int array_it); + +/* Start listening to messages from other Minidaemons */ +void minidaemon_run (); + +#endif /* _MINIDAEMON_H */ Added: mvapich/trunk/mpid/ch_gen2/process/minidaemon_client.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/minidaemon_client.c 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/process/minidaemon_client.c 2007-11-05 16:54:39 UTC (rev 1627) @@ -0,0 +1,31 @@ +#include +#include "minidaemon.h" +#define DEFAULT_PARAM_NUMBER 10 + +#define NAME_INDEX 1 +#define NUM_INDEX 2 +#define WIDTH_INDEX 3 +#define COM_PAR_INDEX 4 +#define COM_INDEX 5 +#define ROOT_NAME 6 +#define ROOT_CH_NUM 7 +#define MPIRUN_PORT_INDEX 8 +#define PID_INDEX 9 +#define ARRAY_IT_INDEX 10 + +int main (int argc, char * argv[]) { + int width; + if (argc < DEFAULT_PARAM_NUMBER) { + MD_USR_ERROR("Minidaemon client : too few parameters to run minidaemon"); + } + if ( (width = atoi(argv[WIDTH_INDEX])) < 1) { + MD_USR_ERROR("Minidaemon client : invalid tree width received"); + } + /** Minidaemon functions will exit on error **/ + + /*void minidaemon_init (const char * par_name , int ch_num, int width, const char * command_params,const char * command, int root_ch_num);*/ + minidaemon_init(argv[NAME_INDEX],atoi(argv[NUM_INDEX]),atoi(argv[WIDTH_INDEX]), argv[COM_PAR_INDEX],argv[COM_INDEX], + argv[ROOT_NAME],atoi(argv[ROOT_CH_NUM]),atoi(argv[MPIRUN_PORT_INDEX]), atoi(argv[PID_INDEX]), atoi(argv[ARRAY_IT_INDEX])); + minidaemon_run(); + return 0; +} Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2007-11-05 16:54:39 UTC (rev 1627) @@ -61,6 +61,8 @@ process_groups * pglist = NULL; process * plist = NULL; int nprocs = 0; +int use_xlauncher = 0; +int xlauncher_width = 8; int aout_index, port; #define MAX_WD_LEN 256 char wd[MAX_WD_LEN]; /* working directory of current process */ @@ -95,7 +97,7 @@ void wait_for_errors(int s,struct sockaddr_in *sockaddr,unsigned int sockaddr_len); int set_fds(fd_set * rfds, fd_set * efds); static int read_hostfile(char *hostfile_name); -char * mkstr(const char *, ...); +void make_command_strings(int argc, char * argv[], char * totalview_cmd, char * command_name, char * command_name_tv); #ifdef USE_SSH @@ -120,6 +122,8 @@ {"v", no_argument, 0, 0}, {"tv", no_argument, 0, 0}, {"legacy", no_argument, 0, 0}, + {"use_xlauncher", no_argument, 0, 0}, + {"xlauncher_width", required_argument, 0, 0}, {0, 0, 0, 0} }; @@ -132,6 +136,7 @@ int debug_on = 0, xterm_on = 0, show_on = 0; int param_debug = 0; int use_totalview = 0; +int server_socket; char * mpirun_processes; char display[200]; char * binary_dirname; @@ -268,9 +273,19 @@ legacy_startup = 1; break; case 11: - legacy_startup = 1; + legacy_startup = 1; break; - case 12: + case 12: + use_xlauncher = 1; + break; + case 13: + xlauncher_width = atoi(optarg); + if (xlauncher_width < 1) { + usage(); + exit(EXIT_FAILURE); + } + break; + case 14: usage(); exit(EXIT_SUCCESS); break; @@ -373,7 +388,7 @@ get_display_str(); - s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + server_socket = s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (s < 0) { perror("socket"); exit(EXIT_FAILURE); @@ -393,7 +408,6 @@ port = (int) ntohs(sockaddr.sin_port); listen(s, nprocs); - if (!show_on) { struct sigaction signal_handler; signal_handler.sa_handler = cleanup_handler; @@ -412,10 +426,12 @@ sigaction(SIGALRM, &signal_handler, NULL); - signal_handler.sa_handler = child_handler; - sigemptyset(&signal_handler.sa_mask); + if (!use_xlauncher) { + signal_handler.sa_handler = child_handler; + sigemptyset(&signal_handler.sa_mask); - sigaction(SIGCHLD, &signal_handler, NULL); + sigaction(SIGCHLD, &signal_handler, NULL); + } } for (i = 0; i < nprocs; i++) { @@ -430,13 +446,40 @@ alarm(1000); alarm_msg = "Timeout during client startup.\n"; /* long timeout for testing, where process may be stopped in debugger */ + + if (use_xlauncher) { + int md_id = fork(); + if (md_id == 0) { + char command_name[COMMAND_LEN]; + char command_name_tv[COMMAND_LEN]; + char *ld_library_path; + char *mpi_prefix; + make_command_strings(argc, argv, totalview_cmd, command_name, command_name_tv); + fprintf(stderr,"Creating the instance of minidaemon, MPI command should be %s\n",command_name); + ld_library_path = getenv("LD_LIBRARY_PATH"); + if (ld_library_path != NULL) { + fprintf(stderr," Setting LD_LIBRARY_PATH = %s\n",ld_library_path); + setenv("LD_LIBRARY_PATH",ld_library_path,1); + } + mpi_prefix = getenv("MPI_PREFIX"); + if (mpi_prefix != NULL) { + fprintf(stderr," Setting MPI_PREFIX = %s\n",mpi_prefix); + setenv("MPI_PREFIX",mpi_prefix,1); + } - if(pglist && !legacy_startup) { - spawn_fast(argc, argv, totalview_cmd, env); + minidaemon_create(plist,nprocs,xlauncher_width,"VIADEV_PARAM=DEMO_PARAM",command_name, port, getpid()); + minidaemon_run(); + } } + + else { + if(pglist && !legacy_startup) { + spawn_fast(argc, argv, totalview_cmd, env); + } - else { - spawn_linear(argc, argv, totalview_cmd, env); + else { + spawn_linear(argc, argv, totalview_cmd, env); + } } if (show_on) @@ -449,6 +492,7 @@ int version, rank, nread; char pidstr[12]; static const int hca_type_len = sizeof(int); + int retries, retry_max = 10; ACCEPT_HID: sockaddr_len = sizeof(sockaddr); s1 = accept(s, (struct sockaddr *) &sockaddr, &sockaddr_len); @@ -474,13 +518,17 @@ /* 0. Find out what version of the startup protocol the executable * was compiled to use. */ - nread = read(s1, &version, sizeof(version)); - - if (nread != sizeof(version)) { - perror("read"); - cleanup(); + for(retries = 0, tot_nread = 0; tot_nread < sizeof(version) && retries < + retry_max; retries++) { + nread = read(s1, &version, sizeof(version) - tot_nread); + tot_nread += nread; } + if(tot_nread != sizeof(version)) { + perror("read"); + cleanup(); + } + if (version != PMGR_VERSION) { fprintf(stderr, "mpirun: executable version %d does not match" " our version %d.\n", version, PMGR_VERSION); @@ -488,26 +536,31 @@ } /* 1. Find out who we're talking to */ - nread = read(s1, &rank, sizeof(rank)); + for(retries = 0, tot_nread = 0; tot_nread < sizeof(rank) && retries < + retry_max; retries++) { + nread = read(s1, &rank, sizeof(rank) - tot_nread); + tot_nread += nread; + } - if (nread != sizeof(rank)) { + if (tot_nread != sizeof(rank)) { perror("read"); cleanup(); } - if (rank < 0 || rank >= nprocs || plist[rank].state != P_STARTED) { + if (rank < 0 || rank >= nprocs || ( !(use_xlauncher) && plist[rank].state != P_STARTED)) { fprintf(stderr, "mpirun: invalid rank received. \n"); cleanup(); } plist[rank].control_socket = s1; /* 2. Find out length of the data */ - nread = read(s1, &hostidlen, sizeof(hostidlen)); - if (nread != sizeof(hostidlen)) { + for(retries = 0, tot_nread = 0; tot_nread < sizeof(hostidlen) && retries < + retry_max; retries++) { + nread = read(s1, &hostidlen, sizeof(hostidlen) - tot_nread); + tot_nread += nread; + } - /* nread == 0 is not actually an error! */ - if (nread == 0) - continue; + if (tot_nread != sizeof(hostidlen)) { perror("read"); cleanup(); } @@ -595,9 +648,8 @@ /* lets enable the timer again*/ /* Lets read all other information, LID QP,etc..*/ - - /* accept incoming connections, read port numbers */ for (i = 0; i < nprocs; i++) { + int retries, retry_max = 10; int nread; /* @@ -615,8 +667,14 @@ */ /* 1. Find out length of the data */ - nread = read(plist[i].control_socket, &addrlen, sizeof(addrlen)); - if (nread != sizeof(addrlen)) { + for(retries = 0, tot_nread = 0; tot_nread < sizeof(addrlen) && retries < + retry_max; retries++) { + nread = read(plist[i].control_socket, &addrlen, sizeof(addrlen) - + tot_nread); + tot_nread += nread; + } + + if (tot_nread != sizeof(addrlen)) { perror("read"); cleanup(); } @@ -663,8 +721,14 @@ read_pid: /* 3. Find out length of the data */ - nread = read(plist[i].control_socket, &pidlen, sizeof(pidlen)); - if (nread != sizeof(pidlen)) { + for(retries = 0, tot_nread = 0; tot_nread < sizeof(pidlen) && retries < + retry_max; retries++) { + nread = read(plist[i].control_socket, &pidlen, sizeof(pidlen) - + tot_nread); + tot_nread += nread; + } + + if(tot_nread != sizeof(pidlen)) { perror("read"); cleanup(); } @@ -791,12 +855,18 @@ */ /* close all opend sockets */ - for (i = 0; i < nprocs; i++) - close(plist[i].control_socket); + for (i = 0; i < nprocs; i++) { + close(plist[i].control_socket); + } + /* clients have all information now. Just sit and wait for them to die, which we will detect via sockets or signal from ssh/rsh signal. */ + if (use_xlauncher) { + exit(EXIT_SUCCESS); + } + wait_for_errors(s,&sockaddr,sockaddr_len); /* this while is unused now. We are in block wait in wait_for_errors */ @@ -1108,7 +1178,8 @@ { fprintf(stderr, "usage: mpirun_rsh [-v] [-rsh|-ssh] " "[-paramfile=pfile] " - "[-debug] -[tv] [-xterm] [-show] [-legacy] -np N " + "[-debug] -[tv] [-xterm] [-show] [-legacy] [-use_xlauncher]" + "[xlauncher-width W] -np N " "(-hostfile hfile | h1 h2 ... hN) a.out args\n"); fprintf(stderr, "Where:\n"); fprintf(stderr, "\tv => Show version and exit\n"); @@ -1773,30 +1844,6 @@ } } -char * mkstr(const char * format, ...) { - va_list ap; - int size; - char * str; - - va_start(ap, format); - size = vsnprintf(NULL, 0, format, ap); - va_end(ap); - - if(size++ < 0) return NULL; - - str = malloc(sizeof(char) * size); - - if(str) { - va_start(ap, format); - size = vsnprintf(str, size, format, ap); - va_end(ap); - - if(size < 0) return NULL; - } - - return str; -} - void spawn_fast(int argc, char *argv[], char *totalview_cmd, char *env) { char * mpispawn_env, * tmp, * ld_library_path; char * name, * value; @@ -2126,6 +2173,31 @@ char command_name_tv[COMMAND_LEN]; int i; + make_command_strings(argc, argv, totalview_cmd, command_name, command_name_tv); + + /* start all processes */ + for (i = 0; i < nprocs; i++) { + if((use_totalview) && (i == 0)) { + if (start_process(i, command_name_tv, env) < 0) { + fprintf(stderr, + "Unable to start process %d on %s. Aborting.\n", + i, plist[i].hostname); + cleanup(); + } + } else { + if (start_process(i, command_name, env) < 0) { + fprintf(stderr, + "Unable to start process %d on %s. Aborting.\n", + i, plist[i].hostname); + cleanup(); + } + } + } +} + +void make_command_strings(int argc, char *argv[], char *totalview_cmd, char * command_name, char * command_name_tv) +{ + int i; if (debug_on) { char keyval_list[COMMAND_LEN]; sprintf(keyval_list, "%s", " "); @@ -2169,25 +2241,6 @@ } strcat(command_name_tv, " -mpichtv"); } - - /* start all processes */ - for (i = 0; i < nprocs; i++) { - if((use_totalview) && (i == 0)) { - if (start_process(i, command_name_tv, env) < 0) { - fprintf(stderr, - "Unable to start process %d on %s. Aborting.\n", - i, plist[i].hostname); - cleanup(); - } - } else { - if (start_process(i, command_name, env) < 0) { - fprintf(stderr, - "Unable to start process %d on %s. Aborting.\n", - i, plist[i].hostname); - cleanup(); - } - } - } } void nostop_handler(int signal) @@ -2220,7 +2273,10 @@ if(pid == 0) break; if(pid != -1 && WIFEXITED(status) && WEXITSTATUS(status) == 0) { - if(++num_exited == num_children) exit(WEXITSTATUS(status)); + if(++num_exited == num_children) { + close(server_socket); + exit(WEXITSTATUS(status)); + } } else { Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h 2007-11-05 16:54:39 UTC (rev 1627) @@ -77,6 +77,7 @@ #include #include #include "pmgr_client.h" +#include "mpirun_util.h" /* Support for debug prints. */ @@ -147,6 +148,7 @@ #endif + #endif /* vi:set sw=4 sts=4 tw=80: */ Added: mvapich/trunk/mpid/ch_gen2/process/mpirun_util.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_util.c 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_util.c 2007-11-05 16:54:39 UTC (rev 1627) @@ -0,0 +1,65 @@ +/* Copyright (c) 2002-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH in the top level MPICH directory. + * + */ + +#include "mpirun_util.h" + +char * mkstr(const char * format, ...) { + va_list ap; + int size; + char * str; + + va_start(ap, format); + size = vsnprintf(NULL, 0, format, ap); + va_end(ap); + + if(size++ < 0) return NULL; + + str = malloc(sizeof(char) * size); + + if(str) { + va_start(ap, format); + size = vsnprintf(str, size, format, ap); + va_end(ap); + + if(size < 0) return NULL; + } + + return str; +} + +/* + * ptr must be suitable for a call to realloc + */ +char * chstr(char * ptr, const char * format, ...) { + va_list ap; + int size; + char * str; + + va_start(ap, format); + size = vsnprintf(NULL, 0, format, ap); + va_end(ap); + + if(size++ < 0) return NULL; + + str = realloc(ptr, sizeof(char) * size); + + if(str) { + va_start(ap, format); + size = vsnprintf(str, size, format, ap); + va_end(ap); + + if(size < 0) return NULL; + } + + return str; +} +/* vi:set sw=4 sts=4 tw=80 */ Added: mvapich/trunk/mpid/ch_gen2/process/mpirun_util.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_util.h 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_util.h 2007-11-05 16:54:39 UTC (rev 1627) @@ -0,0 +1,21 @@ +#ifndef MPIRUN_UTIL_H +#define MPIRUN_UTIL_H +/* Copyright (c) 2002-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH in the top level MPICH directory. + * + */ + +#include +#include + +char * mkstr(const char *, ...); +char * chstr(char *, const char *, ...); + +#endif Modified: mvapich/trunk/mpid/ch_gen2/process/mpispawn.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpispawn.c 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/process/mpispawn.c 2007-11-05 16:54:39 UTC (rev 1627) @@ -29,30 +29,6 @@ return (env_ptr = getenv(env_ptr)) ? strdup(env_ptr) : NULL; } -char * mkstr(const char * format, ...) { - va_list ap; - int size; - char * str; - - va_start(ap, format); - size = vsnprintf(NULL, 0, format, ap); - va_end(ap); - - if(size++ < 0) return NULL; - - str = malloc(sizeof(char) * size); - - if(str) { - va_start(ap, format); - size = vsnprintf(str, size, format, ap); - va_end(ap); - - if(size < 0) return NULL; - } - - return str; -} - lvalues get_lvalues(int i) { lvalues v; char * buffer = NULL; Modified: mvapich/trunk/mpid/ch_gen2/process/mpispawn.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpispawn.h 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/process/mpispawn.h 2007-11-05 16:54:39 UTC (rev 1627) @@ -1,5 +1,5 @@ -#ifndef MPISPAWN -#define MPISPAWN +#ifndef MPISPAWN_H +#define MPISPAWN_H /* Copyright (c) 2002-2007, The Ohio State University. All rights * reserved. * Modified: mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_client_mpirun_rsh.c 2007-11-05 16:54:39 UTC (rev 1627) @@ -171,8 +171,9 @@ void pmgr_init_connection(int phase) { - int nwritten; + int nwritten; int version; + int connect_attempt = 0, max_connect_attempts = 5; struct sockaddr_in sockaddr; if(phase != 0) return; @@ -198,10 +199,18 @@ sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list); sockaddr.sin_port = htons(mpirun_port); - if (connect(mpirun_socket, (struct sockaddr *) &sockaddr, - sizeof(sockaddr)) < 0) { - perror("connect"); - exit(1); + srand(pmgr_me); + + while(connect(mpirun_socket, (struct sockaddr *) &sockaddr, + sizeof(sockaddr)) < 0) { + if(connect_attempt++ < max_connect_attempts) { + sleep(1 + (int) (10.0 * (rand() / (RAND_MAX + 1.0)))); + } + + else { + perror("connect"); + exit(1); + } } /* we are now connected to the mpirun program */ @@ -239,6 +248,7 @@ int tot_nread = 0; int toread_len = 0; int nread, nwritten; + int retries, retry_max = 10; pmgr_init_connection(0); /* next, send size of addr */ @@ -271,17 +281,24 @@ toread_len = pmgr_nprocs * sizeof(int); /* finally, read addresses from all processes */ - while (tot_nread < toread_len) { + for(retries = 0; tot_nread < toread_len && retries < retry_max; retries++) { nread = read(mpirun_socket, ((void *)allhostids) + tot_nread, toread_len - tot_nread); - if (nread <= 0) { + if (nread < 0) { perror("read"); sleep(2); exit(1); } tot_nread = tot_nread + nread; } + + if(tot_nread != toread_len) { + fprintf(stderr, "Unable to read from mpirun_socket after %d tries!\n", + retry_max); + exit(EXIT_FAILURE); + } + fflush(stdout); return 1; } @@ -292,6 +309,7 @@ int i, tot_nread = 0; int toread_len = 0; int nread, nwritten; + int retries, retry_max = 10; pid_t my_pid_int = getpid(); int pidlen, mypid_len; @@ -350,30 +368,43 @@ toread_len = 3 * pmgr_nprocs * sizeof(int); /* finally, read addresses from all processes */ - while (tot_nread < toread_len) { + for(retries = 0; tot_nread < toread_len && retries < retry_max; retries++) { nread = read(mpirun_socket, (void *) ((char *) alladdrs + tot_nread), (size_t) (toread_len - tot_nread)); - if (nread <= 0) { + if (nread < 0) { perror("read"); exit(1); } tot_nread = tot_nread + nread; } + if(tot_nread != toread_len) { + fprintf(stderr, "Unable to read from mpirun_socket after %d tries!\n", + retry_max); + exit(EXIT_FAILURE); + } + if (pidlen != 0) { - tot_nread=0; - /* finally, read pids from all processes */ - while (tot_nread < pmgr_nprocs*pidlen) { + tot_nread=0; + /* finally, read pids from all processes */ + for(retries = 0; tot_nread < pmgr_nprocs*pidlen && retries < retry_max; + retries++) { nread = read(mpirun_socket, (void*)((char *)allpids+tot_nread), - (size_t) ((pmgr_nprocs*pidlen)-tot_nread)); - if (nread <= 0) { - perror("read"); - sleep(2); - exit(1); + (size_t) ((pmgr_nprocs*pidlen)-tot_nread)); + if (nread < 0) { + perror("read"); + sleep(2); + exit(1); } tot_nread = tot_nread + nread; - } + } + + if(tot_nread != pmgr_nprocs*pidlen) { + fprintf(stderr, "Unable to read from mpirun_socket after %d " + "tries!\n", retry_max); + exit(EXIT_FAILURE); + } } fflush(stdout); Modified: mvapich/trunk/util/mpiinstall.in =================================================================== --- mvapich/trunk/util/mpiinstall.in 2007-11-05 16:07:12 UTC (rev 1626) +++ mvapich/trunk/util/mpiinstall.in 2007-11-05 16:54:39 UTC (rev 1627) @@ -531,6 +531,9 @@ if [ -x bin/mpispawn ] ; then FixupFile2 bin/mpispawn ${bindir}/mpispawn $XMODE fi + if [ -x bin/minidaemon_client ] ; then + FixupFile2 bin/minidaemon_client ${bindir}/minidaemon_client $XMODE + fi # MVAPICH: We need to copy `ibmcgrp' to bin for Multicast support From perkinjo at mvapich.cse.ohio-state.edu Mon Nov 19 15:53:59 2007 From: perkinjo at mvapich.cse.ohio-state.edu (perkinjo@mvapich.cse.ohio-state.edu) Date: Mon Nov 19 16:52:35 2007 Subject: [mvapich-commit] r1628 - in mvapich/trunk/mpid/ch_gen2: . process Message-ID: <200711192053.lAJKrxxO024747@mvapich.cse.ohio-state.edu> Author: perkinjo Date: 2007-11-19 15:53:57 -0500 (Mon, 19 Nov 2007) New Revision: 1628 Added: mvapich/trunk/mpid/ch_gen2/process/makefile mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.c mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.h mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_common.c mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_common.h mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_mpirun.c mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_mpirun.h Modified: mvapich/trunk/mpid/ch_gen2/Makefile.in mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h mvapich/trunk/mpid/ch_gen2/viainit.c mvapich/trunk/mpid/ch_gen2/viutil.h Log: Checking in integration of a collection based startup developed by Adam Moody @ LLNL. Other minor changes affecting mpirun_rsh and how its built. Modified: mvapich/trunk/mpid/ch_gen2/Makefile.in =================================================================== --- mvapich/trunk/mpid/ch_gen2/Makefile.in 2007-11-05 16:54:39 UTC (rev 1627) +++ mvapich/trunk/mpid/ch_gen2/Makefile.in 2007-11-19 20:53:57 UTC (rev 1628) @@ -123,49 +123,26 @@ default_all: default -default: setlinks lib $(MPIRUN) $(IBMGRP) $(PTMALLOC) minidaemon_client +default: setlinks lib $(MPIRUN) $(IBMGRP) $(PTMALLOC) lib: $(VIAOBJECTS) $(MPICHOBJECTS) $(AR) $(LIBNAME) $? -minidaemon_client: process/minidaemon.o - cd process &&\ - $(CC) $(MPIRUN_CFLAGS) minidaemon.c -c -o minidaemon.o &&\ - $(CC) $(MPIRUN_CFLAGS) -o minidaemon_client minidaemon.o mpirun_util.o minidaemon_client.c &&\ - install -m 755 minidaemon_client ${top_srcdir}/bin +.EXPORT_ALL_VARIABLES: -mpirun_rsh: process/pmgr_client_mpirun_rsh.o - cd process &&\ - $(CC) getopt.c -c -o getopt.o &&\ - $(CC) getopt1.c -c -o getopt1.o &&\ - $(CC) $(MPIRUN_CFLAGS) -c mpirun_util.c &&\ - $(CC) $(MPIRUN_CFLAGS) -c minidaemon.c &&\ - $(CC) $(MPIRUN_CFLAGS) -c pmgr_client_mpirun_rsh.c &&\ - $(CC) $(MPIRUN_CFLAGS) -o minidaemon_client minidaemon.o mpirun_util.o minidaemon_client.c &&\ - $(CC) $(MPIRUN_CFLAGS) -o mpispawn mpispawn.c mpirun_util.o &&\ - $(CC) $(MPIRUN_CFLAGS) -o mpirun_rsh $(MPIRUN_MAC) minidaemon.o mpirun_util.o mpirun_rsh.c &&\ - install -m 755 minidaemon_client ${top_srcdir}/bin &&\ - install -m 755 mpispawn ${top_srcdir}/bin &&\ - install -m 755 mpirun_rsh ${top_srcdir}/bin &&\ - $(AR) $(LIBNAME) pmgr_client_mpirun_rsh.o minidaemon.o - -(cd $(top_srcdir)/mpid/ch_gen2/process &&\ - ar -d $(LIBNAME) pmgr_client_mpd.o minidaemon.o) +mpirun_rsh: + cd process && $(MAKE) &&\ + install -m 755 mpirun_rsh mpispawn minidaemon_client ${top_srcdir}/bin mpirun_mpd: process/pmgr_client_mpd.o - cd process &&\ - $(CC) getopt.c -c -o getopt.o &&\ - $(CC) getopt1.c -c -o getopt1.o &&\ - $(CC) $(MPIRUN_CFLAGS) -o mpispawn mpispawn.c &&\ - install -m 755 mpispawn ${top_srcdir}/bin &&\ - $(CC) $(MPIRUN_CFLAGS) -o mpirun_rsh $(MPIRUN_MAC) mpirun_rsh.c &&\ - install -m 755 mpirun_rsh ${top_srcdir}/bin &&\ - $(CC) $(MPIRUN_CFLAGS) -c pmgr_client_mpd.c &&\ - $(AR) $(LIBNAME) pmgr_client_mpd.o - -(cd $(top_srcdir)/mpid/ch_gen2/process &&\ - ar -d $(LIBNAME) pmgr_client_mpirun_rsh.o) - cd $(top_srcdir)/mpid/mpd &&\ - make && ln -sf Makefile ch_gen2.mpd &&\ - $(AR) $(LIBNAME) mpdlib.o util.o bnr.o + cd process && $(MAKE) &&\ + install -m 755 mpirun_rsh mpispawn minidaemon_client ${top_srcdir}/bin \ + $(CC) $(MPIRUN_CFLAGS) -c pmgr_client_mpd.c &&\ + $(AR) $(LIBNAME) pmgr_client_mpd.o + -(cd $(top_srcdir)/mpid/ch_gen2/process && ar -d $(LIBNAME) pmgr_client_mpirun_rsh.o) + cd $(top_srcdir)/mpid/mpd && make &&\ + ln -sf Makefile ch_gen2.mpd &&\ + $(AR) $(LIBNAME) mpdlib.o util.o bnr.o ibmcgrp: ibmcgrp/ibmcgrp.o ibmcgrp/main.o $(CC) $(CFLAGS) -c -o ibmcgrp/main.o ibmcgrp/main.c @@ -192,10 +169,13 @@ VIAFILES = viainit.c viasend.c viarecv.c viapriv.c viaparam.c viutil.c vbuf.c \ mpid_init.c mpid_send.c mpid_recv.c viacheck.c mpid_pack.c \ process/pmgr_client_fork.c process/minidaemon.c process/pmgr_client_mpirun_rsh.c \ - process/mpirun_rsh.c mpid_hsend.c mpid_hrecv.c \ - req.h vbuf.h viapacket.h viapriv.h viutil.h \ + process/common_pmgr_collective.c process/client_pmgr_collective.c \ + process/mpirun_rsh.c process/mpispawn.c mpid_hsend.c mpid_hrecv.c \ + req.h vbuf.h viapacket.h viapriv.h viutil.h process/mpispawn.h \ process/pmgr_client.h process/minidaemon.h mpid_misc.c viaparam.h viaconfig.h viadev.h \ - cmnargs.c dreg.h mpid.h mpid_smpi.h mpid_smpi.c + process/common_pmgr_collective.h process/client_pmgr_collective.h \ + cmnargs.c dreg.h mpid.h mpid_smpi.h mpid_smpi.c \ + process/mpirun_util.c # # Files from the original ADI that we need, but didn't change, Added: mvapich/trunk/mpid/ch_gen2/process/makefile =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/makefile 2007-11-05 16:54:39 UTC (rev 1627) +++ mvapich/trunk/mpid/ch_gen2/process/makefile 2007-11-19 20:53:57 UTC (rev 1628) @@ -0,0 +1,21 @@ +CFLAGS = $(MPIRUN_CFLAGS) + +ifeq ($(strip $(MPIRUN)), mpirun_rsh) + PMGR_TARGETS = mpirun_rsh mpispawn minidaemon_client update_lib + libmembers = pmgr_collective_client.o pmgr_collective_common.o minidaemon.o + #libmembers = minidaemon.o pmgr_client_mpirun_rsh.o +else + PMGR_TARGETS = update_lib + libmembers = pmgr_client_mpd.o +endif + +all: $(PMGR_TARGETS) + + +minidaemon_client: minidaemon.o mpirun_util.o +mpispawn: mpirun_util.o +mpirun_rsh: pmgr_collective_common.o pmgr_collective_mpirun.o \ + minidaemon.o mpirun_util.o $(MPIRUN_MAC) + +update_lib: $(libmembers) + $(AR) $(LIBNAME) $(libmembers) Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2007-11-05 16:54:39 UTC (rev 1627) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.c 2007-11-19 20:53:57 UTC (rev 1628) @@ -186,6 +186,7 @@ int *hostids = NULL; int *hca_type_arr = NULL; int is_homogeneous = 1; + int version; size_t hostname_len = 0; @@ -489,7 +490,7 @@ /* accept incoming connections, read port numbers */ for (i = 0; i < nprocs; i++) { - int version, rank, nread; + int rank, nread; char pidstr[12]; static const int hca_type_len = sizeof(int); int retries, retry_max = 10; @@ -529,7 +530,7 @@ cleanup(); } - if (version != PMGR_VERSION) { + if(version != PMGR_VERSION && version != PMGR_COLLECTIVE) { fprintf(stderr, "mpirun: executable version %d does not match" " our version %d.\n", version, PMGR_VERSION); cleanup(); @@ -547,12 +548,17 @@ cleanup(); } - if (rank < 0 || rank >= nprocs || ( !(use_xlauncher) && plist[rank].state != P_STARTED)) { + if (rank < 0 || rank >= nprocs || ( !(use_xlauncher) && plist[rank].state != P_STARTED)) { fprintf(stderr, "mpirun: invalid rank received. \n"); cleanup(); } plist[rank].control_socket = s1; + /* if using PMGR_COLLECTIVE, we've read all the data we need to for this task for now */ + if(version == PMGR_COLLECTIVE) { + continue; + } + /* 2. Find out length of the data */ for(retries = 0, tot_nread = 0; tot_nread < sizeof(hostidlen) && retries < retry_max; retries++) { @@ -618,253 +624,277 @@ first_rank = rank; if (hca_type_arr[first_rank] != hca_type_arr[rank]) is_homogeneous = 0; - } /* at this point, all processes have checked in hostids */ /* cancel the timeout */ alarm(0); - /* Lets write back all hostids */ + if(version == PMGR_COLLECTIVE) { + /* build up an array of file descriptors for pmgr_processops */ + int* fds = (int*) malloc(nprocs*sizeof(int)); - for (i = 0; i < nprocs; i++) { - int nwritten; - nwritten = write(plist[i].control_socket, &is_homogeneous, - sizeof(is_homogeneous)); - if (nwritten != sizeof(is_homogeneous)) { - perror("write"); - cleanup(); - } - nwritten = write(plist[i].control_socket, hostids, - nprocs * hostidlen); - if (nwritten != nprocs * hostidlen ) { - perror("write"); - cleanup(); - } + if(fds == NULL) { + perror("allocating temporary array for socket file descriptors"); + cleanup(); + } + + /* fill it in */ + for(i = 0; i < nprocs; i++) { + fds[i] = plist[i].control_socket; + } + + pmgr_processops(fds, nprocs); + + /* free it off (processops closes each socket before returning control) */ + free(fds); + + for (i = 0; i < nprocs; i++) { + plist[i].state = P_RUNNING; + } } - alarm(1000); - alarm_msg = "Timeout during address exchange.\n"; - /* lets enable the timer again*/ + else { + /* Lets write back all hostids */ + for (i = 0; i < nprocs; i++) { + int nwritten; + nwritten = write(plist[i].control_socket, &is_homogeneous, + sizeof(is_homogeneous)); + if (nwritten != sizeof(is_homogeneous)) { + perror("write"); + cleanup(); + } + nwritten = write(plist[i].control_socket, hostids, + nprocs * hostidlen); + if (nwritten != nprocs * hostidlen ) { + perror("write"); + cleanup(); + } + } - /* Lets read all other information, LID QP,etc..*/ - for (i = 0; i < nprocs; i++) { - int retries, retry_max = 10; - int nread; + alarm(1000); + alarm_msg = "Timeout during address exchange.\n"; + /* lets enable the timer again*/ - /* - * protocol: - * We don't need the version number or the rank, - * 0. read address length - * 1. read address itself - * 2. send array of all addresses - */ + /* Lets read all other information, LID QP,etc..*/ + for (i = 0; i < nprocs; i++) { + int retries, retry_max = 10; + int nread; - plist[i].state = P_CONNECTED; + /* + * protocol: + * We don't need the version number or the rank, + * 0. read address length + * 1. read address itself + * 2. send array of all addresses + */ - /* Let us know connection was established - * printf("MPIRUN_RSH: Process rank %d connected\n",rank); - */ + plist[i].state = P_CONNECTED; - /* 1. Find out length of the data */ - for(retries = 0, tot_nread = 0; tot_nread < sizeof(addrlen) && retries < - retry_max; retries++) { - nread = read(plist[i].control_socket, &addrlen, sizeof(addrlen) - - tot_nread); - tot_nread += nread; - } + /* Let us know connection was established + * printf("MPIRUN_RSH: Process rank %d connected\n",rank); + */ - if (tot_nread != sizeof(addrlen)) { - perror("read"); - cleanup(); - } + /* 1. Find out length of the data */ + for(retries = 0, tot_nread = 0; tot_nread < sizeof(addrlen) && retries < + retry_max; retries++) { + nread = read(plist[i].control_socket, &addrlen, sizeof(addrlen) - + tot_nread); + tot_nread += nread; + } - if (i == 0) { - global_addrlen = addrlen; - } else if (addrlen != global_addrlen) { - fprintf(stderr, "Address lengths %d and %d do not match\n", - addrlen, global_addrlen); - cleanup(); - } + if (tot_nread != sizeof(addrlen)) { + perror("read"); + cleanup(); + } - if (addrlen == 0) { - goto read_pid; - } + if (i == 0) { + global_addrlen = addrlen; + } else if (addrlen != global_addrlen) { + fprintf(stderr, "Address lengths %d and %d do not match\n", + addrlen, global_addrlen); + cleanup(); + } - if (i == 0) { - /* allocate as soon as we know the address length */ - alladdrs = (int *) malloc(addrlen * nprocs); - if (alladdrs == NULL) { - perror("malloc"); - exit(EXIT_FAILURE); - } - } + if (addrlen == 0) { + goto read_pid; + } - /* 2. Read info from each process */ + if (i == 0) { + /* allocate as soon as we know the address length */ + alladdrs = (int *) malloc(addrlen * nprocs); + if (alladdrs == NULL) { + perror("malloc"); + exit(EXIT_FAILURE); + } + } - /* for byte location */ - alladdrs_char = (char *) &alladdrs[i * addrlen / sizeof(int)]; + /* 2. Read info from each process */ - tot_nread = 0; + /* for byte location */ + alladdrs_char = (char *) &alladdrs[i * addrlen / sizeof(int)]; - while (tot_nread < addrlen) { - nread = read(plist[i].control_socket, - (void *) (alladdrs_char + tot_nread), addrlen - tot_nread); + tot_nread = 0; - if (nread < 0) { - perror("read"); - cleanup(); - } + while (tot_nread < addrlen) { + nread = read(plist[i].control_socket, + (void *) (alladdrs_char + tot_nread), addrlen - tot_nread); - tot_nread += nread; - } + if (nread < 0) { + perror("read"); + cleanup(); + } + tot_nread += nread; + } + read_pid: - /* 3. Find out length of the data */ - for(retries = 0, tot_nread = 0; tot_nread < sizeof(pidlen) && retries < - retry_max; retries++) { - nread = read(plist[i].control_socket, &pidlen, sizeof(pidlen) - - tot_nread); - tot_nread += nread; - } + /* 3. Find out length of the data */ + for(retries = 0, tot_nread = 0; tot_nread < sizeof(pidlen) && retries < + retry_max; retries++) { + nread = read(plist[i].control_socket, &pidlen, sizeof(pidlen) - + tot_nread); + tot_nread += nread; + } - if(tot_nread != sizeof(pidlen)) { - perror("read"); - cleanup(); - } + if(tot_nread != sizeof(pidlen)) { + perror("read"); + cleanup(); + } - /*fprintf(stderr, "read Pid lengths %d and %d \n", pidlen, nread);*/ - if (pidlen != pidglen) { - fprintf(stderr, "Pid lengths %d and %d do not match\n", - pidlen, pidglen); - cleanup(); - } + /*fprintf(stderr, "read Pid lengths %d and %d \n", pidlen, nread);*/ + if (pidlen != pidglen) { + fprintf(stderr, "Pid lengths %d and %d do not match\n", + pidlen, pidglen); + cleanup(); + } - if (i == 0) { - /* allocate as soon as we know the pid length */ - allpids = (char *)malloc(pidlen * nprocs); - if (allpids == NULL) { - perror("malloc"); - exit(EXIT_FAILURE); - } - } + if (i == 0) { + /* allocate as soon as we know the pid length */ + allpids = (char *)malloc(pidlen * nprocs); + if (allpids == NULL) { + perror("malloc"); + exit(EXIT_FAILURE); + } + } - tot_nread=0; - while(tot_nread < pidlen) { - nread = read(plist[i].control_socket, - (void*)(allpids+i*pidlen+tot_nread), pidlen - tot_nread); - /*fprintf(stderr, "read length %d \n", nread);*/ - if(nread < 0) { - perror("read"); - cleanup(); - } - tot_nread += nread; - } + tot_nread=0; + while(tot_nread < pidlen) { + nread = read(plist[i].control_socket, + (void*)(allpids+i*pidlen+tot_nread), pidlen - tot_nread); + /*fprintf(stderr, "read length %d \n", nread);*/ + if(nread < 0) { + perror("read"); + cleanup(); + } + tot_nread += nread; + } - plist[i].remote_pid = *((pid_t *)(allpids+i*pidlen)); - } + plist[i].remote_pid = *((pid_t *)(allpids+i*pidlen)); + } - /* at this point, all processes have checked in. */ + /* at this point, all processes have checked in. */ - /* cancel the timeout */ - alarm(0); + /* cancel the timeout */ + alarm(0); - /* send ports to all but highest ranking process, as it needs none */ + /* send ports to all but highest ranking process, as it needs none */ #ifdef DEBUG - for (i = 0; i < nprocs; i++) { - for (j = 0; j < addrlen / sizeof(int); j++) { - fprintf(stderr, "%d\t", - alladdrs[i * addrlen / sizeof(int) + j]); - } - fprintf(stderr, "\n"); - } - fprintf(stderr, "\n"); + for (i = 0; i < nprocs; i++) { + for (j = 0; j < addrlen / sizeof(int); j++) { + fprintf(stderr, "%d\t", + alladdrs[i * addrlen / sizeof(int) + j]); + } + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); #endif - out_addrs_len = 3 * nprocs * sizeof(int); - out_addrs = (int *) malloc(out_addrs_len); - if (out_addrs == NULL) { - perror("malloc"); - exit(EXIT_FAILURE); - } + out_addrs_len = 3 * nprocs * sizeof(int); + out_addrs = (int *) malloc(out_addrs_len); + if (out_addrs == NULL) { + perror("malloc"); + exit(EXIT_FAILURE); + } - for (i = 0; i < nprocs; i++) { - /* put hca_lid information at the first beginning */ - out_addrs[i] = alladdrs[i * addrlen / sizeof(int) + i]; + for (i = 0; i < nprocs; i++) { + /* put hca_lid information at the first beginning */ + out_addrs[i] = alladdrs[i * addrlen / sizeof(int) + i]; - /* put host id information in the third round */ - out_addrs[2 * nprocs + i] = - alladdrs[i * addrlen / sizeof(int) + nprocs]; - } + /* put host id information in the third round */ + out_addrs[2 * nprocs + i] = + alladdrs[i * addrlen / sizeof(int) + nprocs]; + } - for (i = 0; i < nprocs; i++) { - int nwritten; + for (i = 0; i < nprocs; i++) { + int nwritten; - /* personalized address information for each process */ - for (j = 0; j < nprocs; j++) { - /* put qp information here */ - if (i == j) - /* No QP is allocated for a process itself, - * If you change this, please change viainit.cc:1514 too */ - out_addrs[nprocs + j] = -1; - else - out_addrs[nprocs + j] = - alladdrs[j * addrlen / sizeof(int) + i]; - } + /* personalized address information for each process */ + for (j = 0; j < nprocs; j++) { + /* put qp information here */ + if (i == j) + /* No QP is allocated for a process itself, + * If you change this, please change viainit.cc:1514 too */ + out_addrs[nprocs + j] = -1; + else + out_addrs[nprocs + j] = + alladdrs[j * addrlen / sizeof(int) + i]; + } #ifdef DEBUG - for (j = 0; j < out_addrs_len / sizeof(int); j++) - fprintf(stderr, "%d\t", out_addrs[j]); - fprintf(stderr, "\n"); + for (j = 0; j < out_addrs_len / sizeof(int); j++) + fprintf(stderr, "%d\t", out_addrs[j]); + fprintf(stderr, "\n"); #endif - nwritten = - write(plist[i].control_socket, out_addrs, out_addrs_len); - if (nwritten != out_addrs_len) { - perror("write"); - cleanup(); - } - - if(pidlen != 0) { - nwritten = 0; - nwritten = write(plist[i].control_socket, allpids, nprocs*pidlen); - if (nwritten != nprocs*pidlen) { + nwritten = + write(plist[i].control_socket, out_addrs, out_addrs_len); + if (nwritten != out_addrs_len) { perror("write"); cleanup(); } + + if(pidlen != 0) { + nwritten = 0; + nwritten = write(plist[i].control_socket, allpids, nprocs*pidlen); + if (nwritten != nprocs*pidlen) { + perror("write"); + cleanup(); + } + } + + plist[i].state = P_RUNNING; } - plist[i].state = P_RUNNING; - } - #ifndef USE_VIADEV_BARRIER - /* Not using the VI barrier. Implement the termination - * barrier using the socket network we have already - * established. - */ - process_termination(); + /* Not using the VI barrier. Implement the termination + * barrier using the socket network we have already + * established. + */ + process_termination(); #endif - /* shut down all our ports */ - /*close(s); - for (i = 0; i < nprocs; i++) - close(plist[i].control_socket); - */ + /* shut down all our ports */ + /*close(s); + for (i = 0; i < nprocs; i++) + close(plist[i].control_socket); + */ - /* close all opend sockets */ - for (i = 0; i < nprocs; i++) { - close(plist[i].control_socket); - } + /* close all opend sockets */ + for (i = 0; i < nprocs; i++) { + close(plist[i].control_socket); + } - /* clients have all information now. Just sit and wait for them - to die, which we will detect via sockets or signal from ssh/rsh signal. - */ + /* clients have all information now. Just sit and wait for them + to die, which we will detect via sockets or signal from ssh/rsh signal. + */ - if (use_xlauncher) { - exit(EXIT_SUCCESS); + if (use_xlauncher) { + exit(EXIT_SUCCESS); + } } wait_for_errors(s,&sockaddr,sockaddr_len); Modified: mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h 2007-11-05 16:54:39 UTC (rev 1627) +++ mvapich/trunk/mpid/ch_gen2/process/mpirun_rsh.h 2007-11-19 20:53:57 UTC (rev 1628) @@ -78,6 +78,7 @@ #include #include "pmgr_client.h" #include "mpirun_util.h" +#include "pmgr_collective_common.h" /* Support for debug prints. */ Added: mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.c 2007-11-05 16:54:39 UTC (rev 1627) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.c 2007-11-19 20:53:57 UTC (rev 1628) @@ -0,0 +1,791 @@ +/* + * PMGR_COLLECTIVE ============================================================ + * This protocol enables MPI to bootstrap itself through a series of collective + * operations. The collective operations are modeled after MPI collectives -- + * all tasks must call them in the same order and with consistent parameters. + * + * MPI may invoke any number of collectives, in any order, passing an arbitrary + * amount of data. All message sizes are specified in bytes. + * PMGR_COLLECTIVE ============================================================ + * + * This file implements the interface used by the MPI tasks (clients). + * + * An MPI task should make calls in the following sequenece: + * + * pmgr_init + * pmgr_open + * [collectives] + * pmgr_close + * pmgr_finalize + * + * MPI may invoke any number of collectives, in any order, passing an arbitrary + * amount of data. All message sizes are specified in bytes. + * + * All functions return PMGR_SUCCESS on successful completion. + * + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory. + * Author: Adam Moody + */ + +/* + * Copyright (C) 1999-2001 The Regents of the University of California + * (through E.O. Lawrence Berkeley National Laboratory), subject to + * approval by the U.S. Department of Energy. + * + * Use of this software is under license. The license agreement is included + * in the file MVICH_LICENSE.TXT. + * + * Developed at Berkeley Lab as part of MVICH. + * + * Authors: Bill Saphir + * Michael Welcome + */ + +/* Copyright (c) 2002-2007, The Ohio State University. All rights + * reserved. + * + * This file is part of the MVAPICH software package developed by the + * team members of The Ohio State University's Network-Based Computing + * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda. + * + * For detailed copyright and licensing information, please refer to the + * copyright file COPYRIGHT_MVAPICH in the top level MPICH directory. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pmgr_collective_client.h" + +char *mpirun_hostname; +struct hostent *mpirun_hostent; +int mpirun_port; +int mpirun_socket; +int pmgr_me, pmgr_nprocs, pmgr_id; + +/* tree data structures */ +int pmgr_parent; /* MPI rank of parent */ +int pmgr_parent_s; /* socket fd to parent */ +int* pmgr_child; /* MPI ranks of children */ +int* pmgr_child_s; /* socket fds to children */ +int pmgr_num_child; /* number of children */ +int* pmgr_child_incl;/* number of children each child is responsible for (includes itself) */ +int pmgr_num_child_incl; /* total number of children this node is responsible for */ + +/* set env variable to select which trees to use, if any -- all enabled by default */ +int mpirun_use_trees; /* set by MPIRUN_USE_TREES={0,1} to disable/enable tree algorithms */ +int mpirun_use_gather_tree; /* set by MPIRUN_USE_GATHER_TREE={0,1} to disable/enable gather tree */ +int mpirun_use_bcast_tree; /* set by MPIRUN_USE_BCAST_TREE={0,1} to disable/enable bcast tree */ + +/* + * ============================= + * Utility functions for use by other functions in this file + * ============================= + */ + +/* Reads environment variable, bails if not set */ +char* pmgr_getenv(char* envvar) +{ + char* str = getenv(envvar); + if (str == NULL) { + pmgr_error("Can't read %s", envvar); + exit(1); + } + return str; +} + +/* read size bytes into buf from mpirun_socket */ +int pmgr_read(void* buf, int size) { + return pmgr_read_fd(mpirun_socket, buf, size); +} + +/* write size bytes into mpirun_socket from buf */ +int pmgr_write(void* buf, int size) { + return pmgr_write_fd(mpirun_socket, buf, size); +} + +/* write integer into mpirun_socket */ +int pmgr_write_int(int value) { + return pmgr_write(&value, sizeof(value)); +} + +/* ============================= + * Functions to open/close/gather/bcast the TCP/socket tree. + * ============================= +*/ + +/* connect to given IP:port and return opened socket file descriptor */ +int pmgr_connect(struct in_addr ip, int port) +{ + int sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sockfd < 0) { + perror("socket"); + exit(1); + } + + struct sockaddr_in sockaddr; + sockaddr.sin_family = AF_INET; + sockaddr.sin_addr = ip; + sockaddr.sin_port = port; + + if (connect(sockfd, (struct sockaddr *) &sockaddr, sizeof(sockaddr)) < 0) { + perror("connect"); + exit(1); + } + + return sockfd; +} + +/* open socket tree across MPI tasks */ +int pmgr_open_tree() +{ + /* currently implements a binomial tree */ + + /* initialize parent and children based on pmgr_me and pmgr_nprocs */ + int n = 1; + int max_children = 0; + while(n < pmgr_nprocs) { n <<= 1; max_children++; } + + pmgr_parent = 0; + pmgr_num_child = 0; + pmgr_num_child_incl = 0; + pmgr_child = malloc(max_children * sizeof(int)); + pmgr_child_s = malloc(max_children * sizeof(int)); + pmgr_child_incl = malloc(max_children * sizeof(int)); + + /* find our parent and list of children */ + int low = 0; + int high = pmgr_nprocs - 1; + while (high - low > 0) { + int mid = (high - low) / 2 + (high - low) % 2 + low; + if (low == pmgr_me) { + pmgr_child[pmgr_num_child] = mid; + pmgr_child_incl[pmgr_num_child] = high - mid + 1; + pmgr_num_child++; + pmgr_num_child_incl += (high - mid + 1); + } + if (mid == pmgr_me) { pmgr_parent = low; } + if (mid <= pmgr_me) { low = mid; } + else { high = mid-1; } + } + + /* create a socket to accept connection from parent */ + int sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sockfd < 0) { + perror("socket"); + exit(1); + } + + struct sockaddr_in sin; + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(0); /* bind ephemeral port */ + + if (bind(sockfd, (struct sockaddr *) &sin, sizeof(sin)) < 0) { + perror("binding socket"); + exit(1); + } + + listen(sockfd, 1); + + socklen_t len = sizeof(sin); + if (getsockname(sockfd, (struct sockaddr *) &sin, &len) < 0) { + perror("getting sockname"); + exit(1); + } + + char hn[256]; + gethostname(hn, 256); + struct hostent * he = gethostbyname(hn); + struct in_addr ip = * (struct in_addr *) *(he->h_addr_list); + short port = sin.sin_port; + + /* gather socket data to rank 0 */ + int sendcount = sizeof(ip) + sizeof(port); + void* sendbuf = malloc(sendcount); + void* recvbuf = malloc(sendcount * pmgr_nprocs); + + memcpy(sendbuf, &ip, sizeof(ip)); + memcpy((char*)sendbuf + sizeof(ip), &port, sizeof(port)); + + pmgr_gather(sendbuf, sendcount, recvbuf, 0); + + /* + * if i'm not rank 0, accept a connection (from parent) and receive socket + * table + */ + if (pmgr_me != 0) { + socklen_t parent_len; + struct sockaddr parent_addr; + parent_len = sizeof(parent_addr); + pmgr_parent_s = accept(sockfd, (struct sockaddr *) &parent_addr, &parent_len); + pmgr_read_fd(pmgr_parent_s, recvbuf, sendcount * pmgr_nprocs); + } + + /* for each child, open socket connection and forward socket table */ + int i; + for(i=0; i=0; i--) { + pmgr_read_fd(pmgr_child_s[i], (char*)bigbuf + offset, sendcount * pmgr_child_incl[i]); + offset += sendcount * pmgr_child_incl[i]; + } + + /* if i'm not rank 0, send to parent and free temporary buffer */ + if (pmgr_me != 0) { + pmgr_write_fd(pmgr_parent_s, bigbuf, bigcount); + free(bigbuf); + } + + return PMGR_SUCCESS; +} + +/* ============================= + * The mpirun_* functions implement PMGR_COLLECTIVE operations through + * the mpirun process. Typically, this amounts to a flat tree with the + * mpirun process at the root. These functions implement the client side + * of the protocol specified in pmgr_collective_mpirun.c. + * ============================= + */ + +/* + * Perform barrier, each task writes an int then waits for an int + */ +int mpirun_barrier() +{ + /* send BARRIER op code, then wait on integer reply */ + int buf; + + pmgr_write_int(PMGR_BARRIER); + pmgr_read(&buf, sizeof(int)); + + return PMGR_SUCCESS; +} + +/* + * Perform MPI-like Broadcast, root writes sendcount bytes from buf, + * into mpirun_socket, all receive sendcount bytes into buf + */ +int mpirun_bcast(void* buf, int sendcount, int root) +{ + /* send BCAST op code, then root, then size of data */ + pmgr_write_int(PMGR_BCAST); + pmgr_write_int(root); + pmgr_write_int(sendcount); + + if (pmgr_me == root) pmgr_write(buf, sendcount); + + pmgr_read(buf, sendcount); + + return PMGR_SUCCESS; +} + +/* + * Perform MPI-like Gather, each task writes sendcount bytes from sendbuf + * into mpirun_socket, then root receives N*sendcount bytes into recvbuf + */ +int mpirun_gather(void* sendbuf, int sendcount, void* recvbuf, int root) +{ + /* send GATHER op code, then root, then size of data, then data itself */ + pmgr_write_int(PMGR_GATHER); + pmgr_write_int(root); + pmgr_write_int(sendcount); + pmgr_write(sendbuf, sendcount); + + if (pmgr_me == root) pmgr_read(recvbuf, sendcount * pmgr_nprocs); + + return PMGR_SUCCESS; +} + +/* + * Perform MPI-like Scatter, root writes N*sendcount bytes from sendbuf + * into mpirun_socket, then each task receives sendcount bytes into recvbuf + */ +int mpirun_scatter(void* sendbuf, int sendcount, void* recvbuf, int root) +{ + /* send SCATTER op code, then root, then size of data, then data itself */ + pmgr_write_int(PMGR_SCATTER); + pmgr_write_int(root); + pmgr_write_int(sendcount); + + if (pmgr_me == root) pmgr_write(sendbuf, sendcount * pmgr_nprocs); + + pmgr_read(recvbuf, sendcount); + + return PMGR_SUCCESS; +} + +/* + * Perform MPI-like Allgather, each task writes sendcount bytes from sendbuf + * into mpirun_socket, then receives N*sendcount bytes into recvbuf + */ +int mpirun_allgather(void* sendbuf, int sendcount, void* recvbuf) +{ + /* send ALLGATHER op code, then size of data, then data itself */ + pmgr_write_int(PMGR_ALLGATHER); + pmgr_write_int(sendcount); + pmgr_write(sendbuf, sendcount); + pmgr_read (recvbuf, sendcount * pmgr_nprocs); + + return PMGR_SUCCESS; +} + +/* + * Perform MPI-like Alltoall, each task writes N*sendcount bytes from sendbuf + * into mpirun_socket, then recieves N*sendcount bytes into recvbuf + */ +int mpirun_alltoall(void* sendbuf, int sendcount, void* recvbuf) +{ + /* send ALLTOALL op code, then size of data, then data itself */ + pmgr_write_int(PMGR_ALLTOALL); + pmgr_write_int(sendcount); + pmgr_write(sendbuf, sendcount * pmgr_nprocs); + pmgr_read (recvbuf, sendcount * pmgr_nprocs); + + return PMGR_SUCCESS; +} + +/* + * ============================= + * The pmgr_* collectives are the user interface (what the MPI tasks call). + * ============================= + */ + +/* Perform barrier, each task writes an int then waits for an int */ +int pmgr_barrier() +{ + char c; + void* recvbuf = NULL; + + if (mpirun_use_trees) { + /* gather a character to rank 0 */ + if (pmgr_me == 0) recvbuf = (void*) malloc(sizeof(c) * pmgr_nprocs); + + if (mpirun_use_gather_tree) { + pmgr_gather_tree(&c, sizeof(c), recvbuf); + } + + else { + mpirun_gather(&c, sizeof(c), recvbuf, 0); + } + + if (pmgr_me == 0) free(recvbuf); + + /* broadcast a character from rank 0 */ + if (mpirun_use_bcast_tree) { + pmgr_bcast_tree(&c, sizeof(c)); + } + + else { + mpirun_bcast(&c, sizeof(c), 0); + } + } + + else { + mpirun_barrier(); + } + + return PMGR_SUCCESS; +} + +/* + * Perform MPI-like Broadcast, root writes sendcount bytes from buf, + * into mpirun_socket, all receive sendcount bytes into buf + */ +int pmgr_bcast(void* buf, int sendcount, int root) { + return mpirun_bcast(buf, sendcount, root); +} + +/* + * Perform MPI-like Gather, each task writes sendcount bytes from sendbuf + * into mpirun_socket, then root receives N*sendcount bytes into recvbuf + */ +int pmgr_gather(void* sendbuf, int sendcount, void* recvbuf, int root) { + return mpirun_gather(sendbuf, sendcount, recvbuf, root); +} + +/* + * Perform MPI-like Scatter, root writes N*sendcount bytes from sendbuf + * into mpirun_socket, then each task receives sendcount bytes into recvbuf + */ +int pmgr_scatter(void* sendbuf, int sendcount, void* recvbuf, int root) { + return mpirun_scatter(sendbuf, sendcount, recvbuf, root); +} + +/* + * Perform MPI-like Allgather, each task writes sendcount bytes from sendbuf + * into mpirun_socket, then receives N*sendcount bytes into recvbuf + */ +int pmgr_allgather(void* sendbuf, int sendcount, void* recvbuf) +{ + if (mpirun_use_trees) { + /* gather data to rank 0 */ + if (mpirun_use_gather_tree) { + pmgr_gather_tree(sendbuf, sendcount, recvbuf); + } + + else { + mpirun_gather(sendbuf, sendcount, recvbuf, 0); + } + + /* broadcast data from rank 0 */ + if (mpirun_use_bcast_tree) { + pmgr_bcast_tree(recvbuf, sendcount * pmgr_nprocs); + } + + else { + mpirun_bcast(recvbuf, sendcount * pmgr_nprocs, 0); + } + } + + else { + mpirun_allgather(sendbuf, sendcount, recvbuf); + } + + return PMGR_SUCCESS; +} + +/* + * Perform MPI-like Alltoall, each task writes N*sendcount bytes from sendbuf + * into mpirun_socket, then recieves N*sendcount bytes into recvbuf + */ +int pmgr_alltoall(void* sendbuf, int sendcount, void* recvbuf) { + return mpirun_alltoall(sendbuf, sendcount, recvbuf); +} + +/* + * Opens socket to mpirun launch process, then sends protocol version and rank + * number + */ +int pmgr_open() +{ + struct sockaddr_in sockaddr; + + mpirun_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (mpirun_socket < 0) { + perror("opening mpirun socket"); + exit(1); + } + + mpirun_hostent = gethostbyname(mpirun_hostname); + if (mpirun_hostent == NULL) { + herror("gethostbyname"); + exit(1); + } + + sockaddr.sin_family = AF_INET; + sockaddr.sin_addr = *(struct in_addr *) (*mpirun_hostent->h_addr_list); + sockaddr.sin_port = htons(mpirun_port); + + if (connect(mpirun_socket, (struct sockaddr *) &sockaddr, + sizeof(sockaddr)) < 0) { + perror("connect"); + exit(1); + } + + /* we are now connected to the mpirun process */ + + /* + * Exchange information with mpirun. If you make any changes + * to this protocol, be sure to increment the version number + * in the header file. This is to permit compatibility with older + * executables. + */ + + /* send version number, then rank */ + pmgr_write_int(PMGR_COLLECTIVE); + pmgr_write(&pmgr_me, sizeof(pmgr_me)); + + /* open up socket tree, if enabled */ + if (mpirun_use_trees) pmgr_open_tree(); + + return PMGR_SUCCESS; +} + +/* + * Closes the mpirun socket + */ +int pmgr_close() +{ + /* shut down the tree, if enabled */ + if (mpirun_use_trees) pmgr_close_tree(); + + /* send CLOSE op code, then close socket */ + pmgr_write_int(PMGR_CLOSE); + close(mpirun_socket); + + return PMGR_SUCCESS; +} + +/* + * ============================= + * Handle init and finalize + * ============================= + */ + +int pmgr_init(int *argc_p, char ***argv_p, int *np_p, int *me_p, + int *id_p, char ***processes_p) +{ + char *str; + char *str_token; + char **pmgr_processes = NULL; + int i; + setvbuf(stdout, NULL, _IONBF, 0); + char *value; + + /* Get information from environment, not from the argument list */ + + /* mpirun host */ + str = pmgr_getenv("MPIRUN_HOST"); + mpirun_hostname = strdup(str); + mpirun_hostent = gethostbyname(mpirun_hostname); + + if (!mpirun_hostent) { + fprintf(stderr,"gethostbyname failed:: %s: %s (%d)\n", + mpirun_hostname, hstrerror(h_errno), h_errno); + exit(1); + } + + /* mpirun port */ + str = pmgr_getenv("MPIRUN_PORT"); + mpirun_port = atoi(str); + + if (mpirun_port <= 0) { + fprintf(stderr, "Invalid MPIRUN port %s\n", str); + exit(1); + } + + /* number of processes */ + str = pmgr_getenv("MPIRUN_NPROCS"); + pmgr_nprocs = atoi(str); + + if (pmgr_nprocs <= 0) { + fprintf(stderr, "Invalid MPIRUN nprocs %s\n", str); + exit(1); + } + + /* rank of current process */ + str = pmgr_getenv("MPIRUN_RANK"); + pmgr_me = atoi(str); + + if (pmgr_me < 0 || pmgr_me >= pmgr_nprocs) { + fprintf(stderr, "Invalid MPIRUN rank %s\n", str); + exit(1); + } + + /* unique of current application */ + str = pmgr_getenv("MPIRUN_ID"); + pmgr_id = atoi(str); + + if (pmgr_id == 0) { + fprintf(stderr, "Invalid application ID %s\n", str); + exit(1); + } + + /* list of hostnames running processes in job */ + if ((value = getenv("NOT_USE_TOTALVIEW")) == NULL) { + pmgr_processes = (char **) calloc((size_t)pmgr_nprocs, sizeof(char*)); + + if (pmgr_processes == NULL) { + fprintf(stderr, "Can't allocate process list\n"); + exit(1); + } + + str = pmgr_getenv("MPIRUN_PROCESSES"); + str = strdup(str); + + if (str == NULL) { + fprintf(stderr, "Can't allocate process list\n"); + exit(1); + } + + for (i = 0; i < pmgr_nprocs; i++) { + if (!str) { + fprintf(stderr, "Invalid MPIRUN process list: '%s' ", + getenv("MPIRUN_PROCESSES")); + exit(1); + } + + str_token = strchr(str, ':'); + if(str_token) *str_token = ' '; + + pmgr_processes[i] = str; + str = strchr(str, ' '); + if (str) { + *str = '\0'; + str++; + } + } + } + + /* MPIRUN_USE_TREES={0,1} disables/enables tree algorithms */ + mpirun_use_trees = 1; + if ((value = getenv("MPIRUN_USE_TREES"))) { + mpirun_use_trees = atoi(value); + } + + /* MPIRUN_USE_GATHER_TREE={0,1} disables/enables gather tree */ + mpirun_use_gather_tree = 1; + if ((value = getenv("MPIRUN_USE_GATHER_TREE"))) { + mpirun_use_gather_tree = atoi(value); + } + + /* MPIRUN_USE_BCAST_TREE={0,1} disables/enables bcast tree */ + mpirun_use_bcast_tree = 1; + if ((value = getenv("MPIRUN_USE_BCAST_TREE"))) { + mpirun_use_bcast_tree = atoi(value); + } + + *np_p = pmgr_nprocs; + *me_p = pmgr_me; + *id_p = pmgr_id; + *processes_p = pmgr_processes; + + return PMGR_SUCCESS; +} + +/* + * No cleanup necessary here. + */ +int pmgr_finalize() +{ + return PMGR_SUCCESS; +} + +/* + * ============================= + * Handle aborts + * ============================= + */ + +int vprint_msg(char *buf, size_t len, const char *fmt, va_list ap) +{ + int n; + + n = vsnprintf(buf, len, fmt, ap); + + if ((n >= len) || (n < 0)) { + /* Add trailing '+' to indicate truncation */ + buf[len - 2] = '+'; + buf[len - 1] = '\0'; + } + + return (0); +} + +/* + * Call into the process spawner, using the same port we were given + * at startup time, to tell it to abort the entire job. + */ +int pmgr_abort(int code, const char *fmt, ...) +{ + int s; + struct sockaddr_in sin; + struct hostent *he; + va_list ap; + char buf [256]; + int len; + + he = gethostbyname(mpirun_hostname); + if (!he) return -1; + + s = socket(AF_INET, SOCK_STREAM, 0); + if (s < 0) return -1; + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = he->h_addrtype; + memcpy(&sin.sin_addr, he->h_addr_list[0], sizeof(sin.sin_addr)); + sin.sin_port = htons(mpirun_port); + if (connect(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) return -1; + + va_start(ap, fmt); + vprint_msg(buf, sizeof(buf), fmt, ap); + va_end(ap); + + /* write an abort code (may be destination rank), our rank to mpirun */ + pmgr_write_fd(s, &code, sizeof(code)); + pmgr_write_fd(s, &pmgr_me, sizeof(pmgr_me)); + + /* now length of error string, and error string itself to mpirun */ + len = strlen(buf) + 1; + pmgr_write_fd(s, &len, sizeof(len)); + pmgr_write_fd(s, buf, len); + + close(s); + + return PMGR_SUCCESS; +} Added: mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.h 2007-11-05 16:54:39 UTC (rev 1627) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_client.h 2007-11-19 20:53:57 UTC (rev 1628) @@ -0,0 +1,75 @@ +/* + * PMGR_COLLECTIVE ============================================================ + * This protocol enables MPI to bootstrap itself through a series of collective + * operations. The collective operations are modeled after MPI collectives -- + * all tasks must call them in the same order and with consistent parameters. + * + * MPI may invoke any number of collectives, in any order, passing an arbitrary + * amount of data. All message sizes are specified in bytes. + * PMGR_COLLECTIVE ============================================================ + * + * This file defines the interface used by the MPI tasks (clients). + * + * An MPI task should make calls in the following sequenece: + * + * pmgr_init + * pmgr_open + * [collectives] + * pmgr_close + * pmgr_finalize + * + * MPI may invoke any number of collectives, in any order, passing an arbitrary + * amount of data. All message sizes are specified in bytes. + * + * All functions return PMGR_SUCCESS on successful completion. + * + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory. + * Author: Adam Moody +*/ + +#ifndef _PMGR_COLLECTIVE_CLIENT_H +#define _PMGR_COLLECTIVE_CLIENT_H + +#include "pmgr_collective_common.h" + +int pmgr_open (); +int pmgr_close(); + +/* sync point, no task makes it past until all have reached */ +int pmgr_barrier (); + +/* root sends sendcount bytes from buf, each task recevies sendcount bytes into buf */ +int pmgr_bcast (void* buf, int sendcount, int root); + +/* each task sends sendcount bytes from buf, root receives N*sendcount bytes into recvbuf */ +int pmgr_gather (void* sendbuf, int sendcount, void* recvbuf, int root); + +/* root sends blocks of sendcount bytes to each task indexed from sendbuf */ +int pmgr_scatter (void* sendbuf, int sendcount, void* recvbuf, int root); + +/* each task sends sendcount bytes from sendbuf and receives N*sendcount bytes into recvbuf */ +int pmgr_allgather(void* sendbuf, int sendcount, void* recvbuf); + +/* each task sends N*sendcount bytes from sendbuf and receives N*sendcount bytes into recvbuf */ +int pmgr_alltoall (void* sendbuf, int sendcount, void* recvbuf); + +/* + * This function is called by each process in the job during + * initialization. Pointers to argc and argv are passes + * in the event that the process manager passed args on + * the command line. + * The following values are filled in: + * *np_p = total number of processes in the job + * *me_p = the rank of this process (zero based) + * *id_p = the global ID associated with this job. + */ +int pmgr_init(int *argc_p, char ***argv_p, + int *np_p, int *me_p, int *id_p, + char ***processes_p); + +int pmgr_finalize(void); + +int pmgr_abort(int code, const char *fmt, ...); + +#endif /* _PMGR_COLLECTIVE_CLIENT_H */ Added: mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_common.c =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_common.c 2007-11-05 16:54:39 UTC (rev 1627) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_common.c 2007-11-19 20:53:57 UTC (rev 1628) @@ -0,0 +1,74 @@ +/* + * PMGR_COLLECTIVE ============================================================ + * This protocol enables MPI to bootstrap itself through a series of collective + * operations. The collective operations are modeled after MPI collectives -- + * all tasks must call them in the same order and with consistent parameters. + * + * MPI may invoke any number of collectives, in any order, passing an arbitrary + * amount of data. All message sizes are specified in bytes. + * PMGR_COLLECTIVE ============================================================ + * + * This file provides common implementations for + * pmgr_collective_mpirun - the interface used by mpirun + * pmgr_collective_client - the interface used by the MPI tasks + * + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory. + * Author: Adam Moody +*/ + +#include +#include +#include +#include +#include "pmgr_collective_common.h" + +/* print message to stderr */ +void pmgr_error(char *fmt, ...) +{ + va_list argp; + fprintf(stderr, "PMGR_COLLECTIVE ERROR: "); + va_start(argp, fmt); + vfprintf(stderr, fmt, argp); + va_end(argp); + fprintf(stderr, "\n"); +} + +/* +if (errno == EINTR) + continue; +else + return(-1); +*/ + +/* write size bytes from buf into fd, retry if necessary */ +int pmgr_write_fd(int fd, void* buf, int size) +{ + int rc; + int n = 0; + char* offset = (char*) buf; + while (n < size) { + rc = write(fd, offset, size - n); + if (rc < 0) { return rc; } + if (rc == 0) { return n; } + offset += rc; + n += rc; + } + return n; +} + +/* read size bytes into buf from fd, retry if necessary */ +int pmgr_read_fd(int fd, void* buf, int size) +{ + int rc; + int n = 0; + char* offset = (char*) buf; + while (n < size) { + rc = read(fd, offset, size - n); + if (rc < 0) { return rc; } + if (rc == 0) { return n; } + offset += rc; + n += rc; + } + return n; +} Added: mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_common.h =================================================================== --- mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_common.h 2007-11-05 16:54:39 UTC (rev 1627) +++ mvapich/trunk/mpid/ch_gen2/process/pmgr_collective_common.h 2007-11-19 20:53:57 UTC (rev 1628) @@ -0,0 +1,54 @@ +/* + * PMGR_COLLECTIVE ============================================================ + * This protocol enables MPI to bootstrap itself through a series of collective + * operations. The collective operations are modeled after MPI collectives -- + * all tasks must call them in the same order and with consistent parameters. + * + * MPI may invoke any number of collectives, in any order, passing an arbitrary + * amount of data. All message sizes are specified in bytes. + * PMGR_COLLECTIVE ============================================================ + * + * This file provides common definitions for + * pmgr_collective_mpirun - the interface used by mpirun + * pmgr_collective_client - the interface used by the MPI tasks + * + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory. + * Author: Adam Moody +*/ + +#ifndef _PMGR_COLLECTIVE_COMMON_H +#define _PMGR_COLLECTIVE_COMMON_H + +#if defined(_IA64_) +#undef htons +#undef ntohs +#define htons(__bsx) ((((__bsx) >> 8) & 0xff) | (((__bsx) & 0xff) << 8)) +#define ntohs(__bsx) ((((__bsx) >> 8) & 0xff) | (((__bsx) & 0xff) << 8)) +#endif + +/* PMGR_VERSION for pmgr_collective is PMGR_COLLECTIVE (== 8) */ +#define PMGR_COLLECTIVE 8 + +#define PMGR_SUCCESS 0 + +#define PMGR_OPEN 0 +#define PMGR_CLOSE 1 +#define PMGR_ABORT 2 +#define PMGR_BARRIER 3 +#define PMGR_BCAST 4 +#define PMGR_GATHER 5 +#define PMGR_SCATTER 6 +#define PMGR_ALLGATHER 7 +#define PMGR_ALLTOALL 8 + +/* print message to stderr */ +void pmgr_error(char *fmt, ...); + +/* write size bytes from buf into fd, retry if necessary */ +int pmgr_write_fd(int fd, void* buf, int size); + +/* read size bytes into buf from fd, retry if necessary */ +int pmgr_read_fd (int fd, void* buf, int size); + +#endif /*