#!/bin/csh -f # MPIRUN # This script tries to start jobs on whatever kind of machine you're on. # Strategy - This program is built with a default device it uses in # certain ways. The user can override this default from the command line. # # Note that the -f is important to ensure that commands needed by csh are # not aliased to commands with the same name and different behavior. # #set verbose set MPIR_HOME = /home/lusk/mpich set EXECER = /home/lusk/mpich/util/execer set MEIKO_RUN = prun set SP1_RUN = /usr/lpp/euih/eui/cotb0 set jobid=$$ set progname='' set np=1 set arch='' set default_arch='alpha' set machine = '' set default_device='ch_p4' set execer_machine=1 set cmdLineArgs='' set cmdline='' set use_execer = 0 set mpirun_verbose = 0 set nolocal = 0 set leavePGFile = 0 set just_testing = 0 set machineFile = '' set debugger="" set Machines = "chameleon meiko paragon p4 sp1 ibmspx anlspx ksr sgi i860 inteldelta cray_t3d" set max_time = 15 # # Special, system specific values # # polling_mode is for systems that can select between polling and # interrupt-driven operation. Currently, only IBM POE is so supported # (TMC CMMD has some support for this choice of mode) set polling_mode = 1 # Parse command line arguments # The ultimate goal is to determine what kind of parallel machine this # is we are running on. Then we know how to start jobs... set j = 1 while ("$j" <= "$#argv") set arg = "$argv[$j]" switch ("$arg") case -mr_arch: @ j++ set arch = "$argv[$j]" breaksw case -mr_np: @ j++ set np = "$argv[$j]" set nodigits = `echo $np | sed 's/^[0-9]*$//'` if (($nodigits != "") || ($np < 1)) then echo np: $np is an invalid number of processors. Exiting. exit endif breaksw case -mr_machine: @ j++ set machine = "$argv[$j]" breaksw case -mr_machinefile: @ j++ set machineFile = "$argv[$j]" breaksw case -mr_device: @ j++ set default_device = "$argv[$j]" breaksw case -mr_nolocal: set nolocal = 1 breaksw case -mr_h: goto PrintHelp breaksw case -mr_e: set use_execer = 1 breaksw case -mr_pg: set use_execer = 0 breaksw case -mr_leave_pg: case -mr_keep_pg: set leavePGFile = 1 breaksw case -mr_v: set mpirun_verbose = 1 breaksw case -mr_t: set just_testing = 1 breaksw case -mr_dbx: set debugger = "dbx" breaksw # case -mr_xdbx: # set debugger = "xdbx" # breaksw case -mr_nopoll: set polling_mode = 0 breaksw case -mr_maxtime: @ j++ set maxtime = "$argv[$j]" breaksw case -usage: case -help: case -?: # Accept these for help until the program name is provided. if ("$progname" == "") goto PrintHelp breaksw default: # The first unrecognized argument is assumed to be the name of # the program, but only if it is executable if ("$progname" == "" && -e "${argv[$j]}" ) then set progname = $argv[$j] else # any following unrecognized arguments are assumed to be arguments # to be passed to the program if ($use_execer) then set cmdLineArgs = "$cmdLineArgs -arg=${argv[$j]}" else set cmdLineArgs = "$cmdLineArgs ${argv[$j]}" endif endif breaksw endsw @ j++ end # We need at least the program name if ("$progname" == "") then echo Missing: program name exit endif # Try to find the architecture. Note that many systems do not have # arch, and that some systems (RS6000's for example) use "uname -m" # to return difficult to interpret info (RS6000s return a string of # digits that encodes the particular RS6000 model). # # Eventually we'll want to use "which" to try and find a version of # arch... if ("$arch" == "") then if ( -e /bin/arch) then set ARCH=`/bin/arch` else if ( -e /usr/local/bin/arch) then set ARCH=`/usr/local/bin/arch` else if ( -e /usr/ucb/arch) then set ARCH=`/usr/ucb/arch` else set ARCH=`uname -s` if ("$ARCH" != "AIX") then set ARCH=`uname -m` endif endif switch ($ARCH) case sun4: case sun4m: case sun4c: # There are others set Version = `/bin/uname -r` # In "improving" SunOS, the useful feature of "substr" was withdrawn # from expr. Can't let the users have life too easy, can we? This # means that we can't just use # set MajorVersion = `expr substr $Version 1 1` # because it won't work on Solaris systems. The following should work on # both: set MajorVersion = `expr "$Version" : "\(.\)"` if ("$MajorVersion" == 5) then # It really is solaris, but we don't care set arch="sun4" else set arch="sun4" endif breaksw case alpha: set arch="alpha" breaksw case AIX: set arch=rs6000 breaksw case IRIX: case sgi: set arch="IRIX" breaksw case freebsd: set arch="freebsd" breaksw case paragon: set arch="paragon" breaksw case ksr: set arch="ksr" breaksw case IP19: set arch="sgi_mp" breaksw case ipsc860: case i860: case ipsc: set arch="ipsc860" breaksw default: echo "Cannot determine machine architecture. Use the '-arch '" echo "flag or the '-machine ' flag. Defaulting" echo "to $default_arch" set arch=$default_arch breaksw endsw endif # Try to find the machine if ("$machine" == "") then switch ($arch) case sun4: # This device should only exits on sun4s that are actually # MEIKO machines. if ( -e /dev/elan ) then set machine="meiko" else if ("$default_device" == "chameleon") then set machine="chameleon" else if (("$default_device" == "ch_p4") && ($use_execer == 0)) then set machine="p4" else set machine="execer" endif endif endif breaksw case alpha: case IRIX: case freebsd: if ("$default_device" == "chameleon") then set machine="chameleon" else if (("$default_device" == "ch_p4") && ($use_execer == 0)) then set machine="p4" else set machine="execer" endif breaksw case rs6000: # This only works on the ANL sp system echo `hostname` | grep spnode > /dev/null if ($status == 0) then # This is the correct version to use once we're on a node set machine="ibmspx" else if (-d /etc/FRAMES && -d /mcs) then set machine="anlspx" else if ("$default_device" == "chameleon") then set machine="chameleon" else if (("$default_device" == "ch_p4") && ($use_execer == 0)) then set machine="p4" else if (-e /usr/lpp/euih/eui) then set machine="sp1" else if (-e /usr/bin/poe) then # should work for other users set machine="ibmspx" else set machine="execer" endif breaksw case paragon: set machine="paragon" breaksw case ipsc860: case i860: case ipsc: set machine="ipsc860" breaksw case ksr: set machine="ksr" breaksw case sgi_mp: set machine="sgi_mp" breaksw case cray_t3d: set machine="cray_t3d" breaksw default: echo "Can't determine the type of the machine this is." echo "Set it with -mr_machine ." breaksw endsw endif # Fill out relative program pathnames # Get the current directory if ($?PWD == 1) then set PWD_TRIAL = $PWD else set PWD_TRIAL = `pwd` endif if ( "$PWD_TRIAL" != "" ) then set PWD_TRIAL = `pwd | sed -e 's%/tmp_mnt/%/%g'` if ( ! -d $PWD_TRIAL ) then echo "Warning: your default path uses the automounter; this may" echo "cause some problems if you use other NFS-connected systems." PWD_TRIAL=`pwd` endif endif set tail=`echo $progname | sed 's/\/.*//'` if ("$tail" == "") then #echo Absolute path, don't need to change anything else #echo Relative path set progname = "$PWD_TRIAL/$progname" endif # Get value of host # Should really check for hostname first.... if ( $?HOST == 0 ) then if ($arch == "ipsc860") then set HOST = `hostname` else if (-e "`which hostname`") set HOST = "`hostname`" endif # Note that uname -n may not produce a usable hostname. Any suggestions? if ("$HOST" == "") set HOST = "`uname -n`" endif if ($mpirun_verbose) then echo "running $progname on $np ${arch} ${machine} processors" endif if (("$machine" == "execer") || ("$machine" == "p4")) then if ($nolocal) then #echo don\'t run local instance # if the job is not to be run locally, then don't include this machine # in the list set procFound = 0 else # echo run local instance if ("$machine" == "execer") then set cmdline = "$cmdline -host=${HOST} -pgm=${progname} -numprocs=1 $cmdLineArgs" endif set procFound = 1 endif if ("$machineFile" == "") then # If on the ANL SPx, use getjid to get the machine list... if ("$machine" == "ibmspx" && -e /usr/local/bin/getjid) then set machineFile = "/sphome/$LOGNAME/SPnodes.`/usr/local/bin/getjid`" else set machineFile = "${MPIR_HOME}/util/machines/machines.${arch}" endif endif if (!(-e "$machineFile" && -r "$machineFile")) then echo Cannot read $machineFile. Exiting. exit 2 endif if ($nolocal == 0) then set machinesfound=(`head -${np} $machineFile`) else # Remove host from the list of available machines.... set machinesfound=(`cat $machineFile | grep -v $HOST | head -${np}`) endif @ nfound = $#machinesfound + $procFound if ($nfound < $np) then echo "Only $nfound ${arch}'s available. Exiting." exit endif # Get the machine list set machinelist = () set machineNum = 1 while ("$procFound" < "$np") set machineName = "${machinesfound[$machineNum]}" if ($mpirun_verbose) then echo "running on $machineName" endif if ("$machine" == "execer") then if ("$nolocal" || ("$machineName" != "$HOST")) then set cmdline = "$cmdline -host=${machineName} -pgm=${progname} -numprocs=1 $cmdLineArgs" @ procFound++ endif else if ($nolocal || ("$machineName" != "$HOST")) then set machinelist = ($machinelist $machineName) @ procFound++ endif endif @ machineNum++ end endif switch ($machine) case meiko: @ np-- if ($just_testing) then echo "local $np $progname" else echo "local $np $progname" > "$PWD_TRIAL/PI$$" endif @ np++ if ($just_testing) then echo ${MEIKO_RUN} -n $np $progname -p4pg $PWD_TRIAL/PI$$ $cmdLineArgs else ${MEIKO_RUN} -n $np $progname -p4pg $PWD_TRIAL/PI$$ $cmdLineArgs if ($leavePGFile) then echo "P4 procgroup file is $PWD_TRIAL/PI$$." else /bin/rm "$PWD_TRIAL/PI$$" endif endif breaksw case ksr: @ np-- if ($just_testing) then echo "local $np $progname" else echo "local $np $progname" > "$PWD_TRIAL/PI$$" endif @ np++ if ($just_testing) then echo $progname -p4pg "$PWD_TRIAL/PI$$ $cmdLineArgs" else $progname -p4pg "$PWD_TRIAL/PI$$ $cmdLineArgs" if ($leavePGFile) then echo "P4 procgroup file is $PWD_TRIAL/PI$$." else /bin/rm "$PWD_TRIAL/PI$$" endif endif breaksw case sgi_mp: @ np-- if ($just_testing) then echo "local $np $progname" else echo "local $np $progname" > "$PWD_TRIAL/PI$$" endif @ np++ if ($just_testing) then echo $progname -p4pg "$PWD_TRIAL/PI$$" else $progname -p4pg "$PWD_TRIAL/PI$$" if ($leavePGFile) then echo "P4 procgroup file is $PWD_TRIAL/PI$$." else /bin/rm "$PWD_TRIAL/PI$$" endif endif breaksw case cray_t3d: # Untested. Only does the interactive (non-NQS submission) if ($just_testing) then echo $progname -npes $np $cmdLineArgs else $progname -npes $np $cmdLineArgs fi breaksw case ibmspx: # This only works on SPx running release 2 software and with # the high-performance switch. Note that we need to parameterize # the hostlist somehow. # See /cave3/vroom-51/bash/sp1/sp2.hosts for MM machine hosts, # /sphome/gropp/mpich/examples/test/pt2pt/hostlist for DIS hosts # use # MP_EUILIB ip # for ip over the switch # Other variables to consider: # setenv MP_PMDLOG yes # setenv PWD $PWD_TRIAL # setenv MP_INFOLEVEL 1 # setenv MP_INFOLEVEL 20 # setenv MP_CSS_INTERRUPT ? # Note that if the executable is not on a file system mounted # on a node, you may get a strange error message. if ($just_testing) then echo setenv MP_EUILIB us echo setenv MP_RMPOOL 0 echo setenv MP_HOSTFILE $machineFile echo setenv MP_PROCS $np echo setenv MP_INFOLEVEL 0 if ($polling_mode == 0) echo setenv MP_CSS_INTERRUPT yes echo poe $progname $cmdLineArgs else setenv MP_EUILIB us setenv MP_RMPOOL 0 setenv MP_HOSTFILE $machineFile setenv MP_PROCS $np setenv MP_INFOLEVEL 0 if ($polling_mode == 0) setenv MP_CSS_INTERRUPT yes poe $progname $cmdLineArgs endif breaksw case sp1: # This is for the old MPL/p or EUI-H environment if ($just_testing) then echo ${SP1_RUN} $progname $np $cmdLineArgs else ${SP1_RUN} $progname $np $cmdLineArgs endif breaksw case anlspx: # This if for the ANL SP1/2, using the ANL "spsubmit" program. # Users of "loadleveler" will probably want something like this # Create a new file cat >PIrun <<. #! /bin/sh JID=\`/usr/local/bin/getjid\` trap "sprelease \$JID" cd $PWD_TRIAL MP_EUILIB=us MP_RMPOOL=0 MP_HOSTFILE=/sphome/$LOGNAME/SPnodes.\$JID MP_PROCS=$np MP_INFOLEVEL=0 export MP_EUILIB export MP_RMPOOL export MP_HOSTFILE export MP_PROCS export MP_INFOLEVEL if [ $polling_mode = 0 ] ; then MP_CSS_INTERRUPT=yes export MP_CSS_INTERRUPT fi /bin/rm -f /sphome/$LOGNAME/job.output echo "About to run poe ... " >> /sphome/$LOGNAME/job.output poe $progname $cmdLineArgs >> /sphome/$LOGNAME/job.output 2>&1 echo "Poe exited ..." >> /sphome/$LOGNAME/job.output sprelease \$JID exit 0 . chmod a+x PIrun # Determine CAC set CAC = `whatcac | cut -d' ' -f 9 | sed -e s/\"//g -e s/://` if ($just_testing) then echo "spsubmit <<." echo $CAC echo $max_time echo $np echo B echo M echo n echo $PWD_TRIAL/PIrun echo echo C echo . echo spwait else spsubmit <<. $CAC $max_time $np B M n $PWD_TRIAL/PIrun C . spwait endif breaksw case paragon: if ($just_testing) then echo $progname -sz $np $cmdLineArgs else $progname -sz $np $cmdLineArgs endif breaksw case inteldelta: # This script must actually be run ON the Delta (more accurately, # on a Delta service node), which this does by invoking rsh. # If the rsh fails, check your permissions. # A similiar script could be used on Intel i860 systems # UNTESTED if ($just_testing) then echo 'rsh delta1 mexec -t"'"$np"'" -f ' $progname $cmdLineArgs else rsh delta1 mexec -t"$np" -f $progname $cmdLineArgs endif breaksw case i860: case ipsc860: if ($just_testing) then echo "getcube -t $np" echo "load $progname $cmdLineArgs" echo "waitcube" echo "relcube" else getcube -t $np if ($status != 0) then echo "Requested number of nodes not available" else load $progname $cmdLineArgs waitcube relcube endif endif breaksw case chameleon: if ($just_testing) then echo $progname -np $np $cmdLineArgs else $progname -np $np $cmdLineArgs endif breaksw case execer: if ($just_testing) then echo $EXECER "-jobid=$jobid" $cmdline else $EXECER "-jobid=$jobid" $cmdline endif breaksw case p4: # We use this form instead of "local 0" in-case the user is trying to # select a second network whose names are not those returned by # "hostname". For example, a system with a DEC Gigiswitch, Myricom # network, or IP over the IBM SP2 switch (HPS). if ($just_testing) then echo "Procgroup file:" echo "$HOST 0 $progname" else echo "$HOST 0 $progname" > "$PWD_TRIAL/PI$$" endif if ($nolocal) then set procNum = 2 else set procNum = 1 endif while ($procNum <= ${#machinelist}) if ($just_testing) then echo "${machinelist[$procNum]} 1 $progname" else echo "${machinelist[$procNum]} 1 $progname" >> "$PWD_TRIAL/PI$$" endif @ procNum++ end # make sure the procgroup file was written if (!($just_testing) && \ !(-e "$PWD_TRIAL/PI$$" && -r "$PWD_TRIAL/PI$$")) then echo Failed to write "$PWD_TRIAL/PI$$". Exiting. exit endif if (!($just_testing) && $mpirun_verbose) then echo Created "$PWD_TRIAL/PI$$" endif set startpgm = "eval" if ("$debugger" != "") then cat > $PWD_TRIAL/PId$$ <<. run -p4pg $PWD_TRIAL/PI$$ $cmdLineArgs . set startpgm = "$debugger $progname -sr $PWD_TRIAL/PId$$" endif # if ($just_testing) then echo "" if ($nolocal) then echo rsh ${machinelist[1]} $progname -p4pg "$PWD_TRIAL/PI$$" $cmdLineArgs else echo $progname -p4pg "$PWD_TRIAL/PI$$" $cmdLineArgs endif else if ($nolocal) then if ("$debugger" != "") then rsh ${machinelist[1]} $debugger -sr $PWD_TRIAL/PId$$ $progname else rsh ${machinelist[1]} $progname -p4pg "$PWD_TRIAL/PI$$" $cmdLineArgs endif else if ("$debugger" != "") then $debugger -sr $PWD_TRIAL/PId$$ $progname else $progname -p4pg "$PWD_TRIAL/PI$$" $cmdLineArgs endif endif # nolocal if ($leavePGFile) then echo "P4 procgroup file is $PWD_TRIAL/PI$$." else /bin/rm "$PWD_TRIAL/PI$$" endif endif #just testing breaksw default: echo "This machine ($machine) is not yet supported." echo Exiting. exit breaksw endsw exit PrintHelp: echo "mpirun [options...] [options...]" echo "" echo " options:" echo " -mr_arch " echo " specify the architecture (must have matching machines." echo " file in ${MPIR_HOME}/util/machines) if using the execer" echo " -mr_h This help" echo " -mr_machine " echo " use startup procedure for " echo " Currently supported:" foreach machine ($Machines) echo " $machine" end echo "" echo " -mr_machinefile " echo " Take the list of possible machines to run on from the" echo " file " echo " -mr_np " echo " specify the number of processors to run on" echo " -mr_nolocal" echo " don't run on the local machine (only works for " echo " p4 and ch_p4 jobs)" echo " -mr_e Use execer to start the program on workstation" echo " clusters" echo " -mr_pg Use a procgroup file to start the p4 programs, not execer" echo " (default)" echo " -mr_leave_pg" echo " Don't delete the P4 procgroup file after running" echo " -mr_t Testing - do not actually run, just print what would be" echo " executed" echo " -mr_v Verbose - thrown in some comments" echo " -mr_dbx Start the first process under dbx where possible" #echo " -mr_xdbx Start the first process under xdbx where possible" echo " -mr_nopoll Do not use a polling-mode communication." echo " Available only on IBM SPx." echo "" exit