#!/bin/perl

### INSTALL DIRECTIONS:
#
#  1. Install this PERL executable, sge-lam inside the LAM bin dir. 
#     Make sure it is executable.
#  2. Modify the following variables: LAMHOME below to fit your site setup. 
#

$LAMHOME="/temp/LAM";

#  3. Create an SGE PE that can be used to submit lam jobs. The following 
#     is an example assuming the scripts exist in /usr/local/lam/bin. 
#     You should replace the queue_list and slots with your site specific 
#     values or set it to "all" to use all the queues.  
#
#        % qconf -sp lammpi 
#        pe_name lammpi
#        queue_list all
#        slots 6
#        user_lists NONE
#        xuser_lists NONE
#        start_proc_args /usr/local/lam/bin/sge-lam start
#        stop_proc_args /usr/local/lam/bin/sge-lam stop
#        allocation_rule 2
#        control_slaves TRUE
#        job_is_first_task TRUE
#
#    NOTE: It is probably easiest to use the qmon GUI to create the PE.
#
#   4. Add a new LAM node process schema into the $LAMHOME/etc area
#      named sge-lam-conf.lamd. This should be a single line that
#      adds the "sge-lam qrsh-local" prefix to the lamd startup.
#
#       % cat /usr/local/lam/etc/sge-lam-conf.lamd
#       /usr/local/lam/bin/sge-lam qrsh-local /usr/local/lam/bin/lamd  
#         $inet_topo $debug $session_prefix $session_suffix
#
#### Submitting SGE JOBS
#
#   Once this is setup users can submit jobs as normal and should not need to 
#   lamboot on their own. Users need only call mpirun for their MPI programs. 
#   Here is an example job:
#
#        % cat lamjob.csh
#        #$ -cwd
#        set path=(/usr/local/lam/bin $path)
#        echo "Starting my LAM MPI job"
#        mpirun C conn-60
#        echo "LAM MPI job done"
#
#
#
#### Comments/Issues email: christopher.duncan@xxxxxxx
#
# END INSTALL


$verbose=1;
$debug=0;

# close STDIN to avoid stdio race conditions and tty issues
close(STDIN);

if( $debug eq 1){
	open(SGEDEBUG,"> /tmp/sgedebug.$ENV{JOB_ID}.$$");
	select(SGEDEBUG); $|=1;
	open(STDERR,">> /tmp/sgedebug.$ENV{JOB_ID}.$$");
}

# set output for stderr and stdout to be unbuffered
select(STDERR); $|=1;
select(STDOUT); $|=1;

$lamboot="$LAMHOME/bin/lamboot";
$lamhalt="$LAMHOME/bin/lamhalt";
$sgelamconf="sge-lam-conf.lamd";

# read in the args to figure out our task
$func=shift @ARGV;

$SGE_ROOT="$ENV{SGE_ROOT}";

$arch=`${SGE_ROOT}/util/arch`;
chomp($arch);
$qrsh="${SGE_ROOT}/bin/${arch}/qrsh";

# add LAM and SGE to path
$ENV{'PATH'}.=":${SGE_ROOT}/bin/${arch}";
$ENV{'PATH'}.=":${LAMHOME}/bin";

debug_print("LAMHOME = $LAMHOME");
debug_print("SGE_ROOT = $SGE_ROOT");
debug_print("PATH = $ENV{PATH}");
debug_print("qrsh = $qrsh");
debug_print("ARGV = \"".join("\" \"",@ARGV)."\"");

if("$func" eq "start"){
	debug_print("func=start");
	print "Starting SGE + LAM Integration\n";
	print "\t using tight integration scheme\n";
	start_proc_args();
}elsif("$func" eq "stop"){
	debug_print("func=stop");
	print "Stoping SGE + LAM Integration\n";
	stop_proc_args();
}elsif("$func" eq "qrsh-remote"){
	debug_print("func=qrsh-remote");
        qrsh_remote();
}elsif("$func" eq "qrsh-local"){
	debug_print("func=qrsh-local");
        qrsh_local();
}else{
	print STDERR "\nusage: $0 {start|stop}\n\n";	
	exit(-1);
}


sub start_proc_args()
{

  # we currently place the LAM host file in the TMPDIR that SGE uses.
  # if we place it elsewhere we need to clean it up
  $lamhostsfile="$ENV{TMPDIR}/lamhostfile";

  # flags and options for lamboot (-x, -s and -np may be useful in some envs)
  @lambootargs=("-nn","-ssi","boot","rsh","-ssi","boot_rsh_agent","${LAMHOME}/bin/sge-lam qrsh-remote","-c","$sgelamconf");
  if($verbose){ push(@lambootargs,"-v"); }
  if($debug){ push(@lambootargs,"-d"); }
  push(@lambootargs,"$lamhostsfile");
  debug_print("LAMBOOT ARGS: @lambootargs $lamhostsfile");

  ### Need to convert the SGE hostfile to a LAM hostfile format
  # open and read the PE hostfile
  open(SGEHOSTFILE,"< $ENV{PE_HOSTFILE}");
  # convert to LAM bhost file format
  @lamhostslist=();
  while(<SGEHOSTFILE>){
	($host,$ncpu,$junk)=split(/\s+/);
	push( @lamhostslist,"$host cpu=$ncpu");
  }
  close(SGEHOSTFILE);

  # create the new lam bhost file
  open(LAMHOSTFILE,"> $lamhostsfile");
  print LAMHOSTFILE join("\n",@lamhostslist);
  print LAMHOSTFILE "\n";
  close(LAMHOSTFILE);


  if($debug){ close(SGEDEBUG); }
  exec($lamboot,@lambootargs);
}


sub stop_proc_args(){

  if($verbose){ push(@lamhaltargs,"-v"); }
  if($debug){ push(@lamhaltargs,"-d"); }

  if($debug){ close(SGEDEBUG); }
  exec($lamhalt,@lamhaltargs);
}


sub qrsh_remote()
{

  @myargs=("-inherit","-nostdin","-V",@ARGV);

  $i=0;
  while($i < $#myargs){
        if($myargs[$i] =~ /^\-n$/){
                splice(@myargs,$i,1);
                last;
        }
        ++$i;
  }

  debug_print("QRSH REMOTE CONFIG: @myargs");
  if($debug){ close(SGEDEBUG); }
  exec($qrsh,@myargs);
}


sub qrsh_local()
{
  # we are running a local qrsh to add the lamd into the current job
  # on the local node using the LAM boot schema

  # get the hostname to pass to qrsh
  $hostname=`/bin/hostname`;
  chomp($hostname);

  # tell SGE to add this command into the JOB_ID job by using qrsh -inherit
  # the hostname is not passed in this case in ARGV by lamboot
  @myargs=("-inherit","-nostdin","-V","$hostname",@ARGV);

  debug_print("QRSH LOCAL CONFIG: @myargs");
  if($debug){ close(SGEDEBUG); }
  exec($qrsh,@myargs);
}


sub debug_print()
{
  if($debug){
    print SGEDEBUG "SGE-LAM DEBUG: @_\n";
  }
}
