static char dqs_c_qmaster_rcsid[]="$Id: dqs_c_qmaster.c,v 1.1.1.1 1998/08/18 14:39:11 green Exp $";

/*----------------------------------------------------
 * dqs_c_qmaster.c Tom Green Mon Jan 31 10:42:38 1994
 *
 * Copyright 1993
 *
 * SUPER COMPUTER COMPUTATIONS RESEARCH INSTITUTE
 *            FLORIDA STATE UNIVERSITY
 *
 *
 * SCRI representatives make no claims about the
 * suitability of this software for any purpose.
 * It is provided "as is" without express or
 * implied warranty.
 *
 * $Log: dqs_c_qmaster.c,v $
 * Revision 1.1.1.1  1998/08/18 14:39:11  green
 * DQS 3.2.0.5 WIP Import
 *
 * Revision 1.1.1.1  1997/04/10 15:10:31  green
 * DQS 3.1.3.4.1 Distribution
 *
 * Revision 3.15  1996/11/20 23:03:21  nrl
 * Several fixes submitted by or as a result of investigations by
 * Ron Lee, Bodo Bechenback, Guntram Wolski and Frank Dwyyer.
 *
 * Revision 3.14  1996/06/27  01:55:44  nrl
 * changes to accomodate osf gcc
 *
 * Revision 3.13  1996/06/17  02:28:40  nrl
 * Updtaes from Guntram Wolski, Ron Lee, John Makosky and
 * Bodo Beckebach
 *
 * Revision 3.12  1996/03/22  04:20:03  nrl
 * Added error cataloguing number to all routines
 *
 * Revision 3.11  1996/02/07  13:07:48  nrl
 * Added "process leader" and TMP_FILES link capability
 *
 * Revision 3.9  1995/05/26  19:07:30  nrl
 * Cleaned up signal handling and the notify option with the
 * help of Ron Lee.
 *
 * Revision 3.8  1995/05/14  18:28:49  nrl
 * Plugged one hole in dqs_execd and qmaster handhsaking...
 * added gethostbyname calls to overcome problems with some
 * systems
 *
 * Revision 3.7  1995/05/04  18:58:39  nrl
 * Made notify option consistent with documentation, including
 * hard-wallclock overrun case
 *
 * Revision 3.6  1995/05/03  20:07:34  nrl
 * fixed SIGUSR assignements in deleiver signal
 *
 * Revision 3.5  1995/01/30  15:21:47  nrl
 * added "tid" verification between execd and qmaster to prevent
 * "ghost" jobs from persisting in visible queue. Changed ERROR messages
 * which were for information only to DEBUG messages.
 *
 * Revision 3.4  1994/06/16  22:55:16  green
 * deliver a SIGTERM rather than a SIGQUIT if parent dies
 *
 * Revision 3.3  1994/06/16  10:45:45  green
 * fixed that pesky little bug where "qdel" deleted ALL active jobs in
 * the queue
 *
 * Revision 3.2  1994/06/12  23:10:53  green
 * deliver a SIGQUIT rather than a SIGUSR1 on impending death sentence.
 *
 * Revision 3.1  1994/06/03  00:25:49  green
 * replaced "DQSX_STR12" with "master_queue_exec_str" in support of MPI
 * mods
 *
 * Revision 3.0  1994/03/07  04:13:24  green
 * 3.0 freeze
 *
 * Revision 1.2  1994/02/24  14:29:19  green
 * added code to nuke jobs at machine reboot
 *
 * still need to ad "restart" code
 *
 * patched a potential memory reference bug in dqs_sig_handlers.c
 *
 * Revision 1.1.1.1  1994/02/01  17:57:38  green
 * DQS 3.0 ALPHA
 *
 *--------------------------------------------------*/


#include "h.h"
#include "def.h"
#include "dqs.h"
#include "struct.h"
#include "func.h"
#include "globals.h"
#include "dqs_errno.h"

/************************************************************************/
void dqs_c_qmaster(sfd,request_head)
     int           sfd;
     dqs_list_type **request_head;
     
     /*
       dqs_c_qmaster is used solely by the dqs_execd and provides services to
       requests forwarded by the qmaster.
       
       Some of the services provided include job execution, queue suspension,
       and job signaling.
     */
     
{
  
  int                fd;
  dqs_list_type      listel;
  dqs_list_type      *lp;
  dqs_list_type      *request_list;
  char  errmsg[MAX_STRING_SIZE];
  
  struct passwd      *tmp_pw;                          
  
  DENTER((DQS_EVENT,"dqs_c_qmaster"));
  
  request_list= *request_head;
  
  DPRINTF((DQS_EVENT,"------------------------------------------------------"));
  bzero((char *)&listel,sizeof(listel));
  switch(request_list->type) 
    {
      /*------------------------------------------------------*/
    case JOB_EXECUTION:
      DPRINTF((DQS_EVENT,"===>JOB_EXECUTION: >%d<",
	       request_list->int0));
      
      fd = open (request_list->job->exec_file,
                 O_CREAT|O_WRONLY, 0400);                
      
      if (fd<0)
	{ /* no use going any further */
	  CRITICAL((DQS_EVENT,"DQS_ERROR_0142 error: unable to open %s for writing - aborting()",
		    request_list->job->exec_file));
	  
	  dqs_close_sfd(sfd);
	  
	  *request_head=dqs_free_list(*request_head);
	  DEXITE;
	  return;
	}
      
      bzero((char *)&listel,sizeof(listel));
      listel.status=DQS_ACK;
      (void) dqs_send_list(NULL,NULL,sfd,&listel);
      dqs_close_sfd(sfd);
      
      dqs_writenbytes(fd,request_list->job->script_ptr,request_list->job->script_size);
      close(fd);
      
      tmp_pw = getpwnam (request_list->job->owner);   
      chown (request_list->job->exec_file,             
	     tmp_pw->pw_uid, tmp_pw->pw_gid);  
      
      request_list->job->master_queue_exec_str=dqs_string_insert(NULL,request_list->queue->qname);
      dqs_write_list_to_disk(JOB_DIR,request_list->job->dqs_job_name,request_list,ALL);
      bzero((char *)&listel,sizeof(listel));
      listel.str0=request_list->str0;
      request_list->str0=NULL;
      listel.job=request_list->job;
      request_list->job=NULL;
      listel.queue=request_list->queue;
      listel.queue->master=TRUE;
      request_list->queue=NULL;
      listel.str0=dqs_string_insert(NULL,listel.job->dqs_job_name);
      Job_head=dqs_insert(DQS_STR0,TAIL,Job_head,&listel);
      jobs_to_start=TRUE;
      *request_head=dqs_free_list(*request_head);
      DEXIT;
      return;
      
      /*------------------------------------------------------*/
    case SIGNAL_QUEUE_REQUEST:
      DPRINTF((DQS_EVENT,"===>DELIVER_SIGNAL: %d >%s< jid(s) %d",
	       request_list->int0,request_list->str0,request_list->int1));
      bzero((char *)&listel,sizeof(listel));
      listel.status=DQS_ACK;
      (void) dqs_send_list(NULL,NULL,sfd,&listel);
      dqs_close_sfd(sfd);
      
      lp=Job_head;
      while (lp)
	{
	  if (request_list->int1) /* signal a job */
	    {
	      if (lp->job->job_number==request_list->int1)
		{
		  dqs_execd_deliver_signal(request_list->int0,lp);
		}
	    }
	  else
	    {
	      if (!strcmp(lp->queue->qname,request_list->str0))
		{
		  dqs_execd_deliver_signal(request_list->int0,lp);
		}
	    }
	  lp=lp->next;
	}
      
      *request_head=dqs_free_list(*request_head);
      
      DEXIT;
      return;
      
      /*------------------------------------------------------*/
    default:
      INFO((DQS_EVENT,"DQS_ERROR_0143 CASE unknown list type %d",request_list->type));
      (void) dqs_send_nak(sfd,request_list);
      dqs_close_sfd(sfd);
      *request_head=dqs_free_list(*request_head);
      DEXITE;
      return;
      
    }
  
}

/************************************************************************/
void dqs_execd_deliver_signal(sig,lp)
     u_long32      sig;
     dqs_list_type *lp;
     
{
  
  u_long32              now;
  int notifiable_signal=FALSE;
  
  int    tmp_int, tmp_job_pid;  
  string tmp_str;           
  FILE   *tmp_f;         
  
  DENTER((DQS_EVENT,"dqs_execd_deliver_signal"));
  
  if ((sig==DQS_SIGKILL)||(sig==DQS_SIGSTOP))
    notifiable_signal=TRUE;
  
  tmp_job_pid = (int) lp->job->pid;                            
  sprintf (tmp_str, "%s/local/%s.p%s.%d",                      
	   EXECD_SPOOL_DIR, lp->job->job_name,        
	   lp->job->dqs_job_name, lp->job->pid);      
  if (tmp_f = fopen (tmp_str, "r")) {                          
    bzero((char *)tmp_str, sizeof (tmp_str));                         
    if (fgets (tmp_str, sizeof (tmp_str), tmp_f) &&            
	fclose (tmp_f) == 0)                                   
      tmp_job_pid = atoi (tmp_str);                            
  }                                                            
  ERROR((DQS_EVENT, "DQS_ERROR_0144 (signal delivery) %s(%d %d)",             
	 tmp_str, lp->job->pid, tmp_job_pid));                 
  
  
  if (notifiable_signal &&                                     
      (lp->job->notify && lp->queue->notify) ||                
      (sig == DQS_SIGKILL))                                    
    
    {
      if (lp->job->pending_signal)
	{ /* unhuh, somebody might try to play games */
	  DEXITE;
	  return;
	}
      now=dqs_get_gmt();
      
      
      
      
      if (sig==DQS_SIGKILL)
	{
	  INFO((DQS_EVENT,"DQS_ERROR_0145 NOTIFIABLE SIGNAL JID %s and setting pending signal",
		lp->job->dqs_job_name));
	  
	  
	  (void) dqs_kill (-tmp_job_pid, DQS_SIGUSR2);       
	  
	  if ((tmp_int = fork()) < 0) {                      
	    ERROR((DQS_EVENT,                                
		   "(fork) failed - deliver signal %d to %s on my own",
		   sig, lp->job->dqs_job_name));             
	    sleep (15);                                 
	    ERROR((DQS_EVENT,                                
		   "(fork) delivering signal %d to %s",      
		   sig, lp->job->dqs_job_name));             
	    (void) dqs_kill (-tmp_job_pid, sig);        
	  } else {                                           
	    if (tmp_int == 0) {                              
	      ERROR((DQS_EVENT,                              
		     "(fork %d) OK - queued signal %d for %s",
		     tmp_int, sig, lp->job->dqs_job_name));  
	      sleep (15);                               
	      ERROR((DQS_EVENT,                              
		     "(fork %d) delivering signal %d to %s",
		     tmp_int, sig, lp->job->dqs_job_name));  
	      (void) dqs_kill (-tmp_job_pid, sig);      
	      exit(0);                                       
	    }                                                
	  }                                                  
	}
      else
	{
	  INFO((DQS_EVENT,"DQS_ERROR_0146 NOTIFIABLE SIGNAL JID %s and setting pending signal",
		lp->job->dqs_job_name));
	  
	  
	  (void) dqs_kill (-tmp_job_pid, DQS_SIGUSR1);       
	  
	  if ((tmp_int = fork()) < 0) {                      
	    ERROR((DQS_EVENT,                                
		   "(fork) failed - deliver signal %d to %s on my own",
		   sig, lp->job->dqs_job_name));             
	    sleep (lp->queue->notify);                       
	    (void) dqs_kill (-tmp_job_pid, sig);        
	    
	  } else {                                           
	    if (tmp_int == 0) {                              
	      ERROR((DQS_EVENT,                              
		     "(fork %d) OK - queued signal %d for %s",
		     tmp_int, sig, lp->job->dqs_job_name));  
	      sleep (lp->queue->notify);                     
	      ERROR((DQS_EVENT,                              
		     "(fork %d) delivering signal %d to %s",
		     tmp_int, sig, lp->job->dqs_job_name));  
	      (void) dqs_kill(-tmp_job_pid,DQS_SIGSTOP);
	      exit(0);                                       
	    }                                                
	  }                                                  
	}
    }
  else
    {
      INFO((DQS_EVENT,"DQS_ERROR_0147 NON-NOTIFIABLE SIGNAL JID %s  #%d ",
	    lp->job->dqs_job_name,sig));
      (void) dqs_kill(-tmp_job_pid,sig); 
    }
  
  DEXIT;
  return;
  
}

/************************************************************************/
int dqs_kill(pid,dqs_signal)
     int      pid;
     u_long32 dqs_signal;
     
{
  int sig;
  int status;
  
  DENTER((DQS_EVENT,"dqs_kill"));
  
  sig=dqs_unmap_signal(dqs_signal);
  
  INFO((DQS_EVENT,"DQS_ERROR_0148 delivering signal %d to pid %d",sig,pid));
  
  status=kill(pid,sig);
  
  if (status)
    {
      DEXITE;
    }
  else
    {
      DEXIT;
    }
  
  return(status);
  
}
