/*
 * Copyright 1998-2001, University of Notre Dame.
 * Authors: Jeffrey M. Squyres, Arun Rodrigues, and Brian Barrett with
 *          Kinis L. Meyer, M. D. McNally, and Andrew Lumsdaine
 * 
 * This file is part of the Notre Dame LAM implementation of MPI.
 * 
 * You should have received a copy of the License Agreement for the Notre
 * Dame LAM implementation of MPI along with the software; see the file
 * LICENSE.  If not, contact Office of Research, University of Notre
 * Dame, Notre Dame, IN 46556.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted subject to the conditions specified in the
 * LICENSE file.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * 
 * Additional copyrights may follow.
 * 
 *	Ohio Trollius
 *	Copyright 1996 The Ohio State University
 *	GDB
 *
 *	$Id: lambootagent.c,v 6.16 2001/04/01 22:45:48 jsquyres Exp $
 * 
 *	Function:	- LAM boot agent
 *	Accepts:	- link array
 *			- link array size
 *			- # booted nodes (out)
 *			- # running nodes (out)
 */

#include <lam_config.h>
#include <sfh.h>

#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

#include <laminternal.h>
#include <args.h>
#include <debug.h>
#include <lamnet.h>
#include <net.h>
#include <portable.h>
#include <terror.h>
#include <typical.h>
#include <sfh.h>
#include <boot.h>
#include <etc_misc.h>

/*
 * local variables
 */
static int fl_debug;		/* debug mode */
static int fl_verbose;		/* verbose mode */
static int fl_fast;		/* check for .profile or not */
static int fl_close;		/* close local lamd stdout/stderr? */
static char buf[256];

int
lambootagent(struct lamnode *lamnet, int nlamnet, int *nboot, int *nrun)
{
	int		agent_port;	/* port number for replies */
	int		agent_sd;	/* socket for replies */
	int		boot_sd;	/* connection to new node */
	int		cmdc;		/* command vector count */
	int		dlport;
	int		i, j;
	int		r;
	int4		local;		/* local node ID */
	int4		origin;		/* origin node ID */
	char		**cmdv;		/* command vector */
	char		*batchid;	/* batch job ID */
	unsigned char	*p;

	*nboot = 0;
	*nrun = 0;

	if (nlamnet <= 0) {
		return(0);
	}
/*
 * Set the flags.
 */
	fl_debug = opt_taken('d');
	fl_verbose = opt_taken('v');
	fl_fast = opt_taken('b');
	fl_close = opt_taken('s');
/*
 * Allocate a server socket and port.
 */
	agent_port = 0;
	agent_sd = sfh_sock_open_srv_inet_stm(&agent_port);
	if (agent_sd < 0) {
	  show_help("boot", "socket-fail", NULL);
	  return(LAMERROR);
	}
/*
 * Make the socket close on exec.
 */
	if (fcntl(agent_sd, F_SETFD, 1) == -1) {
	  show_help(NULL, "system-call-fail", "fcntl (set close-on-exec)", 
		    NULL);
	  return(LAMERROR);
	}
/*
 * Find the local node.
 */
	local = NOTNODEID;

	for (i = 0; (i < nlamnet) && (local == NOTNODEID); ++i) {

		if ((lamnet[i].lnd_nodeid != NOTNODEID) &&
				(lamnet[i].lnd_type & NT_ME)) {
			local = i;
		}
	}

	if (local == NOTNODEID) {
	  errno = EINVAL;
	  /* The help message displays the name of the schema file,
             and we don't have that here, so we must print it in the
             invoking function */
	  return(LAMERROR);
	}
/*
 * Find the origin node.
 */
	origin = NOTNODEID;

	for (i = 0; (i < nlamnet) && (origin == NOTNODEID); ++i) {

		if ((lamnet[i].lnd_nodeid != NOTNODEID) &&
				(lamnet[i].lnd_type & NT_ORIGIN)) {
			origin = i;
		}
	}
/*
 * Boot all valid links with type NT_BOOT.
 */
	for (i = 0; i < nlamnet; ++i) {
/*
 * Skip nodes that are invalid or already booted.
 */
		if ((lamnet[i].lnd_nodeid == NOTNODEID) ||
				!(lamnet[i].lnd_type & NT_BOOT)) continue;
/*
 * Invoke hboot on the new host.
 */
		cmdc = 0;
		cmdv = 0;
		argvadd(&cmdc, &cmdv, DEFTHBOOT);
		argvadd(&cmdc, &cmdv, "-t");
		argvadd(&cmdc, &cmdv, "-c");
		argvadd(&cmdc, &cmdv, "lam-conf.lam");

		if (fl_debug) {
			argvadd(&cmdc, &cmdv, "-d");
		}
		if (fl_verbose) {
			argvadd(&cmdc, &cmdv, "-v");
		}
/*
 * If remote node, close stdio of processes, unless forced by the
 * command line with -s (see fl_close, above) -- this is useful, for
 * example, in "rsh somenode lamboot hostfile", because we need the
 * hboot/lamd on somenode to close their stdio so that rsh can finish.
 */
		if (i != local || fl_close) {
			argvadd(&cmdc, &cmdv, "-s");
		}
/*
 * If this is under a batch system, pass the -b to both hboot and to
 * the $inet_topo varaible
 */
		batchid = get_batchid();
		if (strlen(batchid) > 0) {
		  argvadd(&cmdc, &cmdv, "-b");
		  argvadd(&cmdc, &cmdv, batchid); 
		}
/*
 * Override the $inet_topo variable.
 */
		p = (unsigned char *) &lamnet[local].lnd_addr.sin_addr;
		argvadd(&cmdc, &cmdv, "-I");
		sprintf(buf, "%c%s-H %u.%u.%u.%u -P %d -n %d -o %d %s %s%c",
			i == local ? ' ' : '"',
			opt_taken('x') ? "-x " : "",
			(unsigned) p[0], (unsigned) p[1],
			(unsigned) p[2], (unsigned) p[3],
			agent_port,
			i,
			origin,
			(strlen(batchid) == 0 ? " " : "-b"),
			(strlen(batchid) == 0 ? " " : batchid),
			i == local ? ' ' : '"');
		argvadd(&cmdc, &cmdv, buf);

		VERBOSE("Executing %s on n%d (%s - %d CPU%s)...\n", 
			DEFTHBOOT, i, lamnet[i].lnd_hname,
			lamnet[i].lnd_ncpus, 
			(lamnet[i].lnd_ncpus > 1) ? "s" : "");

		(*nboot)++;

		if (i == local) {
		        if (fl_debug) {
			  int j;
			  
			  printf("lamboot: attempting to execute \"");
			  for (j = 0; j < cmdc; j++) {
			    if (j > 0)
			      printf(" ");
			    if (strchr(cmdv[j], ' ') != NULL)
			      printf("\"%s\"", cmdv[j]);
			    else
			      printf("%s", cmdv[j]);
			  }
			  printf("\"\n");
			}
			r = _lam_few(cmdv);

			if (r) {
				(*nboot)--;
				errno = r;
				show_help("boot", "fork-fail", cmdv[0], NULL);
				argvfree(cmdv);
				return(LAMERROR);
			}
		} else {
			r = inetexec(lamnet[i].lnd_hname, lamnet[i].lnd_uname,
				     cmdv, (fl_debug ? "lamboot" : NULL),
				     fl_fast);

			if (r) {
				(*nboot)--;
				argvfree(cmdv);
				/* inetexec will display errors if it
                                   fails */
				return(LAMERROR);
			}
		}
/*
 * Accept a connection from the new host.
 */
		boot_sd = sfh_sock_accept_tmout(agent_sd, LAM_TO_BOOT);
		if (boot_sd < 0) return(LAMERROR);
/*
 * Read the new host port numbers.
 */
		if (readcltcoord(boot_sd, &lamnet[i].lnd_bootport,
				&dlport)) return(LAMERROR);

		lamnet[i].lnd_addr.sin_port = htons((unsigned short) dlport);
/*
 * Close the host connection.
 */
		if (close(boot_sd)) return(LAMERROR);
		(*nrun)++;
	}

	if (close(agent_sd)) return(LAMERROR);

	if (fl_verbose) {
		nodespin_init("topology");
	}
/*
 * Send link information to all nodes that have been booted.
 */
	for (i = 0; i < nlamnet; ++i) {
/*
 * Skip nodes that are invalid or already booted.
 */
		if ((lamnet[i].lnd_nodeid == NOTNODEID) ||
				!(lamnet[i].lnd_type & NT_BOOT)) continue;

		if (fl_verbose) {
			nodespin_next((int4) i);
		}
/*
 * Connect to the new host.
 */
		boot_sd = sfh_sock_open_clt_inet_stm(
				(unsigned char *) &lamnet[i].lnd_addr.sin_addr,
				lamnet[i].lnd_bootport);
		if (boot_sd < 0) return(LAMERROR);
/*
 * Send it the number of links.
 */
		if (writesockint4(boot_sd, (int4) nlamnet)) return(LAMERROR);
/*
 * Loop sending info on all the links.
 */
		for (j = 0; j < nlamnet; ++j) {

			if (writecltnbr(boot_sd,
					(lamnet[j].lnd_nodeid == NOTNODEID) ?
					NOTLINKID : j,
					(unsigned char*) 
					&lamnet[j].lnd_addr.sin_addr,
					(int)
					ntohs(lamnet[j].lnd_addr.sin_port),
					lamnet[j].lnd_ncpus))
					return(LAMERROR);
		}

		if (close(boot_sd)) return(LAMERROR);
	}

	if (fl_verbose) {
		nodespin_end();
	}

	return(0);
}
