/*
  Copyright Mission Critical Linux, 2000

  Kimberlite is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
  Free Software Foundation; either version 2, or (at your option) any
  later version.

  Kimberlite is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with Kimberlite; see the file COPYING.  If not, write to the
  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
  MA 02139, USA.
*/
/*
 *  $Id: diskutils.c,v 1.11 2000/09/13 19:39:03 burke Exp $
 *
 *  Copyright (C) 2000 Mission Critical Linux, LLC
 *
 *  author: Tim Burke <burke@missioncriticallinux.com>
 *  description: Generic utility routines to access shared partition.
 */
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <signal.h>
#include <stdlib.h>
#include <msgsvc.h>

#include <logger.h>
#include <sys/syslog.h>
#include <diskapis.h>
#include <clucfg.h>
#include <parseconf.h>
#include "diskstate.h"
#include "disk_proto.h"

#define SHUTDOWN_CMD "/sbin/shutdown"
#define QUORUMD_DEFAULT_LOGGER_LEVEL LOG_WARNING

static const char *version __attribute__ ((unused)) = "$Id: diskutils.c,v 1.11 2000/09/13 19:39:03 burke Exp $";
/*
 * Strings corresponding to #defines in clusterdefs.h.  Here for
 * convenience for printing out readable state information.
 */
char *nodeStateStrings[] = {
  "Uninitialized",		// 0
  "Invalid",		// 1
  "IOError",		// 2
  "Up",			// 3
  "Down"			// 4
};

char *serviceStateStrings[] = {     /* strings for service states */
  "uninitialized",	// 0
  "starting",		// 1
  "running",		// 2
  "stopping",		// 3
  "stopped",		// 4
  "disabling",		// 5
  "disabled",		// 6
  "error"			// 7
};                

static int reallyDoShutdown = 1;  // When set to zero, no actual shutdown is performed.

int sharedPartitionFD[2] = {-1,-1};
int sharedPartitionFDinited = 0;

/*
 * Forward routine declarations.
 */

/*
 * openPartition
 * Called to open the shared state partition with appropriate mode.
 * Returns - (the file descriptor), a value >= 0 on success.
 */
int openPartition(char *name) {
	int fd;
	int retval;

	/*
	 * Open for synchronous writes to insure all writes go directly
	 * to disk.
	 */
	fd = open(name, O_RDWR | O_SYNC);
	if (fd < 0) {
	    return(fd);
	}
	// Check to verify that the partition is large enough.
	retval = lseek(fd, END_OF_DISK, SEEK_SET);
	if (retval != END_OF_DISK) {
	  clulog(LOG_CRIT, "openPartition: cant seek to offset %d, retval=%d.\n",
			END_OF_DISK, retval);
	  clulog(LOG_CRIT, "openPartition: partition may be too small.\n");
#ifdef DEBUG
	  perror("Seek failure");
#endif
	  return(-1);
	}
	return(fd);
}

/*
 * closePartition
 * Closes the shared state disk partition.
 * Returns - value from close syscall.
 */
int closePartition(int *fd) {
	int retval;

	if (*fd < 0) {
	    clulog(LOG_ERR, "ERROR: closePartition called when partition is not open.\n");
	    return(-1);
	}
	retval = close(*fd);
	*fd = -1;
	if (retval < 0) {
	    clulog(LOG_ERR, "ERROR: closePartition failed.\n");
	}
	return(retval);
}

/*
 * validatePartitionName
 * Called to verify that the specified device special file representing
 * the partition appears to be a valid device.
 * Returns: 0 - success, 1 - failure
 */
int validatePartitionName(char *name) {
	struct stat stat_st, *stat_ptr;
	int fd;
	stat_ptr = &stat_st;

	if (stat(name, stat_ptr) < 0) {
		clulog(LOG_ERR, "Unable to stat %s.\n", name);
#ifdef DEBUG
		perror("stat");
#endif
		return(1);
	}
	/*
	 * Verify that its a block or character special file.
	 */
	if (S_ISCHR(stat_st.st_mode) == 0) {
		clulog(LOG_ERR, "%s is not a character special file.\n", name);
		return(1);
	}

	/*
	 * Verify read/write permission.
	 */
	fd = openPartition(name);
	if (fd < 0) {
		clulog(LOG_ERR, "Unable to open %s read/write.\n", name);
		return(1);
	}
	closePartition(&fd);
	return(0);
}

/*
 * Writes the shared state disk header out to the shared partition.
 * Returns: -1 on error, 0 on success.
 */
int writeHeader(SharedStateHeader *hdr) {

	assert_clu_lock_held("writeHeader");
	// Paranoia checks
	if (hdr->magic_number != SHARED_STATE_MAGIC_NUMBER) {
		clulog(LOG_ERR, "writeHeader: invalid magic# 0x%lx\n",
			hdr->magic_number);
		return(-1);
	}
	return diskRawWriteShadow(OFFSET_HEADER, (char *)hdr, sizeof(SharedStateHeader),
				  (ulong)&((SharedStateHeader *)0)->check_sum);
}

/*
 * Reads in the shared state disk header from the shared partition.
 * Stuffing the results into the passed data struct.
 * Returns: -1 on error, 0 on success.
 */
int readHeader(SharedStateHeader *hdr) {
	int ret;


	ret = diskRawReadShadow(OFFSET_HEADER, (char *)hdr, sizeof(SharedStateHeader),
				  (ulong)&((SharedStateHeader *)0)->check_sum, 1);
	if(ret) {
		clulog(LOG_ERR, "readHeader: bad ret %d from diskRawReadShadow\n", ret);
		return(ret);
	}	
	if (hdr->magic_number != SHARED_STATE_MAGIC_NUMBER) {
		clulog(LOG_ERR, "readHeader: invalid magic# 0x%lx\n",
			hdr->magic_number);
		return(-1);
	}

	return(0);
}

/*
 * Same as readHeader (above), but this one does it without using the
 * shadow copy.  The only time this gets called is in the configuration
 * check where it looks to see if the quorum partition has been initialized.
 * This needs to be operational in cases where the cluster has not yet
 * been initialized; so it needs to gracefully fail; rather than being
 * killed off because of locking failures.
 *
 * Stuffing the results into the passed data struct.
 * Returns: -1 on error, 0 on success.
 */
int readHeaderNoshaddowUnlocked(SharedStateHeader *hdr) {
	int ret;

        if(!sharedPartitionFDinited) {
		if (initSharedFD() != 0) {
			clulog(LOG_ERR, "readHeaderNoshaddowUnlocked: can't open quorum partitions.\n");
			return(-1);
		}
	}
	ret = lseek(sharedPartitionFD[0], OFFSET_HEADER, SEEK_SET);
	if (ret != OFFSET_HEADER) {
	   	 clulog(LOG_ERR, 
		"readHeaderNoshaddowUnlocked: can't seek to offset %d.\n",
			(int)OFFSET_HEADER);
		return(-1);
	}
	ret = diskRawRead(sharedPartitionFD[0], (char *)hdr, sizeof(SharedStateHeader));
	if(ret != sizeof(SharedStateHeader)) {
		clulog(LOG_ERR, "readHeaderNoshaddowUnlocked: bad ret %d from diskRawReadShadow\n", ret);
		return(ret);
	}	
	if (hdr->magic_number != SHARED_STATE_MAGIC_NUMBER) {
		clulog(LOG_DEBUG, "readHeader: invalid magic# 0x%lx\n",
			hdr->magic_number);
		return(-1);
	}

	return(0);
}

/*
 * initializePartition
 * Called to initialize a partition representing  cluster shared state.
 * Basically puts down a magic number for later validation purposes.
 * Returns - 0 on success.
 */
int in_initializePartition = 0; /* turns off lock audits */

int initializePartition(int prompt) {
	SharedStateHeader hdr;
	int retval;
	char inputLine[132]; 
	int nodeNumber = cluGetLocalNodeId();
	int errors = 0;

	in_initializePartition = 1;

	if (prompt) {
	    char c;
	    printf("\nWARNING, you have requested initialization of the quorum partitions.\n");
	    printf("this should only be performed while NONE of the cluster members\n");
	    printf("are actively running the cluster daemons.\n");
	    printf("\nAre you sure you wish to proceed? [y/n] ");
	    if (fscanf(stdin,"%s", (char *)&inputLine) < 1) {
		clulog(LOG_ERR, "Error scanning input prompt.\n");
		in_initializePartition = 0;
		return(1);
	    }
	    c = inputLine[0];
	    if ((strlen(inputLine) > 1) && (inputLine[0] == '\n'))
		c = inputLine[1];
	    if ((c != 'y') && (c != 'Y')) {
		clulog(LOG_ERR, "Partition initialization aborting.\n");
		in_initializePartition = 0;
		return(1);
	    }
	}
	hdr.magic_number = SHARED_STATE_MAGIC_NUMBER;
	hdr.version = SHARED_STATE_LATEST_VERSION;
	bzero(hdr.nodenames, MAX_NODES * MAXHOSTNAMELEN);
	strcpy(hdr.description, "Test description field");
	time(&hdr.timestamp);
	hdr.updateNodenum = nodeNumber;

	clulog(LOG_DEBUG, "Writing out the following disk header:\n");
	printSharedStateHeader(&hdr);
	retval = writeHeader(&hdr);
	if (retval != 0) errors++;
	if (retval == 0)
		clulog(LOG_DEBUG, "Partition header successfully initialized.\n");
	else
		clulog(LOG_ERR, "Failed to initialize partition header.\n");
	/*
	 * But wait, there's more!  Actually there are 4 data structures
	 * residing on the disk.  We have just initialize the first, now
	 * init the next 2.
	 */
	retval = initializePartitionServiceState();
	if (retval == 0)
		clulog(LOG_DEBUG, "Service descriptions & node status successfully initialized.\n");
	else {
		clulog(LOG_ERR, "Failed to initialize service descriptions and node status.\n");
		errors++;
	}
        /*
	 * Initialize a node's lock state.
	 */

	retval = initializePartitionLockBlocks();
	if (retval == 0)
	    clulog(LOG_DEBUG, "Partition lock blocks successfully initialized.\n");
	else {
	    clulog(LOG_ERR, "Failed to initialize partition lock blocks.\n");
	    errors++;
	}
        /*
	 * The not-so-final on-disk structure we need to initialize is the one
	 * representing the configuration "database".
	 */
	retval = initializeConfigDatabase();
	if (retval == 0)
	    clulog(LOG_DEBUG, "Configuration database successfully initialized.\n");
	else {
	    clulog(LOG_ERR, "Failed to initialize configuration database.\n");
	    errors++;
	}
	goto done;


        /*
	 * The final on-disk structure we need to initialize is the one
	 * representing the session id.
	 */
	retval = initializePartitionNetBlock();
	if (retval == 0)
	    clulog(LOG_DEBUG, "Netblock successfully initialized.\n");
	else {
	    clulog(LOG_ERR, "Failed to initialize netblock.\n");
	    errors++;
	}

  done:
	in_initializePartition = 0;
	if (errors > 0)
	    return(-1);
	return(0);	
}

/*
 * Debug routine.
 */
void printSharedStateHeader(SharedStateHeader *hdr) {
	int nodenumber;
	int nodecount = 0;

	clulog(LOG_DEBUG, "----- Shared State Header ------\n");
	clulog(LOG_DEBUG, "Magic# = 0x%lx",hdr->magic_number);
	if (hdr->magic_number != SHARED_STATE_MAGIC_NUMBER) {
		clulog(LOG_DEBUG, " INVALID");
	}
	clulog(LOG_DEBUG, "\nVersion = %d\n", hdr->version);
	for (nodenumber = 0; nodenumber < MAX_NODES; nodenumber++) {
		if (strlen(hdr->nodenames[nodenumber]) > 0) {
			clulog(LOG_DEBUG, "Nodename = %s", 
				hdr->nodenames[nodenumber]);
			nodecount++;
		}
	}
	if (nodecount == 0) {
		clulog(LOG_DEBUG, "No nodenames have been specified yet.\n");
	}
	clulog(LOG_DEBUG, "Description = %s\n",hdr->description);
	clulog(LOG_DEBUG, "Updated on %s", ctime(&hdr->timestamp));
	clulog(LOG_DEBUG, "Updated by node %d\n", hdr->updateNodenum);
	clulog(LOG_DEBUG, "--------------------------------\n");
}

/*
 * Debug routine to validate that the offsets are sufficient and do not
 * overlap.
 */
int offsetParanoiaCheck(void) {
	size_t size_header = sizeof(SharedStateHeader);
	size_t size_status = sizeof(NodeStatusBlock);
	size_t size_service = sizeof(DiskServiceBlock);
	size_t size_lock = sizeof(DiskLockBlock);
	int i;
	int err = 0;

	if (size_header >= OFFSET_FIRST_STATUS_BLOCK) {
		clulog(LOG_CRIT, "ERROR: Header overlaps status area!\n");
		err = 1;
	}
	if (size_status >= SPACE_PER_STATUS_BLOCK) {
		clulog(LOG_CRIT, "ERROR: Status structure overlap!\n");
		err = 1;
	}
	if (size_service >= SPACE_PER_SERVICE_BLOCK) {
		clulog(LOG_CRIT, "ERROR: Service structure overlap!\n");
		err = 1;
	}
	if (size_header >= 512) {
		clulog(LOG_ERR, "Warning, header exceeds 512 bytes, %d.\n", 
			size_header);
	}
	if (size_status >= 512) {
		clulog(LOG_ERR, "Warning, status block exceeds 512 bytes, %d.\n", 
			size_status);
	}
	if (size_lock >= 512) {
		clulog(LOG_ERR, "Warning, lock block exceeds 512 bytes, %d.\n", 
			size_lock);
	}
	if (size_service >= 512) {
		clulog(LOG_ERR, "Warning, service block exceeds 512 bytes, %d.\n", 
			size_service);
	}
	if (err) {
		clulog(LOG_DEBUG, "Shared State Header is %d bytes.\n", size_header);
		clulog(LOG_DEBUG, "Status Block is %d bytes.\n", size_status);
		clulog(LOG_DEBUG, "Service Block is %d bytes.\n", size_service);
		clulog(LOG_DEBUG, "Lock Block is %d bytes.\n", size_lock);
		clulog(LOG_DEBUG, "Offsets: Header = %d\n", OFFSET_HEADER);
		for (i=0; i<MAX_NODES; i++) {
			clulog(LOG_DEBUG, "Offsets: Status[%d] = %d\n", i,
				(OFFSET_FIRST_STATUS_BLOCK + 
				(i * SPACE_PER_STATUS_BLOCK)));
		}
		for (i=0; i<MAX_SERVICES; i++) {
			clulog(LOG_DEBUG, "Offsets: Service[%d] = %d\n", i,
				(OFFSET_FIRST_SERVICE_BLOCK + 
				(i * SPACE_PER_SERVICE_BLOCK))
			);
		}
	}
	return(err);
}

/*
 * Called to retrieve the shared state partition name out of the cluster
 * config file.
 * Parameter: name - a pointer to where the name will be copied into.
 * Returns: 0 -  success, 1 - failure.
 */


int initSharedFD(void)
{
    char partitionName[2][MAXPATHLEN];
    int i;

    if(!sharedPartitionFDinited) {
	for(i = 0; i < 2; i++) {
	    if (getPartitionName(i, partitionName[i]) != 0) {
			clulog(LOG_CRIT, "initSharedFD: unable to get partition name from config file.\n");
			return(-1);
	    }
	    /*
	     * Perform some validaton check on the specified
	     * partition to weed out configuration errors.
	     */
	    if (validatePartitionName(partitionName[i]) != 0) {
		clulog(LOG_CRIT, "initSharedFD: unable to validate partition %s. Configuration error?\n", partitionName[i]);
		return(-1);
	    }
	    sharedPartitionFD[i] = openPartition(partitionName[i]);
	    if (sharedPartitionFD[i] < 0) {
		clulog(LOG_EMERG, "initSharedFD: unable to open partition %s.\n", partitionName[i]);
		return(-1);
	    }
	}
    }
    sharedPartitionFDinited = 1;
    return 0;
}
	    


/*
 * Retrieve a string representing the name of the shared state disk
 * partition.  There are 2 partitions (for "mirroring" purposes.
 * The "which" parameter designates which one is returned in the "name"
 * parameter.  Returns 0 on success, 1 on failure.
 */
int getPartitionName(int which, char *name) {
    CluCfg              *cfg;
    char                *file = NULL;


    cfg = get_clu_cfg(file);
    if (cfg == NULL) {
        clulog(LOG_ERR, "getPartitionName: unable to get info from clu_cfg.\n");
	return(1);
    }
    if (which == 0) {
	strcpy(name, cfg->nodes[cfg->lid].quorumPartitionPrimary);
    }
    else {
	strcpy(name, cfg->nodes[cfg->lid].quorumPartitionShadow);
    }
    free(cfg);
    return(0);
}

/*
 * Called to retrieve the verbose diagnostic print level
 * from the bootstrap config file. Then it sets the logging level accordingly.
 * Returns: current logging level.
 */
int getVerboseLevel() {
    int verbLvl; 
    int oldLevel;
    char *param;

    verbLvl = QUORUMD_DEFAULT_LOGGER_LEVEL;
    if (CFG_Get((char *) CFG_DISK_VERBOSE, NULL, &param) != CFG_OK) {
        clulog(LOG_DEBUG, "getVerboseLevel: no logging level specified.\n");
	goto endparams;
    }
    if (param) {
	verbLvl = atoi(param);
    }
    if ((verbLvl < 0) && (verbLvl > LOG_DEBUG)) {
        clulog(LOG_DEBUG, "getVerboseLevel: logging level %d out of range.\n", verbLvl);
        verbLvl = QUORUMD_DEFAULT_LOGGER_LEVEL;
    }

endparams:
    oldLevel = clu_set_loglevel(verbLvl);
    if (oldLevel != verbLvl) {
        clulog(LOG_DEBUG, "getVerboseLevel: Changed logging level from %d to %d.\n", oldLevel,  verbLvl);
    }
    else {
        clulog(LOG_DEBUG, "getVerboseLevel: set logging level to %d.\n", verbLvl);
    }
    return(verbLvl);

}

/*
 * shut_myself_down()
 * Called when the cluster is in an inconsistent state and an immediate
 * shutdown is warranted.  Typically this is done in cases where the
 * node will inevitably get shot by the partner node, but we hope to
 * successfully stop some services prior to getting shot in the interests
 * of expediting startup (i.e. saving fsck time).
 *
 * This gets called when:
 * - We are unable to perform IOs to the quorum disk
 * - In shoot_partner if the power switch returns successful status, but
 *   subsequent failure on the power cycle command.
 * - We shot the partner, but it continued to issue IO operations.
 */
void shut_myself_down(char *reason) {
    if (reallyDoShutdown == 0) {
        clulog(LOG_WARNING, "shut_myself_down: Shutdown disabled!\n");
        return;
    }
    clulog(LOG_EMERG, "shut_myself_down: %s", reason);
    execl (SHUTDOWN_CMD, SHUTDOWN_CMD,"-r" ,"now",reason,NULL);
    // NOTREACHED
}                  

/*
 * Called to determine if the calling node is currently a cluster member.
 * Returns 0 - not currently a cluster member.
 *         1 - currently a cluster member.
 */
int cluster_member_check() {
    SharedDiskNodeStates nodeStates;
    int retval;
    int cluster_member = 0;
    int myID;

    myID = cluGetLocalNodeId();
    if (myID < 0) {
	/*
	 * Unable to determine this node's ID. Make a conservative call
	 * and consider ourselves to be down.
	 */
	cluster_member = 0;
	clulog(LOG_DEBUG, "cluster_member_check: cluGetLocalNodeId failed, consider myself down.\n");
    }
    else {
        retval = cluGetDiskNodeStates(&nodeStates);
        if (retval == 0) {
	    if (nodeStates.states[myID] == NODE_UP) {
		cluster_member = 1;
	        clulog(LOG_DEBUG, "cluster_member_check: cluGetDiskNodeStates says I'm UP.\n");
	    }
	    else {
		cluster_member = 0;
	        clulog(LOG_DEBUG, "cluster_member_check: cluGetDiskNodeStates says I'm DOWN.\n");
	    }
        }
        else {
	    /*
	     * Unable to determine node status.  This probably means that
	     * the quorumd isn't currently running.  Make a conservative call
	     * and consider ourselves to be down.
	     */
	    cluster_member = 0;
	    clulog(LOG_DEBUG, "cluster_member_check: cluGetDiskNodeStates failed, consider myself down.\n");
        }
    }
    return(cluster_member);
}                  

/*
 * consider_shutdown()
 * Called on inability to access the shared state disk partition.
 * A check is made to see if the node is currently a cluster member.
 * If so, then a reboot is waranted; otherwise no reboot is performed.
 * In the case of the reboot, this routine doesn't actually return.
 * In the no-reboot case, the routine will return.
 *
 * An example scenario where no reboot is performed would be cases where
 * utilities are using the disk library to access configuration or status
 * monitoring information.  In this situation there could be an error whereby
 * a configuration setting is wrong such that the designation of the device
 * special file for the shared start partition is bogus.  Here the correct
 * behavior on inability to access the device is to return an error 
 * indication to the caller; rather than rebooting.
 *
 * Return values:
 * 0 - no shutdown needed.
 * 1 - shutdown needed.  Actually in this case the routine wouldn't
 *     return at all.
 */
int consider_shutdown(char *reason) {
    int cluster_member;

    cluster_member = cluster_member_check();
    if (cluster_member == 1) {
        clulog(LOG_DEBUG, "consider_shutdown: cluster member, reboot.\n");
	shut_myself_down(reason);
	// NOTREACHED
	return(1);
    }
    clulog(LOG_DEBUG, "consider_shutdown: not a cluster member, so don't reboot.\n");
    clulog(LOG_DEBUG, "consider_shutdown: %s\n", reason);
    return(0);
}                  
