/*
  Copyright Mission Critical Linux, 2000

  Kimberlite is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
  Free Software Foundation; either version 2, or (at your option) any
  later version.

  Kimberlite is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with Kimberlite; see the file COPYING.  If not, write to the
  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
  MA 02139, USA.
*/
/*
 *  $Id: diskconfigdb.c,v 1.11 2000/09/28 16:19:32 lawrence Exp $
 *
 *  Copyright (C) 2000 Mission Critical Linux, LLC
 *
 *  author: Tim Burke <burke@missioncriticallinux.com>
 *  description: Configuration interface.
 *
 * diskconfigdb.c
 *
 * This file implements the routines used to store the cluster configuration
 * file/database on the shared disk.  Previously we had /etc/cluster.cfg, but
 * it would be probelmatic to keep this file in sync on all cluster members.
 * Rather we store the cluster configuration information on the shared state
 * disk partition.
 *
 * Here the configuration "database" is considered to be binary "blob". So
 * the services offered are to save and retrieve the database as a whole.
 * There are no facilities to change "records" within the "database".
 */
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <unistd.h>
#include <sys/mman.h>
#include <signal.h>
#include <errno.h>
#include <stdlib.h>

#include "diskstate.h"
#include "disk_proto.h"
#include <logger.h>
#include <sys/syslog.h>


static const char *version __attribute__ ((unused)) = "$Revision: 1.11 $";

static int subsysInitialized = 0;
static int zfd = -1;

static off_t offsetDBHdr;
static off_t offsetDBData;
extern int sharedPartitionFDinited;
extern int sharedPartitionFD[];

extern int preferredReadPartition;

static int initDatabaseSubsys(void);
static void initHeader(DiskConfigDBHeader *dbHdr);
static int writeDatabaseHeader(int partition, DiskConfigDBHeader *hdr);
static int readDatabaseHeaderPartition(int partition, DiskConfigDBHeader *hdr);
static int readDatabaseHeader(DiskConfigDBHeader *hdr);
int writeDatabasePartition(int partition, char *data, ssize_t length);
int diskFullCheckDatabasePartition(int partition, char *data, ssize_t max_length, int *check_sum, ssize_t *size);
int diskLseekRawReadChecksumDatabase(int partition, off_t readOffset, char *data, int length, ulong chksum_expected);
int diskRawWritePartition(int partition, off_t writeOffset, char *buf, int len, ulong chksum_off);
int readDatabase_quorumd(void);
#define FAULT() *(int *)0 = 0;
/*
 * Called to initialize subsystem state variables.
 * Also opens the file descriptor representing the shared state partition.
 * 
 * Returns: 0 on success.
 */
static int initDatabaseSubsys(void) {


    if (subsysInitialized) {
	clulog(LOG_DEBUG,"initDatabaseSubsys: already initialized.\n");
	return(0);
    }
    zfd = open("/dev/zero", O_RDWR);
    if (zfd == -1) {
	    clulog(LOG_ERR, "initDatabaseSubsys: unable to open /dev/zero.\n");
            return(-1);
    }
    offsetDBHdr = OFFSET_CONFIG_DB_HEADER;
    offsetDBData = OFFSET_CONFIG_DB_DATA;

    if(!sharedPartitionFDinited)
	    if(initSharedFD())
		    return -1;

    subsysInitialized = 1;
    return(0);
}

/*
 * Called to release resources obtained in initDatabaseSubsys.
 * 
 * Returns: 0 on success.
 */
static int closeDatabaseSubsys(void) {
    int retval;

    if (subsysInitialized == 0) {
	clulog(LOG_DEBUG,"closeDatabaseSubsys: Subsystem not open.\n");
	return(0);
    }
    subsysInitialized = 0;
    return(retval);
}

/*
 * Called to initialize the header describing the database.
 */
static void initHeader(DiskConfigDBHeader *dbHdr) {
    bzero((void *)dbHdr, sizeof(DiskConfigDBHeader));
    dbHdr->magic_number = CONFIG_DB_MAGIC_NUMBER;
    dbHdr->version = CONFIG_DB_LATEST_VERSION;
    dbHdr->length = 0;
}

/*
 * Write a memory resident DB header out to disk.
 * Returns: 0 on success, -1 otherwise.
 */
static int writeDatabaseHeader(int partition, DiskConfigDBHeader *hdr) {

	return diskRawWritePartition(partition, offsetDBHdr, (char *)hdr, sizeof(DiskConfigDBHeader),
				  (ulong)&((DiskConfigDBHeader *)0)->check_sum);

}

/*
 * Read in a DB header off disk and populate a memory resident data struct
 * with the contents.
 */
static int readDatabaseHeaderPartition(int partition, DiskConfigDBHeader *hdr)
{
	int ret;

	ret = diskLseekRawReadChecksum(partition, offsetDBHdr, (char *)hdr, sizeof(DiskConfigDBHeader),
				       (ulong)&((DiskConfigDBHeader *)0)->check_sum);
	if(ret) {
		clulog(LOG_ERR,"readDatabaseHeaderPartition: bad ret %d from diskLseekRawReadChecksum\n", ret);
		return(ret);
	}		
	if (hdr->magic_number != CONFIG_DB_MAGIC_NUMBER) {
		clulog(LOG_ERR, "readDatabaseHeaderPartition: invalid magic # %lx\n",
		       hdr->magic_number);
		return(-1);
	}
	if (hdr->length < 0) {
		clulog(LOG_ERR, "readDatabaseHeaderPartition: invalid length %ld.\n",
		       (long)hdr->length);
		return(-1);
	}
	return(0);
}
static int readDatabaseHeader(DiskConfigDBHeader *hdr)
{
	int ret;

	ret = readDatabaseHeaderPartition(0, hdr);
	if(ret)
		ret = readDatabaseHeaderPartition(1, hdr);
	return ret;
}

#ifdef notdef // Not called anywhere
static void printDatabaseHeader(DiskConfigDBHeader *hdr) {

    clulog(LOG_DEBUG, "------ Config DB Header ------------\n");
    clulog(LOG_DEBUG, "magic# = 0x%lx\n", hdr->magic_number);
    clulog(LOG_DEBUG, "version = %d\n", hdr->version);
    clulog(LOG_DEBUG, "length = %d\n", hdr->length);
    clulog(LOG_DEBUG, "------------------------------\n");
}
#endif // notdef - Not called anywhere


/*
 * Returns the length of the current "data" in the configuration database.
 * Return values:
 *	-1 - Unable to read a database header describing the length.  This
 *	     occurs if the shared state disk partition has never been
 *	     initialized or an IO error occurs.
 *	0  - The database is currently initialized, but is empty.
 *      greater than 0 - the length of actual database contents.
 */
ssize_t getDatabaseLength(void) {
	DiskConfigDBHeader dbHdr;

        if (subsysInitialized == 0) {
	    if (initDatabaseSubsys() != 0) {
	        clulog(LOG_EMERG, "getDatabaseLength: Subsystem init failure.\n");
	        return(-1);
	    }
        }
	if (readDatabaseHeader(&dbHdr) != 0) {
	    clulog(LOG_EMERG, "getDatabaseLength: unable to read header.\n");
	    return(-1);
	}
	// sanity check
	if (dbHdr.length < 0) {
	    clulog(LOG_EMERG, "getDatabaseLength: bogus length %ld\n",(long)dbHdr.length);
	    return(-1);
	}
	return((ssize_t) dbHdr.length);
}
/*
 * Returns the checksum stored in the configuration database.
 * This provides the "management layer" a quick way of determining if
 * the database has been modified from an original snapshot which was taken
 * out when the management session began.
 * Return values:
 *	0 - Unable to read a database header to get the checksum.  This
 *	     occurs if the shared state disk partition has never been
 *	     initialized or an IO error occurs.
 *	Otherwise the checksum value is returned.
 */
ulong getDatabaseChecksum(void) {
	DiskConfigDBHeader dbHdr;

        if (subsysInitialized == 0) {
	    if (initDatabaseSubsys() != 0) {
	        clulog(LOG_EMERG, "getDatabaseChecksum: Subsystem init failure.\n");
	        return(0);
	    }
        }
	if (readDatabaseHeader(&dbHdr) != 0) {
	    clulog(LOG_EMERG, "getDatabaseChecksum: unable to read header.\n");
	    return(0);
	}
	// sanity check
	if (dbHdr.length < 0) {
	    clulog(LOG_EMERG, "getDatabaseChecksum: bogus length %ld\n",(long)dbHdr.length);
	    return(0);
	}
	return(dbHdr.db_check_sum);
}

/*
 * Write the service block out to disk.
 * This routine also provides the ability to delete (clear out) the
 * contents of the database by passing in a length parameter of 0.
 * Returns: -1 on IO error, -2 on parameter error, the number of 
 *	    bytes written on success.
 */
ssize_t writeDatabase(char *data, ssize_t length)
{
	int ret;

	clulog(LOG_DEBUG, "writeDatabase: length=%d.\n", (ssize_t)length);
	// Paranoia checks
	if (length > SPACE_FOR_CONFIG_DATABASE) {
	    clulog(LOG_ERR, "writeDatabase: length %d exceeds max of %d.\n",
			(ssize_t)length, SPACE_FOR_CONFIG_DATABASE);
		return(-3);
	}
        if (subsysInitialized == 0) {
	    if (initDatabaseSubsys() != 0) {
	        clulog(LOG_ERR, "writeDatabase: Subsystem init failure.\n");
	        return(-2);
	    }
        }
	if(!sharedPartitionFDinited)
		if(initSharedFD())
			return -1;

	assert_clu_lock_held("writeDatabase");	

	ret = writeDatabasePartition(0, data, length);
	if(ret)
		return -1;
	ret = writeDatabasePartition(1, data, length);
	if(ret)
		return -1;
	else
		return length;
}
int writeDatabasePartition(int partition, char *data, ssize_t length)
{
	off_t retval_seek;
	ssize_t write_ammount;
	ssize_t retval_write;
	ssize_t resid;
	DiskConfigDBHeader dbHdr;
	char *dataPtr = data;

	/*
	 * Now write out the data.  The current raw IO bounceIO implementation
	 * here limits to MAX_BOUNCEIO_LENGTH which is the page length of 4K.
	 * Consequently the write of the configuration database must be broken
	 * up into chunks.
	 */
	retval_seek = lseek(sharedPartitionFD[partition], offsetDBData, SEEK_SET);
	if (retval_seek != offsetDBData) {
		clulog(LOG_ERR, "writeDatabase: cant seek to offset %ld.\n",
		       offsetDBData);
		return(-1);
	}

	resid = length;
	while (resid > 0) {
		if (resid >= MAX_BOUNCEIO_LENGTH)
			write_ammount = MAX_BOUNCEIO_LENGTH;
		else
			write_ammount = resid;
		
		retval_write = diskRawWrite(sharedPartitionFD[partition], dataPtr, write_ammount);
		if (retval_write != write_ammount) {
			clulog(LOG_ERR, "writeDatabase: header write returned %d.\n",
			       retval_write);
			return(-1);
		} 
		dataPtr += write_ammount;
		resid = resid - write_ammount;
	}
	/*
	 * Now update the header to indicate the number of bytes written.
	 */
	initHeader(&dbHdr);
	dbHdr.length = length;
	dbHdr.db_check_sum = clu_byte_check_sum(data, length);
	if (writeDatabaseHeader(partition, &dbHdr) != 0)
		return(-1);

	return 0;
}

/*
 * Called as part of the read/repair facility to read in the entire
 * database contents.
 */
int readScanWholeDatabase(void) {
	int ret = 0;
	char *buf;


	buf = malloc(SPACE_FOR_CONFIG_DATABASE);
	if (buf == NULL) {
		clulog(LOG_ERR, "readScanWholeDatabase: Unable to malloc.\n");
		return -1;
	}
	if(clu_try_lock() == LOCK_SUCCESS) {
		ret = readDatabase(buf, SPACE_FOR_CONFIG_DATABASE);
		clu_un_lock();
	}
	free(buf);

	if(ret < 0)
		return -1;
	else
		return 0;
}

/*
 * Read in the configuration database off the shared disk and
 * populate the user's buffer with the data.  This of course requires
 * that the user's buffer is big enough to hold the contents of the
 * database.  In order to facilitate this, the user can first call
 * getDatabaseLength().  Just to avoid any buffer overflows, the max_length
 * parameter describes the user buffer size.
 * Note: the database is read in as a single "blob", there are no facilities
 * for retrieving a portion of the database (i.e. records).
 * 
 * Returns: -1 on IO error, the number of bytes read on success.
 */



#define roundpage(x) ((x + (pageSize - 1)) & ~(pageSize - 1))

ssize_t readDatabase(char *data_in, ssize_t max_length)
{
	ssize_t ret, size[2];
	int i, check_ret[2], check_sum[2];
	char *data[2];
	int pageSize, good_part;
	size_t mmap_size;

        pageSize = sysconf(_SC_PAGESIZE);
        if (subsysInitialized == 0) {
		if (initDatabaseSubsys() != 0) {
			clulog(LOG_DEBUG, "readDatabase: Subsystem init failure.\n");
			return(-2);
		}
        }

	assert_clu_lock_held("readDatabase");

	for(i = 0; i < 2; i++) {
		mmap_size = roundpage(max_length);
		data[i] = mmap(0, mmap_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, zfd, 0);
		if(data[i] == MAP_FAILED) {
			if(i)
				munmap(data[0], mmap_size);
			return -1;
		}
		check_ret[i] = diskFullCheckDatabasePartition(i, data[i], max_length, &check_sum[i], &size[i]);
	}

	if(!check_ret[0] && !check_ret[1] && (check_sum[0] == check_sum[1])) {
		good_part = 0;
		goto good;
	}
	if(check_ret[0] && check_ret[1]) {
		clulog(LOG_CRIT, "readDatabase: data corrupt on primary and shadow partitions.\n");
		return -1;
	}
	if((!check_ret[0] && !check_ret[1] && (check_sum[0] != check_sum[1]))
	   || check_ret[1]) {
		ret = writeDatabase(data[0], size[0]);
		if(ret != size[0])
			clulog(LOG_CRIT, "readDatabase: unable to fix database\n");
		good_part = 0;
		goto good;
	}
	if(check_ret[0]) {
		ret = writeDatabase(data[1], size[1]);
		if(ret != size[1]) {
			clulog(LOG_CRIT, "readDatabase: unable to fix database\n");
		}
		good_part = 1;
		goto good;
	}
	clulog(LOG_CRIT, "readDatabase: inconsistency\n");
	FAULT();

  good:
	memcpy(data_in, data[good_part], size[good_part]);
	for(i = 0; i < 2; i++)
		munmap(data[i], mmap_size);
	return size[good_part];
}
int diskFullCheckDatabasePartition(int partition, char *data, ssize_t max_length, int *check_sum, ssize_t *size)
{
	DiskConfigDBHeader hdr;
	int ret;

	ret = readDatabaseHeaderPartition(partition, &hdr);
	if(ret)
		return ret;
	if (hdr.length) {
		if (hdr.length > max_length) {
			clulog(LOG_DEBUG, "diskFullCheckDatabasePartition: user buffer is too small.\n");
			return -1;
		}
		ret = diskLseekRawReadChecksumDatabase(partition, offsetDBData, data, hdr.length, hdr.db_check_sum);
		if(ret)
			return ret;
	}
	*check_sum = hdr.db_check_sum;
	*size = hdr.length;
	return 0;
}
int diskLseekRawReadChecksumDatabase(int partition, off_t readOffset, char *data, int length, ulong chksum_expected)
{
	char *dataPtr;
	off_t retval_seek;
	ssize_t read_ammount;
	ssize_t retval_read;
	ssize_t resid;

	/*
	 * Now read in the data.  The current raw IO bounceIO implementation
	 * here limits to MAX_BOUNCEIO_LENGTH which is the page length of 4K.
	 * Consequently the read of the configuration database must be broken
	 * up into chunks.
	 */

	retval_seek = lseek(sharedPartitionFD[partition], readOffset, SEEK_SET);
	if (retval_seek != readOffset) {
		clulog(LOG_DEBUG, "diskLseekRawReadChecksumDatabase: cant seek to offset %ld.\n", offsetDBData);
		return(SHADOW_NO_ACCESS);
	}
	resid = length;
	dataPtr = data;
	while (resid > 0) {
	    if (resid >= MAX_BOUNCEIO_LENGTH)
		read_ammount = MAX_BOUNCEIO_LENGTH;
	    else
		read_ammount = resid;

	    retval_read = diskRawRead(sharedPartitionFD[partition], dataPtr, read_ammount);
	    if (retval_read != read_ammount) {
		clulog(LOG_DEBUG, "diskLseekRawReadChecksumDatabase: read returned %d.\n", retval_read);
		return(SHADOW_NO_ACCESS);
	    } 
	    dataPtr += read_ammount;
	    resid = resid - read_ammount;
	}

	if(chksum_expected != clu_byte_check_sum(data, length)) {
		clulog(LOG_EMERG,"diskLseekRawReadChecksumDatabase: bad check sum.\n");
		return SHADOW_BAD_CHECKSUM;
	}
	return SHADOW_SUCCESS;
}
/*
 * Initialize the on-disk data structures representing the cluster
 * configuration "database".  The representation of the database consists
 * of a header as well as the actual data portion.  Initialization consists
 * of simply putting down a header which lists the length as 0.
 * Returns: 0 on success.
 */
int initializeConfigDatabase(void) {
    DiskConfigDBHeader hdr;
    int i;

    if (subsysInitialized == 0) {
	if (initDatabaseSubsys() != 0) {
	    clulog(LOG_DEBUG, "initializeConfigDatabase: Subsystem init failure.\n");
	    return(-2);
	}
    }
    initHeader(&hdr);
    hdr.length = 0;
    hdr.db_check_sum = 0;
    for(i = 0; i < 2; i++) {
	    if (writeDatabaseHeader(i, &hdr) != 0) {
		    clulog(LOG_DEBUG, "initializeConfigDatabase: failed to write header.\n");
		    return(-1);
	    }
    }
    clulog(LOG_DEBUG, "initializeConfigDatabase: successfully initialized.\n");
    return(0);
}


/*
 * Bootstrapping routine called to retrieve the contents of the 
 * configuration database from the specified partition.  This is in
 * contrast to the normal access routines above which retrieve
 * configuration parameters (i.e. partition names from the configuration
 * contents.
 */
int getDatabaseFromPartition(char* partitionName, char* buffer, ssize_t length) {
    ssize_t size;
    int check_ret, check_sum;
    char *data;
    int pageSize;
    size_t mmap_size;

    pageSize = sysconf(_SC_PAGESIZE);

    /*
     * First perform all of the subsystem initializations and partition
     * opens which normally get done automatically by reaching into the
     * config parameters for settings.
     */
    if (validatePartitionName(partitionName) != 0) {
        clulog(LOG_CRIT, 
           "getDatabaseFromPartition: unable to validate partition %s."
           " Configuration error?\n", 
           partitionName);
        return(-1);
    }
    sharedPartitionFD[0] = openPartition(partitionName);
    sharedPartitionFD[1] = openPartition(partitionName);
    if (sharedPartitionFD[0] < 0) {
        clulog(LOG_EMERG, "getDatabaseFromPartition: unable to open partition %s.\n", 
           partitionName);
        return(-1);
    }
    sharedPartitionFDinited = 1;

    if (subsysInitialized == 0) {
        if (initDatabaseSubsys() != 0) {
            clulog(LOG_DEBUG, "getDatabaseFromPartition: Subsystem init failure.\n");
	    // Unwind initialization 
            sharedPartitionFD[0] = 0; sharedPartitionFD[1] = 0; 
            sharedPartitionFDinited = 0;
            return(-2);
        }
    }
    { 
	ssize_t len = length;
        length = getDatabaseLength();
        if (length > len) {
            buffer = (char*)realloc(buffer,length);
        }
    }
    mmap_size = roundpage(length);
    data = mmap(0, mmap_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, zfd, 0);
    if(data == MAP_FAILED) {
	// Unwind initialization 
        sharedPartitionFD[0] = 0; sharedPartitionFD[1] = 0; 
        sharedPartitionFDinited = 0; closeDatabaseSubsys();
        return(-1);
    }
    check_ret = diskFullCheckDatabasePartition(0, data, 
                                   length, &check_sum, &size);
    // Unwind initialization 
    sharedPartitionFD[0] = 0; sharedPartitionFD[1] = 0; 
    sharedPartitionFDinited = 0; closeDatabaseSubsys();
    if(check_ret) {
        munmap(data, mmap_size);
        return(-1);
    }
    memcpy(buffer, data, size);
    munmap(data, mmap_size);
    return (size);
}
