/*
 * <state.c>
 * Central noflushd state machine, and random accumulated crap.
 * 
 * Copyright (C) 2000, 2001 Daniel Kobras <kobras@linux.de>
 * 
 * except the sync functions which are
 * 
 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> and Daniel Kobras
 * 
 * Pavel Machek's work was sponsored by SuSE.
 * 
 * $Id: state.c,v 1.27 2004/04/08 20:13:44 nold Exp $
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */             

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include "state.h"
#include "bug.h"
#include "diskhelper.h"
#include "disk_stat.h"
#include "disk_info.h"
#include "intr_stat.h"
#include "part_info.h"
#include "timeout.h"
#include "kupdate.h"
#include "noflushd.h"

#include <stdlib.h>
#include <fcntl.h>
#include <time.h>
#include <string.h>
#include <mntent.h>
#include <sys/stat.h>
#include <sys/mount.h>

static part_info_t part;
static char *dev_prefix;
static int plen;

/* Fixup name from /proc/partitions (i.e. from part_info_get_name())
 * to yield a valid absolute file name. The returned string may be
 * used until the next call to devname_get() only.
 */
char *devname_get(char *name)
{
	char *suffix;

	if (!dev_prefix)
		BUG("Device name requested without prefix.");
	
	dev_prefix=realloc(dev_prefix, strlen(name)+plen+1);
	suffix = dev_prefix+plen;
	
	strcpy(suffix, name);

	return dev_prefix;
}
	
/* Check for oldstyle or devfs entries in /proc/partitions and set
 * dev_prefix accordingly, so that dev_prefix + pname yields a valid
 * fs name to the device node.
 */
static void set_path_fixup(void)
{
	char *dev;
	FILE *mnt;
	struct mntent *mntent;
	struct stat st;
	
	/* Try to determine devfs mountpoint. */
	mnt = fopen("/proc/mounts", "r");
	while ((mntent = getmntent(mnt)))
		if (!strcmp(mntent->mnt_type, "devfs"))
			break;
	fclose(mnt);

	if (mntent) {
		dev_prefix = malloc(strlen(mntent->mnt_dir + 2));
		strcpy(dev_prefix, mntent->mnt_dir);
		strcat(dev_prefix, "/");
		plen = strlen(dev_prefix);
		DEBUG("Detected devfs at %s", dev_prefix);
		return;
	}
	
	if (part_info_is_devfs()) {
		ERR("Your kernel is configured with devfs, but devfs is not\n"
		    "mounted anywhere. This means noflushd cannot work.\n"
		    "Please consult the noflushd README for details");
		exit(1);
	}
	
	/* Looks like oldstyle. Try default name. */
	dev_prefix = strdup("/dev/");
	plen = strlen(dev_prefix);

	part_info_reset(part);
	do {
		if (!part_info_disk_next(part)) {
			ERR("No valid disks found");
			exit(1);
		}
	} while (IS_META(part_info_get_major(part),
	                 part_info_get_minor(part)));
		
	dev = devname_get(part_info_get_name(part));
	
	/* Check: If we can access a block dev via the constructed name,
	 * our guess was probably correct.
	 */
	DEBUG("Probing for valid blkdev at %s", dev);
	if (!stat(dev, &st) && S_ISBLK(st.st_mode)) {
		DEBUG("Detected oldstyle dev at %s", dev_prefix);
		return;
	}
	
	BUG("Unable to determine device dir at %s", dev);
}

/* Initialize the static globals for devname handling. Ought to be done
 * before daemonizing to get error reporting on the console rather than
 * syslog. These functions are probably misplaced in this file, but then
 * all other places I thought of were worse.
 */
void devname_init(void)
{
	part = part_info_init();
	if (!part) {
		ERR("Error reading /proc/partitions");
		exit(1);
	}
	
	set_path_fixup();
}

static int sync_part(char *name)
{
	int fd, ret=0;

	DEBUG("Syncing %s", name);
	fd = open(name, O_WRONLY);
	if (fd==-1)
		return 0;
	if (!fsync(fd))
		ret=1;
	close(fd);
	return ret;
}

/* Sync disk in current part setting. Returns number of successful
 * sync attempts. */
static int sync_current_disk(void)
{
	char *name;
	int ret=0;

	do {
		name = devname_get(part_info_get_name(part));
		ret += sync_part(name);
	} while (part_info_part_next(part));

	return ret;
}

static void sync_disk(disk_info_t di)
{
	int succ;

	part_info_reset(part);

	while ((succ=part_info_disk_next(part)))
		if (part_info_get_major(part) == di->major &&
		    part_info_get_minor(part) == di->minor)
			break;
	
	if (!succ) {
		DEBUG("(%d, %d): No such disk.", di->major, di->minor);
		return;
	}
	
	if (!sync_current_disk())
		disk_info_mark_ro(di);
}
		
static void sync_spinning_disks(disk_info_t head)
{
	int succ;
	disk_info_t di;
	
	part_info_reset(part);
	
	while ((succ=part_info_disk_next(part))) {
		int major = part_info_get_major(part);
		int minor = part_info_get_minor(part);
		
		for (di=head; di; di=di->next) {
			if (di->major == major && di->minor==minor)
				break;
		}

		if (di && (di->state==DISK_STATE_STOPPED || !di->is_rw))
			continue;
		
		sync_current_disk();
	}
}

static int get_min_timeout(disk_info_t di)
{
	int to, min=0;
	
	for (; di; di=di->next) {
		to = timeout_get(di->timeouts);
		if (!NFD_TO_IS_REGULAR(to))
			continue;
		if (to < min || min == 0)
			min = to;
	}
	
	return min ? min : 60;
}
		
static void advance_timeouts(disk_info_t di)
{
	while (advance_timeout) {
		advance_timeout--;
		timeout_advance_default();
		for (; di; di=di->next) {
			int delta = timeout_advance(&di->timeouts);
			if (NFD_TO_DO_SKIP(timeout_get(di->timeouts))) {
				di->state = DISK_STATE_IGNORED;
				di->time_left = 0;
				DEBUG("Timeout update. Ignoring disk %s",
				      di->name);
			} else {
				/* Don't touch state of already spun down
				 * disks regardless of new timeout. */
				if (di->state == DISK_STATE_STOPPED)
					continue;
				di->state = DISK_STATE_SPINNING;
				di->time_left += delta;
				DEBUG("Timeout update. Disk %s, timeout %d, "
				      "left %d", 
				      di->name, timeout_get(di->timeouts),
				      di->time_left);
			}
		}
		
	}
}

/* Checks whether it's time to spin down a disk. If yes, it does so after
 * syncing the disk.
 * Returns 1 iff disk is properly spun down.
 */
static int try_spindown(disk_stat_t ds, disk_info_t di)
{
	
	DEBUG("Disk %s, Time left %d.", di->name, di->time_left);

	if (di->time_left > 0)
		/* Spindown time not reached. */
		return 0;

	INFO("Spinning down %s.", di->name);

	/* Syncing can last a while, in which time new data might have been
	 * produced. So we sync twice, assuming that the second sync is
	 * quite fast.
	 */
	if (di->is_rw) {
		sync_disk(di);
		sync_disk(di);
	}
	
	disk_stat_update(ds);
	
	/* Cancel spindown if there was other activity than our sync on the
	 * disk.
	 */
	if ((disk_stat_check(ds, di->major, di->minor) & ~DISK_STAT_WRITES) 
	    != DISK_STAT_VALID) {
		INFO("Spindown of %s cancelled.", di->name);
		di->time_left=timeout_get(di->timeouts);
		return 0;
	}
	
	return spindown(di);
}

/* Returns new noflushd state depending on number of rw disks spinning/stopped.
 */
static nfd_state_t check_io(disk_info_t di, disk_stat_t ds, int interval)
{
	int rw_spinning=0, rw_stopped=0;
	int irq_is_idle;
	disk_stat_flags io_flags;

	disk_stat_update(ds);
	/* Racy, but just about as close as we can reasonably get. */
	irq_is_idle = intr_stat_check_idleness();
	
	for (; di; di=di->next) {
		io_flags=disk_stat_check(ds, di->major, di->minor);
		switch (di->state) {
		case DISK_STATE_IGNORED:
			break;
		case DISK_STATE_SPINNING:
			if (io_flags==DISK_STAT_INVALID) {
				INFO("No stats for %s. Ignoring.", di->name);
				di->state = DISK_STATE_IGNORED;
				break;
			}
			if (io_flags & DISK_STAT_READS) {
				di->time_left=timeout_get(di->timeouts);
			} else {
				/* time_left is int, so potentially diving a
				 * little bit into negative should be okay. */
				if (di->time_left > 0)
					di->time_left-=interval;
				if (irq_is_idle && try_spindown(ds, di)) 
					di->state=DISK_STATE_STOPPED;
			}
			break;
		case DISK_STATE_STOPPED:
			if (io_flags==DISK_STAT_INVALID) {
				ERR("No stats for stopped disk %s", di->name);
				di->state=DISK_STATE_SPINNING;
				break;
			}
			if (io_flags & (DISK_STAT_READS|DISK_STAT_WRITES)) {
				time_t now, delta;

				di->time_left=timeout_get(di->timeouts);
				di->state=DISK_STATE_SPINNING;
				
				now = time(NULL);
				if (now > di->spundown_at)
					delta = now - di->spundown_at;
				else
					delta = di->spundown_at-now;
				
				INFO("Spinning up %s after %ld minutes.",
				     di->name, delta/60);

				sync_disk(di);
			}
			break;
		case DISK_STATE_UNINITIALISED:
		default:
			BUG("Illegal disk state %d on %s (%d, %d)", 
				di->state, di->name, di->major, di->minor);
		}

		/* Ignored disks are treated as spinning. */
		if (di->state==DISK_STATE_STOPPED)
			rw_stopped+=di->is_rw;
		else
			rw_spinning+=di->is_rw;
	}
	
	DEBUG("rw disks: %d stopped, %d spinning", rw_stopped, rw_spinning);

	if (!rw_stopped)
		return NFD_STATE_SPINNING;
	if (!rw_spinning)
		return NFD_STATE_STOPPED;

	return NFD_STATE_PARTIAL;
}

void nfd_daemon(disk_info_t head, disk_stat_t stat)
{
	long left, sync_left, interval;	
	time_t t_new, t_old;
	nfd_state_t nfd_state=NFD_STATE_UNINITIALISED;

	sync_left = 0;
	t_old = time(NULL);
	
	for (;;) {
		advance_timeouts(head);
		/* This is nasty but the only way to eliminate a race
		 * between checking i/o stats and spindown. */
		kupdate_stop(kupdate);
		
		t_new = time(NULL);
		
		if (t_new > t_old)
			interval=t_new-t_old;
		else
			interval=t_old-t_new;
		
		t_old=t_new;
		
		DEBUG("Check interval %ld", interval);
		nfd_state=check_io(head, stat, interval);
		
		switch (nfd_state) {
		case NFD_STATE_SPINNING:
			kupdate_start(kupdate);
			/* All disks spinning - poll 60 times minimum before
			 * spindown. */
			left = get_min_timeout(head)/60;
			sync_left = 0;
			break;
		/* XXX: The syncing code recognizes hotplugged disks now, but
		 *      we do not yet put them on the disk_info list. Don't
		 *      optimize the stopped case therefore, as one of these
		 *      alien disks might be out there. This will be handled
		 *      by rebuiling the disk_info list someday, but for now
		 *      we settle with the easy solution.
		 */
		case NFD_STATE_STOPPED:
		case NFD_STATE_PARTIAL:
			/* We emulate kupdate - use its wakeup interval
			 * for sync calls, but keep (at most) default polling 
			 * interval for i/o checks. */
			if (sync_left <= 0) {
				sync_spinning_disks(head);
				sync_left = kupdate_get_interval(kupdate);
			}
			left = get_min_timeout(head)/60;
			if (sync_left < left)
				left = sync_left;
			sync_left -= left;
			break;
#if 0
		/* Temporarily disabled. See above. */
		case NFD_STATE_STOPPED:
			/* Poll for spinup every 5 seconds. */
			left = 5;
			sync_left = 0;
			break;
#endif
		case NFD_STATE_UNINITIALISED:
		default:
			BUG("Illegal state");
		
		}

		DEBUG("State %d, sleeping %ld seconds", nfd_state, left);
		while (!advance_timeout && (left=sleep(left)));
	}
}			
