/**
 * @file    read_data.c
 * @brief   File input routines.
 *
 *          Routines for reading data files into ygraph.
 *
 * @author  Denis Pollney
 * @date    1 Oct 2001
 *
 * @par Copyright (C) 2001-2002 Denis Pollney
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 * @par
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 * @par
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

#include<string.h>
#include<math.h>
#include<stdio.h>
#include<zlib.h>
#include<bzlib.h>

#include "ygraph.h"

/**
    Create a DataSet structure and initialise it to some default
 *           values.
 *
 * @param    fname  The name to be given to the DataSet.
 * @returns  A pointer to an empty DataSet initialised to default values.
 * @note     The returned DataSet must be freed.
 */
DataSet*
data_set_init(gchar* fname)
{
  DataSet* data_set;
  gchar* tmp;

  data_set = g_malloc(sizeof(DataSet));
  
  data_set->name=NULL;
  data_set->fname=NULL;
  data_set->dir=NULL;
  
  data_set->type = YG_DATAFILE;

  if (fname != NULL)
    {
      data_set->dir = g_dirname(tmp=g_strdup(fname));
      g_free(tmp);
      tmp=NULL;
      if (g_strcasecmp(fname, INTERNAL_STDIN_STR))
        data_set->name = g_strdup(g_basename(tmp=g_strdup(fname)));
      else
        data_set->name = "stdin";
      if (tmp != NULL)
        g_free(tmp);
      if (!g_path_is_absolute(fname) &&
           g_strcasecmp(fname, INTERNAL_STDIN_STR))
        data_set->fname = g_strjoin(G_DIR_SEPARATOR_S,
                                    global_working_directory, fname, NULL);
      else
        data_set->fname = g_strdup(fname);
    }
  else
    {
      data_set->dir = NULL;
      data_set->name = NULL;
      data_set->fname = NULL;
    }

  data_set->nframes = 0;
  data_set->x_range[0] = G_MAXDOUBLE;
  data_set->x_range[1] = -G_MAXDOUBLE;
  data_set->y_range[0] = G_MAXDOUBLE;
  data_set->y_range[1] = -G_MAXDOUBLE;
  data_set->scale = DEFAULT_DATA_SCALE_FACTOR;
  data_set->frame = g_array_new(FALSE, FALSE, sizeof(Frame*));
  data_set->cmpt_set = g_array_new(FALSE, FALSE, sizeof(gint));
  data_set->scale_entry_field = NULL;

  return data_set;
}

/**
 * @brief    De-allocate the contents of a DataSet*.
 *
 * @param    data_set  A pointer to the DataSet to be freed. 
 */
void
data_set_free(DataSet* data_set)
{
  g_free(data_set->fname);
  g_free(data_set->name);
  g_free(data_set->dir);
  g_array_free(data_set->frame, TRUE);
  g_array_free(data_set->cmpt_set, TRUE);

  g_free(data_set);
}

/**
 * @brief    Set default values for a single frame of a data set.
 *
 * @returns  A pointer to a newly allocated Frame initialised with default
 *           values
 * @note     The returned Frame must be freed.
 */
Frame*
frame_init(void)
{
  Frame* frame;

  frame = g_malloc(sizeof(Frame));
  frame->npoints = 0;
  frame->time = UNINITIALISED_TIME;
  frame->x_range[0] = G_MAXDOUBLE;
  frame->x_range[1] = -G_MAXDOUBLE;
  frame->y_range[0] = G_MAXDOUBLE;
  frame->y_range[1] = -G_MAXDOUBLE;
  frame->xy_data = g_array_new(FALSE, FALSE, sizeof(gdouble*));

  return frame;
}

/**
 * @brief    De-allocate a frame and its contents.
 *
 * @param    frame  A pointer to the Frame to be freed.
 */
void
frame_free(Frame* frame)
{
  g_array_free(frame->xy_data, TRUE);
  g_free(frame);
}

/**
 * @brief    Set the time field of a frame.
 *
 * @param    frame  A pointer to the frame in question.
 * @param    time   The time value to be set.
 */
void
frame_set_time(Frame* frame, gdouble time)
{
  frame->time = time;
}

/**
 * @brief    Set the name field of a data set.
 *
 * @param    data_set  A pointer to the DataSet to be named.
 * @param    label     A name string.
 */
void
dataset_set_label(DataSet* data_set, gchar* label)
{
  gchar* s;
  gint v;
  v = find_equal(label, strlen(label));
  if (v>-1)
    data_set->name = g_strdup(g_strstrip(s));
  else
    data_set->name = NULL;
}

/**
 * @brief    Append a frame to a dataset.
 *
 *           The new Frame is inserted into the DataSet at the appropriate
 *           time. The maximum and minimum data values for the DataSet are
 *           recalculated.
 *
 * @param    data_set  A pointer to the DataSet.
 * @param    frame     A pointer to the frame which is to be appended.
 */
void
dataset_append_frame(DataSet* data_set, Frame* frame)
{
  Frame* cur_frame;
  gdouble time_p_eps;
  gdouble time_m_eps;
  gint i;

  /*
   * If time values have not been specified, set the time according
   * to the frame number.
   */
  if (frame->time == UNINITIALISED_TIME)
    frame->time = data_set->nframes;

  time_p_eps = frame->time + TIME_EPSILON;
  time_m_eps = frame->time - TIME_EPSILON;

  /*
   * Fit the frame into the current list according to its time value.
   */
  if (data_set->nframes > 0)
    {
      cur_frame = g_array_index(data_set->frame, Frame*, data_set->nframes-1);

      if (cur_frame->time < time_m_eps)
        g_array_append_val(data_set->frame, frame);
      else
        {
          for (i=0; i<data_set->nframes; ++i)
            {
              cur_frame = g_array_index(data_set->frame, Frame*, i);
              /*
               * If a frame at the same time value (up to epsilon) already
               * exists, then don't replace it.
               */
              if ((cur_frame->time < time_p_eps) && 
                  (cur_frame->time > time_m_eps))
                return;

              if (cur_frame->time > time_p_eps)
                {
                  g_array_insert_val(data_set->frame, i, frame);
                  break;
                }
            }
        }
    }
  else
    {
      if (data_set->frame == NULL)
        data_set->frame = g_array_new(FALSE, FALSE, sizeof(Frame*));
      g_array_append_val(data_set->frame, frame);
    }

  data_set->x_range[0] = MIN(data_set->x_range[0], frame->x_range[0]);
  data_set->x_range[1] = MAX(data_set->x_range[1], frame->x_range[1]);
  data_set->y_range[0] = MIN(data_set->y_range[0], frame->y_range[0]);
  data_set->y_range[1] = MAX(data_set->y_range[1], frame->y_range[1]);

  data_set->nframes++;
}

/**
 * @brief    Return a string corresponding to the label indicator.
 *
 * @returns  A string which is used in input files to label a new data set,
 *           eg. "label =".
 */
gchar*
label_indicator_set(void)
{
  gchar* li;
  gint li_len;
  gint ierr;

  li_len = strlen(LABEL_INDICATOR_STRING) + 1;
  li = g_malloc(li_len);
  ierr = g_snprintf(li, li_len, "%s", LABEL_INDICATOR_STRING);
  g_assert(ierr);

  return li;
}

/**
 * @brief    Append a data point to a frame, resetting the max and mins.
 *
 *           The new point is added to the Frame, and the corresponding data
 *           ranges are modified.
 *
 * @param    frame  The frame to which the point is to be appended.
 * @param    x_val  The coordinate x-value of the point.
 * @param    y_val  The coordinate y-value of the point.
 */
void
frame_append_data_point(Frame* frame, gdouble x_val, gdouble y_val)
{
  gdouble* point;

  point = g_malloc(2*sizeof(gdouble));
  *point = x_val;
  *(point+1) = y_val;

  g_array_append_val(frame->xy_data, point);

  frame->x_range[0] = MIN(frame->x_range[0], x_val);
  frame->x_range[1] = MAX(frame->x_range[1], x_val);
  frame->y_range[0] = MIN(frame->y_range[0], y_val);
  frame->y_range[1] = MAX(frame->y_range[1], y_val);

  ++(frame->npoints);
}

/**
 * @brief    Check a given filename against the already loaded datasets to
 *           determine whether it has already been loaded.
 *
 *           The absolute path corresponding to the passed string is
 *           compared against paths listed in each DataSet in the 
 *           global_data_set_list.
 *
 * @param    fname  The name of the file to be compared.
 * @returns  NEW_DATA_SET if the file is new, or the data set number
 *           within the global_data_set_list if the file has already been
 *           loaded.
 */
gint
duplicate_file_check(gchar* fname)
{
  DataSet* cur_set;
  gchar* cur_fname;
  gchar* compare_fname;
  guint i;

  if (!g_path_is_absolute(fname))
    compare_fname = g_strjoin(G_DIR_SEPARATOR_S, global_working_directory,
                              fname, NULL);
  else
    compare_fname = g_strdup(fname);

  if (global_data_set_list == NULL)
    {
      g_free(compare_fname);
      return NEW_DATA_SET;
    }

  for (i=0; i<global_data_set_list->len; ++i)
    {
      cur_set = g_array_index(global_data_set_list, DataSet*, i);
      cur_fname = cur_set->fname;

      if (cur_fname != NULL && !strcmp(cur_fname, compare_fname))
        {
          g_free(compare_fname);
          return i;
        }
    }

  g_free(compare_fname);

  return NEW_DATA_SET;
}

/**
 * @brief    Turn a "xxx yyy" string into x and y (double) coordinate values.
 *
 * @param    s      The string to be translated.
 * @param    x_val  The x-value which will be set.
 * @param    y_val  The y-value which will be set.
 */
void
point_read(gchar* s, gdouble* x_val, gdouble* y_val)
{
  gchar x_str[100];
  gchar y_str[100];
  gint ierr;
  const gdouble nan_value = 42.0;

  if (global_x_column==global_y_column)
  {
    ierr = sscanf(s, global_column_format_string, x_str);
    if (ierr==1)
    {
      if (check_for_nan(x_str))
        *x_val = nan_value;
      else
        *x_val = str_to_double(x_str);
      *y_val = *x_val;
    }
  }
  else
  {
    ierr = sscanf(s, global_column_format_string, x_str, y_str);
    if (ierr==2)
    {
      if (global_x_column<global_y_column)
      {
        if (check_for_nan(x_str))
          *x_val = nan_value;
        else
          *x_val = str_to_double(x_str);
        if (check_for_nan(y_str))
          *y_val = nan_value;
        else
          *y_val = str_to_double(y_str);
      }
      else
      {
        if (check_for_nan(y_str))
          *x_val = nan_value;
        else
          *x_val = str_to_double(y_str);
        if (check_for_nan(x_str))
          *y_val = nan_value;
        else
          *y_val = str_to_double(x_str);
      }
    }
  }
}

/*
 * @brief    Append a dataset to the global_data_set_list.
 *
 *           The DataSet is appended to the global_data_set_list, which keeps
 *           track of all of the loaded DataSets. First, however, it checks
 *           through the current list to see whether a data set from the same
 *           file has already been loaded. If that is the case, then replace
 *           that dataset with the newly loaded set.
 *
 * @param    data_set  The DataSet to be appended.
 * @returns  The index within the global list.
 */
gint
global_data_set_list_append(DataSet* data_set)
{
  DataSet* old_data_set;
  gint idx;

  if (global_data_set_list == NULL)
    global_data_set_list = g_array_new(FALSE, FALSE, sizeof(DataSet*));

  if (data_set->type == YG_DATAFILE)
    idx = duplicate_file_check(data_set->fname);
  else
    idx = NEW_DATA_SET;

  if (idx == NEW_DATA_SET)
    {
      g_array_append_val(global_data_set_list, data_set);
      return global_data_set_list->len-1;
    }

  old_data_set = g_array_index(global_data_set_list, DataSet*, idx);
  data_set_free(old_data_set);
  array_index_set_val(global_data_set_list, DataSet*, idx, data_set);

  return idx;
}

/* Frank (knarf): mainly copied from zlib */
char*
bzgets(BZFILE *file, char *buf, int len)
{
  char *b= buf;
  
  if (buf == Z_NULL || len <= 0)
    return Z_NULL;

  while (--len > 0 && BZ2_bzread(file, buf, 1) == 1 && *buf++ != '\n') ;

  *buf = '\0';
  return b == buf && len > 0 ? Z_NULL : b;
}
    
/**
 * @brief    Open a file and read in a set of data.
 *
 *           The requested file is read into a new DataSet which is
 *           appended to the global_data_set_list. The zlib library is
 *           used for file I/O, so that gzipped files can be read just
 *           as normal files.
 *
 * @param    fname  The name of the file to be read.
 * @param    skip   The number of frames to skip.
 * @returns  The index number of the loaded DataSet within the
 *           global_data_set_list, or FAIL if the file cannot be read
 *           or contains no data.
 *
 * @note     The time of the new frame is merged into the global_time_list
 *           during the course of this function. Maybe it would be better
 *           for global_data_set_list_append() to do this?
 */
gint
dataset_read_from_file (gchar* fname, gint skip)
{
  FILE* fp = NULL;
  gchar* label_indicator_str = NULL;
  gchar* s = NULL;
  gint v = -1;
  gdouble time = 0;
  gdouble x_val = -G_MAXDOUBLE;
  gdouble y_val = -G_MAXDOUBLE;
  DataSet* data_set = NULL;
  Frame* cur_frame = NULL;
  gint label_indicator_len;
  gint time_indicator_len;
  gint data_set_idx;
  gint cur_frame_nbr = 0;
  gint read_mode = READ_COMMENT;
  gint use_pipe = 0;
  gint use_bzip = 0;
  gint use_stdin = 0;

  label_indicator_str = label_indicator_set();
  label_indicator_len = strlen(label_indicator_str);
  time_indicator_len = strlen(TIME_INDICATOR_STRING);

  if (g_strcasecmp(fname, INTERNAL_STDIN_STR)==0)
    use_stdin = 1;
  else
  /* Looking for a pipe */
  if (fname[0] == '<')
    use_pipe = 1;
  else
  {
    /* Do I want to use bzip2 or gzip? */
    g_strreverse(fname);
    if (g_strncasecmp(fname, "2zb.", 4)==0)
      use_bzip=1;
    g_strreverse(fname);
  }
  
  /*
   * Try to open the file, and print an error message if there is a problem.
   */
  if (use_stdin)
  {
    fp = stdin;
  }
  else
    if (use_bzip)
      fp = BZ2_bzopen(fname, "r");
    else
      if (use_pipe)
        fp = popen(fname + 1, "r");
      else
        fp = gzopen(fname, "r");
  
  if (fp == NULL)
    {
      gchar* cant_open_msg = g_strdup_printf("Could not open %s", fname);
      message_dialog(cant_open_msg);
      g_free(cant_open_msg);
      return FAIL;
    }

  data_set = data_set_init(fname);
  cur_frame = frame_init();

  /*
   * Read through each line of the input file.
   */
  s = g_malloc(MAX_LINE_LENGTH);
  while(((use_pipe||use_stdin) ? fgets(s, MAX_LINE_LENGTH, fp) : 
                    use_bzip ? bzgets(fp, s, MAX_LINE_LENGTH):
                               gzgets(fp, s, MAX_LINE_LENGTH)) != NULL)
    {
      g_strstrip(s);

      if ((strchr(COMMENT_DELIMETERS, s[0])) || (s[0] == '\0'))
        {
          if (read_mode == READ_DATA)
            ++cur_frame_nbr;
          read_mode = READ_COMMENT;

          g_strdown(s);

          /*
           * Handle comment lines.
           * Assume frames are delimited by at least one comment line, eg. the
           * time line in cactus files.
           */
          if ((cur_frame->npoints > 0) && ((cur_frame_nbr-1)%skip == 0))
            {
              dataset_append_frame(data_set, cur_frame);
              cur_frame = frame_init();
            }
          /*
           * If the 'comment' line is indicating a time value, store it with
           * the frame.
           */
          if (strchr(COMMENT_DELIMETERS, s[0])
	    && (!g_strncasecmp(g_strstrip(s+1), TIME_INDICATOR_STRING,
			       time_indicator_len)))
	      {
		v = find_equal(s, strlen(s));
		if (v > -1)
		  {
		    time=str_to_double((gchar*)g_strstrip(s+v));
		    frame_set_time(cur_frame, time);
		  }
	      }

          /*
           * If the 'comment' line is indicating a label value, store it with
           * the dataset.
           */
          if (strchr(COMMENT_DELIMETERS, s[0]))
            if (!g_strncasecmp(s+1, label_indicator_str, label_indicator_len))
              dataset_set_label(data_set, s+1+label_indicator_len);
        }
      else
        {
          /*
           * Otherwise, read in a data line as an 'x y' pair.
           */
          read_mode = READ_DATA;

          if ((cur_frame_nbr%skip == 0))
            {
              point_read(s, &x_val, &y_val);
              frame_append_data_point(cur_frame, x_val, y_val);
            }
        }
    }
  g_free(label_indicator_str);
  g_free(s);

  if (use_pipe)
    pclose(fp);
  else
    if (use_bzip)
      BZ2_bzclose(fp);
    else
      if (!use_stdin)
        gzclose(fp);

  /*
   * Don't forget the last frame ...
   */
  if (cur_frame->npoints > 0)
    dataset_append_frame(data_set, cur_frame);

  /*
   * If no data has been loaded (eg. if the file is an unreadable format),
   * clean up and go home.
   */
  if (data_set->nframes == 0)
    {
      data_set_free(data_set);
      frame_free(cur_frame);

      message_dialog("File contains no data");
      return FAIL;
    }

  /*
   * Add the new data set to the global list.
   */
  data_set_idx = global_data_set_list_append(data_set);

  /*
   * Add the time values contained in the new data set to the global list,
   * and update the number of frames.
   */
  if (global_time_list == NULL)
    time_list_build();
  else
    time_list_merge(data_set);

  global_last_frame = global_time_list->len - 1;

  return data_set_idx;
}

/**
 * @brief    Merge time values with the global_time_list.
 *
 *           Time values of frames in a newly loaded DataSet are
 *           merged with the already existing set of frames which are already
 *           known by the global_time_list, preserving ordering.
 *
 * @param    data_set  The new DataSet.
 */
void
time_list_merge(DataSet* data_set)
{
  Frame* frame;
  gdouble frame_time;
  gdouble time;
  gint j;
  guint k;

  k = 0;
  for (j=0; j<data_set->nframes; ++j)
    {
      frame = g_array_index(data_set->frame, Frame*, j);
      frame_time = frame->time;
      time = g_array_index(global_time_list, gdouble, k);

      /*
       * The epsilon is here so that if a pair of times differ only by
       * machine roundoff error, then they will not get counted twice.
       */
      while ((frame_time > time+TIME_EPSILON) &&
             (k < global_time_list->len))
        time = g_array_index(global_time_list, gdouble, ++k);

      if (fabs(time - frame_time) > TIME_EPSILON*2)
        g_array_insert_val(global_time_list, k, frame_time);
    }
}

/**
 * @brief    Build a new global time list from scratch.
 *
 *           Builds a new global_time_list by going through all of the
 *           frames of all of the loaded data sets and sorting their time
 *           values.
 */
void
time_list_build(void)
{
  DataSet* data_set;
  Frame* frame;
  guint i;
  gint j;

  if (global_data_set_list == NULL)
    return;
  
  if (global_time_list != NULL)
    g_array_free(global_time_list, TRUE);

  global_time_list = g_array_new(FALSE, FALSE, sizeof(gdouble));

  /*
   * For the first data set, just grab all of the times and put them in
   * the global_time_list.
   */
  data_set = g_array_index(global_data_set_list, DataSet*, 0);
  for (j=0; j<data_set->nframes; ++j)
    {
      frame = g_array_index(data_set->frame, Frame*, j);
      g_array_append_val(global_time_list, frame->time);
    }

  /*
   * Loop through the rest of the data sets and merge their frame times
   * with the global time list.
   */
  for (i=1; i<global_data_set_list->len; ++i)
    {
      data_set = g_array_index(global_data_set_list, DataSet*, i);
      time_list_merge(data_set);
    }
}

/**
 * @brief    Checks for NaN values in a string.
 * @param    str  The string to be checked.
 * @returns  TRUE if the string contains one of the tokens listed
 *           in NAN_STRING.
 */
int
check_for_nan(gchar* str)
{
  gint i;

  g_strdown(str);

  for (i=0; nan_string[i]; ++i)
    {
      if (strstr(str, nan_string[i]))
        return TRUE;
    }

  return FALSE;
}
