/* Mode: C */

/* analyze-index.c

   A command line utility for analyzing FramerD index files
   Originally implemented by Ken Haase in the Machine Understanding Group
     at the MIT Media Laboratory.

   Copyright (C) 1994-2001 Massachusetts Institute of Technology
   Copyright (C) 2001-2002 beingmeta, inc. (A Delaware Corporation)

   This program comes with absolutely NO WARRANTY, including implied
   warranties of merchantability or fitness for any particular purpose.

    Use, modification, and redistribution of this program is permitted
    under the terms of either (at the developer's discretion) the GNU
    General Public License (GPL) Version 2, the GNU Lesser General Public
    License.

    This program is based on the FramerD library released in Fall 2001 by
    MIT under both the GPL and the LGPL licenses, both of which accompany
    this distribution.  Subsequent modifications by beingmeta, inc. are
    also released under both the GPL and LGPL licenses (at the developer's
    discretion).
*/

static char vcid[] = "$Id: analyze-index.c,v 1.11 2002/06/29 01:25:58 haase Exp $";

#include <framerd/indextools.h>

static unsigned int lowhist[10]={0,0,0,0,0,0,0,0,0,0};
static int max_values=0, total_values=0;

struct RESULTS { int n_values, n_keys; };

static int sort_results(void *v1,void *v2)
{
  struct RESULTS *r1=v1, *r2=v2;
  if (r1->n_values < r2->n_values) return -1;
  else if (r1->n_values > r2->n_values) return 1;
  else return 0;
}

void show_usage()
{
  fprintf(stderr,"Usage: analyze-index <index>\n");
  fprintf(stderr,"       analyze-index <index> [histogram]\n");
  exit(1);
}

static char *make_isotime(char *buf,int size,time_t tick)
{
  struct tm tptr;
  fd_breakup_time(&tptr,tick,0);
  sprintf(buf,"%4d-%02d-%02dT%02d:%02d:%02d",
	  ((tptr.tm_year > 100) ? (tptr.tm_year) : (tptr.tm_year+1900)),
	  tptr.tm_mon+1,
	  tptr.tm_mday,
	  tptr.tm_hour,tptr.tm_min,tptr.tm_sec);
  return buf;
}

int
main(int argc, char *argv[])
{

  FILE *in;
  struct FD_ASSOC *assocs;
  struct FD_FILE_INDEX *ix;
  FILE *histogram=NULL;
  int i=0, n_slots=0, n_keys=0, n_misses=0, chain_max=0, chain_sum=0;
  time_t creation_time, repack_time, change_time;
  int major_version, minor_version;
  char timebuf[128];
  fd_lisp metadata;
  
  if (argc == 3) {
    histogram=fd_fopen(argv[2],"w");
    if (histogram == NULL) {
      perror(_("Can't write histogram file")); exit(1);}}
      
  fd_initialize_framerd();

  if (fd_file_existsp(argv[1]) == 0) {
    fd_warn("The file %s does not exist",argv[1]);
    exit(1);}

  ix=(struct FD_FILE_INDEX *)fd_open_index(argv[1]);
  n_slots=ix->size; fd_close_index((fd_index)ix);

  in=fd_fopen(argv[1],"rb");

  metadata=fd_read_file_index_metadata
    (in,&major_version,&minor_version,&creation_time,&repack_time,&change_time);
  if (creation_time == 0)
    fprintf(stderr,_("The file index %s is prehistoric\n"),argv[1]);
  else fprintf(stderr,_("The file index %s was created at %s\n"),
	       argv[1],make_isotime(timebuf,128,creation_time));
  
  if (repack_time == 0)
    fprintf(stderr,_("The file index %s has no repack time information\n"),argv[1]);
  else fprintf(stderr,_("The file index %s was last repacked at %s\n"),
	       argv[1],make_isotime(timebuf,128,repack_time));

  if (repack_time == 0)
    fprintf(stderr,_("The file index %s has no useful modification time information\n"),argv[1]);
  else fprintf(stderr,_("The file index %s was last modified at %s\n"),
	       argv[1],make_isotime(timebuf,128,change_time));

  fprintf(stderr,_("The file index %s has version id %d:%d\n"),
	  argv[1],major_version,minor_version);

  assocs=fd_read_assocs_from_index(in,&n_keys,NULL,0,0,-1,argv[1]);
  fprintf(stderr,_("The index %s stores %d keys in %d slots\n"),
	  argv[1],n_keys,n_slots);

  i=0; while (i < n_keys) {
    int chain_length=0;
    int probe=assocs[i].hash%n_slots;
    int chain_width=((assocs[i].hash)%(n_slots-2))+1;
    int size=assocs[i].n_values;
    while (probe != assocs[i].index) {
      probe=(probe+chain_width)%n_slots; chain_length++;}
    if (chain_length > chain_max) chain_max=chain_length;
    chain_sum=chain_sum+chain_length;
    if (chain_length) n_misses++;
    if (size > max_values) max_values=size;
    total_values=total_values+size;
    if (size < 10) lowhist[size]++;
    i++;}
  if (n_misses) {
    fprintf(stderr,
	    _("Of the %d keys; %d (%4.2f%%) are direct hits\n"),
	    n_keys, n_keys-n_misses,
	    (((double)(n_keys-n_misses))*100.0)/((double)n_keys));
    fprintf(stderr,
	    _("The %d misses average chains of %4.2f elements (max=%d)\n"),
	    n_misses,(((double)chain_sum+n_misses)/((double)n_misses)),
	    chain_max);}
  else fprintf(stderr,
	       _("The index contains %d keys, all of which are direct hits\n"),
	       n_keys);
  if (n_keys)
    fprintf(stderr,
	    _("These keys refer to %d values, making %g references on average\n"),
	    total_values,(((double)total_values)/n_keys));
  fprintf(stderr,
	  _("The most values associated with a key is %d\n"),max_values);
  i=0; while (i < 10) {
    fprintf(stderr,
	    _("  %f%% (%d) of the keys have %d values;\n"),
	    (100.0*(double)lowhist[i])/n_keys,lowhist[i],i);
    i++;}
  if (histogram) {
    int j=0;
    struct FD_HASHTABLE table; fd_pair *elements;
    struct RESULTS *results;
    fd_init_hashtable(&table,n_keys/8);
    i=0; while (i < n_keys) {
      fd_lisp nv=FD_LISPFIX(assocs[i].n_values);
      fd_hashtable_increment(&table,nv,1);
      i++;}
    elements=table.table;
    results=fd_malloc(sizeof(struct RESULTS)*table.n_keys);
    i=0; j=0; while (i < table.n_slots) {
      if (elements[i]) {
	results[j].n_keys=FD_FIXLISP(elements[i]->cdr);
	results[j].n_values=FD_FIXLISP(elements[i]->car); j++;}
      i++;}
    qsort(results,table.n_keys,sizeof(struct RESULTS),
	  (int (*)(const void *,const void *))sort_results);
    i=0; while (i < table.n_keys) {
      fprintf(histogram,"%d\t%d\n",results[i].n_values,results[i].n_keys);
      i++;}
    fd_fclose(histogram);}
  return 0;
}









/* File specific stuff */

/* The CVS log for this file
   $Log: analyze-index.c,v $
   Revision 1.11  2002/06/29 01:25:58  haase
   Made dbtest relocatable

   Revision 1.10  2002/06/03 21:51:21  haase
   Progress reports now provide more context

   Revision 1.9  2002/04/22 14:23:08  haase
   Added extended metadata to file pools and indices

   Revision 1.8  2002/04/10 03:02:10  haase
   Added version information to file pools and indices

   Revision 1.7  2002/04/03 01:33:09  haase
   Moved indextools out of FD_SOURCE core

   Revision 1.6  2002/04/02 21:39:32  haase
   Added log and emacs init entries to C source files

*/

/* Emacs local variables
;;;  Local variables: ***
;;;  compile-command: "cd ../..; make" ***
;;;  End: ***
*/
