#!/usr/bin/perl -w

my $argcffile = shift @ARGV;

my @unmutated_tests = qw(
SIRCAM_SIGNATURE
BADTRANS_WORM

USER_IN_ALL_SPAM_TO
USER_IN_MORE_SPAM_TO
USER_IN_WHITELIST_TO
A_FROM_IN_AUTO_WLIST
USER_IN_WHITELIST
USER_IN_BLACKLIST

RAZOR_CHECK
RCVD_IN_RELAYS_ORDB_ORG
RCVD_IN_OSIRUSOFT_COM
X_OSIRU_SPAM_SRC
X_OSIRU_SPAMWARE_SITE
NO_MX_FOR_FROM

YR_MEMBERSHIP_EXCH
X_UIDL_SPAMSIGN
X_OSIRU_SPAMWARE_SITE
X_OSIRU_SPAM_SRC
WEB4PORNO_URL
USER_IN_WHITELIST_TO
USER_IN_WHITELIST
USER_IN_MORE_SPAM_TO
USER_IN_BLACKLIST
USER_IN_ALL_SPAM_TO
TO_INVESTORS
STAINLESS_STEEL
SPAM_PHRASES_100
SPAM_PHRASES_040
SPAM_PHRASES_020
SPAM_FORM_INPUT
SHOES_GUY
SEXY_PICS
REMOVE_ES_04
REMOVE_ES_03
REMOVE_ES_02
REMOVE_ES_01
RCVD_IN_VISI
RCVD_IN_RSS
RCVD_IN_RFCI
RCVD_IN_RELAYS_ORDB_ORG
RCVD_IN_RBL
RCVD_IN_OSIRUSOFT_COM
RCVD_IN_ORBS
RCVD_IN_DUL
RCVD_IN_BL_SPAMCOP_NET
RAZOR_CHECK
Q_FOR_SELLER
PRINT_OUT_AND_FAX
PORN_2
PENIS_ENLARGE
NO_MX_FOR_FROM
NIGERIAN_SCAM_5
NIGERIAN_SCAM
MAILMAN_CONFIRM
LASER_PRINTER
JUST_MAILED_PAGE
INTERNET_TERROR_RANT
HUNZA_DIET_BREAD
HTTP_CTRL_CHARS_HOST
FROM_FORGED_HOTMAIL
FREQ_SPAM_PHRASE
FREEWEBHOSTINGCENTRAL
FREEWEBCO_NET_URL
EXCUSE_ES_03
EXCUSE_ES_02
EXCUSE_9
EXCUSE_5
EXCUSE_18
E_WEBHOSTCENTRAL_URL
EU_200_32_CE
EMAIL_HARVEST
EGP_HTML_BANNER
DIFF_C_PATCH
CLICK_TO_REMOVE_MAILTO
CLICKSFORMONEY_NET
BUGGY_CGI_ES_2
BUGGY_CGI_ES
BUGGY_CGI_DE_3
BUGGY_CGI_DE_2
BUGGY_CGI_DE
BRAND_NEW_PAGER
ANOTHER_NET_AD
A_HREF_TO_IP
25FREEMEGS_URL


  );

my $threshold = 5;
my $iterlimit = 0;

my %is_spam = ();
my %tests_hit = ();

my $scores;
readscores();
my $origscores = $scores;

print "Reading per-message hit stat logs and scores...\n";
my $total;
my $totspam;
my $totnonspam;
readlogs();
read_ranges();

print "Writing logs and current scores as C code...\n";

writescores_c();
writetests_c();
exit 0;


sub readlogs {
  my $count = $totspam = $totnonspam = 0;

  foreach my $file ("spam.log", "nonspam.log") {
    open (IN, "<$file");

    while (<IN>) {
      if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; }
      my $hits = $1;

      $_ = $'; s/,,+/,/g; s/^\s+//; s/\s+$//;
      my @tests = ();
      foreach my $tst (split (/,/, $_)) {
	next if ($tst eq '');
	if (!defined $scores->{$tst}) {
	  warn "unknown test in $file, ignored: $tst\n";
	  next;
	}
	push (@tests, $tst);
      }

      $tests_hit{$count} = \@tests;

      if ($file eq "spam.log") {
	$totspam++;
	$is_spam{$count} = 1;
      } else {
	$totnonspam++;
	$is_spam{$count} = 0;
      }
      $count++;
    } 
    close IN;
  }
  $total = $count;
}


sub readscores {
  $scores = { };

  my @files;
  if (!defined $argcffile) { $argcffile = "../rules"; }

  if (-d $argcffile ) {
    @files = <$argcffile/[0-9]*.cf>;
  } else {
    @files = ($argcffile);
  }

  foreach $cffile (@files) {
  print "Reading scores from \"$cffile\"...\n";
  open (IN, "<$cffile") or warn "cannot read $cffile\n";
  while (<IN>) {
    s/#.*$//g; s/^\s+//; s/\s+$//;

    if (/^(header|body|rawbody|full|uri)\s+(\S+)\s+/) {
      $scores->{$2} ||= 1;
    } elsif (/^score\s+(\S+)\s+(.+)$/) {
      $scores->{$1} = $2;
    }
  }
  close IN;
  }
}

sub writescores_c {
  my $size = (scalar keys %{$scores}); # adding 100 here makes the GA take a lot longer since the genome's 100 genes longer! plus messes up scores output - crh

  open (DAT, ">tmp/scores.data");
  print DAT "N$size\n";

  my $count = 0;
  foreach my $name (sort keys %{$scores}) {
    if (!defined $is_mutatable{$name}) { $is_mutatable{$name} = 1; } else { $range_lo{$name} = $range_hi{$name} = $scores->{$name}; }

    $range_lo{$name} ||= 0.1;
    $range_hi{$name} ||= 1.5;

    print DAT ".".$count."\n";
    print DAT "n".$name."\n";
    print DAT "b".$scores->{$name}."\n";
    print DAT "m".$is_mutatable{$name}."\n";
    print DAT "l".$range_lo{$name}."\n";
    print DAT "h".$range_hi{$name}."\n";

    $score_c_index{$name} = $count;
    $count++;
  }

  close DAT;

  open (OUT, ">tmp/scores.h");
  print OUT "

int num_scores;
unsigned char is_mutatable[$size]; 	/* er, is_mutable I think ;) */
double range_lo[$size];
double range_hi[$size];
double bestscores[$size];
double scores[$size];
char *score_names[$size];

/* readscores() is defined in tests.h */

";
  close OUT;
}

sub writetests_c {
  my $file;

  # figure out max hits per message
  my $max_hits_per_msg = 0;
  for ($file = 0; $file < $total; $file++) {
    my $hits = scalar @{$tests_hit{$file}} + 1;
    if ($hits > $max_hits_per_msg) { $max_hits_per_msg = $hits; }
  }

  open (TOP, ">tmp/tests.h");
  print TOP "

int num_tests = $total;
int num_spam = $totspam;
int num_nonspam = $totnonspam;
int max_hits_per_msg = $max_hits_per_msg;
unsigned char num_tests_hit[$total];
unsigned char is_spam[$total];
unsigned short tests_hit[$total][$max_hits_per_msg];

";
  $_ = join ('', <DATA>);
  print TOP $_;
  close TOP;

  open (DAT, ">tmp/tests.data");

  for ($file = 0; $file < $total; $file++)
  {
    print DAT ".".$file."\n";

    my $out = '';
    $out .= "s".$is_spam{$file}."\n";

    my $num_tests_hit = 0;
    foreach my $test (@{$tests_hit{$file}}) {
      if ($test eq '') { next; }

      if (!defined $score_c_index{$test}) {
	warn "test with no C index: $test\n";
      }

      $num_tests_hit++;
      $out .= "t".$score_c_index{$test}."\n";

      if ($num_tests_hit >= $max_hits_per_msg) {
	die "Need to increase \$max_hits_per_msg";
      }
    }

    print DAT "n".$num_tests_hit."\n".$out;
  }
  close DAT;
}

sub read_ranges {
  %is_mutatable = ();
  foreach my $t (@unmutated_tests) { $is_mutatable{$t} = 0; }

  if (!-f 'tmp/ranges.data') {
    system ("make tmp/ranges.data");
  }

  # read ranges, and mutatableness, from ranges.data.
  open (IN, "<tmp/ranges.data")
  	or die "need to run score-ranges-from-freqs first!";

  while (<IN>) {
    /^(\S+) (\S+) (\d+) (\S+)$/ or next;
    my $t = $4;
    $range_lo{$t} = $1+0;
    $range_hi{$t} = $2+0;
    my $mut = $3+0; if (!$mut) { $is_mutatable{$t} = 0; }
  }
  close IN;
}

__DATA__

void loadtests (void) {
  FILE *fin = fopen ("tmp/tests.data", "r");
  char buf[256];
  int file = 0;
  int tnum = 0;

  while (fgets (buf, 255, fin) != NULL) {
    char cmd;
    long arg;

    cmd = (char) *buf;
    arg = strtol (buf+1, NULL, 10);

    if (cmd == '.') {
      file = arg;

    } else if (cmd == 'n') {
      tnum = 0;
      num_tests_hit[file] = arg;

    } else if (cmd == 's') {
      is_spam[file] = arg;

    } else if (cmd == 't') {
      tests_hit[file][tnum] = arg; tnum++;
    }
  }
  fclose(fin);

  printf ("Read test results for %d messages.\n", file+1);
}

void loadscores (void) {
  FILE *fin = fopen ("tmp/scores.data", "r");
  char buf[256];
  int snum = 0;

  while (fgets (buf, 255, fin) != NULL) {
    char cmd;
    long arg;
    float argf;
    char *str, *white;

    cmd = (char) *buf;
    arg = strtol (buf+1, NULL, 10);
    argf = strtod (buf+1, NULL);
    str = buf+1;

    while ((white = strchr (str, '\n')) != NULL) {
      *white = '\0';
    }

    if (cmd == '.') {
      snum = arg;

    } else if (cmd == 'N') {
      num_scores = arg;

    } else if (cmd == 'b') {
      bestscores[snum] = argf;

    } else if (cmd == 'l') {
      range_lo[snum] = argf;

    } else if (cmd == 'h') {
      range_hi[snum] = argf;

    } else if (cmd == 'n') {
      score_names[snum] = strdup (str);	/* leaky leak ;) */

    } else if (cmd == 'm') {
      is_mutatable[snum] = arg;
    }
  }
  fclose(fin);

  printf ("Read scores for %d tests.\n", num_scores);
}

