#!/usr/bin/env perl

# (c) Copyright 2002, 2003, 2004, 2005 Stijn van Dongen
# (c) Copyright 2004 Jason Stajich (column output parsing)
#
# Other (C)ontributors;
#     Dinakarpandian Deendayal      (general comments)
#     Abel Ureta-Vidal              (general comments)
#     Daniel Lundin                 (regexp tweak, warning)
#
# You can redistribute and/or modify this program under the terms of the GNU
# General Public License;  either version 2 of the License or (at your option)
# any later version.

#
# TODO
#
#  -  Infer hsp_len from long output (?)
#  -  (Perhaps) enable section parsing ala tribe, for finer control.   
#  -  $:: usage is arbitrary and ugly. pipe down.
#  -  Is the m9/seenlfgt combination correct?
#  -  Check whether m9 output state is correct if errors occur.

#  -  Check values; check correct usage --key=val and --key
#     (by associating type with key). Or just be sane, use Getopt::Long.
# _____________________________________
# general Getopt::Long / GNU style rant:
# --size=24 and --size 24 should *NOT* mean the same. It's hideous
# to have those syntaxes collapse. Pick one, not two.
# What I find reasonable is to have --verbose mean default verbosity,
# and --verbose=babbling or --verbose=richter-10 etc set the variants.
# then -verbose babbling would be the two-token invocation.
# I might give in though, as some stances are in vain or even in vanity.


use strict;

$::mode_sort = 'o';     # a lphabetical
                        # o ccurrence

$::mode_score = 'e';    # e value
                        # b it

$::mode_parse  = 'long';   # or 'm9'.

$::stream_out  =  "";      # don't stream by default.
$::name_out    =  '-';

$::bcut = 0;
$::ecut = 0;
$::rcut = 0;

my $blastfix = "";
my $addfix = "";

my $user_tabfile = "";

my %stream_modes = qw ( abc 1 123 1 packed 1 );

my $obase = "";

while ($ARGV[0] =~ /^--/) {
   my $arg = shift @ARGV;
   my ($val, $key) = ("", "");
   if ($arg =~ /^--(.*?)(=(.*))?$/) {
      $key = $1;
      $val = $2 ? $3 : "";
   }
   else {
      print "Arguments must be in <--key=val> or <--key> format\n";
      exit(1);
   }

   if ($key eq 'sort') {
      $::mode_sort = $val;
      if ($::mode_sort !~ /^[ao]$/) {
         die "unknown sort mode <$::mode_sort>\n";
      }
   }

   elsif ($key eq 'm9')          { $::mode_parse= 'm9'; }
   elsif ($key eq 'abc')         { $::mode_parse= 'abc'; }
   elsif ($key eq 'ecut')        { $::ecut      = $val; }
   elsif ($key eq 'bcut')        { $::bcut      = $val; }
   elsif ($key eq 'rcut')        { $::rcut      = $val; }
   elsif ($key eq 'tab')         { $user_tabfile= $val; }
   elsif ($key eq 'xo-dat')      { $addfix      = $val; }
   elsif ($key eq 'xi-dat')      { $blastfix    = $val; }
   elsif ($key eq 'base')        { $obase       = $val; }

   elsif ($key eq 'stream') {
      $::stream_out  = $val;
      if (!defined($stream_modes{$val})) {
         die "unknown streaming mode <$val>";
      }
   }
   elsif ($key eq 'out') {
      $::name_out  = $val;
   }
   elsif ($key eq 'abc-out') {
      $::name_out  = $val;
      $::stream_out = 'abc';
   }

   elsif ($key eq 'score') {
      $::mode_score = $val;
      if ($::mode_score !~ /^[ebr]$/) {
         die "unknown sort mode <$::mode_score>\n";
      }
   }
   elsif ($key eq 'help') {
      help();
   }
   else {
      die "unknown argument $arg\n";
   }
}

if (!@ARGV) {
   help();
   exit(0);
}

my $fname = shift || die "please submit name of blast file\n";
$obase = $fname unless $obase;

if ($blastfix) {
   $obase = $fname;
   if ($fname =~ /\Q.$blastfix\E$/) {
      $obase =~ s/\Q.$blastfix\E$//;
   }
   else {
      $fname .= ".$blastfix";
   }
}
if ($addfix) {
   $obase .=  ".$addfix";
}

my ($gix, $giy);
$::seenlft = {};
$::seenrgt = {};
my $tagTocc = {};
my $me = "[$0] ";
my $lc = 0;
my $fh = \*STDIN;

if ($fname ne '-') {
   open(F_BLAST, "<$fname") || die "cannot open $fname\n";
   $fh = \*F_BLAST;
}

$::TAB_user = {};

if ($user_tabfile) {
   read_tab($user_tabfile, $::TAB_user);
}


$::f_raw = undef;
my $f_err = undef;

if ($::stream_out) {
   if (!$::name_out || $::name_out eq '-') {
      open(F_RAW, ">$::name_out") || die "cannot open $::name_out";
      $::f_raw = \*F_RAW;
   }
   else {
      $::f_raw = \*STDOUT;
   }
   $f_err = \*STDERR;

   if ($::stream_out eq 'abc') {
      # print $::f_raw "#aa# (stream abc cookie)\n";
   }
   elsif ($::stream_out eq '123') {
      # print $::f_raw "#11# (stream abc cookie)\n";
   }
   elsif ($::stream_out eq 'packed') {
      # my $sentinel = 2+4+32+128;
      my $cookie     =  0x777700;      # 00000000111111111111111100000000
      # print $::f_raw "L", $cookie;
   }
}
else {
   open(F_RAW, ">$obase.raw") || die "cannot open $obase.raw\n";
   $::f_raw = \*F_RAW;
   open(F_ERR, ">$obase.err") || die "cannot open $obase.err\n";
   $f_err = \*F_ERR;
}


$::ID = 0;

if ($::mode_parse eq 'long') {
   munge_long($fh, $::f_raw);
}
elsif ($::mode_parse eq 'm9' || $::mode_parse eq 'abc') {
   munge_linewise($fh, $::f_raw, $::mode_parse);
}
else {
   die "Unknown parse mode $::mode_parse\n";
}

close $::f_raw;
$::CT = scalar keys %$::seenrgt;


my $alnum = 0;
my $occTmisc = {};

if (!$user_tabfile && !$::stream_out) {
   open(F_HDR, ">$obase.hdr") || die "cannot open $obase.hdr\n";    # header
   open(F_MAP, ">$obase.map") || die "cannot open $obase.idx\n";    # map file
   open(F_TAB, ">$obase.tab") || die "cannot open $obase.idx\n";    # indices
   print F_TAB "#<mapped index> <tag>\n";
   if ($::mode_sort eq 'o') {
      print F_TAB "# sort mode is by occurrence\n";
   }
   elsif ($::mode_sort eq 'a') {
      print F_TAB "# sort mode is alphabetical\n";
   }
   if ($::mode_sort eq 'a') {
      for (sort {$::a cmp $::b; } keys %$tagTocc) {
         print F_TAB "$alnum $_\n";
         $occTmisc->{$tagTocc->{$_}} = [ $alnum, $_ ];
         $alnum++;
      }
      print STDERR "Index [$obase.tab] is sorted by alphabetic order\n";
   }
   elsif ($::mode_sort eq 'o' || 1) {
      for (sort {$tagTocc->{$::a} <=> $tagTocc->{$::b}; } keys %$tagTocc) {
         print F_TAB "$alnum $_\n";
         $occTmisc->{$tagTocc->{$_}} = [ $alnum, $_ ];
         $alnum++;
      }
      print STDERR "Index [$obase.tab] is sorted by occurrence order\n";
      print STDERR "Primary and secondary occurrences are considered equal\n";
   }
   my $ct = keys %$occTmisc;
   print F_MAP "(mclheader\nmcltype matrix\ndimensions $ct", 'x',
               "$ct\n)\n(mclmatrix\nbegin\n";
   for (sort {$::a <=> $::b; } keys %$occTmisc) {
      # print F_MAP "$_ ", $occTmisc->{$_}[0], " ", $occTmisc->{$_}[1], "\n";
      print F_MAP  "$_ $occTmisc->{$_}[0] \$\n";
   }
   print F_MAP ")\n";

   print F_HDR "(mclheader\nmcltype matrix\ndimensions ";
   print F_HDR $::ID . 'x' . $::ID;
   print F_HDR "\n)\n";
   close F_TAB;
   close F_HDR;
   close F_MAP;
}


my $n_err = 0;
for (sort keys %$::seenrgt) {
   if (!$::seenlft->{$_}) {
      print $f_err "secondary element $_ not seen as primary element\n";
      print $f_err "emergency measure: added the element to the primary list\n";
      $n_err++;
   }
}

if ($n_err) {
   print STDERR $me, "$n_err secondary elements not seen as primary element\n";
   print STDERR $me, "I added all of them\n";
   print STDERR $me, "There were $::CT elements in all\n";
}
else {
   print STDERR $me,
   "all secondary elements were also seen as primary elements (check ok)\n";
}





sub munge_linewise {

   my $fh_in   =  shift;
   my $fh_raw  =  shift;
   my $mode    =  shift;

   my $gix_prev = "";

   while (<$fh_in>) {

      next if /^#/;
      chomp;
      my $sc_abc = 0;

      my ($gix, $giy, $percent_id, $hsp_len, $mismatches, $gapsm,
          $qstart, $qend,$hstart, $hend, $e, $b)
          =
          split
      if $mode eq 'm9';
                              # yes, ugly and lazy.
      ($gix, $giy, $sc_abc) = split if $mode eq 'abc';

      my $s = 0;

      my $idx = getid($gix, 1);
      next unless $idx >= 0;

      if ($gix_prev ne $gix) {
          if (!$::stream_out) {
             if ($gix_prev) {
               print $fh_raw "\$\n";
             }
             if( $idx >= 0 ) {
               print $fh_raw "$idx ";
             }
         }
      }
      my $idy = getid($giy, 0);

      $s
      =     $mode eq 'm9'
         ?  getscore($e, $b, $hsp_len)
         :  $sc_abc;

      if ($idy >= 0) {
          if (!$::stream_out) {
             print $fh_raw "$idy:$s ";
          }
          else {
             print $fh_raw "$gix\t$giy\t$s\n";
          }
      }
      $gix_prev = $gix;
   }

   if ($gix_prev) {
      print $fh_raw "\$\n" if !$::stream_out;
   }
}


sub munge_long {

   my $fh_in = shift;
   my $fh_raw = shift;

   my $need_query = 1;
   my $need_hits = 2;
   my $need_gi = 3;

   my $state = $need_query;

   while (<$fh_in>) {
      $lc++;
      chomp;
      if (/^Query=\s+gi\|(\d+(_\d+)?)/ || /^Query=\s+(\S+).*$/) {
# warn "STATE query $1\n";
         if ($state != $need_query) {
            print STDERR "Unexpected 'Query=' line\n";
         }
         $gix = $1;
         my $idx = getid($gix, 1);

         if ($idx >= 0) {
            print $fh_raw "$idx " if !$::stream_out;
         }
         $state = $need_hits;
      }
      elsif (/^Query=/) {
         print STDERR "Query string not recognized: $_\n";
      }
      elsif
      ($state == $need_hits && /sequences producing significant alignments/i) {
         $state = $need_gi;
# warn "STATE significant\n";
      }
      elsif ($state == $need_hits && /no hits found/i) {
         print STDERR "no hits found for gi $gix\n";
         print $fh_raw "\$\n" if !$::stream_out;
         $state = $need_query;
      }
      elsif
      (  $state == $need_gi
      && ! /^>/
      && (/^gi\|(\d+(_\d+)?)/ || /^(\S+)\s+.*$/)
      )
      {  $giy = $1;
         my $idy = getid($giy, 0);
         my ($s, $b, $e);

         if (/(\S+)\s+(\S+)\s*$/) {
            $b = +$1;
            $e = +$2;
         }
         else {
            print STDERR "no scores in line $lc [$_]!\n";
            next;
         }

         $s = getscore($e, $b, 0);

         if ($idy >= 0) {               # fixme, void or explain.
            if (!$::stream_out) {
               print $fh_raw "$idy:$s ";
            }
            else {
               print $fh_raw "$giy\t$gix\t$s\n";
            }
         }
      }
      elsif (/^\s*$/) {
         # paragraph skip does not change state, including the $need_gi case.
      }
      elsif (/(Statistics|Parameters):/) {
         $state = $need_query;
         # this provides WU-blast compatibility.
      }
      elsif ($state == $need_gi) {
         print $fh_raw "\$\n" if !$::stream_out;
         $state = $need_query;
      }
   }

   if ($state == $need_gi) {
      print $f_err "run ended while expecting more secondary scores\n";
      print STDERR "run ended while expecting more secondary scores\n";
      print $fh_raw "\$\n" if !$::stream_out;
   }
}




sub read_tab {
   my $file = shift;
   my $tab = shift;
   open (U_TAB, "<$file") || "die cannot open $file\n";
   while (<U_TAB>) {
      if (/^\s*#/) {
         next;
      }
      else {
         if (/^(\d+)\s+(.*)/) {
            $tab->{$2} = $1;
         }
         else {
            print STDERR "___ cannot parse line: $_";
         }
      }
   }
}


sub getscore {
   my ($e, $b, $hl) = @_;
   my $s = 0;

   if ($::mode_score eq 'e') {
      $e = "1$e" if $e =~ /^e/;
      $s = $e > 0 ? -log($e)/log(10) : 200;
      if ($s > 200) {
         $s = 200;
      }
      $s = $s > $::ecut ? $s : 0;
   }
   elsif ($::mode_score eq 'b') {
      $s = $b > $::bcut ? $b : 0;
   }
   elsif ($::mode_score eq 'r' && $hl) {
      $s = $b / $hl;
      $s = $s > $::rcut ? $s : 0;
   }
   return $s;
}


sub getid {
   my ($gi, $is_a_query) = @_;
   my $id = -1;

   if ($user_tabfile) {
      if (defined($::TAB_user->{$gi})) {
         $id = $::TAB_user->{$gi};
      }
      else {
         print STDERR "___ no user tab entry for label <$gi>\n";
         return -1;
      }
   }
   else {
      if (!exists($tagTocc->{$gi})) {
# warn "$is_a_query $gi <-> $::ID\n";
         $tagTocc->{$gi} = $::ID++;
      }
      $id = $tagTocc->{$gi};
   }

   $::seenrgt->{$id}++;
   if ($is_a_query) {
      $::seenlft->{$id}++;
   }
   return $id;
}


sub help {
   print <<_help_;
Usage: mcxdeblast <options> file-name
where file-name is in BLAST NCBI format.
mcxdeblast will create
   base.hdr    [to be read by mcxassemble]
   base.raw    [to be read by mcxassemble]
   base.map    [to be read by mcxassemble]
   base.tab    [to be read by clmformat]
   base.err    [error log]
where base is derived from or equal to file-name
Options:
   --m9              Expect column (-m 9) input.
   --abc             Expect simple ID1 ID2 SCORE input.
   --score=<b|e|r>   Use bit scores, E-values,
                        or bit scores normalized by hsp-length
   --sort=<a|o>      Use alphabetic sorting (default) or occurrence.
   --tab=<fname>     Use user-supplied tab file.
   --xi-dat=<suf>    Strip <suf> from file-name to create output base name.
   --xo-dat=<suf>    Add <suf> to base name.
   --bcut=<val>      Ignore hits below bit score <val>
   --ecut=<val>      Ignore hits below E-value <val>
_help_
}
