#!/usr/bin/perl
# Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# version 2 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# filter_Text:
#  Filter text strings.
#  The only outputs contains defined functions, one token per line.
#  This removes anything that looks like binary or a program.
#  Text within programs/binaries are listed as different functions.

###############################################################
###############################################################
###############################################################

###############################################################
# SAM2bSAM(): Convert a string-based function to binary.
###############################################################
sub SAM2bSAM
{
  my $FunctionName=shift;
  my $Function=shift;
  my $bFunction=""; # entire encrypted binary function
  my $bData="";	# binary data
  my $bDataCount=0; # number of items in bData
  my $i;

  if (length($Function) > 32766)
	{
	my $Len;
	my $Out="";
	# really long function?  Arbitrarily split it!
	for($i=0; $i * 32766 < length($Function); $i++)
	  {
	  $Out .= SAM2bSAM("$FunctionName.$i",substr($Function,$i * 32766,32766));
	  }
	return $Out;
	}

  foreach $i (split(/[[:space:]]/,$Function))
    {
    # print STDERR "$FunctionName $i\n";
    if ($i ne "")
      {
      my $j;
      my $Sum=0;
      for($j; $j < length($i); $j+=2)
        {
	$Sum += ord(substr($i,$j,1)) * 256;
	if ($j+1 < length($i)) { $Sum += ord(substr($i,$j+1,1)); }
	}
      $bData .= pack("n",$Sum);
      $bDataCount+=2;
      }
    }

    {
    $bFunction .= pack("n",0x0101); # tag function name
    $bFunction .= pack("n",length($FunctionName)+1);
    $bFunction .= $FunctionName;
    $bFunction .= chr(0);
    # pack function names to 2-byte boundary
    if ((length($FunctionName)+1) % 2 == 1) { $bFunction .= chr(255); }

    $bFunction .= pack("n",0x0108); # tag function tokens
    $bFunction .= pack("n",$bDataCount) . $bData;
    }

  return($bFunction);
} # SAM2bSAM()

###############################################################
###############################################################
# main()
###############################################################

my $i;	# function read
my $WholeFile, $OldWholeFile;

while($ARGV[0] ne "")
{
  # Step 1: Load source
  my $holdTerminator = $/;
  undef $/;
  my $WholeFile;
  open(FIN,"< $ARGV[0]") || die "Unable to open file $ARGV[0]\n";
  $WholeFile = " ";
  $WholeFile .= <FIN>;
  $WholeFile .= " ";
  close(FIN);
  $/ = $holdTerminator;
  shift @ARGV;

  # filter out all non-text characters.
  $WholeFile =~ tr@[A-Z]@[a-z]@; # convert to lowercase
  $WholeFile =~ s@[[:space:]][[:space:]]*@ @g; # convert spaces

  # character fixes (ISO8859)
  $WholeFile =~ s@\x91@'@g; # quote in: `
  $WholeFile =~ s@\x92@'@g; # quote out: '
  $WholeFile =~ s@\x93@"@g; # quote: ``
  $WholeFile =~ s@\x94@"@g; # quote: ''
  $WholeFile =~ s@''@"@g; # poor-man's double quote
  $WholeFile =~ s@\x99@TM@g; # Trademark
  $WholeFile =~ s@\xA9@Copyright@g; # copyright symbol
  $WholeFile =~ s@\xAE@RegisteredTrademark@g; # registered trademark symbol
  $WholeFile =~ s@\xD5@'@g; # single quote
  $WholeFile =~ s@\xA7@SS@g; # subsection (legal notation)
  $WholeFile =~ s@\xC2@SS@g; # subsection (legal notation)
  $WholeFile =~ s@\xE9@e@g; # e with an accent
  $WholeFile =~ s@<year>@Year@g; # year template
  $WholeFile =~ s@{year}@Year@g; # year template
  $WholeFile =~ s@\[year\]@Year@g; # year template
  $WholeFile =~ s@&copy;@Copyright@g; # copyright symbol
  $WholeFile =~ s@\(c\)@Copyright@g; # copyright symbol
  $WholeFile =~ s@<[^>]*>@@g; # remove all other tags

  # reduction
  $WholeFile =~ s@[^ -z]@~@g; # remove non-text characters
  $WholeFile =~ s@~~*@~@g; # remove empty functions
  $OldWholeFile="";
  while($WholeFile ne $OldWholeFile)
    {
    $OldWholeFile = $WholeFile;
    $WholeFile =~ s@~.{1,7}~@~@g; # remove short strings
    }
  undef $OldWholeFile;

  # string reductions for clarification
  $WholeFile =~ s@(.)\1{3,}@SiMpLiFy@g; # more than 3 of same character? reduce!
  $WholeFile =~ s@([^a-zA-Z0-9])@ \1 @g; # space between chars
  $WholeFile =~ s@  *@ @g; # remove empty spaces
  $WholeFile =~ s@ Copyright copyright @ copyright Copyright @g; # copyright ordering

  # reduce years
  $WholeFile =~ s@ [0-9][0-9][0-9][0-9] @ Year @g; # year
  $OldWholeFile="";
  while($WholeFile ne $OldWholeFile)
    {
    $OldWholeFile = $WholeFile;
    $WholeFile =~ s@Year - Year@Year@g; # year range
    $WholeFile =~ s@Year , Year@Year@g; # year range
    }
  undef $OldWholeFile;

  # Start the data file
  print pack("n",0x0004); # file type
  print pack("n",length("Text")+1);
  print "Text" . pack("b",0x00);
  # pack function names to 2-byte boundary
  if ((length("Text")+1) % 2 == 1) { print chr(255); }

  my $FunctionCount=0;
  foreach $i ( split(/~/,$WholeFile) )
    {
    if ($i ne "")
      { 
      $FunctionCount++;
      print SAM2bSAM("Section_$FunctionCount","$i");
      }
    }

  # next file!
} # while files to process

