#!/usr/bin/perl
# Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# version 2 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# filter_Java:
#  Filter Java source code.
#  Based on filter_C.
#  The only outputs contains defined functions, one token per line.
#  This removes comments, globals, prototypes, etc.

###############################################################
###############################################################
###############################################################

##################################
# RemoveSpaces(): Given a string, remove
# all whitespace
# This is used for fixing quoted strings.
#################################
sub RemoveSpaces
{
  my $Line=shift;
  $Line =~ s/[[:space:]]//g;
  return($Line);
} # RemoveSpaces()

##################################
# ReplaceParameters(): Given a string that
# represents a function call, replace every
# parameter.
#################################
sub ReplaceParameters
{
  my $Line=shift;
  my $NewLine="";
  my $L;

  # remove all voids

  $Line =~ s/[[:space:]]+/ /g;
  $Line =~ s/^ +//;
  $Line =~ s/ +$//;
  if ($Line eq "") { return ""; }

  foreach $L ( split(/,/,$Line) )
    {
    if (($L ne "void") && ($L ne ""))
      {
      if ($NewLine ne "") { $NewLine .= ","; }
      $NewLine .= " >Parm< ";
      }
    }
  return($NewLine);
} # ReplaceParameters()

##################################
# IsKeyword(): check if a word is a keyword
# check for standard Java strings
# Source: http://www.jchq.net/certkey/0404certkey.htm
##################################
sub IsKeyword
{
  my $Word=shift;
  if ($Word =~ /^(abstract|boolean|break|byte|case|catch|char|class|const|continue|default|do|double|else|extends|final|finally|float|for|goto|if|implements|import|instanceof|int|interface|long|native|new|package|private|protected|public|return|short|static|strictfp|super|switch|synchronized|this|throw|throws|transient|try|void|volatile|while)$/)
  	{
	return(1);
	}
  return(0);
} # IsKeyword()

##################################
# NormalizeVariables(): Given a string, remove
# all variable names.
# Strings are only found within functions.
# At this point, functions are separated by "\n".
# This is used for reducing differences due
# to naming conventions
#################################
sub NormalizeVariables
{
  my $OldSource=shift;
  my $NewSource="";
  my $Function;
  my @LineList=();
  my $NewLine;
  my $i;

  foreach $Function ( split(/\n/,$OldSource) )
    {
    $Function =~ s/^ +//;
    @LineList=();
    @LineList = split(/ +/,$Function);
    # $LineList[0] == function name.  Don't change it!
    $NewSource .= "$LineList[0] ";
    for($i=1; $i <= $#LineList; $i++)
      {
      # check if it's a string
      if ($LineList[$i] =~ /^".*"$/)
	{
	$NewSource .= "String ";
	}
      elsif ($LineList[$i] =~ /^'.*'$/)
	{
	$NewSource .= "Char ";
	}

      # check it it's a parameter tag
      elsif ($LineList[$i] eq ">Parm<")
	{
	$NewSource .= "Parm ";
	}

      # check if it's a symbol
      elsif ($LineList[$i] !~ /^([\w.]+(->[\w.]*)*)$/)
	{
	# not a variable.  Keep it.
	$NewSource .= "$LineList[$i] ";
	}

      # check if it's a number
      elsif ($LineList[$i] =~ /^[0-9]+$/)
	{
	# not a variable.  Keep it.
	# $NewSource .= "$LineList[$i] ";
	$NewSource .= "Num ";
	}
      elsif ($LineList[$i] =~ /^0x[0-9a-fA-F]+$/)
	{
	# not a variable.  Keep it.
	# $NewSource .= "$LineList[$i] ";
	$NewSource .= "Num ";
	}

      elsif (IsKeyword($LineList[$i]) eq 1)
	{
	$NewSource .= "$LineList[$i] ";
	}

      # otherwise, assume it is a variable!
      else
	{
	$NewSource .= "Var ";
	}
      }
    $NewSource .= "\n";
    }

  # replace any "Var Var" with "Var"
  # these come from variable declarations.
  $NewSource =~ s/Var( +Var)* /Var /g;

  # Some programmers split strings like ("hi" "there")
  $NewSource =~ s/String( +String)* /String /g;

  return($NewSource);
} # NormalizeVariables()

###############################################################
# SAM2bSAM(): Convert a string-based function to binary.
###############################################################
sub SAM2bSAM
{
  my $Function=shift;
  my $bFunction=""; # entire encrypted binary function
  my $bData="";	# binary data
  my $bDataCount=0; # number of items in bData
  my $i;
  my $State="GETNAME";  # FSM: GETNAME -> READDATA

  foreach $i (split(/\n/,$Function))
    {
    if ($State eq "GETNAME")
      {
      $bFunction .= pack("n",0x0101); # tag function name
      $bFunction .= pack("n",length($i)+1);
      $bFunction .= $i;
      $bFunction .= chr(0);
      # pack function names to 2-byte boundary
      if ((length($i)+1) % 2 == 1) { $bFunction .= chr(255); }
      $State="READDATA";
      }
    elsif (($State eq "READDATA") && ($i ne ""))
      {
      my $j;
      my $Sum=0;
      for($j; $j < length($i); $j+=2)
        {
	$Sum += ord(substr($i,$j,1)) * 256;
	if ($j+1 < length($i)) { $Sum += ord(substr($i,$j+1,1)); }
	}
      $bData .= pack("n",$Sum);
      $bDataCount+=2;
      }
    elsif (($State eq "READDATA") && ($i eq ""))
      {
      $bFunction .= pack("n",$bDataCount) . $bData;
      $bData="";
      $bDataCount="";
      $State="GETNAME";
      }
    }
  if ($State eq "READDATA")
      {
      $bFunction .= pack("n",0x0108); # tag function name
      $bFunction .= pack("n",$bDataCount) . $bData;
      }
  return($bFunction);
} # SAM2bSAM()

###############################################################
# ProcessFunction(): Process a function.
# Returns the processed function to stdout.
###############################################################
sub ProcessFunction
{
  my $Function=shift;
  my $Word;

  $Word = $Function;
  $Word =~ s/(\w+) .*/\1/;
  if (IsKeyword($Word) eq 1)	{ return($Function); }

  # there is a start! Idenify the any function name
  $Function .= ' ';
#  $Function =~ s@^.* ([\w]* \([0-9]+ )@ \1@;
  $Function =~ s@^[[:space:]]*@@;
  # remove calling parameters
  $Function =~ s@\([0-9]+ (.*?) \)[0-9]+@ "( " . ReplaceParameters($1) . " )"@e;

  # Undo scope notations
  $Function =~ s@([\{\}\(\)\[\]])[0-9]+ @\1 @g;

  # But, we've put in too many newlines.
  # Undo quotes (this is complex)
  # The problem: must identify all spaces between "< and >".
  $Function =~ s/  */ /g; # remove duplicate spaces
  $Function =~ s@("<([^"]|\\")*>")@ RemoveSpaces($1) @ge;
  $Function =~ s@('<([^']|\\')*>')@ RemoveSpaces($1) @ge;

  $Function = NormalizeVariables($Function);

  $Function =~ s/  +/ /g; # remove duplicate spaces (for ease of debugging)

  # remove all variable definitions
  ## Look between "{ ... ;" or "; ... ;"
  ## Remove anything not an assignment, comparison, function, or scope.
  ## We include + and - due to ++ and -- assignments.
  $Function =~ s@({|;) [^=><+\-(){}]* ;@\1@g;

  # Now for the fun part...
  # Everything line that does not start with a doublequote gets the
  # spaces turned to newline! (Bwa ha ha ha)
  $Function =~ s/  */ /g; # remove duplicate spaces
  $Function =~ s/^ //g; # remove begin space
  $Function =~ s/ $//g; # remove any end space
  $Function =~ s/\n /\n/g; # remove any newline space
  $Function =~ s@ @\n@g; # PUT EVERY TOKEN ON A NEW LINE!

  print SAM2bSAM("$Function\n");
  return "";
} # ProcessFunction()

###############################################################
###############################################################
# main()
###############################################################

my $c;	# character read
my $Source;
my @SourceList;	# for complex splitting/joining of $Source
my $s;	# source segment (for looping)
my $Scope;	# counter number of {...}.  "{" = +1, "}" = -1
my $Junk;

while($ARGV[0] ne "")
{
  # Step 1: Load source and strip out single-line items.
  open(FIN,"< $ARGV[0]") || die "Unable to open file $ARGV[0]\n";
  shift @ARGV;

  # initialize
  $Source="";
  $Scope=0;
  $c="";

  # Start the data file
  print pack("n",0x0004); # file type
  print pack("n",length("Java")+1);
  print "Java" . pack("b",0x00);
  # pack function names to 2-byte boundary
  if ((length("Java")+1) % 2 == 1) { print chr(255); }

  # load the source file
  #  - strip out all line breaks
  #  - strip out all single-line elements (#include, etc.)
  # NOTE: we either load $c from FIN or it's already loaded.
  while(($c ne "") || (($c = getc(FIN)) ne ""))
    {
    # print "Loaded -$c-\n";
    $c =~ s/[[:space:]]/ /;	# just permit spaces

    if ($c eq "\\")
	{
	$c = getc(FIN);
	# ignore line concatinations
	if ($c ne "\n") { $Source .= "\\$c"; }
	else { $Source .= " "; }
	$c="";
	} # process single character quoting

    elsif ($c =~ /[{\[(]/ )	# if { or [ or (
	{
	$Source .= " $c$Scope ";
	$Scope++;
	$c="";
	} # process scope
    elsif ($c =~ /[}\])]/ )	# if ) or ] or }
	{
	$Scope--;
	$Source .= " $c$Scope ";
	$c="";
	} # process scope

    elsif ($c eq "#")
	{
	# Hash is only found in quoted strings, or on a compiler line.
	# Since we handle strings separately...  Blow away the line!
	$Junk = <FIN>;
	chomp $Junk;
	while($Junk =~ /\\$/)
	  {
	  $Junk = <FIN>;
	  chomp $Junk;
	  }
	$c="";
	} # process hashes

    elsif ($c eq "\'")
	{
	# single quote
	# Technically, only a single character can be in single-quotes.
	# But, a single character has many representations:
	#  A \A 0x65 \x65 etc...
	# So just treat it like a string.
	$Source .= "$c<";	# save start quote
	while( (($c = getc(FIN)) ne "") && ($c ne "\'"))
		{
		if ($c eq "\\")
			{
			$c .= getc(FIN);
			}
		$Source .= $c;
		}
	$Source .= ">'";	# save the unquote
	$c="";
	}

    elsif ($c eq "\"")
	{
	# display quoted string, but encase in begins ("<) and ends (>")
	$Source .= "$c<";
	while( (($c = getc(FIN)) ne "") && ($c ne "\""))
		{
		# Single quotes can cause confusion.  Remove them.
		if ($c eq "\'")  { $c = "_"; }
		if ($c eq "\\")
			{
			$c .= getc(FIN);
			}
		$Source .= $c;
		}
	$Source .= ">\"";
	$c="";
	} # process quotes

    elsif ($c eq "/")
	{
	$c=getc(FIN);
	if ($c eq "/")
		{
		# it's a "//" so blow away the line.
		$Junk = <FIN>;
		$c="";
		}
	elsif ($c eq "*")
		{
		my $Lastc='@';
		# it's a comment (/*) , blow away to end of comment (*/)
		$c='@';
		while((($c = getc(FIN)) ne "") && ("$Lastc$c" ne "*/"))
			{
			$Lastc=$c;
			}
		$c="";
		}
	else
		{
		# it's just a slash, keep it.
		$Source .= "/";
		# NOTE: $c is already loaded!
		}
	} # process comments

    else
	{
	$Source .= $c;
	$c="";
	}
    }
  close(FIN);

  # add a space at the end, just so we can be sure to match tokens.
  $Source .= " ";

  # postprocess loaded file
  # The "$Source" looks like one long line of C code. (But it compiles!)
  # Now we want to strip out everything except function names and function
  # contents.
  $Source =~ s/  */ /g; # remove duplicate spaces

  # space every known token
  $Source =~ s@(\w+)@ \1 @g; # space around words/numbers
  $Source =~ s@;@ ; @g; # space around semicolons
  $Source =~ s@,@ , @g; # space around commas
  $Source =~ s@([^\+])(\+)@\1 \2@g; # space around positive numbers
  $Source =~ s@([^\-])(\-)@\1 \2@g; # space around negative numbers
  $Source =~ s@"<@ "<@g; # space around quotes
  $Source =~ s@>"@>" @g; # space around quotes
  $Source =~ s@'<@ '<@g; # space around quotes
  $Source =~ s@>'@>' @g; # space around quotes

  # Undo spaces around scope notations
  $Source =~ s@( [{}()\[\]]) +([0-9]+) @ \1\2 @g;

  # Undo things that should not have spaces
  $Source =~ s@( +\-> +)@\->@g;
  ## merge around periods, unless the variable begins with a period
  $Source =~ s@( [^ {}()\[\]]\w*) +\.@ \1\.@g;
  $Source =~ s@\. +(\w)@\.\1@g;  # period followed by word is ok
  $Source =~ s@ \.([^ {}\[\]()])@ \1@g;  # now remove beginning periods

  $Source =~ s/  */ /g; # remove duplicate spaces

  # LET'S IDENTIFY FUNCTIONS!
  {
  my $MaxParen,$i;
  for($MaxParen=0; $Source =~/{$MaxParen/; $MaxParen++)
  	{ ; }
  
  # Look for a function.
  # All functions look like: \w+ ( [^)]* ) \w+ { .* }
  # Examples:
  #   Name (paramaters) throw up { stuff }
  #   Name (paramaters) { stuff }
  for($i = $MaxParen; $i ge 0; $i--)
    {
    $Source =~ s@(\w+ \([0-9]+ [^)]+ \)[0-9]+( \w+)* {$i .*? }$i)@ ProcessFunction($1) @ge;
    }
  }

  # next file!
} # while files to process

