(*
 * Copyright (c) 2001 Stefan Kral
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *)


open List
open Util
open Expr
open Variable
open Number
open VSimdBasics
open VSimdUnparsing
open VScheduler
open VAnnotatedScheduler
open P4Basics
open P4RegisterAllocationBasics
open P4RegisterReallocation
open P4Unparsing
open P4Translate
open P4FlatInstructionScheduling
open P4InstructionSchedulingBasics
open Printf
open AssignmentsToVfpinstrs
open CodeletMisc


let vect_schedule vect_optimized =
  let _ = info "vectorized scheduling..." in
  let vect_scheduled = schedule vect_optimized in
  let _ = info "vectorized annotating..." in
  let vect_annotated = annotate vect_scheduled in
  let _ = info "vectorized linearizing..." in
    annotatedscheduleToVsimdinstrs vect_annotated

let vect_optimize varinfo n expr =
  let _ = info "simplifying..." in
  let code0 = Exprdag.simplify_to_alist expr in
  let _ = info "mapping assignments to vfpinstrs..." in
  let code1 = AssignmentsToVfpinstrs.assignmentsToVfpinstrs varinfo code0 in
  let _ = info "vectorizing..." in
  let (operandsize,code2) = P4Vectorization.vectorize varinfo n code1 in
  let _ = info "optimizing..." in
  let code3 = VP4Optimization.apply_optrules code2 in
  let code4 = vect_schedule code3 in
  let _ = info "improving..." in
     (operandsize, VImproveSchedule.improve_schedule code4)


(****************************************************************************)

let get2ndhalfcode base one dst ld_n = 
  if ld_n < 0 then
    [P4V_IntCpyUnaryOp(P4_ICopy, base, dst)]
    (* issue a warning or sth similar *)
    (* failwith "get2ndhalfcode: ld_n < 0 not supported!" *)
  else if ld_n <= 3 then
    [P4V_IntLoadEA(P4V_RISID(base,one,1 lsl ld_n,0), dst)]
  else
    [P4V_IntCpyUnaryOp(P4_ICopy, one, dst);
     P4V_IntUnaryOp(P4_IShlImm ld_n, dst);
     P4V_IntBinOp(P4_IAdd, base, dst)]


let loadfnargs xs = map (fun (src,dst) -> (dst, [P4V_IntLoadMem(src,dst)])) xs

(****************************************************************************)

let emit_instr1 instr = 
  p4rinstrAdaptStackPointerAdjustment instr;
  print_string (P4Unparsing.p4rinstrToString instr);
  print_newline ()


(* Warning: this function produces side-effects. *)
let emit_instr (_,_,instr) = 
  p4rinstrAdaptStackPointerAdjustment instr;
  print_string (P4Unparsing.p4rinstrToString instr);
  print_newline ()

let emit_instr' (t,cplen,i) = 
  p4rinstrAdaptStackPointerAdjustment i;
  Printf.printf "\t/*  t=%d, cp=%d  */\t" t cplen;
  print_string (P4Unparsing.p4rinstrToString i);
  print_newline ()

let emit_instr0' i = 
  p4rinstrAdaptStackPointerAdjustment i;
  Printf.printf "\t/*  t=?, cp=?  */\t";
  print_string (P4Unparsing.p4rinstrToString i);
  print_newline ()

(****************************************************************************)

(* determine stackframe size in bytes *)
let getStackFrameSize instrs =
  let processInstr ((operand_size_bytes,max_bytes) as s0) = function
    | P4R_SimdSpill(_,idx) -> 
	(operand_size_bytes, max max_bytes ((idx+1) * operand_size_bytes))
    | P4R_SimdPromiseCellSize operand_size ->
	(p4operandsizeToInteger operand_size, max_bytes)
    | _ -> s0 in

  snd (fold_left processInstr (p4operandsizeToInteger P4_QWord,min_int) instrs)
			
(****************************************************************************)

type simdconstant = 
  | SC_Number1 of Number.number
  | SC_Number2 of Number.number * Number.number
  | SC_SimdPos of vsimdpos

let eq_simdconstant a b = match (a,b) with
  | (SC_SimdPos a, SC_SimdPos b) ->
      a=b
  | (SC_Number1 n, SC_Number1 m) -> 
      eq_number n m
  | (SC_Number2(n,m), SC_Number2(n',m')) -> 
      eq_number n n' && eq_number m m'
  | _ -> 
      false

let p4rinstrsToConstants = 
  let rec rinstrsToConsts bag = function
    | [] -> List.rev bag
    | x::xs ->
	(match x with
	  | P4R_SimdUnaryOp(P4_FPChs p,_)
	    when not (exists (eq_simdconstant (SC_SimdPos p)) bag) ->
	      rinstrsToConsts ((SC_SimdPos p)::bag) xs 
	  | P4R_SimdUnaryOp(P4_FPMulC2(n,m),_)
	    when not (exists (eq_simdconstant (SC_Number2 (n,m))) bag) ->
	      rinstrsToConsts ((SC_Number2 (n,m))::bag) xs 
	  | P4R_SimdUnaryOp(P4_FPMulC1 n,_)
	    when not (exists (eq_simdconstant (SC_Number1 n)) bag) ->
	      rinstrsToConsts ((SC_Number1 n)::bag) xs 
	  | _ -> rinstrsToConsts bag xs)
  in rinstrsToConsts []

let myconstIsSimd2 = function
  | SC_SimdPos _ -> true
  | SC_Number2 _ -> true
  | SC_Number1 _ -> false

let myconst_to_decl = function
  | SC_SimdPos V_Lo -> 
      (vsimdposToChsconstnamestring V_Lo)   ^ ": .double -1.0, +1.0"
  | SC_SimdPos V_Hi -> 
      (vsimdposToChsconstnamestring V_Hi)   ^ ": .double +1.0, -1.0"
  | SC_SimdPos V_LoHi -> 
      (vsimdposToChsconstnamestring V_LoHi) ^ ": .double -1.0, -1.0"
  | SC_Number1 n ->
      sprintf "%s: .double %s"
	      (Number.unparse n)
	      (Number.to_string n)
  | SC_Number2(n,m) ->
      sprintf "%s%s: .double %s, %s"
	      (Number.unparse n)
	      (Number.unparse m)
	      (Number.to_string n)
	      (Number.to_string m)

(****************************************************************************)

let instr_wants_down_pass1 = function
  | P4R_SimdPromiseCellSize _ -> false
  | P4R_SimdLoadStoreBarrier -> true
  | P4R_IntLoadMem _ -> true
  | P4R_IntStoreMem _ -> false
  | P4R_IntLoadEA _ -> true
  | P4R_IntUnaryOp _ -> true
  | P4R_IntUnaryOpMem _ -> true
  | P4R_IntCpyUnaryOp _ -> true
  | P4R_IntBinOp _ -> true
  | P4R_IntBinOpMem _ -> true
  | P4R_SimdLoad _ -> true
  | P4R_SimdLoad1 _ -> true
  | P4R_SimdStore _ -> false
  | P4R_SimdStore1 _ -> false
  | P4R_SimdSpill _ -> false
  | P4R_SimdUnaryOp(P4_FPMulC1 _,_) -> false
  | P4R_SimdUnaryOp(P4_FPMulC2 _,_) -> false
  | P4R_SimdUnaryOp _ -> true
  | P4R_SimdCpyUnaryOp _ -> true
  | P4R_SimdCpyUnaryOpMem _ -> true
  | P4R_SimdBinOp _ -> false
  | P4R_SimdBinOpMem _ -> false
  | P4R_Label _ -> false
  | P4R_Jump _ -> false
  | P4R_CondBranch _ -> false
  | P4R_Ret -> false

let instr_wants_down_pass2 = function
  | P4R_SimdCpyUnaryOpMem _ -> false
  | instr -> instr_wants_down_pass1 instr

let rec move_down_instrs instr_wants_down = 
  let rec moveonedown i = function
    | [] -> [i]
    | x::xs as xxs -> 
	if p4rinstrCannotRollOverP4rinstr (get3of3 i) (get3of3 x) then 
	  i::xxs 
	else 
	  x::(moveonedown i xs)
  in function
    | [] -> []
    | x::xs -> 
	let xs' = move_down_instrs instr_wants_down xs in
	  if instr_wants_down (get3of3 x) then moveonedown x xs' else x::xs'

(****************************************************************************)

let p4rinstrIsAGIMovable = function
  | P4R_SimdLoad _ -> true
  | P4R_SimdStore _ -> true
  | P4R_SimdCpyUnaryOpMem _ -> true
  | P4R_SimdBinOpMem _ -> true
  | P4R_SimdSpill _ -> true
  | _ -> false

let p4rinstrIsSimdLoad = function 
  | P4R_SimdLoad _  -> true
  | P4R_SimdLoad1 _ -> true
  | _ -> false

let p4rinstrIsSimdStore = function
  | P4R_SimdStore _  -> true
  | P4R_SimdStore1 _ -> true
  | _ -> false

let p4rinstrCanMoveOverAGI a b = match (a,b) with
  | (i1, i2) 
    when p4rinstrIsSimdLoad i1 && p4rinstrIsSimdStore i2 -> true
  | (i1, P4R_SimdCpyUnaryOpMem _) when p4rinstrIsSimdLoad i1 -> true
  | (i1, P4R_SimdSpill _) when p4rinstrIsSimdLoad i1 -> true
  | (i1, P4R_SimdBinOpMem _) when p4rinstrIsSimdLoad i1 -> true
  | (i1, P4R_SimdUnaryOp _) when p4rinstrIsSimdLoad i1 -> true
  | (P4R_SimdCpyUnaryOpMem _, P4R_SimdSpill _) -> true
  | (P4R_SimdUnaryOp _, P4R_SimdSpill _) -> true
  | (i2, P4R_SimdBinOpMem _) when p4rinstrIsSimdStore i2 -> true
  | (i2, P4R_SimdCpyUnaryOpMem _) when p4rinstrIsSimdStore i2 -> true
  | (i2, P4R_SimdSpill _) when p4rinstrIsSimdStore i2 -> true
  | (i2, P4R_SimdUnaryOp _) when p4rinstrIsSimdStore i2 -> true
  | _ -> false

let avoid_address_generation_interlock instrs = 
  let stop (_,_,x) (_,_,y) = 
    p4rinstrCannotRollOverP4rinstr x y || 
    (not (p4rinstrCanMoveOverAGI x y)) in
  let rec loop = function
    | [] -> []
    | ((_,_,instr) as x)::xs ->
	if p4rinstrIsAGIMovable instr then
	  insertList stop x (loop xs)
	else
	  x::(loop xs)
  in loop instrs

(****************************************************************************)

let p4rinstrsToPromiseEarly xs =
  let flag = ref false in
  let rec p4rinstrsToPromiseEarly__int = function
    | [] -> []
    | (P4R_Label _ as x1)::(P4R_SimdPromiseCellSize _ as x2)::xs ->
	x1::x2::(p4rinstrsToPromiseEarly__int xs)
    | x1::(P4R_SimdPromiseCellSize _ as x2)::xs ->
	flag := true;
	x2::(p4rinstrsToPromiseEarly__int (x1::xs))
    | x::xs ->
	x::(p4rinstrsToPromiseEarly__int xs)
  in (!flag,p4rinstrsToPromiseEarly__int xs)


let rec optimize ref_t0 instrs0 =
  let _ = info (sprintf "optimizing: len %d, ref_t %d." 
			(length instrs0) ref_t0) in
  let instrs1 = p4rinstrsToRegisterReallocated
		  (move_down_instrs 
		 	instr_wants_down_pass2
                        (move_down_instrs instr_wants_down_pass1 instrs0)) in
  let instrs = p4rinstrsToInstructionscheduled instrs1 in
  let (ref_t,_,_) = list_last instrs in
    if ref_t < ref_t0 then
      optimize ref_t instrs
    else
      instrs0

(****************************************************************************)

let rec addIndexToListElems i0 = function
  | [] -> []
  | x::xs -> (i0,x)::(addIndexToListElems (succ i0) xs)

let procedure_proepilog fn_name stackframe_size_bytes code = 
  let regs_to_save =
	addIndexToListElems 1
		(filter (fun r -> exists (p4rinstrUsesP4rintreg r) code) 
			p4rintregs_calleesaved) in
  let total_stackframe_size = if regs_to_save = [] then 0 else 16 in
  (if total_stackframe_size > 0 then 
     [P4R_IntUnaryOp(P4_ISubImm total_stackframe_size, p4rintreg_stackpointer)]
   else
     []) @
  (map (fun (i,r) -> P4R_IntStoreMem(r,P4_MFunArg (-i))) regs_to_save) @
  code @
  (map (fun (i,r) -> P4R_IntLoadMem(P4_MFunArg (-i), r)) regs_to_save) @
  (if total_stackframe_size > 0 then 
     [P4R_IntUnaryOp(P4_IAddImm total_stackframe_size, p4rintreg_stackpointer)]
   else
     []) @
  [P4R_Ret]

(****************************************************************************)

let compileToAsm_with_fixedstride (n, dir, ctype, initcode, p4vinstrs) 
				  (istride,ostride) =
  let _ = info "compileToAsm..." in
  let fn_name = codeletinfoToFnname n (ctype,dir) ^ 
		"_i" ^ (string_of_int istride) ^
		"_o" ^ (string_of_int ostride) in
  let realcode = P4RegisterAllocator.regalloc initcode p4vinstrs in
  let stackframe_size_bytes = getStackFrameSize realcode in
  let realcode = procedure_proepilog fn_name stackframe_size_bytes realcode in
  let _ = info "scheduling instructions..." in
  let realcode = fixpoint p4rinstrsToPromiseEarly realcode in
  let realcode' =
      avoid_address_generation_interlock 
	  (optimize 1000000 (p4rinstrsToInstructionscheduled realcode)) in

  let realcode'' = map get3of3 realcode' in
  let _ = info "before unparseToAsm" in
  let all_consts = p4rinstrsToConstants realcode'' in
  let (consts2, consts1) = List.partition myconstIsSimd2 all_consts in
  let datasection = 
	if all_consts = [] then 
	  ""
	else
	  ".section .rodata" ^ "\n" ^
          "\t" ^ ".balign 64" ^ "\n" ^
          "\t" ^ (listToString myconst_to_decl "\n\t" consts2) ^ "\n\n" ^
          "\t" ^ (listToString myconst_to_decl "\n\t" consts1) ^ "\n\n" in
  begin
    print_string datasection;
    print_string (".text\n\t.balign 64\n\t.globl\t" ^ fn_name ^ "\n");
    print_string ("\t.type\t" ^ fn_name ^ ", @function\n");
    print_string (fn_name ^ ":\n");
    p4rinstrInitStackPointerAdjustment 0;
    iter emit_instr1 realcode''
  end


let compileToAsm (n, dir, ctype, initcode, p4vinstrs) =
  let _ = info "compileToAsm..." in
  let fn_name = codeletinfoToFnname n (ctype,dir) in
  let realcode = P4RegisterAllocator.regalloc initcode p4vinstrs in
  let stackframe_size_bytes = getStackFrameSize realcode in
  let realcode = procedure_proepilog fn_name stackframe_size_bytes realcode in
  let _ = info "scheduling instructions..." in
  let realcode = fixpoint p4rinstrsToPromiseEarly realcode in
  let realcode' =
      avoid_address_generation_interlock 
	  (optimize 1000000 (p4rinstrsToInstructionscheduled realcode)) in

  let realcode'' = map get3of3 realcode' in
  let _ = info "before unparseToAsm" in
  let all_consts = p4rinstrsToConstants realcode'' in
  let (consts2, consts1) = List.partition myconstIsSimd2 all_consts in
  let datasection = 
	if all_consts = [] then 
	  ""
	else
	  ".section .rodata" ^ "\n" ^
          "\t" ^ ".balign 64" ^ "\n" ^
          "\t" ^ (listToString myconst_to_decl "\n\t" consts2) ^ "\n\n" ^
          "\t" ^ (listToString myconst_to_decl "\n\t" consts1) ^ "\n\n" in
  begin
    print_string datasection;
    print_string (".text\n\t.balign 64\n\t.globl\t" ^ fn_name ^ "\n");
    print_string ("\t.type\t" ^ fn_name ^ ", @function\n");
    print_string (fn_name ^ ":\n");
    p4rinstrInitStackPointerAdjustment 0;
    iter emit_instr1 realcode'';
    print_newline ();
    print_newline ();
    print_string (codelet_description n dir ctype);
  end
