;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;                                                                       ;;
;;;                Centre for Speech Technology Research                  ;;
;;;                     University of Edinburgh, UK                       ;;
;;;                       Copyright (c) 1996,1997                         ;;
;;;                        All Rights Reserved.                           ;;
;;;                                                                       ;;
;;;  Permission to use, copy, modify, distribute this software and its    ;;
;;;  documentation for research, educational and individual use only, is  ;;
;;;  hereby granted without fee, subject to the following conditions:     ;;
;;;   1. The code must retain the above copyright notice, this list of    ;;
;;;      conditions and the following disclaimer.                         ;;
;;;   2. Any modifications must be clearly marked as such.                ;;
;;;   3. Original authors' names are not deleted.                         ;;
;;;  This software may not be used for commercial purposes without        ;;
;;;  specific prior written permission from the authors.                  ;;
;;;                                                                       ;;
;;;  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        ;;
;;;  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      ;;
;;;  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   ;;
;;;  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     ;;
;;;  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    ;;
;;;  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   ;;
;;;  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          ;;
;;;  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       ;;
;;;  THIS SOFTWARE.                                                       ;;
;;;                                                                       ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;;   Phrase boundary prediction.
;;;   
;;;   Two methods supported, if POS is enabled we use ngrams for that
;;;   otherwise we use a CART tree
;;;
;;;   Models trained from the IBM/Lancaster Spoken English Corpus and 
;;;   Boston University's FM Radio Corpus.

;;;
;;;  Here's a very simple CART tree for predicting phrase breaks
;;;  based on punctuation only
;;;
(set! simple_phrase_cart_tree
'
((Token.punc in ("?" "." ":"))
  ((BB))
  ((Token.punc in ("'" "\"" "," ";"))
   ((B))
   ((n.name is 0)  ;; end of utterance
    ((BB))
    ((NB))))))

;;;  This is a simple CART tree used after boundaries are predicted
;;;  by the probabilistic method to get two levels of break
(set! english_phrase_type_tree
'((pbreak is NB)
  ((Token.EMPH is 1)
   ((B))
   ((n.Token.EMPH is 1)
    ((B))
    ((NB))))
  ((pbreak is BB)
   ((BB))
   ((name in ("." "!" "?"))  ;; only (potentially) change Bs to BBs
    ((BB))
    ((B))))))

(set! f2b_phrase_cart_tree
'
((gpos is punc)
 (((1 0.00238095) (3 0) (4 0.997619) B))
 (((4 0.00238095) (3 0) (1 0.997619) NB))))

;;;  For more detailed prediction of phrase breaks we use POS and
;;;  probability distribution of breaks
;;;  These models were trained using data from the Lancaster/IBM
;;;  Spoken English Corpus

(defvar pbreak_ngram_dir (car (reverse load-path))
  "pbreak_ngram_dir
  The directory containing the ngram models for predicting phrase
  breaks.  By default this is the standard library directory.")
;;(set! pbreak_ngram_dir "/home/awb/data/marsec/ngrams")

(ngram.load 'english_break_pos_ngram 
      (path-append pbreak_ngram_dir "sec.ts20.quad.ngrambin"))
(defvar break_pos_ngram_name nil
  "break_pos_ngram_name
  The name of the loaded ngram containing the a priori ngram model for 
  predicting phrase breaks in the Phrasify module.  This model should 
  predict probability distributions for B and NB given some context of 
  part of  speech tags.")
(defvar break_gram_scale_s 0.59
  "break_gram_scale_s
  A weighting factor for breaks in the break/non-break ngram.")

(ngram.load 'english_break_ngram 
      (path-append pbreak_ngram_dir "sec.B.hept.ngrambin"))
(defvar break_ngram_name nil
  "break_ngram_name
  The name of the loaded ngram  containing the a posteriori ngram model
  for predicting phrase breaks in the Phrasify module.  This module should
  predict probability distributions for B and NB given previous B and
  NBs.  If this variable is set to 'use_dist an internal method is
  used which counts the distance from the previous break (doesn't work
  well though).")

(defvar phrase_type_tree nil
  "phrase_type_tree
  When Phrase_Method is prob_models, this tree, if set is used to 
  potentially predict phrase type.  At least some prob_models only
  predict B or NB, this tree may be used to change some Bs into
  BBs.  If it is nil, the pbreak value predicted by prob_models
  remains the same.")

(defvar break_tags '(B NB)
  "break_tags
  A list of tags used in identifying breaks.  Typically B and NB (and
  BB).  This should be the alphabet of the ngram identified in
  break_ngram_name")

;(set! break_pos_ngram_name 
;      (path-append pbreak_ngram_dir "sec.tsBB.quad.ngram"))
;;(set! break_prob_tree sec_pos_break_tree)
;(set! break_ngram_name
;      (path-append pbreak_ngram_dir "sec.B.hept.gram"))

(def_feature_docstring 
  'Word.pbreak
  "Word.pbreak
  Result from statistical phrasing module, may be B or NB denoting
  phrase break or non-phrase break after the word.")

(def_feature_docstring 
  'Word.pbreak_score
  "Word.pbreak_score
  Log likelihood score from statistical phrasing module, for pbreak
  value.")

(def_feature_docstring 
  'Word.blevel
  "Word.blevel
  A crude translation of phrase break into ToBI like phrase level.
  Values may be 0,1,2,3,4.")

(provide 'phrase)
