#!/usr/bin/env python
#
#---
# $Id: trans2arabic,v 1.4 2003/11/01 07:15:44 elzubeir Exp $
#
# ------------
# Description:
# ------------
# This script will convert a transliterated file to its native Arabic
# in UTF-8 or CP1256 encoding
# It is TAB delimited for reading into Excel or any other application
# that can read a CSV type of application.
# 
# This is specifically written to handle the datasets bundled with
# the Buckwalter Morphological Analyzer
#
# (C) Copyright 2003, Arabeyes, Mohammed Elzubeir
# -----------------
# Revision Details:    (Updated by Revision Control System)
# -----------------
#  $Date: 2003/11/01 07:15:44 $
#  $Author: elzubeir $
#  $Revision: 1.4 $
#  $Source: /home/arabeyes/cvs/projects/duali/pyduali/trans2arabic,v $
#
#  This program is written under the BSD License.
#---

import sys, getopt, os, string
from pyduali.trans_table import *

scriptname = os.path.splitext(os.path.basename(sys.argv[0]))[0]
scriptversion = '$Id: trans2arabic,v 1.4 2003/11/01 07:15:44 elzubeir Exp $'


def about():
  "About this script"
  print "; This output is generated by the '%s' script which is a part of" \
         % scriptname
  print "; Duali (http://www.arabeyes.org/project.php?proj=duali)"
  

def help():
  "Display help message"
  print """
This utility will convert the Buckwalter transliterated lexicon data set
to UTF-8 or CP1256 encoded comma separated file (csv). The default encoding
is set to the UTF-8. It should be noted that the Windows CP-1256 encoding
is sometimes not sufficient to convert some of the transliterations.
It is therefore advised to only use UTF-8 encoding.

The CSV file outputted can be opened by Excel or any other spreadsheet
program (and should be importable by most if not all database programs
with little to no modification).

"""
  usage()
  
def usage():
  "Display usage options"
  print "(C) Copyright 2003, Arabeyes, Mohammed Elzubeir\n"
  print "Usage: %s -f filename [OPTIONS]" % scriptname
  print "\t[-h | --help            ]\toutputs a help message"
  print "\t[-V | --version         ]\tprogram version"
  print "\t[-f | --file= filename  ]\tinput file containing word list"
  print "\t[-c | --charset ENCODING]\tencoding (cp1256 or utf-8)"
  print "\t[-l | --lines           ]\tnumber of lines per file"
  print "\t[-d | --delimiter       ]\tdelimiter (tab or comma)"
  print "\rnThis program is licensed under the BSD License.\n"

def grabargs():

  if not sys.argv[1:]:
    usage()
    sys.exit(0)

  charset = 'utf-8'
  nl = 0
  delim = 1
  
  try:
    opts, args = getopt.getopt(sys.argv[1:], "hVc:f:l:d:",
                               ["help", "version", "charset", "file=",
                                "line", "delimiter"],)
  except getopt.GetoptError:
    usage()
    sys.exit(0)

  for o, val in opts:
    if o in ("-h", "--help"):
      help()
      sys.exit(0)
    if o in ("-V", "--version"):
      print scriptversion
      sys.exit(0)
    if o in ("-f", "--file"):
      fname = val
    if o in ("-c", "--charset"):
      charset = val
    if o in ("-l", "--lines"):
      nl = int(val)
    if o in ("-d", "--delimiter"):
      if (val == 'tab'):
        delim = 1
      if (val == 'comma'):
        delim = 0
    
  return (fname, charset, nl, delim)


def getGlossPOS(s):
  """
  Break up the glosspos into a tuple, first the pos then the gloss
  If the there is no pos then simply return an empty string and the gloss
  """
  start = s.find('<pos>')
  if (not start):
    return ('', s)
  end = s.find('</pos>', start)
  return (s[start+5:end-6].strip(), s[:start-1].strip())

def readchop(str,c, delim):
  "Transliteration->CHARSET conversion"

  # comma delimited file
  # w_vanilla - word without any of the diacritics
  # w_full    - word with full diacritic marks
  # w_cat     - word category
  # w_pos     - word part of speech
  # w_gloss   - word glossary
  w_vanilla, w_full, w_cat, w_glosspos = str.split('\t')
  w_pos, w_gloss = getGlossPOS(w_glosspos)
  try:
    if (delim == 0):
      return "%s,%s,%s,%s,%s\n" % (t2charset(w_vanilla,c), twcharset(w_full,c),
                                w_cat, w_pos, w_gloss)
    else:
      return "%s\t%s\t%s\t%s\t%s\n" % \
            (t2charset(w_vanilla,c), t2charset(w_full,c), w_cat, w_pos, w_gloss)
  except:
    if (delim == 0):
      return "%s,%s,%s,%s,%s\n" % (w_vanilla, w_full, w_cat, w_pos, w_gloss)
    else:
      return "%s\t%s\t%s\t%s\t%s\n" % \
            (w_vanilla, w_full, w_cat, w_pos, w_gloss)

def t2charset(s, charset):
  "Tranliteration to character-set conversion of a string"
  mystr = ''
  for mychar in s:
    mystr = "%s%s" % (mystr , t2a_table.get(mychar, mychar))
  return mystr.encode(charset)
  
def main():
  fname, charset, nl, delim = grabargs()
  lines = open(fname, 'r').readlines()
  linecounter = 0
  filecounter = 0
  
  for line in lines:
    if (string.find(line, ';') == 0):
      pass
    else:
      if (nl == 0):
        f = open("%s.txt" % fname, 'a')
        f.write(readchop(line[:-1],charset,delim))
        f.close()
      elif (linecounter < nl):
        f = open("%s%d.txt" % (fname,filecounter), 'a')
        f.write(readchop(line[:-1], charset, delim))
        linecounter += 1
        f.close()
      else:
        filecounter += 1
        f = open("%s%d.txt" % (fname,filecounter), 'a')
        f.write(readchop(line[:-1], charset, delim))
        linecounter = 0
        f.close()
      
  sys.exit(0)

if __name__ == "__main__":
  main()
