#!/usr/bin/python2.3

#from __future__ import generators

import os, glob, sys, unicodedata, locale, gzip, re, traceback, string

from optparse import OptionParser

#import unihan

VERSION='0.4.6'

locale.setlocale(locale.LC_ALL, '')

colours = {
            'none'       :    "",
            'default'    :    "\033[0m",
            'bold'       :    "\033[1m",
            'underline'  :    "\033[4m",
            'blink'      :    "\033[5m",
            'reverse'    :    "\033[7m",
            'concealed'  :    "\033[8m",

            'black'      :    "\033[30m",
            'red'        :    "\033[31m",
            'green'      :    "\033[32m",
            'yellow'     :    "\033[33m",
            'blue'       :    "\033[34m",
            'magenta'    :    "\033[35m",
            'cyan'       :    "\033[36m",
            'white'      :    "\033[37m",

            'on_black'   :    "\033[40m",
            'on_red'     :    "\033[41m",
            'on_green'   :    "\033[42m",
            'on_yellow'  :    "\033[43m",
            'on_blue'    :    "\033[44m",
            'on_magenta' :    "\033[45m",
            'on_cyan'    :    "\033[46m",
            'on_white'   :    "\033[47m",

            'beep'       :    "\007",
            }


general_category = {
      'Lu':  'Letter, Uppercase',
      'Ll':  'Letter, Lowercase',
      'Lt':  'Letter, Titlecase',
      'Lm':  'Letter, Modifier',
      'Lo':  'Letter, Other',
      'Mn':  'Mark, Non-Spacing',
      'Mc':  'Mark, Spacing Combining',
      'Me':  'Mark, Enclosing',
      'Nd':  'Number, Decimal Digit',
      'Nl':  'Number, Letter',
      'No':  'Number, Other',
      'Pc':  'Punctuation, Connector',
      'Pd':  'Punctuation, Dash',
      'Ps':  'Punctuation, Open',
      'Pe':  'Punctuation, Close',
      'Pi':  'Punctuation, Initial quote',
      'Pf':  'Punctuation, Final quote',
      'Po':  'Punctuation, Other',
      'Sm':  'Symbol, Math',
      'Sc':  'Symbol, Currency',
      'Sk':  'Symbol, Modifier',
      'So':  'Symbol, Other',
      'Zs':  'Separator, Space',
      'Zl':  'Separator, Line',
      'Zp':  'Separator, Paragraph',
      'Cc':  'Other, Control',
      'Cf':  'Other, Format',
      'Cs':  'Other, Surrogate',
      'Co':  'Other, Private Use',
      'Cn':  'Other, Not Assigned',
}

bidi_category = {
     'L'   : 'Left-to-Right',
     'LRE' : 'Left-to-Right Embedding',
     'LRO' : 'Left-to-Right Override',
     'R'   : 'Right-to-Left',
     'AL'  : 'Right-to-Left Arabic',
     'RLE' : 'Right-to-Left Embedding',
     'RLO' : 'Right-to-Left Override',
     'PDF' : 'Pop Directional Format',
     'EN'  : 'European Number',
     'ES'  : 'European Number Separator',
     'ET'  : 'European Number Terminator',
     'AN'  : 'Arabic Number',
     'CS'  : 'Common Number Separator',
     'NSM' : 'Non-Spacing Mark',
     'BN'  : 'Boundary Neutral',
     'B'   : 'Paragraph Separator',
     'S'   : 'Segment Separator',
     'WS'  : 'Whitespace',
     'ON'  : 'Other Neutrals',
}

comb_classes = {
        0: 'Spacing, split, enclosing, reordrant, and Tibetan subjoined',
        1: 'Overlays and interior',
        7: 'Nuktas',
        8: 'Hiragana/Katakana voicing marks',
        9: 'Viramas',
       10: 'Start of fixed position classes',
      199: 'End of fixed position classes',
      200: 'Below left attached',
      202: 'Below attached',
      204: 'Below right attached',
      208: 'Left attached (reordrant around single base character)',
      210: 'Right attached',
      212: 'Above left attached',
      214: 'Above attached',
      216: 'Above right attached',
      218: 'Below left',
      220: 'Below',
      222: 'Below right',
      224: 'Left (reordrant around single base character)',
      226: 'Right',
      228: 'Above left',
      230: 'Above',
      232: 'Above right',
      233: 'Double below',
      234: 'Double above',
      240: 'Below (iota subscript)',
}


HomeDir = os.path.expanduser('~/.unicode')
HomeUnicodeData = os.path.join(HomeDir, "UnicodeData.txt")
UnicodeDataFileNames = ['/usr/share/unicode/UnicodeData.txt', HomeUnicodeData, './UnicodeData.txt']

if not os.path.exists(HomeDir):
    print "Making directory %s" % (HomeDir)
    os.mkdir(HomeDir)
if not os.path.exists(HomeUnicodeData):
    UnicodeData = glob.glob('/usr/share/perl/*/unicore/UnicodeData.txt')
    if UnicodeData:
        if os.path.islink(HomeUnicodeData): # symlink exists, but is invalid - probably perl was updated
            print "Removing old symlink"
            os.remove(HomeUnicodeData)
        print "Found %s, symlinking to %s" % (UnicodeData[0], HomeUnicodeData)
        os.symlink(UnicodeData[0], HomeUnicodeData)
    


def error(txt):
    print txt
    sys.exit()

def GrepInNames(pattern):
    p = re.compile(pattern, re.I)
    f = None
    for name in UnicodeDataFileNames:
        try:
            f = open(name)
            break
        except IOError:
            try:
                f = gzip.GzipFile(name+".gz")
                break
            except IOError:
                continue
    if not f:
        print """
Cannot find UnicodeData.txt, please place it into 
/usr/share/unicode/UnicodeData.txt or current working directory 
(optionally you can gzip it).
Without the file, searching will be much slower.

"""
        for i in xrange(sys.maxunicode):
            try:
                name = unicodedata.name(unichr(i))
                if re.search(p, name):
                    yield myunichr(i)
            except ValueError:
                pass
    else:
        while 1:
            l = f.readline()
            if not l:
                break
            if re.search(p, l):
                r = myunichr(int(l.split(';')[0], 16))
                yield r
        f.close()


def myunichr(n):
    try:
        r = unichr(n)
        return r
    except ValueError:
        traceback.print_exc()
        error("Consider recompiling your python interpreter with wide unicode characters")

        

def guesstype(arg):
    if arg[:2]=='U+' or arg[:2]=='u+': # it is hexadecimal number
        try:
            val = int(arg[2:], 16)
            if val>sys.maxunicode:
                return 'regexp', arg
            else:
                return 'hexadecimal', arg[2:]
        except ValueError:
            return 'regexp', arg
    elif arg[0] in "Uu" and len(arg)>4:
        try:
            val = int(arg[1:], 16)
            if val>sys.maxunicode:
                return 'regexp', arg
            else:
                return 'hexadecimal', arg
        except ValueError:
            return 'regexp', arg
    elif len(arg)>=4:
        try:
            val = int(arg, 16)
	    if val>sys.maxunicode:
	        return 'regexp', arg
	    else:
                return 'hexadecimal', arg
        except ValueError:
            return 'regexp', arg
    else:
        return 'string', arg
    

def process(arg, t):
    if t==None:
        t, arg = guesstype(arg)
    if t=='hexadecimal':
        val = int(arg, 16)
        r = myunichr(val)
        yield r
    elif t=='decimal':
        val = int(arg, 10)
        r = myunichr(val)
        yield r
    elif t=='regexp':
        for r in GrepInNames(arg):
            yield r
    elif t=='string':
        try:
            unirepr = unicode(arg, options.iocharset)
        except UnicodeDecodeError:
            error ("Sequence %s is not valid in charset '%s'." % (repr(arg),  options.iocharset))
        for r in unirepr:
            yield r
    
def maybe_colours(colour):
    if use_colour:
        return colours[colour]
    else:
        return ""
    
# format key and value
def printkv(*l):
    for i in range(0, len(l), 2):
        if i<len(l)-2:
            sep = "  "
        else:
            sep = "\n"
        k, v = l[i], l[i+1]
        print maybe_colours('green')+k+":"+maybe_colours('default'), unicode(v).encode(options.iocharset,'replace')+sep,


def print_characters(list, maxcount):
    counter = 0
    for c in list:
        if maxcount:
            counter += 1
        if counter > options.maxcount:
            print 
            print "Too many characters to display, more than %s, use --max option to change it" % options.maxcount
            return
        try:
            name = unicodedata.name(c)
        except ValueError:
            name = None
        print maybe_colours('bold')+'U+%04X'% ord(c), 
        if name:
            print unicodedata.name(c),
        else:
            print maybe_colours('default'), "- No such unicode character name in database",
        print maybe_colours('default')

        ar = ["UTF-8", string.join([("%02x" % ord(x)) for x in c.encode('utf-8')]) ,
              "UTF-16BE", string.join([("%02x" % ord(x)) for x in c.encode('utf-16be')], ''),
              "Decimal", "&#%s;" % ord(c) ]
        if options.addcharset:
            try:
                rep = string.join([("%02x" % ord(x)) for x in c.encode(options.addcharset)] )
            except UnicodeError:
                rep = "NONE"
            ar.extend( [options.addcharset, rep] )
        printkv(*ar)

        category = unicodedata.category(c)

        if unicodedata.combining(c):
            pc = " "+c
        else:
            pc = c
        print pc.encode(options.iocharset,'replace'),
        if category=='Ll':
            print "(%s)" % pc.upper().encode(options.iocharset,'replace')
            printkv( "Uppercase", 'U+%04X'% ord(c.upper()) )
        elif category=='Lu':
            print "(%s)" % pc.lower().encode(options.iocharset,'replace')
            printkv( "Lowercase", 'U+%04X'% ord(c.lower()) )
        else:
            print
        printkv( 'Category', category+ " (%s)" % general_category[category] )
        
        try:
            num = unicodedata.numeric(c)
            printkv( 'Numeric value', num)
        except ValueError:
            pass
        
        try:
            dig = unicodedata.digit(c)
            printkv( 'Digit value', dig )
        except ValueError:
            pass
        
        bidi = unicodedata.bidirectional(c)
        if bidi:
            printkv( 'Bidi', bidi+ " (%s)" % bidi_category[bidi] )
        mirrored = unicodedata.mirrored(c)
        if mirrored:
            print 'Character is mirrored'
        comb = unicodedata.combining(c)
        if comb:
            printkv( 'Combining', str(comb)+ " (%s)" % (comb_classes.get(comb, '?')) )
        decomp = unicodedata.decomposition(c)
        if decomp:
            printkv( 'Decomposition', decomp.encode(options.iocharset, 'replace') )
        """
        if options.verbosity>0:
            han = unihan.get_properties(c)
            if han:
                for i, val in han.iteritems():
                    printkv( i, val )
            else:
                print "No Unihan data"
        """
        print



LTR = u'\u200e'
LTR = ''

def print_block(block):
    #header
    print " "*9,
    for i in range(16):
        print ".%X" % i,
    print
    #body
    for i in range(block*16, block*16+16):
        hexi = "%X" % i
        if len(hexi)>3:
            hexi = "%07X" % i
            hexi = hexi[:4]+" "+hexi[4:]
        else:
            hexi = "     %03X" % i
        print LTR+hexi+". ",
        for j in range(16):
            c = unichr(i*16+j)
            if unicodedata.combining(c):
                c = " "+c
            print (c.encode(options.iocharset, 'replace'))+" ",
        print
    print

def print_blocks(blocks):
    for block in blocks:
        print_block(block)
    

def is_range(s, typ):
    sp = s.split('..')
    if len(sp)<>2:
        return False
    if not sp[1]:
        sp[1] = sp[0]
    elif not sp[0]:
        sp[0] = sp[1]
    if not sp[0]:
        return False
    low = list(process(sp[0], typ))
    high = list(process(sp[1], typ))
    if len(low)<>1 or len(high)<>1:
        return False
    low = ord(low[0])
    high = ord(high[0])
    low = low // 256
    high = high // 256 + 1
    return range(low, high)

iocharsetguess = locale.nl_langinfo(locale.CODESET) or "ascii"



parser = OptionParser(usage="usage: %prog [options] arg")
parser.add_option("-x", "--hexadecimal",
      action="store_const", const='hexadecimal', dest="type", 
      help="Assume arg to be hexadecimal number")
parser.add_option("-d", "--decimal",
      action="store_const", const='decimal', dest="type",
      help="Assume arg to be decimal number")
parser.add_option("-r", "--regexp",
      action="store_const", const='regexp', dest="type",
      help="Assume arg to be regular expression")
parser.add_option("-s", "--string",
      action="store_const", const='string', dest="type",
      help="Assume arg to be a sequence of characters")
parser.add_option("-a", "--auto",
      action="store_const", const=None, dest="type",
      help="Try to guess arg type (default)")
parser.add_option("-m", "--max",
      action="store", default=10, dest="maxcount", type="int",
      help="Maximal number of codepoints to display, default: 10; 0=unlimited")
parser.add_option("-i", "--io",
      action="store", default=iocharsetguess, dest="iocharset", type="string",
      help="I/O character set, I am guessing %s" % iocharsetguess)
parser.add_option("-c", "--charset-add",
      action="store", dest="addcharset", type="string",
      help="Show hexadecimal reprezentation in this additional charset")
parser.add_option("-C", "--colour",
      action="store", dest="use_colour", type="string",
      default="auto",
      help="Use colours, on, off or auto")
parser.add_option("-v", "--verbose",
      action="count", dest="verbosity",
      default=0,
      help="Increase verbosity")

      
      
(options, args) = parser.parse_args()


if len(args)==0:
     parser.print_help()
     sys.exit()

if options.use_colour.lower() in ("on", "1", "true", "yes"):
    use_colour = True
elif options.use_colour.lower() in ("off", "0", "false", "no"):
    use_colour = False
else:
    use_colour = sys.stdout.isatty()    

l = [] # list of characters to process
for i in args:
    r = is_range(i, options.type)
    if r:
        print_blocks(r)
    else:
        print_characters(process(i, options.type), options.maxcount)
    
