#!/usr/bin/env python
"""
PluckerDocs.py   $Id: PluckerDocs.py,v 1.27 2001/10/17 20:44:06 janssen Exp $

(Corresponds to disass.py  Version 0.16)


Contains a set of classes to hold information about Plucker documents.

Called as a script it is a utility to
o disassemble files generated by the plucker awk parser (or any other
  parser) or any other binary form of a plucker record,
o re-assemble these into possible a different format
o collect statistics about the disassembled documents

Call this as
  ``python <script name>'' and it will give you a list of valid options.

If you get a traceback with an assertion error, something is wrong in
the data file, i.e. some data not conforming to the specification was
found.  (Parser writers: if you disagree somewhere or find a useful
check that is missing, please tell me.)


Copyright 1999,2000 by Holger Duerer <holly@starship.python.net>

Distributable under the GNU General Public License Version 2 or newer.

"""

if __name__ == '__main__':
    ## The following section tries to get the PyPlucker directory onto the
    ## system path if called as a script and if it is not yet there:
    try: import PyPlucker
    except ImportError:
        import os, sys
        file = sys.argv[0]
        while os.path.islink (file): file = os.readlink (file)
        sys.path = [os.path.split (os.path.dirname (file))[0]] + sys.path
        try: import PyPlucker
        except ImportError:
            print "Cannot find where module PyPlucker is located!"
            sys.exit (1)

        # and forget the temp names...
        del file, os
    del PyPlucker
    ##
    ## Now PyPlucker things should generally be importable
    ##



import string
import struct
import types
import urlparse
import urllib
try:
    import zlib
except ImportError:
    zlib = None
    
import StringIO
from rfc822 import Message

try:
    from  Pyrite import doc_compress
    doc_compress_function = doc_compress.compress
    doc_uncompress_function = doc_compress.uncompress
except ImportError:
    # Pyrite not installed?
    # try other dirs to import the file directly
    import sys, os
    sys.path.append(".")
    sys.path.append(os.path.dirname (sys.argv[0]))
    try:
        from PyPlucker.helper import doc_compress
        doc_compress_function = doc_compress.compress
        doc_uncompress_function = doc_compress.uncompress
    except ImportError:
        doc_compress_function = None
        doc_uncompress_function = None
try:
    from Pyrite import _Doc
    doc_block_compress_function = _Doc.compress
except ImportError:
    doc_block_compress_function = doc_compress_function


## These constants are only valid for the new format!!!
DOCTYPE_HTML = 0
DOCTYPE_HTML_COMPRESSED = (1 << 8)
DOCTYPE_IMAGE = (2 << 8)
DOCTYPE_IMAGE_COMPRESSED = (3 << 8)
DOCTYPE_MAILTO = (4 << 8)
DOCTYPE_LINK_INDEX = (5 << 8)
DOCTYPE_LINKS = (6 << 8)
DOCTYPE_LINKS_COMPRESSED = (7 << 8)
DOCTYPE_BOOKMARKS = (8 << 8)
DOCTYPE_CATEGORY = (9 << 8)
DOCTYPE_METADATA = (10 << 8)

# DB type constants
DBTYPE_DOC = 1
DBTYPE_ZLIB = 2

## The maximum number of bytes a text document should be before being
## split into spill documents or truncated:
Max_Document_Size = 30000

## The text to add when the document gets truncated:
Document_Truncated_Text = "This document exceeded the maximum allowable size and was truncated at this point"
## The text to add as link to next spill document
Document_Next_Part_Text = "Click here for the next part"
## The text to add as link to previous spill document
Document_Previous_Part_Text = "Click here for the previous part"



_DOC_HEADER_SIZE = 8
_PARA_HEADER_SIZE = 4


def DocCompressData (data):
    """Do Doc compression of data.
    If 'in_blocks' is true, do so in in 4KB blocks"""

    if not doc_compress_function:
        raise RuntimeError, "No doc compression function available!"

    res = ""
    while data:
        block = data[:4096]
        data = data[4096:]
        res = res + doc_block_compress_function (block)
    return res
    
def DocUncompressData (data):
    """Do Doc uncompression of data compressed in 4KB blocks"""
    if not doc_uncompress_function:
        raise RuntimeError, "No doc uncompression function available!"

    return doc_uncompress_function (data)

def ZLibCompressData (data):
    if zlib is None:
        raise RuntimeError, "No ZLib support in your Python installation!"
    return zlib.compress (data)

def ZLibUncompressData (data):
    if zlib is None:
        raise RuntimeError, "No ZLib support in your Python installation!"
    return zlib.decompress (data)

CompressFunction = DocCompressData
UncompressFunction = DocUncompressData

def UseDocCompression ():
    global CompressFunction
    global UncompressFunction
    CompressFunction = DocCompressData
    UncompressFunction = DocUncompressData

def UseZLibCompression ():
    global CompressFunction
    global UncompressFunction
    CompressFunction = ZLibCompressData
    UncompressFunction = ZLibUncompressData


class PluckerTextDocumentStatistics:
    """A class to collect staticics about a Plucker text document"""

    def __init__ (self):
        self._num_paragraphs = 0
        self._num_images = 0
        self._num_databytes = 0
        self._num_compressed_databytes = 0
        self._num_paddingbytes = 0
        self._num_graphicsbytes = 0
        self._num_compressed_graphicsbytes = 0
        self._num_headerbytes = 0

    def combine_with (self, other_stat):
        self._num_paragraphs = self._num_paragraphs + other_stat._num_paragraphs
        self._num_images = self._num_images + other_stat._num_images
        self._num_databytes = self._num_databytes + other_stat._num_databytes
        self._num_compressed_databytes = self._num_compressed_databytes + other_stat._num_compressed_databytes
        self._num_paddingbytes = self._num_paddingbytes + other_stat._num_paddingbytes
        self._num_graphicsbytes = self._num_graphicsbytes + other_stat._num_graphicsbytes
        self._num_compressed_graphicsbytes = self._num_compressed_graphicsbytes + other_stat._num_compressed_graphicsbytes
        self._num_headerbytes = self._num_headerbytes + other_stat._num_headerbytes

    def get_whole_data_length (self):
        return self._num_databytes + self._num_paddingbytes + self._num_headerbytes
        
    def add_paragraphs (self, n):
        self._num_paragraphs = self._num_paragraphs + n
        
    def add_images (self, n):
        self._num_images = self._num_images + n
        
    def add_databytes (self, n):
        self._num_databytes = self._num_databytes + n
        
    def add_compressed_databytes (self, n):
        self._num_compressed_databytes = self._num_compressed_databytes + n
        
    def add_paddingbytes (self, n):
        self._num_paddingbytes = self._num_paddingbytes + n
        
    def add_graphicsbytes (self, n):
        self._num_graphicsbytes = self._num_graphicsbytes + n

    def add_compressed_graphicsbytes (self, n):
        self._num_compressed_graphicsbytes = self._num_compressed_graphicsbytes + n

    def add_headerbytes (self, n):
        self._num_headerbytes = self._num_headerbytes + n
        
    def pretty_print (self, prefix_string=""):
        full_data_length = self.get_whole_data_length ()
        if full_data_length:
            data_percent = "%d" % (100 * self._num_databytes / full_data_length)
            compress_percent = "%d" % (100 * self._num_compressed_databytes / full_data_length)
            padding_percent = "%d" % (100 * self._num_paddingbytes / full_data_length)
            header_percent = "%d" % (100 * self._num_headerbytes / full_data_length)
        else:
            data_percent = "??"
            compress_percent = "??"
            padding_percent = "??"
            header_percent = "??"
        if full_data_length:
            print "%s%d bytes form %d paragraphs" % (prefix_string, full_data_length, self._num_paragraphs)
            print "%s%d bytes (=%s%%) of document data, %d bytes (= %s%%) of padding, %d bytes (= %s%%) for headers" % \
                  (prefix_string, self._num_databytes, data_percent,
                   self._num_paddingbytes, padding_percent,
                   self._num_headerbytes, header_percent)
            if self._num_compressed_databytes and self._num_databytes:
                compression_ratio = "%4.1f" % (100.0 * self._num_compressed_databytes / self._num_databytes)
                print "%sWith Doc compression of data: %d bytes of (compressed) data makes %s percent of original size" % \
                      (prefix_string, self._num_compressed_databytes, compression_ratio)
        if self._num_images:
            print "%s%d bytes in %d images" % (prefix_string, self._num_graphicsbytes, self._num_images)
            if self._num_compressed_graphicsbytes and self._num_graphicsbytes:
                compression_ratio = "%4.1f" % (100.0 * self._num_compressed_graphicsbytes / self._num_graphicsbytes)
                print "%sWith Doc compression of image data: %d bytes of (compressed) data makes %s percent of original size" % \
                      (prefix_string, self._num_compressed_graphicsbytes, compression_ratio)


class PluckerDocument:
    """A base class for all types of Plucker documents"""
    def __init__ (self, url):
        self._url = url

    def get_url (self):
        return self._url

    def get_urls (self):
        # return a list of all urls, that are associated with this document
        # the first entry should be the same as the result of get_url()
        return [self._url]

    def is_text_document (self):
        return 0
            
    def is_image_document (self):
        return 0
            
    def is_mailto_document (self):
        return 0
            
    def is_special_document (self):
        return 0
            

CMD_TEXT = 1
CMD_IMAGE = 2
CMD_ANCHOR_START = 3
CMD_ANCHOR_END = 4
CMD_ITALICS_START = 5
CMD_ITALICS_END = 6
CMD_NEWLINE = 7
CMD_SET_MARGIN = 8
CMD_SET_ALIGNMENT = 9
CMD_HR = 10
CMD_SET_STYLE = 11
CMD_ULINE_START = 12
CMD_ULINE_END = 13
CMD_STRIKE_START = 14
CMD_STRIKE_END = 15


class PluckerTextParagraph:
    """A class to contain information about one paragraph of a Plucker
    Text document."""

    
    
    def __init__ (self, extra_space=0):
        self._data = ""
        self._items = []
        self._extra_space = extra_space
        self._names = []
        self._document_refs = []
        self._image_refs = []
        self._estimated_length = _PARA_HEADER_SIZE


    def set_extra_spacing (self, extra_space):
        extra_space = int (extra_space)
        assert 0 <= extra_space <= 7, \
               "Extra space for a paragraph needs to be between 0 and 7 but is %s" % repr (extra_space)
        self._extra_space = extra_space


    def get_extra_spacing (self):
        return self._extra_space


    def get_estimated_length (self):
        return self._estimated_length


    def get_names (self):
        """Return the list of names of this paragraph (i.e. named anchors
        in this paragraph)"""
        return self._names


    def get_external_references (self):
        return (self._document_refs, self._image_refs)

    
    def padding_needed (self):
        """Calculate how much passing is needed for this paragraph.
        (Old format of plucker docs)"""
        padding = 4 - (len (self._data) % 4)
        if padding == 4:
            padding = 0
        return padding


    def _add (self, something):
        """Privete function: Add some entry to internal list of
        parsed things"""
        self._items.append (something)


    def add_name (self, name):
        self._names.append (name)

        
    def add_text (self, text):
        self._add ((CMD_TEXT, text))
        self._estimated_length = self._estimated_length + len (text)


    def add_anchor_start (self, dict_of_items):
        self._add ((CMD_ANCHOR_START, dict_of_items))
        if dict_of_items.has_key ('href'):
            self._document_refs.append ((dict_of_items['href'], dict_of_items))
        self._estimated_length = self._estimated_length + 4


    def add_anchor_end (self):
        self._add ((CMD_ANCHOR_END, None))
        self._estimated_length = self._estimated_length + 2


    def add_italics_start (self):
        self._add ((CMD_ITALICS_START, None))
        self._estimated_length = self._estimated_length + 2


    def add_italics_end (self):
        self._add ((CMD_ITALICS_END, None))
        self._estimated_length = self._estimated_length + 2


    def add_underline_start (self):
        self._add ((CMD_ULINE_START, None))
        self._estimated_length = self._estimated_length + 2


    def add_underline_end (self):
        self._add ((CMD_ULINE_END, None))
        self._estimated_length = self._estimated_length + 2


    def add_strike_start (self):
        self._add ((CMD_STRIKE_START, None))
        self._estimated_length = self._estimated_length + 2


    def add_strike_end (self):
        self._add ((CMD_STRIKE_END, None))
        self._estimated_length = self._estimated_length + 2


    def add_image_reference (self, dict_of_items):
        self._add ((CMD_IMAGE, dict_of_items))
        if dict_of_items.has_key ('src'):
            self._image_refs.append ((dict_of_items['src'], dict_of_items))
        self._estimated_length = self._estimated_length + 4
       
    def add_newline (self):
        self._add ((CMD_NEWLINE, None))
        self._estimated_length = self._estimated_length + 2

    def add_set_margin (self, left, right):
        self._add ((CMD_SET_MARGIN, (left, right)))
        self._estimated_length = self._estimated_length + 2

    def add_set_alignment (self, value):
        assert 0 <= value and value <= 3, "Alignment must be >=0 and <=3 but is %s" % repr(value)
        self._add ((CMD_SET_ALIGNMENT, value))
        self._estimated_length = self._estimated_length + 3


    def add_hr (self, height=0, width=0, width_percent=0):
        self._add ((CMD_HR, (height, width, width_percent)))
        self._estimated_length = self._estimated_length + 4


    def add_style_change (self, new_stilenum):
        self._add ((CMD_SET_STYLE, new_stilenum))
        self._estimated_length = self._estimated_length + 3


    def cleanup (self):
        """Clean up the paragraph.  Should only be called when nothing
        more is to be changed in this paragraph"""
        idx = 0
        items = self._items
        while 1:
            idx = max (0, idx) # make legal in case we set it too far back
            if idx >= len (items):
                # done!
                break
            this = items[idx][0]
            if idx+1 == len (items):
                next = None
            else:
                next = items[idx+1][0]
            if this == CMD_SET_STYLE:
                if next is None:
                    # at end
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_ANCHOR_END:
                    items[idx], items[idx+1] = items[idx+1], items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_SET_STYLE:
                    del items[idx]
                    continue

            elif this == CMD_SET_ALIGNMENT:
                if next is None:
                    # at end
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_SET_ALIGNMENT:
                    del items[idx]
                    continue

            elif this == CMD_SET_MARGIN:
                if next is None:
                    # at end
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_SET_MARGIN:
                    del items[idx]
                    continue

            elif this == CMD_ITALICS_START:
                if next is None:
                    # at end
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_ITALICS_END:
                    del items[idx+1]
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_ITALICS_START:
                    del items[idx]
                    continue
                elif next == CMD_SET_STYLE:
                    items[idx], items[idx+1] = items[idx+1], items[idx]
                    idx = idx - 1
                    continue
            elif this == CMD_ITALICS_END:
                if next is None:
                    # at end
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_ITALICS_START:
                    del items[idx+1]
                    del items[idx]
                    idx = idx - 1
                    continue

            elif this == CMD_ULINE_START:
                if next is None:
                    # at end
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_ULINE_END:
                    del items[idx+1]
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_ULINE_START:
                    del items[idx]
                    continue
                elif next == CMD_SET_STYLE:
                    items[idx], items[idx+1] = items[idx+1], items[idx]
                    idx = idx - 1
                    continue
            elif this == CMD_ULINE_END:
                if next is None:
                    # at end
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_ULINE_START:
                    del items[idx+1]
                    del items[idx]
                    idx = idx - 1
                    continue

            elif this == CMD_STRIKE_START:
                if next is None:
                    # at end
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_STRIKE_END:
                    del items[idx+1]
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_STRIKE_START:
                    del items[idx]
                    continue
                elif next == CMD_SET_STYLE:
                    items[idx], items[idx+1] = items[idx+1], items[idx]
                    idx = idx - 1
                    continue
            elif this == CMD_STRIKE_END:
                if next is None:
                    # at end
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_STRIKE_START:
                    del items[idx+1]
                    del items[idx]
                    idx = idx - 1
                    continue

            elif this == CMD_ANCHOR_START:
                if next == CMD_ANCHOR_END:
                    del items[idx+1]
                    del items[idx]
                    idx = idx - 1
                    continue
                elif next == CMD_SET_STYLE or next == CMD_ITALICS_START:
                    items[idx], items[idx+1] = items[idx+1], items[idx]
                    idx = idx - 1
                    continue
            elif this == CMD_ANCHOR_END:
                pass
            elif this == CMD_IMAGE:
                pass
            elif this == CMD_TEXT:
                if idx == 0:
                    text = string.lstrip (items[idx][1])
                    items[idx] = (CMD_TEXT, text)
                text = items[idx][1]
                if not text:
                    del items[idx]
                    continue

            # now proceed
            idx = idx + 1
        # end of loop
        self._items = items

        

    def resolve_ids (self, resolver_function):
        """Resolve document ids for anchor starts and image references.
        This calls resolver_function for each such item with the associated
        dictionary and expects an integer as return as the id to use"""
        for (tag, value) in self._items:
            if tag == CMD_ANCHOR_START:
                value['pluckerid'] = resolver_function (value, 0)
            elif tag == CMD_IMAGE:
                value['pluckerid'] = resolver_function (value, 1)
                big_url = value['src']
                fake_dict = {'src': big_url + "_BIG"}
                big_id = resolver_function (fake_dict, 1)
                if big_id == None:
                    big_id = 0
                value['big_id'] = big_id


    def _dump_record_body (self, allow_fragments, allow_newstuff):
        """(Re-)Assemble the binary representation of just the body of this paragraph.
        (returns tupel (data, padding)"""

        # continous appending t oa string is inefficient (uses O(n)
        # time), so we collect in all in an array and concat all in
        # the end
        data = [] 
        
        for (tag, value) in self._items:
            if tag == CMD_TEXT:
                data.append (value)
            elif tag == CMD_ANCHOR_START:
                assert value.has_key ('pluckerid'), "Anchor start information lacks document id"
                id = value['pluckerid']
                if type (id) == types.TupleType:
                    assert len (id) == 2, "PluckerId must be number or 2-tuple of numbers but is %s" % repr (id)
                    if allow_fragments:
                        data.append (struct.pack (">BBHH", 0, 014, id[0], id[1]))
                    else:
                        data.append (struct.pack (">BBH", 0, 012, id[0]))
                elif id is not None:
                    data.append (struct.pack (">BBH", 0, 012, id))
            elif tag == CMD_ANCHOR_END:
                data.append (struct.pack (">BB", 0, 010))
            elif tag == CMD_SET_STYLE:
                data.append (struct.pack (">BBB", 0, 021, value))
            elif tag == CMD_IMAGE:
                assert value.has_key ('pluckerid'), "Image reference information lacks document id"
                id = value['pluckerid']
                if id is None:
                    if value.has_key ('alt'):
                        data.append (value['alt'])
                    else:
                        data.append ('[img]')
                else:
                    big_id = value['big_id']
                    if big_id != 0:
                        data.append (struct.pack (">BBHH", 0, 0134, big_id, id,))
                    else:
                        data.append (struct.pack (">BBH", 0, 032, id))
            elif tag == CMD_SET_MARGIN:
                if allow_newstuff:
                    data.append (struct.pack (">BBBB", 0, 042, value[0], value[1]))
                else:
                    # ignored :-(
                    pass
            elif tag == CMD_SET_ALIGNMENT:
                if allow_newstuff:
                    data.append (struct.pack (">BBB", 0, 051, value))
                else:
                    # ignored :-(
                    pass
            elif tag == CMD_HR:
                if allow_newstuff:
                    (height, width, perc_width) = value
                    data.append (struct.pack (">BBBBB", 0, 063, height, width, perc_width))
                else:
                    # ignored :-(
                    pass
            elif tag == CMD_NEWLINE:
                if not allow_newstuff:
                    raise RuntimeError, "Text paragraph contains a newline but undump may not use it"
                data.append (struct.pack (">BB", 0, 070))
            elif tag == CMD_ITALICS_START:
                if allow_newstuff:
                    data.append (struct.pack (">BB", 0, 0100))
                else:
                    # ignored :-(
                    pass
            elif tag == CMD_ITALICS_END:
                if allow_newstuff:
                    data.append (struct.pack (">BB", 0, 0110))
                else:
                    # ignored :-(
                    pass
            elif tag == CMD_ULINE_START:
                if allow_newstuff:
                    data.append (struct.pack (">BB", 0, 0140))
                else:
                    # ignored :-(
                    pass
            elif tag == CMD_ULINE_END:
                if allow_newstuff:
                    data.append (struct.pack (">BB", 0, 0150))
                else:
                    # ignored :-(
                    pass
            elif tag == CMD_STRIKE_START:
                if allow_newstuff:
                    data.append (struct.pack (">BB", 0, 0160))
                else:
                    # ignored :-(
                    pass
            elif tag == CMD_STRIKE_END:
                if allow_newstuff:
                    data.append (struct.pack (">BB", 0, 0170))
                else:
                    # ignored :-(
                    pass
        data = string.join (data, "")

        padding = 4 - (len (data) % 4)
        if padding == 4:
            padding = 0
        return (data, "X"*padding)


    def dump_record (self):
        """(Re-)Assemble the binary representation of this paragraph.
        Returns a tupel of (header, body) data"""

        (data, padding) = self._dump_record_body (allow_fragments=1, allow_newstuff=1)
        
        space = self._extra_space & 0x07
        header = struct.pack (">HH", len(data), space)

        return (header, data)

        
    def undump_record (self, text, verbose=0):
        """Dissassemble just one paragraph"""
        self._data = text
        while 1:
            res = string.split(text, "\000", 1)
            if res[0]:
                # some text before a function or before the end
                if verbose:
                    print "  Text %s" % repr(res[0])
                self.add_text (res[0])
            if len(res) == 1:
                # just text and no function was found, i.e. we are now done
                break
            else:
                # function found
                rest_text = res[1]
                assert len (rest_text)>= 1, "No function data found after function marker"
                function_code = rest_text[0]
                if function_code == "\012":
                    # anchor
                    (id,) = struct.unpack(">H", rest_text[1:3])
                    text = rest_text[3:]
                    if verbose:
                        print "  Anchor start for document #%d" % id
                    self.add_anchor_start ({'pluckerid': id})
                elif function_code == "\014":
                    # anchor with fragment part
                    (id, fragmentid) = struct.unpack(">HH", rest_text[1:5])
                    text = rest_text[5:]
                    if verbose:
                        print "  Anchor start for document #(%d, %d)" % (id, fragmentid)
                    self.add_anchor_start ({'pluckerid': (id, fragmentid)})
                elif function_code == "\010":
                    # anchor end
                    text = rest_text[1:]
                    if verbose:
                        print "  End anchor"
                    self.add_anchor_end ()
                elif function_code == "\021":
                    # set style
                    (id,) = struct.unpack(">B", rest_text[1:2])
                    text = rest_text[2:]
                    assert id>=0 and id<=7, "Illegal style code %d found" % id
                    if verbose:
                        if id==0:
                            idcode = "normal"
                        elif id==7:
                            idcode = "bold"
                        else:
                            idcode = "header %d" % id
                        print "  Style code %d (%s)" % (id, idcode)
                    self.add_style_change (id)
                elif function_code == "\032":
                    # image
                    (id,) = struct.unpack(">H", rest_text[1:3])
                    text = rest_text[3:]
                    if verbose:
                        print "  Reference to image #%d" % id
                    self.add_image_reference ({'pluckerid': id})
                elif function_code == "\042":
                    # set margin
                    (left, right) = struct.unpack(">BB", rest_text[1:3])
                    text = rest_text[3:]
                    self.add_set_margin (left, right)
                    if verbose:
                        print "  Set Margins  %d, %d" % (left, right)
                elif function_code == "\051":
                    # alignment
                    (code,) = struct.unpack(">B", rest_text[1:2])
                    text = rest_text[2:]
                    self.add_set_alignment (code)
                    if verbose:
                        if code == 0:
                            al = "left"
                        elif code == 1:
                            al = "right"
                        elif code == 2:
                            al = "center"
                        else:
                            al = "???"
                        print "  Alignment %s" % al
                elif function_code == "\063":
                    # alignment
                    (height, width, perc_width) = struct.unpack(">BBB", rest_text[1:4])
                    text = rest_text[4:]
                    self.add_hr (height, width, perc_width)
                    if verbose:
                        print "  Horizontal rule: height: %d, width: %d, %%-width: %d" % (height, width, perc_width)
                elif function_code == "\070":
                    # newline
                    self.add_newline ()
                    text = rest_text[1:]
                    if verbose:
                        print "  NewLine"
                elif function_code == "\100":
                    # italics start
                    self.add_italics_start ()
                    text = rest_text[1:]
                    if verbose:
                        print "  Italics start"
                elif function_code == "\110":
                    # italics end
                    self.add_italics_end ()
                    text = rest_text[1:]
                    if verbose:
                        print "  Italics end"
                elif function_code == "\140":
                    # underline start
                    self.add_underline_start ()
                    text = rest_text[1:]
                    if verbose:
                        print "  Underline start"
                elif function_code == "\150":
                    # underline end
                    self.add_underline_end ()
                    text = rest_text[1:]
                    if verbose:
                        print "  Underline end"
                elif function_code == "\160":
                    # strikethrough start
                    self.add_strike_start ()
                    text = rest_text[1:]
                    if verbose:
                        print "  Strikethrough start"
                elif function_code == "\170":
                    # strikethrough end
                    self.add_strike_end ()
                    text = rest_text[1:]
                    if verbose:
                        print "  Strikethrough end"
                else:
                    raise AssertionError, ("Unknown function code %s (%d) found" % (repr(function_code), ord(function_code)))




class PluckerTextDocument (PluckerDocument):
    """A class to contain information about one Plucker Text document."""

    def __init__ (self, url):
        PluckerDocument.__init__(self, url)
        self._documents = [[]]
        self._last_document_length = 0
        self._stats = PluckerTextDocumentStatistics ()
	self._charset = None

    ########################
    ## overridden methods...
    ########################
        
    def get_urls (self):
        result = []
        for i in range (len (self._documents)):
            result.append (self._get_part_url (i))
        return result

            
    def is_text_document (self):
        return 1
    

    ########################
    ## new methods...
    ########################
        
    def set_charset (self, charset):
	self._charset = charset

    def get_charset (self):
	# if the document URL is via HTTP or HTTPS, and there is no charset,
	# we apply either the user-specified default charset, or if there is
	# none, we use ISO-8859-1, as specified in the HTTP spec.
	return self._charset

    def _get_part_url (self, part_no):
        """Return the url for the part 'part_no' of this document"""
        if part_no == 0:
            return self.get_url ()
        else:
            return "plucker:/~parts~/" + \
                   urllib.quote(self.get_url (), "") + \
                   ("/%d" % part_no)


    def resolve_ids (self, resolver_function):
        """Resolve references to external documents"""
        # First we resolve urls for the various sub parts
        for part_no in range (len (self._documents)):
            if part_no != 0:
                url = self._get_part_url (part_no)
                resolver_function ({'href': url}, as_image=0)
        # and now we resolve the ids in the paragraphs of the parts
        for document in self._documents:
            for paragraph in document:
                paragraph.resolve_ids (resolver_function)

            

    def _build_document (self, id, para_count, headers, bodies):
        """Build a (new format) dump of a document from headers and bodies."""

        assert _DOC_HEADER_SIZE==8

        headers = string.join (headers, "")
        bodies = string.join (bodies, "")

        compressed_bodies = CompressFunction (bodies)
        if len (compressed_bodies) < len (bodies):
            shipped_bodies = compressed_bodies
            content_type = DOCTYPE_HTML_COMPRESSED
        else:
            shipped_bodies = bodies
            content_type = DOCTYPE_HTML

        header = struct.pack (">HHHH",
                              id,		# uid
                              para_count, 	# number of paragraphs
                              len (bodies),	# size
                              content_type)	# content type

        return header + headers + shipped_bodies
        

    def dump_record (self, id):
        """(Re-)Assemble the binary representation of this text document"""

        headers = []
        bodies = []
        size_sum = 0
        para_count = 0
        for para in self._documents[0]:
            para.cleanup ()
            (header, body) = para.dump_record()
            para_count = para_count + 1
            assert len (header) == _PARA_HEADER_SIZE, "Length of header (%d) is not %d" % (len (header), _PARA_HEADER_SIZE)
            if size_sum + len (header) + len (body) > Max_Document_Size:
                para = PluckerTextParagraph (5)
                para.add_text (Document_Truncated_Text)
                (header, body) = para.dump_record()
                headers.append (header)
                bodies.append (body)
                break
                    
            headers.append (header)
            bodies.append (body)
            size_sum = size_sum + len (header) + len (body)

        return self._build_document (id, para_count, headers, bodies)

    
    def dump_record_with_splits (self, resolver_function):
        """(Re-)Assemble the binary representation of this text document"""

        results = []

        for part_no in range (len (self._documents)):
            document = self._documents[part_no]
            this_url = self._get_part_url (part_no)
            this_id = resolver_function ({'href': this_url}, as_image=0)
            
            headers = []
            bodies = []
            para_count = 0
            for para in document:
                para.cleanup ()
                (header, body) = para.dump_record()
                para_count = para_count + 1
                assert len (header) == _PARA_HEADER_SIZE, "Length of header (%d) is not %d" % (len (header), _PARA_HEADER_SIZE)
                headers.append (header)
                bodies.append (body)

            results.append ((this_url, this_id, self._build_document (this_id, para_count, headers, bodies)))

        return results

    
    def undump_record (self, data, verbose=0):
        """Given the contents of a cache file (as e.g. generated by the awk parser),
        disassemble the contents and show it to the user"""

        assert len (data) > _DOC_HEADER_SIZE, \
               "Length of text document should be >%d but is %d" % (_DOC_HEADER_SIZE, len (data))
        header_data = data[:_DOC_HEADER_SIZE]

        if _DOC_HEADER_SIZE == 8:
            (uid, paragraphs, data_size, content_type) = struct.unpack (">HHHH", header_data)
        if content_type == DOCTYPE_HTML:
            typetext = "uncompressed text"
        elif content_type == DOCTYPE_HTML_COMPRESSED:
            typetext = "compressed text"
        else:
            typetext = "?? illegal content type for text document"
        if verbose:
            print "Text Document header:\n" \
                  "\tnumber of paragraphs: %d" \
                  "\tbody size: %d" \
                  "\ttype: %d (%s)" % \
                  (paragraphs, data_size, content_type, typetext)
        ## Do sanity check on header values
        assert content_type==DOCTYPE_HTML or content_type==DOCTYPE_HTML_COMPRESSED, \
               "Content type of text document is %d which is not a legal value" % content_type

        self._stats.add_headerbytes (_DOC_HEADER_SIZE)

        header_size = paragraphs * _PARA_HEADER_SIZE
        rest_data = data[_DOC_HEADER_SIZE:]
        headers = rest_data[:header_size]
        if content_type == DOCTYPE_HTML_COMPRESSED:
            # compressed HTML
            compressed_body = rest_data[header_size:]
            body = UncompressFunction (compressed_body)
        else:
            # uncompressed HTML
            body = rest_data[header_size:]
            if doc_compress_function:
                compressed_body = CompressFunction (body)
            else:
                compressed_body = ""
        assert data_size == len (body), \
               "Text document's header says data size is %d but it is %d" % (data_size, len (body))
        
        offset = 0
        for i in range (paragraphs):
            para_header = headers[:_PARA_HEADER_SIZE]
            headers = headers[_PARA_HEADER_SIZE:]
            (length, attr) = struct.unpack (">HH", para_header)

            ## check all the header values for sanity:
            # only lower 3 bits are used in attribute:
            assert 0 <= attr <= 15, \
                   "Text paragraph's attribute is %d -- must be between 0 and 15" % attr
            
            contents = body[offset:offset+length]
            offset = offset + length

            if not attr:
                attr_text = "none"
            else:
                space = attr & 3
                attr_text = "%d units of space above " % space
                if attr & 4:
                    attr_text = attr_text + ", extra spacing"
            if verbose:
                print "Paragraph: length %d\n\t   attributes: %d (%s)\n" \
                      "\t   offset: %d (calculated)" % \
                      (length, attr, attr_text, offset)

            paragraph = PluckerTextParagraph (extra_space=((attr&4) == 4))
            paragraph.undump_record (contents, verbose=verbose)

            self._documents[0].append (paragraph)

            self._stats.add_paragraphs (1)
            self._stats.add_headerbytes (_PARA_HEADER_SIZE)
            self._stats.add_databytes (len (contents))

        self._stats.add_compressed_databytes (len (compressed_body))

        # All header data should be used up:
        assert len (headers) == 0, \
               "After parsing all text document paragraph's headers, '%s' header info remains" % \
               repr (headers)
        # At the end, all data should be used up:
        assert offset == len (body), \
               "Adding all text document's paragraph's lengths (%d) is not the length of the data (%d)" % \
               (offset, len (body))


    def add_paragraph (self, par):
        """Add a new paragraph to this document.
        It gets appended.  This should be used to incrementally
        contruct a document"""
        par_length = par.get_estimated_length ()
        if par_length + self._last_document_length > Max_Document_Size:
            # a split into a new sub document is needed
            this_url = self._get_part_url (len (self._documents) - 1)
            next_url = self._get_part_url (len (self._documents))
            tmp_para = PluckerTextParagraph (5)
            tmp_para.add_anchor_start ({'href': next_url})
            tmp_para.add_text (Document_Next_Part_Text)
            tmp_para.add_anchor_end ()
            self._documents[-1].append (tmp_para)
            # start new document:
            self._documents.append ([])
            tmp_para = PluckerTextParagraph (0)
            tmp_para.add_anchor_start ({'href': this_url})
            tmp_para.add_text (Document_Previous_Part_Text)
            tmp_para.add_anchor_end ()
            self._last_document_length = tmp_para.get_estimated_length ()
            self._documents[-1].append (tmp_para)

        self._documents[-1].append (par)
        self._last_document_length = self._last_document_length + par_length


    def get_name_map (self):
        """ Returns the map that states what paragraph numer a named anchor
        is located in.
        Should be useful to build a resolver"""
        result = {}

        part_no = 0
        for doc in self._documents:
            url = self._get_part_url (part_no)
            part_no = part_no + 1
            for par_id in range (len (doc)):
                par = doc[par_id]
                for name in par.get_names ():
                    result[name] = (url, par_id)
        return result
    

    def get_external_references (self):
        hrefs = []
        images = []
        for doc in self._documents:
            for par in doc:
                (h, i) = par.get_external_references ()
                hrefs = hrefs + h
                images = images + i

        ## the urls should already be absolute!
        # Now clean up the urls, to make them absolut
        #for idx in range (len (hrefs)):
        #    (origurl, dict) = hrefs[idx]
        #    url = urlparse.urljoin (self._url, origurl)
        #    assert url[:17] != "plucker:/~parts~/", "Joing from %s and %s =>  %s" % (self._url, repr (origurl), repr (url))
        #    dict['href'] = url
        #    hrefs[idx] = (url, dict)
        #for idx in range (len (images)):
        #    (url, dict) = images[idx]
        #    url = urlparse.urljoin (self._url, url)
        #    dict['src'] = url
        #    images[idx] = (url, dict)
        return (hrefs, images)


    def get_stats (self):
        return self._stats



class PluckerImageDocument (PluckerDocument):
    """A class to contain information about one Plucker Image document (BMP image)."""

    def __init__ (self, url, config=None):
        PluckerDocument.__init__ (self, url)
        self._data = ""
        self._stats = PluckerTextDocumentStatistics ()
        self._config = config


    def is_image_document (self):
        return 1


    def set_data(self, data):
        """Set the BMP data directly.
        Useful for building a new document"""
        self._data=data


    def print_summary (self, prefix):
        """Give some information about this bitmap."""
        assert len (self._data) > 16, "Image data is not a bitmap!"
        bitmap_header = self._data[:16]
        bitmap_data = self._data[16:]
        (width, height, row_bytes, flags, pixel_size, version, \
         next_depth_offset, reserved1, reserved2) = \
                struct.unpack (">HHHHBBHHH", bitmap_header)
        flags_text="%d = 0x%x" % (flags, flags)
        if flags:
            flags_text = flags_text + " ("
            if flags & 1:
                flags_text = flags_text = "compressed  "
            if flags & 2:
                flags_text = flags_text = "has_color_table"
            flags_text = flags_text + ")"
            
        print "%sBitmap: %d x %d" % (prefix, width, height)
        print "%s        flags: %s" % (prefix, flags_text)
        print "%s        version %d; %d bits per pixel,  %d bytes per row" % \
              (prefix, version, pixel_size, row_bytes)
        print "%s        next depth offset: %d" % (prefix, next_depth_offset)
        print "%s        reserved values: %d, %d" % (prefix, reserved1, reserved2)
        if not (flags & 1):
            should_length = height * row_bytes
            is_length = len (bitmap_data)
            if is_length == should_length:
                print "%s        %d bytes of uncompresses image data" % (prefix, is_length)
            else:
                print "%s        %d bytes of uncompresses image data expected but has %d bytes!!!" % \
                      (prefix, should_length, is_length)
                

        

    def dump_record (self, id):
        """(Re-)Assemble the binary representation of this image document"""
        assert _DOC_HEADER_SIZE==8

        if len (self._data) > self._config.get_int ('image_compression_limit', 0):
            compressed_data = CompressFunction (self._data)
            if len (compressed_data) < len (self._data):
                data = compressed_data
                type = DOCTYPE_IMAGE_COMPRESSED
            else:
                data = self._data
                type = DOCTYPE_IMAGE
        else:
            data = self._data
            type = DOCTYPE_IMAGE

        header = struct.pack (">HHHH",
                              id,		# uid
                              0,		# header size
                              len (self._data),	# size
                              type)		# content type

        return header + data



    def undump_record (self, data, verbose=0):
        """Given the contents of an image cache file (as
        e.g. generated by the awk parser), dissasseble the contents
        and show it to the user"""

        assert len (data) > _DOC_HEADER_SIZE
        header_data = data[:_DOC_HEADER_SIZE]
        (uid, paragraphs, data_size, content_type) = struct.unpack (">HHHH", header_data)
        if verbose:
            print "Documente header:\n" \
                  "\tdoc id: %d\n" \
                  "\tnumber of paragraphs: %d\n" \
                  "\tdata size: %d" \
                  "\ttype: %d" % \
                  (uid, paragraphs, data_size, content_type)
        
        if content_type == DOCTYPE_IMAGE_COMPRESSED:
            compressed_data = data[_DOC_HEADER_SIZE:]
            rest_data = UncompressFunction (compressed_data)
        else:
            rest_data = data[_DOC_HEADER_SIZE:]
            compressed_data = rest_data

        ## Do sanity check on header values
        assert paragraphs == 0, \
               "Number of paragraphs for image documents must be %d but is %d" % (0, header_size)
        assert data_size == len (rest_data),\
               "Image document's header says length is %d but really it is %d" % \
               (data_size, len (rest_data))
        assert content_type==DOCTYPE_IMAGE or content_type == DOCTYPE_IMAGE_COMPRESSED, \
               "Content type must be %d or %d but is %d " % (DOCTYPE_IMAGE, DOCTYPE_IMAGE_COMPRESSED, content_type)

        # self._stats.add_headerbytes (_DOC_HEADER_SIZE)

        self._data = rest_data
        self._stats.add_graphicsbytes (len (rest_data))
        self._stats.add_compressed_graphicsbytes (len (compressed_data))
        self._stats.add_images (1)

        if verbose:
            self.print_summary (prefix="\t")


    def get_stats (self):
        return self._stats




class PluckerMailtoDocument (PluckerDocument):
    """A class to contain information about one Plucker Mailto document."""

    def __init__ (self, url):
        PluckerDocument.__init__ (self, url)
        self._data = ""
        self._stats = PluckerTextDocumentStatistics ()


    def is_mailto_document (self):
        return 1

    def parse (self, url):
        to_offset = cc_offset = subject_offset = body_offset = 0
        total = 8
        result = []

        data = str (url)
        data = string.replace (data, "\r", "")
        data = string.replace (data, "\n", "")
        data = string.replace (data, "?", "\n")
        data = string.replace (data, "&amp", "\n")
        data = string.replace (data, "&", "\n")
        data = string.replace (data, "=", ":")
        data = data + "\n"

        file = StringIO.StringIO (data)

        m = Message(file)
        to = m.getrawheader("mailto")
        if (len (to) < 3):
            to = m.getrawheader("to")
        cc = m.getrawheader("cc")
        subject = m.getrawheader("subject")
        body = m.getrawheader("body")
        file.close()

        if to is not None:
            to_offset = 8
            total = total + len (to)
        if cc is not None:
            cc_offset = total
            total = total + len (cc)
        if subject is not None:
            subject_offset = total
            total = total + len (subject)
        if body is not None:
            body_offset = total
            total = total + len (body)


        result.append (struct.pack (">HHHH",
                        to_offset, cc_offset, subject_offset, body_offset))
        if to is not None:
            result.append (to)
        if cc is not None:
            result.append (cc)
        if subject is not None:
            result.append (subject)
        if body is not None:
            result.append (body)

        return result, total


    def dump_record (self, id):
        """(Re-)Assemble the binary representation of this mailto document"""
        (data, len) = self.parse (self._url)
        type = DOCTYPE_MAILTO
        header = struct.pack (">HHHH",
                              id,	# uid
                              0,	# number of paragraphs
                              0,	# size -- unused / no needed?
                              type)	# content type

        data = string.join (data, "")
        data = string.replace(data, "\n", "\000")
        return header + data


    def undump_record (self, data, verbose=0):
        """Given the contents of an mailto cache file (as
        e.g. generated by the awk parser), dissasseble the contents
        and show it to the user"""

        assert len (data) > _DOC_HEADER_SIZE
        header_data = data[:_DOC_HEADER_SIZE]
        (uid, paragraphs, data_size, content_type) = struct.unpack (">HHHH", header_data)
        if verbose:
            print "Documente header:\n" \
                  "\tdoc id: %d\n" \
                  "\tNumber of paragraphs: %d\n" \
                  "\tdata size: %d" \
                  "\ttype: %d" % \
                  (uid, paragraphs, data_size, content_type)
        
        rest_data = data[_DOC_HEADER_SIZE:]

        ## Do sanity check on header values
        assert paragraphs == 0, \
               "Number of paragraphs for mailto documents must be %d but is %d" % (0, header_size)
        assert data_size == len (rest_data),\
               "Mailto document's header says length is %d but really it is %d" % \
               (data_size, len (rest_data))
        off_data = rest_data[:8]
        (to_offset, cc_offset, subject_offset, body_offset) = struct.unpack (">HHHH", off_data)
        print "Mailto offsets: \n" \
            "\tto      %d\n" \
            "\tcc      %d\n" \
            "\tsubject %d\n" \
            "\tbody    %d" % \
            (to_offset, cc_offset, subject_offset, body_offset)
        print "Data: \n" \
            "\tto      %s\n" \
            "\tcc      %s\n" \
            "\tsubject %s\n" \
            "\tbody    %s\n" % \
            (rest_data[to_offset:cc_offset], rest_data[cc_offset:subject_offset], \
             rest_data[subject_offset:body_offset], rest_data[body_offset:])




class PluckerSpecialDocument (PluckerDocument):
    """A class to contain information about other types of  Plucker documents."""

    def __init__ (self, url):
        PluckerDocument.__init__ (self, url)


    def is_special_document (self):
        return 1


class PluckerIndexDocument (PluckerSpecialDocument):
    """A class to contain information about the DB (record 0)"""
    def __init__ (self, url, config, has_metadata):
        PluckerSpecialDocument.__init__(self, url)
        self._config = config
	self._has_metadata = has_metadata

    RSVD_REC_NAME_HOME = 0
    RSVD_REC_NAME_EXT_BOOKMARKS = 1
    RSVD_REC_NAME_URLS = 2
    RSVD_REC_NAME_CATEGORIES = 3
    RSVD_REC_NAME_METADATA = 4

    def dump_record (self, record_id = 1):
        """(Re-)Assemble the binary representation of this document"""

        if self._config.get_bool ('zlib_compression', 0):
            type = DBTYPE_ZLIB
        else:
            type = DBTYPE_DOC

	# we always have a home document with record id 2, so start with that
	reserved = struct.pack(">HH", self.RSVD_REC_NAME_HOME, 2)

        if not self._config.get_bool ('no_url_info', 0):
	    # OK, the URL info is in record 3
	    reserved = reserved + struct.pack(">HH", self.RSVD_REC_NAME_URLS, 3)

	if self._config.get_string('category'):
	    # OK, the category info is in record 4
	    reserved = reserved + struct.pack(">HH", self.RSVD_REC_NAME_CATEGORIES, 4)

	if self._has_metadata:
	    # That goes in record 5
	    reserved = reserved + struct.pack(">HH", self.RSVD_REC_NAME_METADATA, 5)

        index = struct.pack (">HHH",
                              record_id,	# uid
                              type,		# compression type
                              len(reserved)/4)	# number of reserved records

        return index + reserved


class PluckerCategoryDocument (PluckerSpecialDocument):
    """A class to contain information about the default category"""
    def __init__ (self, url, config):
        PluckerSpecialDocument.__init__(self, url)
        self._config = config

    def dump_record (self, record_id = 4):
        """(Re-)Assemble the binary representation of this document"""

        category_list = self._config.get_string('category')

        categories = string.replace (category_list, ";", "\0")

        header = struct.pack (">HHHH", 
                              record_id,	            # uid
                              0,		            # number of paragraphs
                              len(categories) + 1,  # size
                              DOCTYPE_CATEGORY)     # content type

        return header + categories + '\0'



class PluckerMetadataDocument (PluckerSpecialDocument):
    """A class to contain information about extra metadata like charsets"""
    def __init__ (self, url, info):
        PluckerSpecialDocument.__init__(self, url)
        self._info = info

    TYPECODE_CHARSET = 1
    TYPECODE_EXCEPTIONAL_CHARSETS = 2

    def dump_record (self, record_id = 5):
        """(Re-)Assemble the binary representation of this document"""

	subrecords = []
	for key in self._info.keys():
	    if key == 'CharSet':
		subrecords.append(struct.pack(">HHH",
					      self.TYPECODE_CHARSET,
					      1,
					      self._info[key]))
	    elif key == 'ExceptionalCharSets':
		subrecords.append(reduce(lambda x,y: x + y,
					 map(lambda v: struct.pack('>HH', v[0], v[1]),
					     self._info[key]),
					 struct.pack('>HH',
						     self.TYPECODE_EXCEPTIONAL_CHARSETS,
						     2 * len(self._info[key]))))
				  
	    else:
		raise ValueError, "Unknown metadata key " + key

        header = struct.pack (">HHHHH", 
                              record_id,	    # uid
                              0,		    # number of paragraphs
			      reduce(lambda x, y: x + len(y), subrecords, 2),	# size in bytes
                              DOCTYPE_METADATA,     # record type
                              len(subrecords))      # number of subrecords		

	return reduce(lambda x, y: x + y, subrecords, header)


class PluckerLinkIndexDocument (PluckerSpecialDocument):
    """A class to contain information about the link documents (record 2)"""
    def __init__ (self, url, links = None, max_idx = 0):
        PluckerSpecialDocument.__init__(self, url)
        if links is not None:
            self._links = links;
        else:
            self._links = ""
        self._max_idx = max_idx


    def dump_record (self, id = 3):
        """(Re-)Assemble the binary representation of this document"""

        header = struct.pack (">HHHH",
                              id,			# uid
                              0,			# number of paragraphs
                              len(self._links),		# size
                              DOCTYPE_LINK_INDEX)	# content type

        return header + self._links



class PluckerLinksDocument (PluckerSpecialDocument):
    """A class to contain information about the links"""
    def __init__ (self, url, links = None, max_idx = 0):
        PluckerSpecialDocument.__init__(self, url)
        if links is not None:
            self._links = links;
        else:
            self._links = ""
        self._max_idx = max_idx


    def dump_record (self, id):
        """(Re-)Assemble the binary representation of this document"""

        compressed_data = CompressFunction (self._links)
        if len (compressed_data) > len (self._links):
            data = self._links
            type = DOCTYPE_LINKS
        else:
            data = compressed_data
            type = DOCTYPE_LINKS_COMPRESSED
        header = struct.pack (">HHHH",
                              id,		# uid
                              0,		# number of paragraphs
                              len(self._links),	# size
                              type)		# content type

        return header + data



def Undump_PluckerDocument (url, data, verbose=0):
    """Given 'data' try to find out what sort of document that is and
    unparse it as such.  Returns a PluckerDocument."""

    pluckerdoc = None
    
    if len (data) > _DOC_HEADER_SIZE:
        header_data = data[:_DOC_HEADER_SIZE]
        (uid, paragraphs, size, content_type) = struct.unpack (">HHHH", header_data)
        # We assign some dummy values to pass the test below
        vert_offset=0
        first_visible=_DOC_HEADER_SIZE
        first_paraY=0
        last_visible=0
        last_paraY=0 
        height=0
        if content_type==DOCTYPE_HTML or content_type==DOCTYPE_HTML_COMPRESSED:
            # aha, a text document
            pluckerdoc = PluckerTextDocument (url)
            pluckerdoc.undump_record (data, verbose)
        elif content_type==DOCTYPE_IMAGE or content_type==DOCTYPE_IMAGE_COMPRESSED:
            # aha, an image document
            pluckerdoc = PluckerImageDocument (url)
            pluckerdoc.undump_record (data, verbose)
        elif content_type==DOCTYPE_MAILTO:
            # aha, a mailto document
            pluckerdoc = PluckerMailtoDocument (url)
            pluckerdoc.undump_record (data, verbose)

    if not pluckerdoc:
        # nothing worked, i.e. unknown data
        raise ValueError, "Unknown Plucker data %s" % repr(data)
    return pluckerdoc





        
if __name__ == '__main__':
    # This gets executed if this file is called as a script
    import PyPlucker
    import getopt
    import os
    import sys
    
    def usage(reason=None):
        if reason is not None:
            print reason
        print "Usage: %s [-h] [-v] [-s] [-S] [-d] [-z] <filename> ..." % sys.argv[0]
        print "  Parses the plucker cache file(s) and verifies the contents"
        print "   -h : display usage information and exit"
        print "   -v : output version information and exit"
        print "   -s : show statics of each document"
        print "   -S : show summary statistics of all documents"
        print "   -d : show disassembly information of each document"
        print "   -z : use ZLib to uncompress the documents"

        if reason is not None:
            sys.exit (1)
	else:
            sys.exit (0)

    # the option dictionary will be used to hold flags if that option is set
    # it gets initialized to all false
    option = {}
    for letter in string.lowercase + string.uppercase:
        option[letter] = None
        
    (optlist, args) = getopt.getopt(sys.argv[1:], "hvsSdz")
    for (k,v) in optlist:
        # k is of the form e.g. '-s', so we pick the second char
        if v == "":
            v = 1
        option[k[1]] = v

    _DOC_HEADER_SIZE = 8
    _PARA_HEADER_SIZE = 4

    if option['h']:
        usage ()

    if option['v']:
        print "$Revision: 1.27 $"
        sys.exit(0)

    if not args:
        usage ("Please specify cache file(s) to parse")

    if option['z']:
        UseZLibCompression ()

    all_stats = PluckerTextDocumentStatistics ()
    num_files = 0
    for filename in args:
        if option['d'] or option['s']:
            print "\nProcessing %s..." % filename
        try:
            file = open(filename, "rb")
            text = string.join (file.readlines (), "")
            file.close()
        except IOError, text:
            print "Error: %s" % text
            text = ""

        if text:
            # non-empty file
            try:
                plucker = Undump_PluckerDocument ("file:/"+filename,
                                                  text,
                                                  verbose=option['d'])

                if not option['d'] and option['s'] and plucker.is_special_document ():
                        print "Not a document file"
                if plucker.is_text_document () or plucker.is_image_document ():
                    stats = plucker.get_stats ()
                    num_files = num_files + 1
                    all_stats.combine_with (stats)
                    if option['s']:
                        stats.pretty_print (prefix_string = "")
                    
            except AssertionError, text:
                print "!!! Parsing %s failed with an assertion error: %s" % (filename, text)
                # usually text is empty, so we write the traceback
                import traceback
                traceback.print_exc ()
            except ValueError, text:
                print "!!! Parsing %s failed. %s" % (filename, text)
                import traceback
                traceback.print_exc ()
                
    if option['S']:
        print "\n\n%d files processed:" % num_files
        all_stats.pretty_print (prefix_string="")
