#!/usr/bin/env python

"""
Writer.py   $Id: Writer.py,v 1.22 2001/10/18 00:54:03 janssen Exp $

Write a collection of documents into various formats.

Currently implemented: the traditional Plucker cache format.


Copyright 1999,2000 by Holger Duerer <holly@starship.python.net>

Distributable under the GNU General Public License Version 2 or newer.
"""


import os, struct, string, time, helper.PQAAppInfo, sys
import PyPlucker
from PyPlucker import Url, PluckerDocs, PluckerLinks
from PyPlucker.helper import prc, dict
from PyPlucker.helper.CharsetMapping import charset_mibenum_to_name


class SimpleMapping:
    def __init__ (self, fragment_finder, a_dict=None, alias_list=None):
        if a_dict is not None:
            self._dict = a_dict
        else:
            self._dict = {}
        self._alias_list = alias_list
        self._mapping = dict.DictCompartment (self._dict, "url")
        if not self._dict.has_key ('next'):
            self._set_max_id (10)

        self._fragment_finder = fragment_finder

        if not self.has_key ('plucker:/home.html'):
            self.add ('plucker:/home.html', 2)


    def copy_to_dict (self, dest_dict):
        dict.copy_dict (self._dict, dest_dict)
            

    def get_max_id (self):
        return self._dict['next'] - 1


    def _set_max_id (self, val):
        self._dict['next'] = val + 1


    def get_mapping (self):
        return self._mapping


    def del_mapping (self, url):
        del self._mapping[url]


    def get_urls (self):
        return self._mapping.keys ()


    def has_key (self, url):
        if self._alias_list:
            url = self._alias_list.get (url)
        # strip the fragment.
        # FIXME: David says this is not necessary.  Check later...
        url = Url.URL (url)
        url.remove_fragment ()
        url = str (url)
        return self._mapping.has_key (url)


    def get (self, url):
        if self._alias_list:
            url = self._alias_list.get (url)
        return self._mapping[url]


    def equals (self, other):
        """Check if this mapping equals the other"""
        for url in self.get_urls():
            if not other.has_key (url) or other.get (url) != self.get (url):
                return 0
        for url in other.get_urls():
            if not self.has_key (url) or other.get (url) != self.get (url):
                return 0
        return 1
    
        
    def _add (self, url, id=None):
        if id is None:
            id = self.get_max_id () + 1
            self._set_max_id (id)
        else:
            if id > self.get_max_id():
                self._set_max_id (id)
        assert not self._mapping.has_key (url) or self._mapping[url] == id, \
               "Mapping::_add(): %s is already assigned %s but is now to be assigned %s" % \
               (url, self._mapping[url], id)
        self._mapping[url] = id
        return id


    def add (self, orig_url, id=None):
        url = Url.URL (orig_url)
        url.remove_fragment ()
        url = str (url)

        if self._alias_list:
            url = self._alias_list.get (url)

        return self._add (url, id)

    
    def get_image_id (self, url):
        """Get the id of the image document 'url' or None if the url
        is not to be found by the callback function (i.e. not in the
        set of documents collected)."""

        if self._alias_list:
            url = self._alias_list.get (url)
        
        theurl = Url.URL (url)
        fragment = theurl.get_fragment ()
        theurl.remove_fragment ()
        theurl = str (theurl)
        if self._mapping.has_key (theurl):
            return self._mapping[theurl]
        else:
            document = self._fragment_finder (theurl, fragment=None)
            if document is None:
                return None
            else:
                return self.get_or_add (url)


    def get_or_add (self, url):
        """Return the id of this url (generates a new if not yet in the map).
        If 'url' contains a fragement return a tupel (id, paragraph_number)"""

        theurl = Url.URL (url)
        fragment = theurl.get_fragment ()

        if self._alias_list:
            theurl = self._alias_list.get (url)
        else:
            theurl = theurl.as_string (with_fragment=0)
            
        if self._mapping.has_key (theurl):
            id = self._mapping[theurl]
        else:
            try:
                id = self._add (theurl)
            except AssertionError:
                print "Assertion error:"
                print "Url was: %s, now is %s, but failed" % (repr (url), repr (theurl))
                raise

        if fragment and self._fragment_finder:
            # This one should maybe be a tupel
            (fragments_url, fragment_id) = self._fragment_finder (theurl, fragment)
            if fragment_id:
                # Fragments may actualy be located in a different
                # plucker document.  Therefore we need to
                # re-calcultate the id for the actual url.
                # Here we rely upon the fact, that fragments_url does
                # not contain a fragment; otherwise we might end up in
                # an infinite loop.
                id = (self.get_or_add (fragments_url), fragment_id)
        return id


    def resolver_function (self, dict, as_image):
        if as_image:
            if dict.has_key ('src'):
                url = dict['src']
                return self.get_image_id (url)
            else:
                # No src given
                return None
        else:
            # a document, not an inline image
            if dict.has_key ('href'):
                url = dict['href']
                return self.get_or_add (url)
            else:
                return None


    def make_binary_representation (self):
        res = []
        res.append (struct.pack (">H", self.get_max_id()+1))
        for url in self._mapping.keys ():
            res.append (struct.pack (">HH", self._mapping[url], len (url)))
            res.append (url)
        res = string.join (res, "")
        compressed = PluckerDocs.DocCompressData (res)
        version = struct.pack (">H", 2)
        return version + compressed


    def unpack_binary_representation (self, bin):
        (version, ) = struct.unpack (">H", bin[:2])
        if version == 1:
            (next, ) = struct.unpack (">H", bin[2:4])
            bin = bin[4:]
            if next > self.get_max_id():
                self._set_max_id (next)
            while bin:
                (id, length) = struct.unpack (">HH", bin[:4])
                url = bin[4:4+length]
                bin = bin[4+length:]
                self.add (url, id)
        elif version == 2:
            bin = PluckerDocs.DocUncompressData (bin[2:])
            (next, ) = struct.unpack (">H", bin[:2])
            bin = bin[2:]
            if next > self.get_max_id():
                self._set_max_id (next)
                while bin:
                    (id, length) = struct.unpack (">HH", bin[:4])
                    url = bin[4:4+length]
                    bin = bin[4+length:]
                    self.add (url, id)
        else:
            raise RuntimeError, "Version mismatch"




def document_resolver (theurl, fragment, collection):
    if collection.has_key (theurl):
        doc = collection[theurl]
        if fragment is None:
            # we just ask for existence of the url
            return doc
        if doc.is_text_document ():
            map = doc.get_name_map ()
            if map.has_key (fragment):
                return map[fragment]
    if fragment is None:
        return None
    else:
        return (None, None)


class _Resolver:
    def __init__ (self, collection):
        self._collection = collection


    def __call__ (self, url, fragment):
        return document_resolver (url, fragment, self._collection)

    

def make_document_resolver (collection):
    res = _Resolver (collection)
    return res




class Writer:
    """Abstract base class from which to derive the various writers
    for documents"""

    def __init__ (self, collection, config):
        self._collection = collection
        self._config = config


    def _fragment_resolver (self, theurl, fragment):
        return document_resolver (theurl, fragment, self._collection)
    
    def save_data (self, data, url, id, verbose):
        """This needs to be implemented in the derived class to
        actually output the 'data' (human readably denoted as
        'url') as something with id 'id'."""
        raise NotImplementedError, "PyPlucker.Writer.Writer.save_doc()"


    def _write_doc (self, out_dict, pluckerdoc, url, id, mapping, verbose):
        if pluckerdoc.is_text_document ():
            dumps = pluckerdoc.dump_record_with_splits (mapping.resolver_function)
            for dump in dumps:
                (the_url, the_id, dump) = dump
                if the_id == 0:
                    the_id = id # original

                out_dict [the_id] = (dump, the_url, the_id, verbose)

                if verbose:
                    urltext = str (the_url)
                    if len (urltext) > 60:
                        urltext = urltext[:40] + "....." + urltext[-15:]
                    print "Converted %s" % urltext
            return
        else:
            dump = pluckerdoc.dump_record (id)

        out_dict [id] = (dump, url, id, verbose)

        if verbose > 1:
            urltext = str (url)
            if len (urltext) > 60:
                urltext = urltext[:40] + "....." + urltext[-15:]
            print "Converted %s" % urltext


    
    def write (self, verbose, mapping=None, alias_list=None):
        """Write out the collection.  Returns the mapping that was
        used to generate the ids."""

        if mapping is None:
            # Generate our mapping
            mapping = SimpleMapping (self._fragment_resolver, alias_list=alias_list)

	# figure default charset
	mibenum = self._config.get_int('default_charset', 0) or None
	charsets = {}

        # Now add ids for all documents in the _collection, so that
        # they get the low numbers
        all_urls = []
        for url in self._collection.keys ():
            doc = self._collection[url]
            all_urls = all_urls + doc.get_urls ()
            
        # the sorting is just for aesthetics...
        all_urls.sort ()
        for url in all_urls:
            if alias_list:
                url = alias_list.get (url)
            mapping.get_or_add (url)

        done_urls = {}
        urls = self._collection.keys ()
        urls.sort ()

        out_dict = {}
        for orig_url in urls:
            if alias_list:
                url = alias_list.get (orig_url)
            else:
                url = orig_url
            if done_urls.has_key (url):
                print "Ignoring double url %s" % orig_url
            else:
                done_urls[url] = 1
                
            id = mapping.get_or_add (url)

            pluckerdoc = self._collection[url]
            if pluckerdoc.is_text_document ():
                pluckerdoc.resolve_ids (mapping.resolver_function)
		doc_mibenum = pluckerdoc.get_charset()
		if verbose > 2:
		    charset_name = charset_mibenum_to_name(doc_mibenum)
		    sys.stderr.write(pluckerdoc.get_url() + ' has charset ' + str(doc_mibenum) + ((charset_name and " (" + charset_name + ")") or "") + "\n")
		if charsets.has_key(doc_mibenum):
		    charsets[doc_mibenum].append(id)
		else:
		    charsets[doc_mibenum] = [id]
		
            self._write_doc (out_dict, pluckerdoc, url, id, mapping, verbose)

        ## Do some error checking
        if not out_dict.has_key (2):
            raise RuntimeError, "The collection process failed to generate a 'home' document"
        
	## set up the metadata mapping, if any
	metadata = {}
	# set the default to the charset which has the 'most' pages
	items = charsets.items()
	if len(items) > 0:	# have to allow for image-only document
	    items.sort(lambda x, y: ((len(x[1]) < len(y[1]) and 1) or ((len(x[1]) > len(y[1])) and -1) or 0))
	    mibenum = items[0][0]
	    odd_charsets = []
	    if len(items) > 1:
		for item in items[1:]:
		    for id in item[1]:
			odd_charsets.append((id, item[0] or 0,))
	else:
	    mibenum = None
	    odd_charsets = []
	if mibenum != None:
	    metadata['CharSet'] = mibenum
	    if verbose > 1:
		# this bit of hair looks through a list of (name, number) tuples for the number
		# matching the mibenum, and returns the name, through flexible use of Python's
		# 'reduce' (great way of boiling a list to a single value) and boolean expressions
		charset_name = charset_mibenum_to_name(mibenum)
		print 'Default charset is MIBenum ' + str(mibenum) + ((charset_name and " (" + charset_name + ")") or "")
	else:
	    if verbose > 1:
		print 'No default charset'
	if len(odd_charsets) > 0:
	    metadata['ExceptionalCharSets'] = odd_charsets
	    if verbose > 1:
		sys.stderr.write("ExceptionalCharSets is " + str(odd_charsets) + "\n")

	## write the index record
        tmp_url = "plucker:/~special~/index"
        type = PluckerDocs.PluckerIndexDocument (tmp_url, self._config, metadata)
        self._write_doc (out_dict, type, tmp_url, 1, mapping, verbose)

	## write the URL information, if desired
        if not self._config.get_bool ('no_url_info', 0):
            links = PluckerLinks.Links (mapping)
            count = links.build_links ()
            tmp_url = "plucker:/~special~/pluckerlinks"
            pluckerdoc = PluckerDocs.PluckerLinkIndexDocument (tmp_url, links.return_index())
            self._write_doc (out_dict, pluckerdoc, tmp_url, 3, mapping, verbose)
            for i in range (count):
                list = links.return_list (i)
                tmp_url = "plucker:/~special~/links%d" % (i * 200 + 1)
                pluckerdoc = PluckerDocs.PluckerLinksDocument (tmp_url, list)
                id = 1 + i + mapping.get_max_id()
                self._write_doc (out_dict, pluckerdoc, tmp_url, id, mapping, verbose)

	## write the category information, if present
        if self._config.get_string ('category') is not None:
            tmp_url = "plucker:/~special~/category"
            type = PluckerDocs.PluckerCategoryDocument (tmp_url, self._config)
            self._write_doc (out_dict, type, tmp_url, 4, mapping, verbose)

	## write the metadata record, if any
	if metadata:
            tmp_url = "plucker:/~special~/metadata"
	    type = PluckerDocs.PluckerMetadataDocument (tmp_url, metadata)
            self._write_doc (out_dict, type, tmp_url, 5, mapping, verbose)

        ## now write everything else
        the_ids = out_dict.keys ()
        the_ids.sort ()  # they are numeric, so sort does the right thing
        for id in the_ids:
            dump, the_url, the_id, verbose = out_dict[id]
            self.save_data (dump, the_url, the_id, verbose)
            if verbose:
                urltext = str (the_url)
                if len (urltext) > 60:
                    urltext = urltext[:40] + "....." + urltext[-15:]
                print "Wrote %d <= %s" % (the_id, urltext)

        return mapping



class CacheWriter (Writer):
    """A Writer that writes the traditional format of a separate files
    in a cache directory"""

    def __init__ (self, collection, config, cachedir):
        Writer.__init__ (self, collection, config)
        self._cachedir = cachedir


    def write (self, verbose, alias_list, mapping=None):
        cachedir = os.path.expandvars (self._cachedir)
        cachedir = os.path.expanduser (cachedir)
        if not os.path.exists (cachedir):
            print "%s does not exists!" % cachedir
            return
        if not os.path.isdir (cachedir):
            print "%s is not a directory" % cachedir
            return

        # clear the cache directory
        for name in os.listdir (cachedir):
            fname = os.path.join (cachedir, name)
            if os.path.isfile (fname):
                os.unlink (fname)

        # Now call the super class to do the actual work
        return Writer.write (self, verbose, mapping=mapping, alias_list=alias_list)
        

    def save_data (self, data, url, id, verbose):
        filename = os.path.join (self._cachedir, "%d" % id)
        file = open (filename, "wb")
        file.write (data)
        file.close ()


class PDBWriter (Writer):
    """A Writer that writes the items into a ready-to-synch PDB
    file."""

    def __init__ (self, collection, config, name, version, filename):
        Writer.__init__ (self, collection, config)
        self._filename = filename
        self._dbname = name
        self._dbversion = version
        self._pdb_file = None
        self._flag_copy_prevention = config.get_bool ('copyprevention_bit')
        self._flag_launchableData = config.get_bool ('launchable_bit')
        self._flag_backup = config.get_bool ('backup_bit')
        self._icon = config.get_bool ('icon') or config.get_bool('launchable_bit')
        self._big_icon = config.get_string ('big_icon','')
        self._small_icon = config.get_string ('small_icon','')
        self._config = config


    def write (self, verbose, alias_list, mapping=None):
        if os.path.exists (self._filename):
            os.unlink (self._filename)
	if self._filename == '<stdout>':
	    if sys.platform == "win32":
		import msvcrt
		msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
	    self._pdb_file = prc.File (sys.stdout, read=0, write=1)
	else:
	    self._pdb_file = prc.File (self._filename, read=0, write=1)
        info = self._pdb_file.getDBInfo ()
        info['name'] = self._dbname
        info['version'] = self._dbversion
        info['creator'] = 'Plkr'
        info['type'] = 'Data'
        info['createDate'] = int (time.time())
        info['modifyDate'] = info['createDate']
        info['backupDate'] = -2082844800L
        info['flagCopyPrevention'] = self._flag_copy_prevention
        info['flagLaunchableData'] = self._flag_launchableData
        info['flagBackup'] = self._flag_backup
        if self._icon:
            self._pdb_file.setAppBlock( \
                helper.PQAAppInfo.pqa_app_info_block(self._config, \
                                                     self._dbname, \
                                                     self._dbversion, \
                                                     self._big_icon, \
                                                     self._small_icon))
        self._pdb_file.setDBInfo (info)

        # Now call the super class to do the actual work
        result = Writer.write (self, verbose, mapping=mapping, alias_list=alias_list)

        self._pdb_file.close ()
        return result
        

    def save_data (self, data, url, id, verbose):
        assert self._pdb_file is not None, "write_doc called with unintialized pdb file"

        self._pdb_file.setRecord (attr=0, id=id, cat=0, data=data)




class DictWriter (Writer):
    """A Writer that writes each record into a passed dictionary with
    the record number as the key"""

    def __init__ (self, collection, config, dict):
        Writer.__init__ (self, collection, config)
        self._dict = dict


    def save_data (self, data, url, id, verbose):
        self._dict[id] = data
