#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# urlwatch is a minimalistic URL watcher written in Python
#
# Copyright (c) 2008-2010 Thomas Perl <thp@thpinfo.com>
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. The name of the author may not be used to endorse or promote products
#    derived from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

"""Watch web pages and arbitrary URLs for changes"""

pkgname = 'urlwatch'

__author__ = 'Thomas Perl <thp@thpinfo.com>'
__copyright__ = 'Copyright 2008-2010 Thomas Perl'
__license__ = 'BSD'
__homepage__ = 'http://thpinfo.com/2008/urlwatch/'
__version__ = '1.11'

user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)

# Configuration section
display_errors = False
line_length = 75


# File and folder paths
import sys
import os.path

urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
cache_dir = os.path.join(urlwatch_dir, 'cache')
scripts_dir = os.path.join(urlwatch_dir, 'lib')
hooks_py = os.path.join(scripts_dir, 'hooks.py')

# Check if we are installed in the system already
(prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))

if bindir == 'bin':
    # Assume we are installed in system
    examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
else:
    # Assume we are not yet installed
    examples_dir = os.path.join(prefix, bindir, 'examples')
    sys.path.append(os.path.join(prefix, bindir, 'lib'))

urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')

# Code section

import shutil
import os
import stat
import urllib2
import httplib
import email.Utils
import time
import socket
import difflib
import datetime
import optparse
import logging
import imp

from urlwatch import handler

# One minute (=60 seconds) timeout for each request to avoid hanging
socket.setdefaulttimeout(60)

log = logging.getLogger(pkgname)
log.setLevel(logging.DEBUG)

class NullHandler(logging.Handler):
    def emit(self, record):
        pass

log.addHandler(NullHandler())

def foutput(type, url, content=None, summary=None, c='*', n=line_length):
    """Format output messages
    
    Returns a snippet of a specific message type (i.e. 'changed') for
    a specific URL and an optional (possibly multi-line) content.

    The parameter "summary" (if specified) should be a list variable
    that gets one item appended for the summary of the changes.

    The return value is a list of strings (one item per line).
    """
    summary_txt = ': '.join((type.upper(), str(url)))

    if summary is not None:
        if content is None:
            summary.append(summary_txt)
        else:
            summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))

    result = [c*n, summary_txt]
    if content is not None:
        result += [c*n, str(content)]
    result += [c*n, '', '']

    return result


if __name__ == '__main__':
    start = datetime.datetime.now()

    # Option parser
    parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
    parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
    parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
    parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
    parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')

    parser.set_defaults(verbose=False, display_errors=False)

    (options, args) = parser.parse_args(sys.argv)

    if options.verbose:
        # Enable logging to the console
        console = logging.StreamHandler()
        console.setLevel(logging.DEBUG)
        formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
        console.setFormatter(formatter)
        log.addHandler(console)
        log.info('turning on verbose logging mode')

    if options.display_errors:
        log.info('turning display of errors ON')
        display_errors = True

    if options.urls:
        if os.path.isfile(options.urls):
            urls_txt = options.urls
            log.info('using %s as urls.txt' % options.urls)
        else:
            log.error('%s is not a file' % options.urls)
            print 'Error: %s is not a file' % options.urls
            sys.exit(1)

    if options.hooks:
        if os.path.isfile(options.hooks):
            hooks_py = options.hooks
            log.info('using %s as hooks.py' % options.hooks)
        else:
            log.error('%s is not a file' % options.hooks)
            print 'Error: %s is not a file' % options.hooks
            sys.exit(1)

    # Created all needed folders
    for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
        if not os.path.isdir(needed_dir):
            os.makedirs(needed_dir)

    # Check for required files
    if not os.path.isfile(urls_txt):
        log.warning('not a file: %s' % urls_txt)
        urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
        hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
        print 'Error: You need to create a urls.txt file first.'
        print ''
        print 'Place it in %s' % (urls_txt)
        print 'An example is available in %s' % (urls_txt_fn)
        print ''
        if not options.hooks:
            print 'You can also create %s' % (hooks_py)
            print 'An example is available in %s' % (hooks_py_fn)
            print ''
        if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
            shutil.copy(urls_txt_example, urls_txt_fn)
        if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
            shutil.copy(hooks_py_example, hooks_py_fn)
        sys.exit(1)

    headers = {
            'User-agent': user_agent,
    }

    summary = []
    details = []
    count = 0

    if os.path.exists(hooks_py):
        log.info('using hooks.py from %s' % hooks_py)
        hooks = imp.load_source('hooks', hooks_py)
        if hasattr(hooks, 'filter'):
            log.info('found and enabled filter function from hooks.py')
            filter = hooks.filter
        else:
            log.warning('hooks.py has no filter function - ignoring')
            filter = lambda x, y: y
    else:
        log.info('not using hooks.py (file not found)')
        filter = lambda x, y: y

    for job in handler.parse_urls_txt(urls_txt):
        log.info('processing job: %s' % job.location)
        filename = os.path.join(cache_dir, job.get_guid())
        try:
            if os.path.exists(filename):
                st = os.stat(filename)
                timestamp = st[stat.ST_MTIME]
            else:
                timestamp = None

            # Retrieve the data
            data = job.retrieve(timestamp, filter, headers)

            if os.path.exists(filename):
                log.info('%s exists - creating unified diff' % filename)
                old_data = open(filename).read()
                timestamp_old = email.Utils.formatdate(timestamp, localtime=1)
                timestamp_new = email.Utils.formatdate(time.time(), localtime=1)
                diff = ''.join(difflib.unified_diff(\
                        old_data.splitlines(1), \
                        data.splitlines(1), \
                        '@', \
                        '@', \
                        timestamp_old, \
                        timestamp_new))
                if len(diff) > 0:
                    log.info('%s has changed - adding diff' % job)
                    details += foutput('changed', job, diff, summary)
                else:
                    log.info('%s has not changed' % job)
            else:
                log.info('%s does not exist - is considered "new"' % filename)
                details += foutput('new', job, None, summary)
            log.info('writing current content of %s to %s' % (job, filename))
            open(filename, 'w').write(data)
        except urllib2.HTTPError, error:
            if error.code == 304:
                log.info('%s has not changed (HTTP 304)' % job)
            else:
                log.error('got HTTPError while loading url: %s' % error)
                if display_errors:
                    details += foutput('error', job, error, summary)
        except handler.ShellError, error:
            log.error('Shell returned %d' % error.result)
            if display_errors:
                details += foutput('error', job, error, summary)
        except urllib2.URLError, error:
            log.error('got URLError while loading url: %s' % error)
            if display_errors:
                details += foutput('error', job, error, summary)
        except IOError, error:
            log.error('got IOError while loading url: %s' % error)
            if display_errors:
                details += foutput('error', job, error, summary)
        except socket.timeout, error:
            log.error('got timeout while loading url: %s' % error)
            if display_errors:
                details += foutput('error', job, error, summary)
        except httplib.error, error:
            # This is to workaround a bug in urllib2, see
            # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
            log.error('got httplib error while loading url: %s' % error)
            if display_errors:
                details += foutput('error', job, (repr(error) +
                        '\n' + str(error)).strip(), summary)

        count += 1

    end = datetime.datetime.now()

    # Output everything
    if len(summary) > 1:
        log.info('printing summary with %d items' % len(summary))
        print '-'*line_length
        print 'summary: %d changes' % (len(summary),)
        print ''
        for id, line in enumerate(summary):
            print '%02d. %s' % (id+1, line)
        print '-'*line_length
        print '\n\n\n'
    else:
        log.info('summary is too short - not printing')
    if len(details) > 1:
        log.info('printing details with %d items' % len(details))
        print '\n'.join(details)
        print '-- '
        print '%s %s, %s' % (pkgname, __version__, __copyright__)
        print 'Website: %s' % (__homepage__,)
        print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
    else:
        log.info('no details collected - not printing')

