# Sitescooper::Main.pm -- main sitescooper logic.

package Sitescooper::Main;

require Exporter;

use Carp;
use File::Find;
use File::Path;
use File::Copy;
use File::Basename;
use Cwd;
use FindBin;

use strict;
use vars qw{
	@ISA @EXPORT $VERSION
	$match_url_dd $match_url_mm $match_url_yyyy $match_url_Mstr
	$match_url_yy
	$MY_OS $COLON $SLASH $use_hashes_for_cache_filenames
	$OUT_TEXT $OUT_DOC $OUT_HTML $OUT_IMAGES
	@MONTHNAMES $MAC_ARGS $DEFAULT_OUTPUT_TEMPLATE

	$__loaded_statics
	$__implemented_add_url_to_cache_method

	$got_intr_behaviour $got_intr_flag
};

#---------------------------------------------------------------------------

@ISA = qw(Exporter);
@EXPORT= qw();
$VERSION = "3.1.2";
sub Version { $VERSION; }

#---------------------------------------------------------------------------

$OUT_TEXT = 0;
$OUT_DOC = 1;
$OUT_HTML = 2;
$OUT_IMAGES = 3;

@MONTHNAMES = qw(x Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);

# note the iSilo bookmark code here BTW.
$DEFAULT_OUTPUT_TEMPLATE = q{

<indexpage><html>
<head><title>Sitescooper Scoops</title>
</head>
<body bgcolor=white><h1>Sitescooper Scoops</h1>

<table>
<tr>
<td width=50% valign=top>
__SCOOPS_COL_1__
</td>
<td width=50% valign=top>
__SCOOPS_COL_2__
</td>
</tr>
</table>

<p><hr><font size=1><em>
(Downloaded and converted by sitescooper; see __SITESCOOPER_HOME_URL__ )
</em></font>
</body></html>
</indexpage>

<htmlmainpage><html>
<head><title>__SITE_TITLE__</title>
__ISILO_BOOKMARK_LINKS__
</head>
<body bgcolor=white><h1>__SITE_TITLE__</h1>

__MAIN_BODY__

<p><hr><font size=1><i>
(End of snarf - copyright retained by original providers.
__SITE_RIGHTS__
Downloaded and converted by sitescooper; see __SITESCOOPER_HOME_URL__ )
</i></font>
</body></html>
</htmlmainpage>

<htmlsubpage><html>
<head><title>__SITE_TITLE__</title></head>
<body bgcolor=white>

__SUB_BODY__

</body></html>
</htmlsubpage>

<htmlstory>
<hr><font size=1><i>__SITE_NAME__:
<a HREF_EXTERNAL="__STORY_URL__">__SHORT_URL__</a></i></font><br>

__STORY_ANCHOR__ __ISILO_HEADLINE_ANCHOR__
    [<a href="__STORY_PREV_LINK__">&lt;&lt;</a>]
    [<a href="__STORY_UP_LINK__">^^</a>]
    [<a href="__STORY_NEXT_LINK__">&gt;&gt;</a>]<br>

__STORY_TEXT__
</htmlstory>
<textmainpage>__SITE_TITLE__


__MAIN_BODY__


(End of snarf - copyright retained by original providers.
__SITE_RIGHTS__
Downloaded and converted by sitescooper; see __SITESCOOPER_HOME_URL__ )
</textmainpage>

<textsubpage>__SUB_BODY__</textsubpage>

<textstory>------------
__SITE_NAME__: __STORY_URL__

__BOOKMARK_CHAR__ __HEADLINE__
__STORY_TEXT__
</textstory>

};

#---------------------------------------------------------------------------

sub new {
  my $class = shift; $class = ref($class) || $class;
  my $ui = shift;

  my $self = {
    'httpclient_queue'		=> { },
    'httpclient_queue_new_idx'	=> 0,
  };

  $self->{ui} = $ui;

  @{$self->{robots}} = ();
  $self->{cwd} = getcwd;
  $self->{home_url} = "http://sitescooper.org";
  $self->{default_output_template} = $DEFAULT_OUTPUT_TEMPLATE;
  $self->{output_template} = undef;
  @{$self->{profiles}} = ( );
  @{$self->{profiles_cmdline}} = ( );
  $self->{userid} = $<;

  $self->{cf} = { };
  $self->{cf}->{sitescooperdir} = $FindBin::Bin;
  $self->{cf}->{config} = undef;

  $self->{scfs} = { };

  bless ($self, $class);
  $self->LoadStatics();
  $self->Init();
  $self->SetupDefaultConfig();
  $self;
}

sub LoadStatics {
  my $self = shift;

  if (!defined $__loaded_statics) {
    $__loaded_statics = 1;	 # only once
  } else {
    return;
  }

# Andrew Fletcher <fletch@computer.org>:
# A relative path on Mac seems to need a ":" before it. I've called
# this $COLON. (jm note: currently unused but we may need it again)

  if (MyOS() eq 'UNIX') {
    $COLON = '';
    $SLASH = '/';
    $use_hashes_for_cache_filenames = 0;

  } elsif (MyOS() eq 'Win32') {
    $COLON = '';
    $SLASH = '\\';
    $use_hashes_for_cache_filenames = 0;

  } elsif (MyOS() eq 'Mac') {
    $SLASH = ':'; $COLON = ':';
    # because of the Mac's 32-char filename limitation, we need to include
    # a hash of the URL in cache filenames to avoid clashes. This may be
    # handy for other OSes too, but leave it Mac-only for now.
    $use_hashes_for_cache_filenames = 1;
  }

  $self->verbose ("Sitescooper version ".$VERSION
				  .", Copyright (c) 1999-2000 Justin Mason\n"
	  ."Sitescooper comes with ABSOLUTELY NO WARRANTY; for details\n"
	  ."see http://sitescooper.org/doc/gpl.html .\n");

}

sub Init {
  my $self = shift;

  $self->{cf}->{refresh} = 0;
  $self->{cf}->{full_refresh} = 0;
  $self->{cf}->{debug} = 0;
  $self->{cf}->{debugdiffs} = 0;			# set to 1 to break after diffing

  $self->{cf}->{testsupportfile} = undef;		# set when functional testing

  $self->{cf}->{cached_front_page_lifetime} = 60 / (24*60);
  $self->{cf}->{dump_output} = 0;
  $self->{cf}->{dumppdb} = 0;

  $self->{cf}->{verbose} = 1;
  $self->{cf}->{nowrite} = 0;
  $self->{cf}->{badcache} = 0;
  $self->{cf}->{grepmode} = 0;
  undef $self->{cf}->{pdbdir};
  undef $self->{cf}->{pilotinstdir};
  undef $self->{cf}->{pilotinstapp};

  $self->{cf}->{include_isilo_bookmarks} = 0;
  $self->{cf}->{turn_big_imgs_to_hrefs} = 0;
  $self->{cf}->{maxcolors} = 0;

  # Text is the default on MacOS.
  if (MyOS() eq 'Mac') {
    $self->{cf}->{output_style} = $OUT_TEXT;
    $self->{cf}->{outputfilter} = '__cat__';
    $self->{cf}->{outputextn} = 'txt';
    $self->{cf}->{use_convert_tool} = 0;
    $self->{cf}->{print_full_links} = 0; # print full links

  } else {
    $self->{cf}->{output_style} = $OUT_HTML;
    $self->{cf}->{outputfilter} = 'isilo';
    $self->{cf}->{outputextn} = 'pdb';
    $self->{cf}->{include_isilo_bookmarks} = 1;
    $self->{cf}->{turn_big_imgs_to_hrefs} = 0;
    $self->{cf}->{use_convert_tool} = 1;
    $self->{cf}->{print_full_links} = 1; # do not print full links in output file for iSilo
  }
  $self->{cf}->{converter_args} = '';

  $self->{cf}->{fileperpage} = 0;
  $self->{cf}->{nolinkrewrite} = 0;
  $self->{cf}->{defaultfilesizelimit} = 500;		# in Kb
  $self->{cf}->{filesizelimit} = undef;
  $self->{cf}->{storylimit} = 0;
  $self->{cf}->{linkslimit} = 0;

  $self->{cf}->{writeheader} = 1;
  $self->{cf}->{writefooter} = 1;
  $self->{cf}->{allowimgs} = 1;
  $self->{cf}->{keep_ext_links} = 0;
  $self->{cf}->{target_has_color} = 0;
  $self->{cf}->{gen_index} = 0;

  $self->{cf}->{use_only_cache} = 0;
  $self->{cf}->{admincmd} = undef;

  @{$self->{cf}->{sites_grep}} = ();
  @{$self->{cf}->{site_files_to_read}} = ();
  @{$self->{cf}->{site_choices}} = ();
  @{$self->{cf}->{layout_site_files}} = ();

  $self->{cf}->{cmdline_urls} = [ ];
  $self->{cf}->{keep_tmps} = 0;
  $self->{cf}->{disconnect_after_scoop} = 0;
  $self->{cf}->{preload_method} = 'lwp';

  $self->{cf}->{argv_settings} = [ ];

  $self->{add_closing_tags} = 1;	# close tags when cleaning HTML
  $self->{strip_empty_tag_sets} = 0; 	# strip empty tag sets (broken atm)

  $self->{cf}->{filename_template} = "YYYY_MM_DD_Site";
  $self->{cf}->{pdb_title} = "YYYY-Mon-DD: Site";

  $self->{failed_to_cvt} = 0;

  # chars. Palm uses these:
  #
  #  20 = secure wireless PQA link
  #  21 = wireless PQA link
  # 128 = euro
  # 133 = ellipsis
  # 134 = cross
  # 135 = double-cross
  # 136 = small, high caret
  # 141 = diamonds (used for bookmark char here)
  # 142 = clubs
  # 143 = hearts
  # 144 = spades
  # 145 = left single quote
  # 146 = right single quote
  # 147 = left double quote
  # 148 = right double quote
  # 149 = small block in center of char
  # 151 = long dash
  # 153 = tm
  #
  # and these are standard latin1:
  #
  # 164 = star with diamond in the center
  # 174 = reg
  # 176 = degree symbol
  # 215 = small x in center of char

  $self->{cf}->{bookmark_char} = "\x8D";
  $self->{cf}->{ext_link_char} = "\xB0";

# This is the placeholder for development debug flags.
# Add debugging stuff here, tagged with J M D (without the spaces ;).

}

# --------------------------------------------------------------------------

sub parse_commandline {
  my ($self, @argv) = @_;
  local ($_);
  my @sites_grep = ();

  while ($#argv >= 0) {
    $_ = shift @argv;

    if (/^-debug$/) {
      $self->{cf}->{debug}++;

    } elsif (/^-quiet$/) {
      $self->{cf}->{verbose} = 0;

    } elsif (/^-refresh/) {
      $self->{cf}->{refresh} = 1;

    } elsif (/^-fullrefresh/) {
      $self->{cf}->{cached_front_page_lifetime} = 0;
      $self->{cf}->{full_refresh} = 1;
      $self->{cf}->{refresh} = 1;	# implies -refresh as well

    } elsif (/^-dump/) {
      $self->{cf}->{dump_output} = 1;
      $self->{cf}->{use_convert_tool} = 0;

    } elsif (/^-dumpprc/) {
      $self->{cf}->{dumppdb} = 1;
      $self->{cf}->{use_convert_tool} = 1;

    } elsif (/^-bw/) {
      $self->{cf}->{target_has_color} = 0;

    } elsif (/^-color/) {
      $self->{cf}->{target_has_color} = 1;

    } elsif (/^-maxcolors/) {
      $self->{cf}->{maxcolors} = shift (@argv)+0;
      if ($self->{cf}->{maxcolors} == 0) {
	$self->{ui}->usage();
      }

    } elsif (/^-keeplinks/) {
      $self->{cf}->{keep_ext_links} = 1;

    } elsif (/^-fixlinks/) {
      $self->{cf}->{keep_ext_links} = 0;

    } elsif (/^-doc/) {
      $self->{cf}->{output_style} = $OUT_DOC;
      $self->{cf}->{fileperpage} = 0;
      $self->{cf}->{outputfilter} = 'makedoc';
      $self->{cf}->{include_isilo_bookmarks} = 0;
      $self->{cf}->{outputextn} = 'pdb';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 0;

    } elsif (/^-plucker/) {
      $self->{cf}->{output_style} = $OUT_HTML;
      $self->{cf}->{fileperpage} = 0;
      $self->{cf}->{outputfilter} = 'plucker';
      $self->{cf}->{include_isilo_bookmarks} = 0;
      $self->{cf}->{turn_big_imgs_to_hrefs} = 1;
      $self->{cf}->{outputextn} = 'pdb';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 1;
      $self->{cf}->{keep_ext_links} = 1;
      $self->{cf}->{ext_link_char} = "\x15";

    } elsif (/^-mplucker/) {
      $self->{cf}->{output_style} = $OUT_HTML;
      $self->{cf}->{fileperpage} = 1;
      $self->{cf}->{outputfilter} = 'plucker';
      $self->{cf}->{include_isilo_bookmarks} = 0;
      $self->{cf}->{turn_big_imgs_to_hrefs} = 1;
      $self->{cf}->{outputextn} = 'pdb';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 1;
      $self->{cf}->{keep_ext_links} = 1;
      $self->{cf}->{ext_link_char} = "\x15";

    } elsif (/^-isilo/) {
      $self->{cf}->{output_style} = $OUT_HTML;
      $self->{cf}->{fileperpage} = 0;
      $self->{cf}->{outputfilter} = 'isilo';
      $self->{cf}->{include_isilo_bookmarks} = 1;
      $self->{cf}->{outputextn} = 'pdb';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 1;
      $self->{cf}->{ext_link_char} = "\x15";

    } elsif (/^-misilo/) {
      $self->{cf}->{output_style} = $OUT_HTML;
      $self->{cf}->{fileperpage} = 1;
      $self->{cf}->{outputfilter} = 'isilo';
      $self->{cf}->{include_isilo_bookmarks} = 1;
      $self->{cf}->{outputextn} = 'pdb';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 1;
      $self->{cf}->{ext_link_char} = "\x15";

    } elsif (/^-richreader/) {
      $self->{cf}->{output_style} = $OUT_HTML;
      $self->{cf}->{fileperpage} = 0;
      $self->{cf}->{outputfilter} = 'richreader';
      $self->{cf}->{include_isilo_bookmarks} = 0;
      $self->{cf}->{outputextn} = 'pdb';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 1;		# TODO - not sure

    } elsif (/^-imgv/) {
      $self->{cf}->{output_style} = $OUT_IMAGES;
      $self->{cf}->{fileperpage} = 1;
      $self->{cf}->{outputfilter} = 'imgv';
      $self->{cf}->{include_isilo_bookmarks} = 0;
      $self->{cf}->{outputextn} = 'pdb';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 1;

    } elsif (/^-text/) {
      $self->{cf}->{output_style} = $OUT_TEXT;
      $self->{cf}->{fileperpage} = 0;
      $self->{cf}->{outputfilter} = '__path__';
      $self->{cf}->{include_isilo_bookmarks} = 0;
      $self->{cf}->{outputextn} = 'txt';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 0;

    } elsif (/^-html/) {
      $self->{cf}->{output_style} = $OUT_HTML;
      $self->{cf}->{fileperpage} = 0;
      $self->{cf}->{outputfilter} = '__path__';
      $self->{cf}->{include_isilo_bookmarks} = 0;
      $self->{cf}->{gen_index} = 1;
      $self->{cf}->{outputextn} = 'html';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 1;

    } elsif (/^-mhtml/) {
      $self->{cf}->{output_style} = $OUT_HTML;
      $self->{cf}->{fileperpage} = 1;
      $self->{cf}->{outputfilter} = '__path__';
      $self->{cf}->{include_isilo_bookmarks} = 0;
      $self->{cf}->{gen_index} = 1;
      $self->{cf}->{outputextn} = 'html';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 1;

    } elsif (/^-images/) {
      $self->{cf}->{output_style} = $OUT_IMAGES;
      $self->{cf}->{fileperpage} = 1;
      $self->{cf}->{outputfilter} = '__path__';
      $self->{cf}->{include_isilo_bookmarks} = 0;
      $self->{cf}->{outputextn} = 'list';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 1;

    } elsif (/^-pipe/) {
      my $fmt = shift @argv;
      my $cmd = shift @argv;

      if ($fmt eq 'text') {
	$self->{cf}->{output_style} = $OUT_TEXT;
	$self->{cf}->{fileperpage} = 0;

      } elsif ($fmt eq 'html') {
	$self->{cf}->{output_style} = $OUT_HTML;
	$self->{cf}->{fileperpage} = 0;

      } elsif ($fmt eq 'mhtml') {
	$self->{cf}->{output_style} = $OUT_HTML;
	$self->{cf}->{fileperpage} = 1;
      } else {
	$self->{ui}->usage();
      }
      $self->{cf}->{outputfilter} = 'cmd: '.$cmd;
      $self->{cf}->{outputextn} = 'pdb';
      $self->{cf}->{use_convert_tool} = 1;
      $self->{cf}->{allowimgs} = 1;		# probably ;)

    } elsif (/^-cvtargs/) {
      $self->{cf}->{converter_args} = shift @argv;

    } elsif (/^-admin$/) {
      $self->{cf}->{admincmd} = shift @argv;
      if ($self->{cf}->{admincmd} eq 'import-cookies')
			{ $self->{cf}->{importcookies} = shift @argv; }
      if ($self->{cf}->{admincmd} eq 'func-test')
			{ $self->{cf}->{testsupportfile} = shift @argv; }
      if ($self->{cf}->{admincmd} eq 'include-isilo-bookmarks')
			{ $self->{cf}->{include_isilo_bookmarks} = 1; }
      if ($self->{cf}->{admincmd} eq 'turn-big-imgs-to-hrefs')
			{ $self->{cf}->{turn_big_imgs_to_hrefs} = 1; }

    } elsif (/^-nolinkrewrite/) {
      $self->{cf}->{nolinkrewrite} = 1;
    } elsif (/^-fromcache/) {
      $self->{cf}->{use_only_cache} = 1;
    } elsif (/^-preload/) {
      $self->{cf}->{preload_method} = shift(@argv);
    } elsif (/^-limit/) {
      $_ = shift(@argv); s/\s*kb?\s*//i; $_ += 0;
      if ($_ <= 0) { $_ = 9999999; }
      $self->{cf}->{filesizelimit} = $_;
    } elsif (/^-maxlinks/) {
      $self->{cf}->{storylimit} = shift(@argv)+0;
    } elsif (/^-maxstories/) {
      $self->{cf}->{linkslimit} = shift(@argv)+0;
    } elsif (/^-nodates/) {
      $self->{cf}->{filename_template} = 'Site';
      $self->{cf}->{pdb_title} = 'Site';
    } elsif (/^-nowrite/) {
      $self->{cf}->{nowrite} = 1;
    } elsif (/^-badcache/) {
      $self->{cf}->{badcache} = 1;
    } elsif (/^-grep/) {
      $self->{cf}->{grepmode} = 1;
    } elsif (/^-install/) {
      $self->{cf}->{pilotinstdir} = shift @argv;
    } elsif (/^-instapp/) {
      $self->{cf}->{pilotinstapp} = shift @argv;
    } elsif (/^-site$/) {
      push (@sites_grep, shift @argv);
    } elsif (/^-sites/) {
      push (@sites_grep, @argv); @argv = (); last;
    } elsif (/^-profile$/) {
      push (@{$self->{profiles_cmdline}}, shift @argv);
    } elsif (/^-profiles/) {
      push (@{$self->{profiles_cmdline}}, @argv); @argv = (); last;
    } elsif (/^-name/) {
      $self->{argv_name} = shift (@argv);
    } elsif (/^-levels/) {
      $self->add_cmdline_site_param ("Levels: ". shift @argv);
    } elsif (/^-storyurl/) {
      $self->add_cmdline_site_param ("StoryURL: ". shift @argv);
    } elsif (/^-set/) {
      $self->add_cmdline_site_param (shift (@argv) .": ". shift @argv);
    } elsif (/^-keep-tmps/) {
      $self->{cf}->{keep_tmps} = 1;
    } elsif (/^-disc/) {
      $self->{cf}->{disconnect_after_scoop} = 1;

    } elsif (/^-noheaders/) {
      $self->{cf}->{writeheader} = 0;
    } elsif (/^-nofooters/) {
      $self->{cf}->{writefooter} = 0;
    } elsif (/^-outputtemplate/) {
      $_ = shift @argv;
      open (IN, "< $_") or die "cannot open output template: $_\n";
      $self->{output_template} = join ("\n", <IN>);
      close IN;

    } elsif (/^-filename/) {
      $self->{cf}->{filename_template} = shift @argv;

    } elsif (/^-prctitle/) {
      $self->{cf}->{pdb_title} = shift @argv;

    } elsif (/^-config$/) {
      $self->{cf}->{config} = shift @argv;

    } elsif (/^-stdout-to/) {
      $_ = shift @argv; close (STDOUT);
      open (STDOUT, ">>".$_) or die "failed to redirect STDOUT to $_\n";

    } elsif (/\.scp$/i) {
      $self->{cf}->{config} = $_;

    } elsif (/\.site$/i) {
      push (@{$self->{cf}->{site_files_to_read}}, $_);
      push (@{$self->{cf}->{sites_grep}}, $_);

    } elsif (/^-/) {
      $self->{ui}->usage();

    } else {
      unshift @argv, $_; last;
    }
  }

  # now the non-switch arguments
  @{$self->{cf}->{cmdline_urls}} = @argv;

  foreach my $key (@sites_grep) {
    if (!-r $key) {
      warn "Failed to read -site argument \"$key\".\n";
      next;
    }

    # it's a site file.
    push (@{$self->{cf}->{site_files_to_read}}, $key);
    push (@{$self->{cf}->{sites_grep}}, $key);
    # TODO: add support for -sitesmatching pattern or similar...
  }
}

sub add_cmdline_site_param {
  my $self = shift;
  push (@{$self->{cf}->{argv_settings}}, shift);
}

# --------------------------------------------------------------------------

sub set_tmp_dir {
  my ($self, @paths) = @_;
  $self->{cf}->{tmpdir} = $self->get_first_existing_path_or_default (@paths);
}

sub set_my_sites_dir {
  my ($self, @paths) = @_;
  $self->{cf}->{mysitesdir} = $self->get_first_existing_path_or_default (@paths);
}

sub set_site_choices_file {
  my ($self, @paths) = @_;
  $self->{cf}->{choicefile} = $self->get_first_existing_path_or_default (@paths);
  # if we didn't get one there, just default to the last one listed
}

sub set_shared_sites_dir {
  my ($self, @paths) = @_;
  $self->{cf}->{sharedsitesdir} =
  			$self->get_first_existing_path_or_default (@paths);
}

sub get_first_existing_path {
  my ($self, @paths) = @_;
  foreach my $path (@paths) {
    $path = $self->sed_path ($path);
    if (-e $path) { return $path; }
  }
  return undef;
}

sub get_first_existing_path_or_default {
  my ($self, @paths) = @_;
  my $ret = $self->get_first_existing_path (@paths);
  if (!defined $ret) {
    return $self->sed_path (pop (@paths));
  } else {
    return $ret;
  }
}

sub sed_path {
  my ($self, $path) = @_;

  $path =~ s/\~/ $ENV{'HOME'}; /ge;
  $path =~ s/%S/ $self->{cf}->{sitescooperdir}; /ge;
  $path =~ s/%t/ $ENV{'TMPDIR'} || $ENV{'TEMP'}; /ge;
  $path =~ s/%T/ $self->{cf}->{tmpdir}; /ge;
  $path =~ s/\//${SLASH}/g;
  $path;
}

# --------------------------------------------------------------------------

sub read_config {
  my ($self, $filename, @conf) = @_;

  my $i;
  my @conflines = ();
  for ($i=0; $i < @conf+1; $i++) {
    push (@conflines, $filename.":".($i+1));
  }

  $self->parse_conf_with_lines (\@conf, \@conflines);
}

# --------------------------------------------------------------------------

sub SetupDefaultConfig {
  my $self = shift;

  $self->{cf}->{outdir} = '';
  $self->{cf}->{expiry_days} = 7.0;
  $self->{cf}->{sharedcache} = undef;

  $self->{scfs}->{sites} = { };

  @{$self->{scfs}->{layout_order}} = ();
  %{$self->{scfs}->{layouts}} = ();
  $self->{scfs}->{have_layouts} = 0;

  @{$self->{scfs}->{exception_order}} = ();
  %{$self->{scfs}->{exceptions}} = ();
  $self->{scfs}->{have_exceptions} = 0;

  $self->{cf}->{diff} = 'diff';
  if (MyOS() eq 'Win32') { $self->{cf}->{diff} = "diff.exe"; }
  if (MyOS() eq 'Mac') { $self->{cf}->{diff} = ""; }	# use Algorithm::Diff
  $self->{cf}->{checked_for_diff} = 0;

  $self->{cf}->{makedoc} = 'makedoc';
  if (MyOS() eq 'Win32') { $self->{cf}->{makedoc} = "makedocw.exe"; }

  $self->{cf}->{plucker} = 'plucker-build';

  $self->{cf}->{imgvconvert} = 'convert "__SCOOPFILE__" pnm:- | '.
  	'ppmimgvquant | pgmtoimgv -t "__TITLE__" __ARGS__ > "__SYNCFILE__"';

  $self->{cf}->{isilo} = 'iSilo386';
  $self->{cf}->{isiloargs} = '-y -U';
  $self->{cf}->{isilomultipageargs} = '-d9';
  if ($self->{cf}->{allowimgs}) {
    $self->{cf}->{isiloargs} .= ' -Is__IMAGE_MAX_WIDTH__';
    if (!$self->{cf}->{target_has_color}) {
      $self->{cf}->{isiloargs} .= ' -Ic -Id';
    }
  }

  if (MyOS() eq 'Win32') { $self->{cf}->{isilo} = "iSiloC32.exe"; }

  $self->{cf}->{richreader} = 'HTML2Doc'; $self->{cf}->{richargs} = '';
  if (MyOS() eq 'Win32')
	  { $self->{cf}->{richreader} = "HTML2Doc.exe"; $self->{cf}->{richargs} = ''; }
  # Note that currently there is no HTML2Doc for UNIX platforms; it's
  # supported here anyway for future-proofing.

  set_got_intr_behaviour ('exit');
  $SIG{'INT'} = \&got_intr;
  $SIG{'TERM'} = \&got_intr;

  $self->{cf}->{proxyhost} = undef;
  $self->{cf}->{proxyport} = 80;
  $self->{cf}->{proxyuser} = undef;
  $self->{cf}->{proxypass} = undef;
}

# ---------------------------------------------------------------------------

sub interpret_basic_config {
  my $self = shift;

  if (!defined $self->{cf}->{tmpdir}) {
    warn "Warning: cannot work out TmpDir, please set it manually\n".
	  "in the configuration file.\n";
  }
  if (!defined $self->{cf}->{sitescooperdir}) {
    warn "Warning: cannot work out SitescooperDir.\n";
  }

  $self->parse_palm_install_details();
}

# ---------------------------------------------------------------------------

sub parse_palm_install_details {
  my $self = shift;

  return if (defined $self->{cf}->{pilotinstdir});

  if (MyOS() eq 'Win32' && !defined $self->{cf}->{pilotinstapp}) {
    $self->{cf}->{pilotinstapp} = "***USE_MODULE***";

  } elsif (MyOS() eq 'UNIX') {
    my $pilot_mgr_dir = $ENV{'HOME'}."/.pilotmgr/Installer";
    my $jpilot_file = $ENV{'HOME'}."/.jpilot/jpilot_to_install";

    if (defined $self->{cf}->{pilotinstapp}) {
      # see if one of the built-in support for UNIX pilot desktops is
      # being used.
      my $app = $self->{cf}->{pilotinstapp};
      undef $self->{cf}->{pilotinstapp};
      if ($app =~ /pilot.*manager/i) {
	$self->{cf}->{pilotinstdir} = $pilot_mgr_dir;
      } elsif ($app =~ /gnome.*pilot/i) {
	$self->{cf}->{pilotinstapp} = "gpilot-install-file --later";
      } elsif ($app =~ /jpilot/i) {
	$self->{cf}->{pilotinstapp} = "***ADD_TO_MANIFEST*** ".$jpilot_file;
      }

    } else {
      $self->{cf}->{pilotinstapp} = "***USE_MODULE***";		# use the module
    }
  }
}

# ---------------------------------------------------------------------------

sub load_all_modules {
  my $self = shift;

  require URI::URL;
  require HTTP::Date;
  require HTTP::Cookies;
  require HTTP::Request::Common;
  require HTML::Entities;
  require HTML::Parser;
  require HTML::Filter;

  require Sitescooper::Util;
  require Sitescooper::Robot;
  require Sitescooper::SCF;
  require Sitescooper::StripTablesFilter;
  require Sitescooper::UserAgent;
  require Sitescooper::ConsolePasswordAsker;
  require Sitescooper::LWPHTTPClient;
  require Sitescooper::ForkHTTPClient;
  require Sitescooper::PreloadURLProcessor;
  require Sitescooper::NewsHound;

  require Sitescooper::CacheFactory;
  require Sitescooper::DirCacheFactory;

  require PDA::PilotInstall;

# ---------------------------------------------------------------------------

  $self->{useragent} = new Sitescooper::UserAgent($self);
  $self->{useragent}->env_proxy();
  $self->{useragent}->env_proxy_auth();

  $self->{useragent}->agent ("sitescooper/$VERSION ($self->{home_url}) ".
		  $self->{useragent}->agent);
  $self->{useragent}->max_size (1024*1024*2);	# 2-meg file limit

  if (defined $self->{cf}->{proxyhost} && $self->{cf}->{proxyhost} ne '') {
    $self->{useragent}->proxy
		(['http', 'ftp'], "http://".
		$self->{cf}->{proxyhost}.":".$self->{cf}->{proxyport}."/");
  }
  if (defined $self->{cf}->{proxyuser} && $self->{cf}->{proxyuser} ne '') {
    $self->{useragent}->set_proxy_auth($self->{cf}->{proxyuser},
		$self->{cf}->{proxypass});
  }

  $self->{cookie_jar} = HTTP::Cookies::Netscape->new();

  if ($self->{cf}->{preload_method} eq 'lwp') {
    eval q{
      $self->{httpclient} = new Sitescooper::LWPHTTPClient ($self);
      1;
    } or die "eval for preload class failed";
  } elsif ($self->{cf}->{preload_method} =~ /^fork(\d*)/) {
    my $num = $1; ($num eq '') and $num = 4;
    eval q{
      $self->{httpclient} = new Sitescooper::ForkHTTPClient ($self, $num);
      1;
    } or die "eval for preload class failed";
  } else {
    warn "Unknown preload method '".$self->{cf}->{preload_method}."'.\n";
    $self->{ui}->usage();
  }

  $self->{httpclient}->init();

  $self->{cachefactory} = new Sitescooper::DirCacheFactory ($self);
}

# ---------------------------------------------------------------------------

sub set_got_intr_behaviour {
  # sadly, these have to be global -- ah well.

  $got_intr_behaviour = shift;
  $got_intr_flag = 0;
}

sub got_intr {
  my $signame = shift;
  (MyOS() eq 'UNIX') and system ("stty echo");

  if ($got_intr_behaviour eq 'exit') {
    die "got signal SIG$signame, exiting.\n";
  } else {
    die "got signal SIG$signame, skipping site...\n";
    $got_intr_flag = 1;
  }
}

# ---------------------------------------------------------------------------

sub read_commandline_scoops {
  my $self = shift;
  if (MyOS() eq 'Mac') {
    if (defined ${$self->{cf}->{cmdline_urls}}[0]
      	&& ${$self->{cf}->{cmdline_urls}}[0] eq $self->{cf}->{sitescooperdir})
    {
      # REVISIT -- I do not know why this happens :(
      shift (@{$self->{cf}->{cmdline_urls}});
    }
  }

  return if ($#{$self->{cf}->{cmdline_urls}} < 0);

  # zero out any site files we might have read
  $self->{scfs}->{sites} = { };

  my ($url, $confline, @conf, @conflines, $i);
  foreach $url (@{$self->{cf}->{cmdline_urls}})
  {
    # if it's a local file URL, switch around the slashes (for windows)
    if (MyOS() eq 'Win32' && $url =~ m,file:///,i) {
      $url =~ s/\\/\//g;
    }

    if (-r $url) {
      if ($url =~ m,^/,) {
	$url = 'file://'.$url;
      } else {
	$url = "file://".$self->{cwd}."/".$url;
      }
    }

    if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }
    push (@conf, "URL: ".$url);

    if (defined $self->{argv_name}) {
      $_ = $self->{argv_name};
      push (@conf, "Name: ".$_);
    } else {
      if ($url =~ m,/([^/]+)$,) {
	$_ = $1;
	if (length ($_) > 40) {
	  # trim out spare stuff to keep it short.
	  s,^([^:]+://[^/]+)/.*/([^/]+$),$1/.../$2,i; #/
	}
      } else {
	$_ = $url;
      }
    }
    push (@conf, "StoryLifetime: 0");	# never used cached stuff
    push (@conf, @{$self->{cf}->{argv_settings}});

    for ($i=0; $i<@conf+1; $i++) { push (@conflines, $url.":0"); }
    $self->parse_conf_with_lines (\@conf, \@conflines);
  }
}

# ---------------------------------------------------------------------------

sub read_a_profile {
  my ($self, $file) = @_;

  $self->verbose ("Reading profile from file \"$file\"...");

  my $profile = Sitescooper::NewsHound::LoadProfile($file);
  if ($profile == 0) {
    warn "Can't load profile '$file': $!\n";
  } else {
    push (@{$self->{profiles}}, $profile);
  }
}

sub read_newshound_profiles {
  my $self = shift;
  my $file;

  my $profdir = $self->{profilesdir};
  if (defined $profdir) {
    foreach $file (<$profdir/*.nhp>) {
      next if (-d $file);             # skip directories
      $self->read_a_profile ($file);
    }
  }

  foreach $file (@{$self->{profiles_cmdline}}) {
    if (!-r $file) {
      warn "Failed to read -profile argument \"$file\".\n";
      next;
    }
    $self->read_a_profile ($file)
  }
}

# ---------------------------------------------------------------------------

sub handle_admin_cmds {
  my $self = shift;
  if (defined $self->{cf}->{admincmd}) {
    if ($self->{cf}->{admincmd} eq 'dump-sites') {
      my ($robot, $outdir);

      foreach $robot (@{$self->{robots}}) {
        my $outdir = $robot->{outdir};
	my $url = $robot->{url};
	my $title = $robot->{sitename};
	$title =~ s,\t, ,g; $title =~ s,^\d+-\S+-\d+: ,,g;

	my $base = $robot->{outtmp};
	$base =~ s,^.*${SLASH}(\S+?)\.tmp$,$1,o;

	my $scf = $self->{scfs}->{sites}->{$url};
	my $site = $scf->{site_defined_at};

	$site =~ s/:\d+$//; $site =~ s/^.*${SLASH}(\S+?)$/$1/o;

	# foobar.site	http://www.foobar.com/	Foo Bar	1999_01_01_Foo_Bar
	print "$site\t$url\t$title\t$base\n";
      }
      exit;

    } elsif ($self->{cf}->{admincmd} eq 'journal') {
      open (JOURNAL, "> ".$self->{cf}->{tmpdir}.$SLASH."journal.txt")
	  or die "cannot write to \"".$self->{cf}->{tmpdir}.$SLASH."journal.txt\"!\n";

    } elsif ($self->{cf}->{admincmd} eq 'import-cookies') {
      if (!defined $self->{cf}->{importcookies}) { $self->{ui}->usage(); }
      if (!-r $self->{cf}->{importcookies}) {
	die "cannot open \"$self->{cf}->{importcookies}\"!\n";
      }
      warn "Importing Netscape-format cookie jar from ".
      		"\"$self->{cf}->{importcookies}\"...\n";
      $self->{cookie_jar}->load ($self->{cf}->{importcookies});
      warn "Cookie jar now looks like:\n".$self->{cookie_jar}->as_string;

      $self->{scfs}->{sites} = { };
      $self->{cf}->{cmdline_urls} = [ ];
      $self->{robots} = [ ];
      # and carry on to exit.

    } elsif ($self->{cf}->{admincmd} eq 'func-test') {
      # do nothing right now, it's handled later
    } elsif ($self->{cf}->{admincmd} eq 'include-isilo-bookmarks') {
      # do nothing here
    } elsif ($self->{cf}->{admincmd} eq 'turn-big-imgs-to-hrefs') {
      # do nothing here

    } else { $self->{ui}->usage(); }
  }
}

# ---------------------------------------------------------------------------

sub run {
  my $self = shift;
  # to do all the conversions at the end:
  #$self->get_all_sites;
  #foreach $robot (@{$self->{robots}}) {
  #  $self->convert_output ($robot);
  #}
  #(TODO: this hasn't been modified to use the scf object.)

  # to do them as each site is scooped:
  $self->get_all_sites (1);

  if ($self->{cf}->{gen_index}) {
    $self->generate_output_index_page();
  }
}

# ---------------------------------------------------------------------------

sub finish {
  my $self = shift;
  delete $self->{httpclient};
  $self->write_state;
  $self->{useragent}->save_logins();
  $self->{cookie_jar}->save ($self->{cf}->{tmpdir}.$SLASH."cookies");

  if (defined $self->{cf}->{pilotinstapp} && $self->{cf}->{pilotinstapp} eq '***USE_MODULE***')
  {
    $self->{cf}->{installer}->write_config_file
    			($self->{cf}->{tmpdir}.$SLASH."inst.txt");
  }

  if ($self->{cf}->{disconnect_after_scoop}) {
    $self->disconnect;
  }

  $self->verbose ("Finished!");
}

# ---------------------------------------------------------------------------

sub parse_conf_with_lines (\@\@) {
  my ($self, $conf, $conflines) = @_;
  local ($_);

  my $cf = undef;
  my $scf = undef;
  my $confline;

  foreach $_ (@$conf) {
    $confline = shift @$conflines;

    s/#.*$//; s/^\s+//; s/\s+$//g; next if (/^$/);
    if (!defined $confline) {
      $self->dbg ("oops! confline not set for $_");
    }

    # process environment variable references: ${ENVVARNAME}
    s/\$\{(\S+?)\}/
	  defined($ENV{$1}) ? $ENV{$1} : "";
    /ge;
    s/\$HOME/$ENV{'HOME'}/ge;		# always supported

    if (defined $scf && $scf->is_in_scoped_statement()) {
      $scf->ParseConfigScopedLine ($self, $_, $confline) and next;
    }

    s/^(\S+:)\s+/$1 /;		# easier to read this way ;)
    study;

    # now handle the general config parameters
    if (/^ProxyHost: (.*)$/) { $self->{cf}->{proxyhost} = $1; next; }
    if (/^ProxyPort: (.*)$/) { $self->{cf}->{proxyport} = $1+0; next; }
    if (/^ProxyUsername: (.*)$/) { $self->{cf}->{proxyuser} = $1; next; }
    if (/^ProxyPassword: (.*)$/) { $self->{cf}->{proxypass} = $1; next; }

    /^TmpDir: (.*)$/ and ($self->{cf}->{tmpdir} = $1), next;
    if (/^SitescooperDir: (.*)$/) {
      $self->{cf}->{sitescooperdir} = $1;
      if (!-d $self->{cf}->{sitescooperdir}) {
	$self->sitewarn_file_line ($confline, "SitescooperDir directory ".
		"does not exist: ".$self->{cf}->{sitescooperdir}."\n");
      }
      next;
    }
    if (/^SitesDir: (.*)$/) { $self->{cf}->{mysitesdir} = $1; next; }
    if (/^ProfilesDir: (.*)$/) { $self->{profilesdir} = $1; next; }
    if (/^SiteChoiceFile: (.*)$/) { $self->{cf}->{choicefile} = $1; next; }

    # provide support for reading command line flags from config file
    # for MacOS (has no inherent command line concept)
    if (/^CommandLine: (.*)$/) {
      $self->parse_commandline(split (' ',$1)); next;
    }

    /^MakeDoc: (.*)$/ and ($self->{cf}->{makedoc} = $1), next;
    /^iSilo: (.*)$/ and ($self->{cf}->{isilo} = $1), next;
    /^Plucker: (.*)$/ and ($self->{cf}->{plucker} = $1), next;
    /^ImageViewer: (.*)$/ and ($self->{cf}->{imgvconvert} = $1), next;
    /^HTML2Doc: (.*)$/ and ($self->{cf}->{richreader} = $1), next;
    /^Diff: (.*)$/ and ($self->{cf}->{diff} = $1), next;
    /^TextSaveDir: (.*)$/ and ($self->{cf}->{outdir} = $1), next;
    /^PilotInstallDir: (.*)$/ and ($self->{cf}->{pilotinstdir} = $1), next;
    /^PilotInstallApp: (.*)$/ and ($self->{cf}->{pilotinstapp} = $1), next;
    /^SharedCacheDir: (.*)$/ and ($self->{cf}->{sharedcache} = $1), next;
    /^ExpireCacheAfter: (.*)$/ and ($self->{cf}->{expiry_days} = $1+0.0), next;
    /^TargetMaxColors: (.*)$/ and ($self->{cf}->{maxcolors} = $1+0), next;
    /^BookmarkChar: (.*)$/ and
    		($self->{cf}->{bookmark_char} = eval(qq!"$1"!)), next;
    /^ExtLinkChar: (.*)$/ and
    		($self->{cf}->{ext_link_char} = eval(qq!"$1"!)), next;

    if (/^CachedPageLifetime: (.*)$/) {
      (!$self->{cf}->{refresh}) and
    		($self->{cf}->{cached_front_page_lifetime} = ($1+0) / (24*60));
      next;
    }

    # next the site file config params
    if (/^URL: (.*)$/) {
      my $url = $self->expand_url_magic ($1);
      if (!defined $url) {
	$self->sitewarn_file_line ($confline, "Bad URL in site file: $_\n");
      }

      if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; }
      if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }

      $scf = new Sitescooper::SCF ($self);
      $scf->set_defaults ($url, $confline);
      $self->{scfs}->{sites}->{$url} = $scf;
      next;
    }

    # LayoutURL is similar to URL, but defines a layout for a specific
    # pattern. If an URL falls within this pattern, and parameters are
    # defined for this layout but not defined by the site file, the
    # layout parameters will be used.
    #
    if (/^LayoutURL: (.*)$/) {
      my $url = $self->expand_url_magic ($1);

      if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; }
      if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }

      # allow extra parameters to be added to an existing layout
      if (defined ${$self->{scfs}->{layouts}}{$url}) {
	$scf = ${$self->{scfs}->{layouts}}{$url};
      } else {
	$scf = new Sitescooper::SCF ($self);
	$scf->set_defaults ($url, $confline);
	${$self->{scfs}->{layouts}}{$url} = $scf;
      }

      next;
    }

    # ExceptionURL is like LayoutURL, but it takes priority over
    # both LayoutURL and the normal site file rules. This way you
    # can define bits of a site that uses different layouts, caching
    # rules etc. by matching pages' URLs against the ExceptionURL
    # regular expression.
    #
    if (/^ExceptionURL: (.*)$/) {
      my $url = $self->expand_url_magic ($1);

      if ($url !~ m,^(http|file)://,i) { $url = 'http://'.$url; }
      if ($url =~ m,(http|file)://[^/]+$,i) { $url .= '/'; }

      # allow extra parameters to be added to an existing exception
      if (defined ${$self->{scfs}->{exceptions}}{$url}) {
	$scf = ${$self->{scfs}->{exceptions}}{$url};
      } else {
	$scf = new Sitescooper::SCF ($self);
	$scf->set_defaults ($url, $confline);
	${$self->{scfs}->{exceptions}}{$url} = $scf;
      }

      next;
    }

    if (!defined $scf) {
      $self->sitewarn_file_line ($confline,
	  "Configuration line invalid (needs URL line first?):\n  $_\n");
      next;
    }
    $scf->ParseConfigLine ($self, $_, $confline) and next;

    $self->sitewarn_file_line ($confline, "Unrecognised:\n  $_\n");
  }

  if (defined $scf && $scf->is_in_scoped_statement()) {
    $self->sitewarn_file_line ($confline,
	  "Fell off end of ".$scf->scoped_statement_type().
	  " statement!\n");
  }
}

# ---------------------------------------------------------------------------

sub get_ready_for_run {
  my $self = shift;

  $self->index_layouts();

  # CRG: Added $self->{cf}->{checked_for_diff} to prevent diff check that follows
  # which would fail.
  if ($self->{cf}->{diff} eq 'MODULE') { $self->{cf}->{diff} = ''; $self->{cf}->{checked_for_diff} = 1; }

  if (MyOS() eq 'Win32' && $self->{cf}->{checked_for_diff} == 0) {
    my $file = $self->{cf}->{tmpdir}.$SLASH."tstdiff.txt";

    open (OUT, "> $file"); close OUT;
    warn "Checking for availability of the \"diff.exe\" command...\n";

    my $difffailed = 0;
    if (open (IN, $self->{cf}->{diff}." \"$file\" \"$file\" |")) {
      1 while (<IN>);
      close IN;
      $difffailed = ($? >> 8);

    } else {
      $difffailed = 1;
    }

    if ($difffailed) {
      warn "\n".
	"$self->{cf}->{diff} could not be found and run. Using perl module\n".
	"Algorithm::Diff instead.\n";

      $self->{cf}->{diff} = "";		# use module instead
    }
    unlink $file; $self->{cf}->{checked_for_diff} = 1;
  }

  if ($self->{cf}->{diff} eq '') {
    $self->dbg ("using Algorithm::Diff module to diff pages.");
    eval q{ use Algorithm::Diff qw(diff); 1; }
		  or die "Cannot use built-in diff support, perl module\n".
			  "Algorithm::Diff not found: $@\n";
  }

  $self->parse_output_template();

  if (defined $self->{cf}->{testsupportfile}) {
    $self->read_test_support ($self->{cf}->{testsupportfile});
  }

  $self->make_dirs();		# Note: this will chdir
  $self->parse_filename_templates();
  $self->generate_output_filenames (keys %{$self->{scfs}->{sites}});
  $self->handle_admin_cmds();

  if ($self->{cf}->{use_convert_tool}) {
    if (defined $self->{cf}->{pdbdir} && !-d $self->{cf}->{pdbdir}) {
      mkdir ($self->{cf}->{pdbdir}, 0755) 
      		or die "failed to mkdir '".$self->{cf}->{pdbdir}."'\n";
    }
    if (defined $self->{cf}->{pilotinstdir} &&
		 !-d $self->{cf}->{pilotinstdir})
    {
      mkdir ($self->{cf}->{pilotinstdir}, 0755) 
      		or die "failed to mkdir '".$self->{cf}->{pilotinstdir}."'\n";
    }
  }

  $self->{useragent}->load_logins();
  $self->read_state();

  if (defined $self->{cf}->{pilotinstapp}
  		&& $self->{cf}->{pilotinstapp} eq '***USE_MODULE***')
  {
    $self->{cf}->{installer} = new PDA::PilotInstall();
    $self->{cf}->{installer}->read_config_file
    			($self->{cf}->{tmpdir}.$SLASH."inst.txt");
  }

  if (defined $self->{cf}->{pilotinstdir}) {
    # just write PRCs direct to the install dir, save a copy.
    $self->{cf}->{pdbdir} = $self->{cf}->{pilotinstdir};
  }
}

sub parse_filename_templates {
  my ($self) = @_;
  my ($mday, $mon, $year, $monstr) = $self->get_date();
  my ($min, $hr, $wdaystr) = $self->get_extra_date();

  $self->{cf}->{filename_template} =~ s/YYYY/ sprintf ("%04d", $year); /eg;
  $self->{cf}->{filename_template} =~ s/MM/ sprintf ("%02d", $mon); /eg;
  $self->{cf}->{filename_template} =~ s/Mon/ sprintf ("%3s", $monstr); /eg;
  $self->{cf}->{filename_template} =~ s/DD/ sprintf ("%02d", $mday); /eg;
  $self->{cf}->{filename_template} =~ s/Day/ sprintf ("%3s", $wdaystr); /eg;
  $self->{cf}->{filename_template} =~ s/hh/ sprintf ("%02d", $hr); /eg;
  $self->{cf}->{filename_template} =~ s/mm/ sprintf ("%02d", $min); /eg;

  $self->{cf}->{pdb_title} =~ s/YYYY/ sprintf ("%04d", $year); /eg;
  $self->{cf}->{pdb_title} =~ s/MM/ sprintf ("%02d", $mon); /eg;
  $self->{cf}->{pdb_title} =~ s/Mon/ sprintf ("%3s", $monstr); /eg;
  $self->{cf}->{pdb_title} =~ s/DD/ sprintf ("%02d", $mday); /eg;
  $self->{cf}->{pdb_title} =~ s/Day/ sprintf ("%3s", $wdaystr); /eg;
  $self->{cf}->{pdb_title} =~ s/hh/ sprintf ("%02d", $hr); /eg;
  $self->{cf}->{pdb_title} =~ s/mm/ sprintf ("%02d", $min); /eg;
}

sub index_layouts {
  my ($self) = @_;

  @{$self->{scfs}->{layout_order}} = ();
  my $pat;
  foreach $pat (sort { length($a) <=> length($b) }
				 keys %{$self->{scfs}->{layouts}})
  {
    ${$self->{scfs}->{layouts}}{$pat}->{active} = 0; # not treated like sites
    push (@{$self->{scfs}->{layout_order}}, $pat);
    $self->{scfs}->{have_layouts} = 1;
  }
  $self->dbg ("site layouts defined: ",
  			join(' ', @{$self->{scfs}->{layout_order}}));

  @{$self->{scfs}->{exception_order}} = ();
  foreach $pat (sort { length($a) <=> length($b) }
  		keys %{$self->{scfs}->{exceptions}})
  {
    ${$self->{scfs}->{exceptions}}{$pat}->{active} = 0;
    push (@{$self->{scfs}->{exception_order}}, $pat);
    $self->{scfs}->{have_exceptions} = 1;
  }
  $self->dbg ("site exceptions defined: ",
                        join(' ', @{$self->{scfs}->{exception_order}}));
}

# ---------------------------------------------------------------------------

sub generate_output_index_page {
  my $self = shift;
  local ($_);

  my $dir = $self->{cf}->{outdir};

  opendir(DIR, $dir) || die "can't opendir $dir: $!";
  my @idxes = grep {
    !/^\./ && (-f "$dir/$_/$_.html")
  } readdir(DIR);
  closedir DIR;

  my @pages = ();
  foreach my $file (sort @idxes) {
    my $linktitle = $file;
    $linktitle =~ s/_/ /g; $linktitle =~ s/\/.*$//g;
    push (@pages, "<li><a href=\"$file/$file.html\">$linktitle</a></li>");
  }

  my $cutoff = $#pages / 2;
  my $col1 = '';
  my $col2 = '';
  if ($#pages >= 0) {
    $col1 = join ("\n", @pages[0 .. $cutoff]);
    $col2 = join ("\n", @pages[($cutoff+1) .. $#pages]);
  }

  $_ = $self->{index_page};
  s/__SCOOPS_COL_1__/${col1}/gs;
  s/__SCOOPS_COL_2__/${col2}/gs;
  s/__SITESCOOPER_HOME_URL__/$self->{home_url}/gs;

  open (MYFILE, ">${dir}/index.html") or
	warn "cannot write to ${dir}/index.html\n";
  print MYFILE $_;
  close MYFILE or warn "cannot write to ${dir}/index.html\n";
}

# ---------------------------------------------------------------------------

sub read_site_choices {
  my $self = shift;

  my $choicefile = $self->{cf}->{choicefile};
  if (!defined $choicefile) {
    warn "choicefile not defined"; return;
  }

  if (!-e $choicefile) {
    $self->EditSiteChoices ($choicefile);	# or create it in this case
  }

  $self->verbose ("Using site choices from \"$choicefile\".");
  open (IN, "<".$choicefile) or return;

  my $samplesdir = $self->{cf}->{sharedsitesdir};
  while (<IN>) {
    if (/^\s*\[\s*x\s*\]/i) {
      while (<IN>) {
	if (/^\s*Filename:\s*(\S+)\s*$/) {
	  $_ = $1; s/\[samples\]/${samplesdir}/g;
	  if (/layouts\.site/) {
	    push (@{$self->{cf}->{layout_site_files}}, $_);
	  } else {
	    # $self->dbg ("site choice: $_");
	    push (@{$self->{cf}->{site_choices}}, $_);
	  }
	  last;
	}
      }
    }
  }
  close IN;
}

# ---------------------------------------------------------------------------

sub find_sites {
  return unless (/\.site?$/i && -f $_);
  push(@TmpGlobal::site_files_found, $File::Find::name);
}

sub find_sites_in_sites_dir {
  my ($self) = @_;
  my ($file, $key);
  my %sites_grep = ();
  my %read_sites = ();

  foreach $key (@{$self->{cf}->{sites_grep}}) { $sites_grep{$key} = 1; }

  foreach $key (@{$self->{cf}->{layout_site_files}}) {
    $self->dbg ("Using layout: $key");
    if (-r $key) { push (@{$self->{cf}->{site_files_to_read}}, $key); }
  }

  if (@{$self->{cf}->{sites_grep}} >= 0 && defined
	${$self->{cf}->{sites_grep}}[0])
  {
    $self->verbose ("Restricting to sites: ".join (' ', @{$self->{cf}->{sites_grep}}));
  }
  elsif ($#{$self->{cf}->{cmdline_urls}} >= 0 && defined
  	${$self->{cf}->{cmdline_urls}}[0])
  {
    # we're only snarfing the command-line URLs, skip the predefined sites
    return;
  }
  elsif (@{$self->{cf}->{site_choices}} >= 0 && defined
	${$self->{cf}->{site_choices}}[0])
  {
    # only scoop sites from the site_choices list if the -site argument
    # was not used.
    foreach $key (@{$self->{cf}->{site_choices}}) {
      $self->dbg ("Adding site-choices site: ".$key);
      if (-r $key) {
	push (@{$self->{cf}->{site_files_to_read}}, $key);
      } else {
	warn "Failed to read site-choices site file: $key\n";
      }
    }
  }

  if (defined $self->{cf}->{mysitesdir} && -d $self->{cf}->{mysitesdir})
  {
    @TmpGlobal::site_files_found = ();
    find(\&find_sites, $self->{cf}->{mysitesdir});
    my @files = @TmpGlobal::site_files_found;
    undef @TmpGlobal::site_files_found;

    foreach $file (@files) {
      next if ($file =~ /(\.swp$|core|\.bak$|\~$|^#)/);	# skip backups, etc.
      next if (-d $file);		# skip directories

      # if the -site argument was used, skip it unless it's the layouts
      # file.
      if ($#{$self->{cf}->{sites_grep}} >= 0) {
	next unless ($file =~ /layouts\.site/i);
      }

      push (@{$self->{cf}->{site_files_to_read}}, $file);
    }
  }
}

sub read_site_files {
  my ($self) = @_;

  # don't read the real site files if command lines args were used
  return if ($#{$self->{cf}->{cmdline_urls}} >= 0);

  if ($#{$self->{cf}->{site_files_to_read}} < 0) {
    warn "\n".
      "No sites were read -- the site_choices.txt file is empty, or the\n".
      "\"sites\" directory could not be found.\n\n";
  }

  my %read_sites = ();
  foreach my $file (@{$self->{cf}->{site_files_to_read}}) {
    next if (defined $read_sites{$file});
    $read_sites{$file} = 1;		# don't read the same file twice

    if (open (IN, "<".$file)) {
      $self->dbg ("Scooping site from file \"$file\".");

      my @conf = (<IN>);
      close IN;
      $self->read_config ($file, @conf);

    } else {
      $self->sitewarn_file_line ("$file:0", "Cannot read $file\n");
    }
  }
}

# ---------------------------------------------------------------------------

sub EditSiteChoices {
  my ($self, $choicefile) = @_;

  my $samplesdir = $self->{cf}->{sharedsitesdir};

  if (!-d $samplesdir) {
    warn "Cannot find 'site_samples' directory, not creating site_choices file.\n".
    	"(looked for '$samplesdir')\n";
    return;
  }

  if ($#{$self->{cf}->{sites_grep}} >= 0) {
    warn "'-site' argument used, not creating site_choices file.\n";
    return;
  }

  warn "Creating/editing \"site_choices.txt\" file...\n";

  my %chosen_sites = ();
  foreach $_ (@{$self->{cf}->{site_choices}}) { $chosen_sites{$_} = 1; }

  if (!open (CHOICE, ">".$choicefile)) {
    warn "Cannot create \"site_choices.txt\" file $choicefile\n";
    return;
  }

  print CHOICE <<EOHDR;
Please pick the site files you wish to use here.  Put an X in the box
beside the sites you wish to scoop.

If you want to use the traditional 'sites' directory, or you have your own
site files not in this list, then do not put an X in any of the boxes.
Sitescooper will supplement what you have ticked here with the contents
of your 'sites' directory, if it exists.

EOHDR

  @TmpGlobal::site_files_found = (); find(\&find_sites, $samplesdir);

  my $samplespat = $samplesdir;
  $samplespat =~ s/([^-_:A-Za-z0-9])/\\$1/g;
  my $file;
  
  foreach $file (@TmpGlobal::site_files_found) {
    my $pretty = $file; $pretty =~ s,^${samplespat},\[samples\],g;

    if ($file =~ /layouts\.site/) {
      print CHOICE "    [x] (Site layouts for common sites)\n".
	      "\tFilename: $pretty\n\n";
      next;
    }

    my ($url, $name, $desc) = $self->ReadSiteForChoices($file);
    if (!defined $url) { next; }
    if (!defined $name) { $name = $url; }

    if (defined $desc) { $desc = "\t($desc)\n"; }
    else { $desc = ''; }

    my $chosen = ' ';
    if (defined $chosen_sites{$pretty}) { $chosen = 'x'; }

    print CHOICE "    [$chosen] $name\n\tURL: $url\n".
    		"\tFilename: $pretty\n$desc\n";
  }
  undef @TmpGlobal::site_files_found;

  close CHOICE or die "failed to write to site_choices file";

  my $edit;
  if (defined $ENV{'VISUAL'}) {
    $edit = $ENV{'VISUAL'};
  } elsif (defined $ENV{'EDITOR'}) {
    $edit = $ENV{'EDITOR'};
  } elsif (MyOS() eq 'UNIX' && -x '/usr/bin/editor') {
    $edit = 'editor';		# Debian std, thanks to michael d. ivey
  } elsif (MyOS() eq 'UNIX') {
    $edit = 'vi';
  } elsif (MyOS() eq 'Win32') {
    $edit = 'notepad.exe';
  } elsif (MyOS() eq 'Mac') {
    # REVISIT -- don't know what to do here ;)
    warn "\nIf you wish to choose which sites to scoop from a list\n".
    	"of pre-defined sites, stop this script now, edit the file\n".
	"$choicefile\n".
	"and re-run it.\n\n";
    return;
  }

  if (MyOS() eq 'UNIX' && !-t STDOUT) {
    warn "\nNot running editor for site_choices.txt file, as sitescooper is\n".
      "not running interactively. Please edit this file at your\n".
      "convenience: $choicefile\n\n";

  } else {
    warn "Running editor for site_choices.txt file using command $edit...\n";
    system ($edit, $choicefile);

    if (($? >> 8) != 0) {
      die "The command failed. Please edit $choicefile\n".
	  "by hand and re-run sitescooper.\n\n";
    }
  }
}

# ---------------------------------------------------------------------------

sub ReadSiteForChoices {
  my ($self, $file) = @_;

  return if ($file =~ /(\.swp$|core|\.bak$|\~$|^#)/);
  return if (-d $file);

  open (IN, "<$file") || next;
  my ($url, $sitename, $desc);
  $url = $sitename = $desc = undef;
  while (<IN>) {
    s/
*$//g; s/#.*$//g;
    /^\s*Name:\s*(.*)$/ and ($sitename = $1), next;
    /^\s*Description:\s*(.*)$/ and ($desc = $1), next;
    /^\s*URL:\s*(.*)$/ and ($url = $1), next;
  }
  close IN;

  ($url, $sitename, $desc);
}

# ---------------------------------------------------------------------------

sub AddRegexpToSet {
  my ($self, $regexp, $pat) = @_;

  if (!defined ($regexp) || $regexp !~ /\)$/) {
    $regexp = "($pat)";
  } else {
    $regexp =~ s/\)$/|${pat})/g;
  }
  $regexp;
}

# ---------------------------------------------------------------------------

sub make_basic_dirs {
  my $self = shift;

  if (!-d $self->{cf}->{tmpdir}) {
    mkdir ($self->{cf}->{tmpdir}, 0777) or
    		die "failed to mkdir '".$self->{cf}->{tmpdir}."'\n";
  }
}

sub make_dirs {
  my $self = shift;

  $self->make_basic_dirs();	# in case it hasn't been done already
  chdir ($self->{cf}->{tmpdir}) or die "cannot cd to ".$self->{cf}->{tmpdir}."\n";

  if (-f $self->{cf}->{tmpdir}."/cookies") {
    $self->{cookie_jar}->load ($self->{cf}->{tmpdir}."/cookies");
  }

  if ($self->{cf}->{debug}) {
    open (LOGFILE, ">".$self->{cf}->{tmpdir}.$SLASH."log.txt");
    select LOGFILE; $| = 1; select STDOUT;
  }

  if ($self->{cf}->{outdir} eq '') {
    $self->{cf}->{outdir} = $self->{cf}->{tmpdir}.$SLASH."txt";
  }
  if (!-d $self->{cf}->{outdir}) {
    mkdir ($self->{cf}->{outdir}, 0777) || die "failed to mkdir '$self->{cf}->{outdir}'\n";
  }

  $self->{cf}->{pdbdir} = $self->{cf}->{tmpdir}.$SLASH."prc";

  # check for spaces on Win32 -- MakeDocW can't handle them!
  # Thx to wgoosey /at/ servtech.com for spotting this one.
  if ($self->{cf}->{outputfilter} eq 'makedoc') {
    if (MyOS() eq 'Win32') {
      if ($self->{cf}->{outdir} =~ / /) {
	warn "

Warning: Sitescooper is installed in a directory containing spaces in the
filename. The MakeDocW conversion tool does not support this, so you may
need to move Sitescooper to another directory, e.g. C:\\Sitescooper, for
this conversion to work!  (This is a bug in MakeDOCW.exe.)

";
      }
    }
  }
}

# ---------------------------------------------------------------------------

sub read_state {
  my $self = shift;

  $self->{cachefactory}->open_cache();
}

# ---------------------------------------------------------------------------

sub generate_output_filenames {
  my ($self, @sites) = @_;
  my %already_done = ();
  my $url;
  my $scf;

  foreach $url (@sites) {
    next if ($url eq '');
    $scf = $self->{scfs}->{sites}->{$url};
    $scf->load_site_file();
    carp "url not defined in generate_output_filenames"
    			unless defined($url);
    carp "scf not defined in generate_output_filenames"
    			unless defined($scf);

    if (defined $scf->{req_cookie}) {
      ($TmpGlobal::req_cookie_host, $TmpGlobal::req_cookie_key)
      				= split (' ', $scf->{req_cookie});
      $TmpGlobal::gotit = 0;

      sub chk_for_reqd_cookie {
	if ($_[4] eq $TmpGlobal::req_cookie_host &&
	  	$_[1] eq $TmpGlobal::req_cookie_key)
	{
	  $TmpGlobal::gotit = 1;
	}
      }
      $self->{cookie_jar}->scan (\&chk_for_reqd_cookie);

      if (!$TmpGlobal::gotit) {
	my $line = $scf->{site_defined_at};
	$line =~ s/^(.*):(.*?)$/"$1"/g;
	$self->verbose ("Cookie from $TmpGlobal::req_cookie_host is not imported, not scooping $line.");
	$scf->{active} = 0;
      }

      undef $TmpGlobal::req_cookie_host;
      undef $TmpGlobal::req_cookie_key;
    }

    if (defined $scf->{req_env}) {
      my $envname = $scf->{req_env};
      if (!defined $ENV{$envname}) {
	my $line = $scf->{site_defined_at};
	$line =~ s/^(.*):(.*?)$/"$1"/g;
	$self->verbose ("Env variable $envname is not set, not scooping $line.");
	$scf->{active} = 0;
      }
    }

    next unless ($scf->{active} == 1);

    my $sitename = $scf->{name};
    if (!defined $sitename || $sitename eq $url) {
      $sitename = $url;
      $sitename =~ s/[\?\#].*$//;	# strip CGI params, or #anchors tho'
      $sitename =~ s/http:\/\///;
      $sitename =~ s/www\.//g;
      $sitename =~ s/\.(com|net|org)//g;
      $sitename =~ s/\.htm.*$//g;
      $sitename =~ s/\/$//g;

      $sitename = 'Untitled_'.$sitename;
      $scf->{name} = $sitename;
    }

    next if (defined $already_done{$sitename});
    $already_done{$sitename} = 1;

    my $scooper = new Sitescooper::Robot ($self,
    		$url, $self->{scfs}->{sites}->{$url});

    $scooper->{cf}->{output_style} = $self->{cf}->{output_style};
    $scooper->{site} = $url;

    $scooper->set_title ($sitename);
    if ($sitename =~ /^Untitled_/) { $scooper->{need_title} = 1; }

    push (@{$self->{robots}}, $scooper);
  }
}

# ---------------------------------------------------------------------------

sub get_preloaded_page {
  my ($self, $url) = @_;
  my $resp = $self->{preloaded_responses}->{$url};

  if (defined $resp) {
    delete $self->{preloaded_responses}->{$url};
    $resp;
  } else {
    undef;
  }
}

# ---------------------------------------------------------------------------

sub get_all_sites {
  my ($self, $convert_now) = @_;

  $convert_now ||= 0;

  # optimise by pre-getting front pages using parallel
  # requests, if possible.  Pre-get several pages in advance.
  my $ampreloading = ($self->{httpclient}->can_preload ()
  				&& !$self->{cf}->{use_only_cache});

  if ($ampreloading) {
    $self->dbg ("preloading front pages");
  }

  # now, run through the robots.  Preload the next few sites' front
  # pages while we process the current one.  Preload several in
  # advance, even though this will saturate the load queue, because
  # the less URLs we load from one site at a time the better.
  #
  my $PRELOAD_AHEAD = 3;
  my @robots = @{$self->{robots}};

  my ($thisrobot);
  my @nextrobot = ();
  my $i;

  for ($i = 0; $i < $PRELOAD_AHEAD; $i++) {
    $nextrobot[$i] = shift @robots;
    if (defined $nextrobot[$i]) {
      $nextrobot[$i]->{convert_now} = $convert_now;
      if ($ampreloading) {
	$nextrobot[$i]->preload_front_page ($nextrobot[$i]->{url});
      }
    }
  }

  $i = $PRELOAD_AHEAD - 1;
  while (1) {
    $thisrobot = $nextrobot[0];
    shift @nextrobot;
    $nextrobot[$i] = shift @robots;

    if (defined $nextrobot[$i]) {
      $nextrobot[$i]->{convert_now} = $convert_now;
      if ($ampreloading) {
	$nextrobot[$i]->preload_front_page ($nextrobot[$i]->{url});
      }
    } else {
      $nextrobot[$i] = undef;
    }

    last unless (defined $thisrobot);	# out of sites
    $thisrobot->scoop_site();
  }
}

# ---------------------------------------------------------------------------

sub get_httpclient_queue_keys {
  my ($self) = @_;
  (keys %{$self->{httpclient_queue}});
}

sub add_httpclient {
  my ($self, $handler) = @_;
  $self->{httpclient_queue_new_idx}++;
  $self->{httpclient_queue}->{ $self->{httpclient_queue_new_idx} } = $handler;
}

sub remove_httpclient {
  my ($self, $handler) = @_;
  my $idx;
  foreach $idx (keys %{$self->{httpclient_queue}}) {
    my $ent = $self->{httpclient_queue}->{$idx};
    next unless (defined $ent && $ent eq $handler);
    delete $self->{httpclient_queue}->{$idx};
  }
}

sub httpclient_queue_to_string {
  my ($self) = @_;
  my $idx;
  my $ret = 'URL-handler queue: ';
  foreach $idx (keys %{$self->{httpclient_queue}}) {
    my $ent = $self->{httpclient_queue}->{$idx};
    $ret .= $ent->to_string().' ';
  }
  $ret =~ s/ $//;
  $ret;
}

# ---------------------------------------------------------------------------

sub convert_output {
  my ($self, $robot, $scf) = @_;

  carp "scf not defd" unless defined ($scf);
  carp "robot not defd" unless defined ($robot);

  return unless ($self->{cf}->{use_convert_tool});

  my $outdir = $robot->{outdir};
  my $outidxfile = $robot->{outidxfile};
  my $pdbtitle = $robot->{pdbtitle};

  my $idx = $outdir.$SLASH.$outidxfile;

  if ($self->{cf}->{output_style} == $OUT_IMAGES) {
    # we don't need the HTML itself, just the images. Dump the HTML and
    # output the path to the images' directory.
    unlink $idx;

    my $num = 0;
    foreach my $imgfile (@{$robot->{images_snarfed}}) {
      $imgfile =~ /\.([^.]+)$/;
      my $extension = $1;

      my $fname = $idx;
      $fname =~ s/_NN\.tmp$/_${num}.${extension}/g;
      rename ($outdir.$SLASH.$imgfile, $fname);

      $self->convert_an_output_item ($robot, $scf, $fname, $pdbtitle);
    }

  } else {
    $self->convert_an_output_item ($robot, $scf, $idx, $pdbtitle);
  }
}

sub convert_an_output_item {
  my ($self, $robot, $scf, $idx, $pdbtitle) = @_;

  return unless ($self->{cf}->{use_convert_tool});

  if ($self->{cf}->{outputfilter} eq '__cat__') {
    open (IN, "< ".$idx);
    while (<IN>) { print STDOUT; }
    close IN;
    unlink $idx;
    return;
  }
  if ($self->{cf}->{outputfilter} eq '__path__') {
    print STDOUT "$idx\n";
    return;
  }

  my $outdir = $robot->{outdir};
  my $syncfile = $robot->{syncfile};
  return unless defined $syncfile;
  unlink $syncfile;

  # special hack for imgv, just use the "cmd: " syntax because it's
  # a tricky commandline.
  if ($self->{cf}->{outputfilter} eq 'imgv') {
    $self->{cf}->{outputfilter} = 'cmd: '.$self->{cf}->{imgvconvert};
  }

  my $cmd;
  if ($self->{cf}->{outputfilter} eq 'makedoc') {
    $cmd = "$self->{cf}->{makedoc} $self->{cf}->{converter_args} ".	
    		"\"$idx\" \"".$syncfile."\" '".$pdbtitle."'";

  } elsif ($self->{cf}->{outputfilter} eq 'plucker') {
    my $dir = $self->setup_pluckerdir ($idx, $outdir, $pdbtitle);
 
    $cmd = $self->{cf}->{plucker}.
	" ".$self->{cf}->{converter_args}.
    	" -p\"$dir\" -s scoop ";

    $syncfile = $idx;
    $syncfile =~ s,^.*[\/\\\:]([^\/\\\:]+)\.[^\.]+$,$1,;
    $syncfile = "$dir/$syncfile.pdb";

  } elsif ($self->{cf}->{outputfilter} eq 'isilo') {
    if (MyOS() eq 'Win32' && $self->{cf}->{isilo} =~ /isilow32/i) {
      $self->{cf}->{isiloargs} .= ' -u';		# doesn't support the other args AFAIK.
    }

    if ($self->{cf}->{fileperpage}) {
      $cmd = "$self->{cf}->{isilo} $self->{cf}->{isiloargs} ".
		"-i\"".$pdbtitle."\" ".
		$self->{cf}->{isilomultipageargs}." ".
		$self->{cf}->{converter_args}." ".
		"\"".$idx."\" ";
    } else {
      $cmd = "$self->{cf}->{isilo} $self->{cf}->{isiloargs} ".
		"-i\"".$pdbtitle."\" ".
		$self->{cf}->{converter_args}." ".
		"\"".$idx."\" ";
  }

    # UNIX iSilo utils take the output filename as well; Win32
    # doesn't need it as it installs as it goes along.
    # Redirect output to /dev/null if -quiet was given.
    #
    if (MyOS() eq 'UNIX') {
      $cmd .= " \"".$syncfile."\"";
      if ($self->{cf}->{verbose} == 0) {
	$cmd .= " > /dev/null";
      }
    }

    # Win32 iSilo only takes the -u arg for the GUI version, not the
    # command line one. Strip the arg for the command-line converter.
    # Also add the output filename.
    #
    if (MyOS() eq 'Win32' && $cmd =~ /isiloc32/i) {
      $cmd =~ s/ -u / /g;
      $cmd .= " \"".$syncfile."\"";
    }

  } elsif ($self->{cf}->{outputfilter} eq 'richreader') {
    $cmd = "$self->{cf}->{richreader} $self->{cf}->{richargs} $self->{cf}->{converter_args} \"".$idx."\"";

  } elsif ($self->{cf}->{outputfilter} =~ /^cmd: (.*)$/) {
    $cmd = $1;
    my $args = $self->{cf}->{converter_args};
    $cmd =~ s/__SCOOPFILE__/${idx}/g;
    $cmd =~ s/__SYNCFILE__/${syncfile}/g;
    $cmd =~ s/__TITLE__/${pdbtitle}/g;
    $cmd =~ s/__ARGS__/${args}/g;

  } else {
    die "bad output filter $self->{cf}->{outputfilter}\n";
  }

  # substitute in the parameters taken from the site file
  # these should be set in the SCF::set_defaults method to avoid undefs.
  $cmd =~ s/__IMAGE_MAX_WIDTH__/ $scf->{image_max_width} /ge;

  my $keep_tmps = ($self->{cf}->{debug} || $self->{cf}->{keep_tmps});

  if (MyOS() ne 'Mac') {
    $self->add_cmd_dir_to_path ($cmd);
    $self->verbose ("Running: $cmd");

    # cd to conversion dir for command
    my $realwd = getcwd; chdir $outdir;
    system $cmd;
    chdir $realwd;			# back again

    # output a newline, MakeDoc won't do it itself.
    if ($self->{cf}->{outputfilter} eq 'makedoc' && MyOS() eq 'UNIX') {
      $self->verbose ("\n");
    }

    if (($? >> 8) != 0)
    {
      # work around a bug in iSilo converter on Win32 -- it
      # reports failure even when the conversion went fine.
      # (TODO: check if still the case)
      if (MyOS() ne 'Win32' ||
	    $self->{cf}->{outputfilter} ne 'isilo')
      {
	warn "command failed: $cmd\n";
	$self->{failed_to_cvt} = 1;
	$keep_tmps = 1;
      }
    }

  } else
  {
    # system is broken on MacOS, so print the required command 
    #so it can be run easily from MPW shell
    if (!defined $self->{macos_system_warning_written}) {
      warn "[Warning: not using the broken MacPerl system() call. ".
	    "You will need to\ncut and paste the command ".
	    "lines yourself!]\n\n";
      $self->{macos_system_warning_written} = 1;
    }
    print $cmd, "\n";
    $keep_tmps = 1;
  }

  if ($self->{cf}->{dumppdb})
  {
    # If we're dumping, read in the generated file and write it to
    # STDOUT.
    open (IN, "< ".$syncfile);
    while (<IN>) { print STDOUT; }
    close IN;
    unlink $syncfile;

  } elsif (defined $self->{cf}->{pilotinstdir})
  {
    # if installing to a dir, install it and output the filename.
    #
    my $filedesc = basename ($syncfile);
    my $instfile = $self->{cf}->{pilotinstdir}.$SLASH.$filedesc;
    if ($syncfile ne $instfile) {
      move ($syncfile, $instfile) or
      	warn "failed to move $syncfile to $instfile\n";
    }
    print "Created: ".$instfile."\n";

  } elsif (defined $self->{cf}->{pilotinstapp})
  {
    # If installing using an app, run it as appropriate, or for an
    # ***ADD_TO_MANIFEST*** install method, write the filename to
    # that manifest file.
    #
    if ($self->{cf}->{pilotinstapp} =~ /^\*\*\*ADD_TO_MANIFEST\*\*\* (.*)$/)
    {
      if (!open (OUT, ">> $1")) {
	warn "cannot write to $1\n";
      } else {
	print OUT $syncfile."\n"; close OUT;
      }

    } elsif ($self->{cf}->{pilotinstapp} eq '***USE_MODULE***') {
      my $instret = $self->{cf}->{installer}->install_file ($syncfile);
      $self->dbg ("PDA::PilotInstall install_file returned ".
      		(defined $instret ? $instret : "undef"));
      if (defined $instret && $instret == 1) {
	print "Installed using PDA::PilotInstall module: ".$syncfile."\n";
	unlink $syncfile;
      }

    } else {
      $cmd = "$self->{cf}->{pilotinstapp} $syncfile";
      $self->add_cmd_dir_to_path ($cmd);
      $self->verbose ("Running: $cmd");
      system $cmd;

      if (($? >> 8) != 0) {
	warn "command failed: $cmd\n";
      } else {
	unlink $syncfile;
      }
    }
  } else
  {
    # just output the name of the file, in the sitescooper tmp directory,
    # for other apps that may want to collect these files and store
    # them somewhere.
    #
    print "Created: ".$syncfile."\n";
  }

  if (!$keep_tmps) {
    File::Path::rmtree ($outdir);		# don't keep .txt files around
  }
}

sub add_cmd_dir_to_path {
  my ($self, $cmd) = @_;
  local ($_);

  # Perl on some Win32 platforms seems to require that the binary be
  # in the PATH.
  #
  if (MyOS() eq 'Win32') {
    $_ = $cmd;
    if (!/[\\\/]/) { return; }	# foo arg ...
    if (/^\"([^\"]+)\"/) { $cmd = $1; }	# "C:\Program Files\foo.exe" arg ...
    elsif (/^(\S+)\s/) { $cmd = $1; }	# C:\windows\foo.exe arg ...
    else { $cmd = $_; }			# C:\windows\foo.exe

    $cmd =~ s,[\\/][^\\/]+\s*$,,g;		# trim the filename
    my $cmdpat = $cmd; $cmdpat =~ s,(\W),\\$1,g;	# escape funny chars

    if ($ENV{'PATH'} !~ /;${cmdpat}(;|$)/) {
      $self->dbg ("Adding directory to command path: $cmd");
      my $path = $ENV{'PATH'} || $ENV{'Path'} || $ENV{'path'};
      $path .= ";$cmd"; $ENV{'PATH'} = $path;
    }
  }
}

# ---------------------------------------------------------------------------

sub setup_pluckerdir {
  my ($self, $idx, $outdir, $pdbtitle) = @_;

  my $dir = $idx;			# /path/to/foo.html
  $dir =~ s,[\/\\\:]([^\/\\\:]+)$,,;	# /path/to
  my $convfilename = $1;		# foo.html

  my $sitename = $convfilename;
  $sitename =~ s,\.[^\.]+$,,;		# foo

  # first, create a file listing patterns that Plucker should not follow
  my $excludefile = $dir.$SLASH."exclude.txt";
  open (OUT, ">". $excludefile) or die "cannot write to $excludefile\n";
  print OUT   "1:+:file:${outdir}/.*\\..*\n",
	      "1:+:plucker:/.*\\..*\n",
	      "0:-:.*\n";
  close OUT;

  my $bpp;
  if ($self->{cf}->{maxcolors} <= 2) {
    $bpp = 1;
  } elsif ($self->{cf}->{maxcolors} == 4) {
    $bpp = 2;
  } elsif ($self->{cf}->{maxcolors} == 16) {
    $bpp = 4;
  } elsif ($self->{cf}->{maxcolors} == 256) {
    $bpp = 8;
  } else {
    warn "TargetMaxColors should be one of 2, 4, 16, or 256 for Plucker.\n"
      ."Using 2, ignoring invalid value \"$self->{cf}->{maxcolors}\".";
    $bpp = 1;
  }

  my $pluckerrc;
  my $origpluckerrc;
  if (MyOS() eq 'UNIX') {
    $pluckerrc = $dir.$SLASH.".pluckerrc";
    $origpluckerrc = $ENV{'HOME'}.$SLASH.".pluckerrc";

  } else {
    $pluckerrc = $dir.$SLASH."plucker.ini";
    my $pluckerhome = $ENV{'PLUCKERHOME'};
    if (defined $pluckerhome) {
      $origpluckerrc = "$pluckerhome\\plucker.ini";
    } else {
      $origpluckerrc = "C:\\Program Files\\Plucker\\plucker.ini";
    }
  }

  # read default values for the [DEFAULT] section from the main
  # plucker config file
  my %configs = ();

  # some sitescooper defaults, if they're unset in the config
  $configs{maxwidth} = 150;
  $configs{maxheight} = 250;
  $configs{alt_maxwidth} = 570;
  $configs{alt_maxheight} = 570;
  $configs{bpp} = $bpp;

  if (open (IN, "<$origpluckerrc")) {
    while (<IN>) {
      /^\s*\[DEFAULT\]\s*$/i and last;
    }
    while (<IN>) {
      s/;.*$//; s/^\s+//; s/\s+$//;
      /^\[/i and last;
      /^(.+?)\s*=\s*(.+?)$/ and $configs{$1} = $2;
    }
    close IN;
  }

  # Sitescooper setting for this always takes priority
  $configs{exclusion_lists} = $excludefile;

  # now create our config file...
  open (OUT, ">".$pluckerrc);
  print OUT "[DEFAULT]\n";
  foreach my $key (keys %configs) {
    print OUT "$key=".$configs{$key}."\n";
  }
  print OUT qq{

[scoop]
db_name=$pdbtitle
db_file=$sitename
home_url=plucker:/$convfilename
home_maxdepth=9

[WINDOWS]
close_on_exit=1

  };

  close OUT or warn "cannot write to $pluckerrc\n";

  $dir;
}

# ---------------------------------------------------------------------------

sub write_state {
  my $self = shift;

  $self->{cachefactory}->close_cache();
}

# ---------------------------------------------------------------------------

sub disconnect {
  my $self = shift;

  if (MyOS() eq 'Win32') {
    warn "disconnect on Win32 not implemented yet -- sorry.\n";

  } elsif (MyOS() eq 'UNIX') {
    warn "disconnect: trying to kill pppd.\n";
    system ("killall pppd");	# probably won't work.

  } elsif (MyOS() eq 'Mac') {
    eval q{
      use MacPerl;
      MacPerl::DoAppleScript("ppp disconnect");
    } or die "Cannot disconnect, MacPerl module not found: $@\n";
  }
}

# ---------------------------------------------------------------------------

sub parse_output_template {
  my $self = shift;

  $self->{index_page} = undef;
  $self->{html_main_page} = $self->{html_sub_page} = $self->{html_story} = undef;
  $self->{text_main_page} = $self->{text_sub_page} = $self->{text_story} = undef;

  # first read the defaults, then override them with the
  # -noheaders/-nofooters settings, then override those with user
  # template settings.
  #
  $self->parse_a_template ($self->{default_output_template});

  # strip out our tags if -noheaders is on. Be sure to leave in the
  # compulsory ones (__STORY_ANCHOR__, <html><head>, etc.)

  if ($self->{cf}->{writeheader} == 0)
  {
    $self->{html_main_page} =~ s/<h1>__SITE_TITLE__<\/h1>//is;
    $self->{text_main_page} =~ s/__SITE_TITLE__\n\n//is;

    $self->{html_story} =~ s/<hr>.*?<br>//is;
    $self->{html_story} =~ s/\[<a href=.*?<\/a>\]<br>//is;
    # leave in the <a name=...> tag though

    $self->{text_story} =~ s/__SITE_NAME__: __STORY_URL__\n\n//is;
  }

  if ($self->{cf}->{writefooter} == 0)
  {
    $self->{html_main_page} =~ s/<p><hr><font size=1>.*?<\/i><\/font>//is;
    $self->{text_main_page} =~ s/\(End of.*?__SITESCOOPER_HOME_URL__ \)\n//is;
  }

  # and now parse the user templates
  #
  if (defined $self->{output_template}) {
    $self->parse_a_template ($self->{output_template});
  }
}

sub parse_a_template {
  my $self = shift;
  local ($_) = shift;

  s/<indexpage>(.*?)<\/indexpage>//is and $self->{index_page} = $1;

  s/<htmlmainpage>(.*?)<\/htmlmainpage>//is and $self->{html_main_page} = $1;
  s/<htmlsubpage>(.*?)<\/htmlsubpage>//is and $self->{html_sub_page} = $1;
  s/<htmlstory>(.*?)<\/htmlstory>//is and $self->{html_story} = $1;

  s/<textmainpage>(.*?)<\/textmainpage>//is and $self->{text_main_page} = $1;
  s/<textsubpage>(.*?)<\/textsubpage>//is and $self->{text_sub_page} = $1;
  s/<textstory>(.*?)<\/textstory>//is and $self->{text_story} = $1;
}

# ---------------------------------------------------------------------------

sub warn_log {
  my ($self) = shift;
  my $msg = join ('', @_); chomp $msg;
  $self->log ("Warning: ", $msg);
  $self->{ui}->scoop_warn (@_);
}

sub die_log {
  my ($self) = shift;
  my $msg = join ('', @_); chomp $msg;
  $self->log ("Fatal: ", $msg);
  $self->{ui}->scoop_die (@_);
}

sub log {
  my ($self) = shift;
  if (defined fileno LOGFILE) { print LOGFILE @_, "\n"; }
}

sub journal {
  my $self = shift;
  if (defined fileno JOURNAL) {
    my $tag = shift;
    my $lines = join("", @_); $lines =~ s/^/$tag:\t/gm;
    print JOURNAL $lines, "\n";
  }
}

sub dbg {
  my ($self) = shift;
  if ($self->{cf}->{debug} != 0) {
    my $msg = "debug: ".join ('', @_); chomp $msg; $self->log ($msg);
    $self->{ui}->dbg (@_);
  }
}

sub dbg2 {
  my ($self) = shift;
  if ($self->{cf}->{debug} > 1) {
    my $msg = "debug: ".join ('', @_); chomp $msg; $self->log ($msg);
    $self->{ui}->dbg (@_);
  }
}

sub verbose {
  my ($self) = shift;

  if ($self->{cf}->{verbose}) {
    my $msg = join ('', @_); chomp $msg; $self->log ($msg);
    $self->{ui}->verbose (@_);
  }
}

# ---------------------------------------------------------------------------

sub sitewarn_file_line {
  my ($self, $fname) = @_;
  shift; shift;
  $fname =~ s,^.*[\/\\:]([^\/\\:]+?):(\d+)?$,\"$1\" line $2,;
  $self->{ui}->sitewarn ($fname, @_);
}

# ---------------------------------------------------------------------------

sub cleanexit {
  my ($self) = @_;
  shift;
  $self->{ui}->cleanexit (@_);
}

sub read_test_support {
  my ($self, $file) = @_;

  $self->dbg ("reading functional test support file $file");
  open (IN, "<$file") or die "no test file $file";
  while (<IN>) {
    s/\$\{(\S+?)\}/
	  defined($ENV{$1}) ? $ENV{$1} : "";
    /ge;
    if (/^Redirect: (\S+) (\S+)/) {
      $self->{func_test_redirect}->{$1} = $2;
    }
  }
  close IN;
}

sub AddHostToURL {
  my ($self, $fromurl, $relative) = @_;
  # a simpler form of AbsoluteURL, used for StoryURL lines.
  # this is necessary because the real thing will escape metacharacters
  # which screws up regexp patterns.

  local ($_) = $relative;

  carp ("fromurl not defined in AddHostToURL") unless defined $fromurl;
  carp ("url not defined in AddHostToURL") unless defined $_;

  s/^"//; s/"$//;	# trim quotes if necessary
  $_ = $self->expand_url_magic ($_);	# allow [[MM]] etc. keywords in these patterns

  if (m,^[^/]+://,) {
    # do nothing, it's fully-qualified
  } elsif (m,^/,) {
    $fromurl =~ m,^([^/]+://[^/]+)/, and ($_ = $1.$_);
  }
  $_;
}

# ---------------------------------------------------------------------------

sub mm_to_monthname {
  $MONTHNAMES[$_[0]];
}

sub get_extra_date {
  my ($self, $time) = @_;

  my ($x, $wday, $min, $hr);
  ($x,$min,$hr,$x,$x,$x,$wday,$x,$x) =
  		localtime(defined $time ? $time : time);
  my @days = qw(Sun Mon Tue Wed Thu Fri Sat);
  ($min, $hr, $days[$wday]);
}

sub get_date {
  my ($self, $time) = @_;

  my ($x, $mday, $mon, $year);
  ($x,$x,$x,$mday,$mon,$year,$x,$x,$x) =
  		localtime(defined $time ? $time : time);
  $mon++; $year += 1900;
  ($mday, $mon, $year, &mm_to_monthname($mon));
}

sub time2datestr {
  my ($self, $time) = @_;

  my ($dd, $mm, $year, $mon) = &get_date ($time);
  "$mon $dd $year";
}

# ---------------------------------------------------------------------------

sub match_url ($$$) {
  my ($self, $cache, $url, $pat) = @_;

  if (!defined $url || !defined $pat) {
    warn ("undef in match_url from: ".join(' ', caller()));
    return undef;
  }

  my $re;
  $re = $cache->{$pat};
  if (!defined $__implemented_add_url_to_cache_method) {
    $self->implement_add_url_to_cache_method();
  }
  if (!defined $re) { $re = add_url_to_cache ($cache, $pat); }

  return $url =~ /$re/;
}

# since Perl 5.004 and below don't support qr{}, we have to provide
# an alternative implementation that uses strings (slower, but at least
# it works).  Create the method on the fly using eval().

sub implement_add_url_to_cache_method {
  my ($self) = @_;
  $__implemented_add_url_to_cache_method = 1;

  my $func = '
	      sub add_url_to_cache {
		my ($cache, $pat) = @_;
  ';

  if ($] > 5.005) {
    $func .= q{

      		$cache->{$pat} = qr{^${pat}(?:\#|$)};

    };
  } else {
    $func .= q{

      		$cache->{$pat} = '^'.${pat}.'(?:\#|$)';

    };
  }
  $func .= '} 1;';
  eval $func or die "eval failed: $func: $@";
}

# ---------------------------------------------------------------------------

sub expand_url_magic {
  my ($self, $url) = @_;
  if ($url !~ /\[\[/) { return $url; }

  local ($_);
  if (!defined $match_url_yyyy) {
    ($match_url_dd, $match_url_mm,
    $match_url_yyyy, $match_url_Mstr) = &get_date(undef);

    $match_url_yy = $match_url_yyyy;
    $match_url_yy =~ s/^\d\d//; # trim century
    $match_url_mm = "0$match_url_mm"
    			unless ($match_url_mm =~ /^..$/);
    $match_url_dd = "0$match_url_dd"
    			unless ($match_url_dd =~ /^..$/);
  }

  $url =~ s/\[\[YYYY\]\]/$match_url_yyyy/g;
  $url =~ s/\[\[YY\]\]/$match_url_yy/g;

  $url =~ s{\[\[MM([\+\-]\d+|)\]\]}{
    $self->offset_month($match_url_mm, $1);
  }ge;

  $url =~ s{\[\[M([\+\-]\d+|)\]\]}{
    $self->offset_month($match_url_mm, $1)+0;
  }ge;	# single-digit if poss

  $url =~ s{\[\[Mon([\+\-]\d+|)\]\]}{
    &mm_to_monthname ($self->offset_month($match_url_mm, $1));
  }ge;
  $url =~ s{\[\[mon([\+\-]\d+|)\]\]}{
    $_ = &mm_to_monthname ($self->offset_month($match_url_mm, $1));
    tr/A-Z/a-z/; $_;
  }ge;
  $url =~ s{\[\[MON([\+\-]\d+|)\]\]}{
    $_ = &mm_to_monthname ($self->offset_month($match_url_mm, $1));
    tr/a-z/A-Z/; $_;
  }ge;

  $url =~ s/\[\[DD\]\]/$match_url_dd/g;
  $url =~ s{\[\[D\]\]}{ $match_url_dd+0; }ge;	# single-digit if poss
  $url;
}

sub offset_month {
  my ($self, $mm, $offset) = @_;

  if ($offset ne '') { $mm += $offset; }
  if ($mm < 1 || $mm > 12) { $mm = ((($mm-1)+12) % 12)+1; }
  $mm = "0$mm" unless ($mm =~ /^..$/);
  $mm;
}

# how a link should be marked up when we did not retrieve it, and
# it points to a page inside this site (but which did not match
# any of the retrieval URLs).
#
sub delink_unscooped_internal_link {
  my ($self, $url, $text) = @_;

  if ($self->{cf}->{keep_ext_links}) {
    "<a href=\"$url\">".$text."</a>";
  } else {
    # "<u><!--internal-->".$text."</u>";
    "<u>".$text."</u>";
  }
}

# how a link should be marked up when we did not retrieve it, and
# it points to a page outside this site.
#
sub delink_unscooped_external_link {
  my ($self, $url, $text) = @_;

  if ($self->{cf}->{keep_ext_links} && defined $url) {
    "<a href=\"$url\">".$text.$self->{cf}->{ext_link_char}."</a>";
  } else {
    # "<u><!--external-->".$text."</u>";
    "<u>".$text."</u>";
  }
}

# simple wrapper around Config and $^O to provide win/mac/unix
# differentiation, without worrying about which variant of UNIX
# it is.
#
sub MyOS() {
  if (defined ($MY_OS)) { return $MY_OS; }

  # FIGURE OUT THE OS WE'RE RUNNING UNDER
  # Some systems support the $^O variable.  If not available then require()
  # the Config library.  [nicked from CGI.pm -- jmason]

  my $os;
  unless ($os) {
    unless ($os = $^O) {
      require Config;
      $os = $Config::Config{'osname'};
    }
  }

  if ($os=~/win/i) {
    $os = 'Win32';
  } elsif ($os=~/vms/i) {
    $os = 'VMS';
  } elsif ($os=~/mac/i) {
    $os = 'Mac';
  } elsif ($os=~/os2/i) {
    $os = 'OS2';
  } else {
    $os = 'UNIX';
  }
  $MY_OS = $os;
}

1;

#===========================================================================

# TODO:
#
# URLs at end like [1] this
#
#---------------------------------------------------------------------------
# vim:sw=2:tw=74:
