#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
  if 0; # not running under some shell

# ===========================================================================
# --- NMSN Australian TV grabber by Michael 'Immir' Smith...
# --- $Id: tv_grab_au,v 2.14 2006/07/20 22:29:32 michael Exp michael $
#
# A current version of this script should usually be available here:
#
#   <http://immir.com/tv_grab_au>
#
# pod documentation perhaps coming later... in brief:
#
#             --configure           configure the grabber
#             --config-file=<file>  use specified config file
#             --list-channels       show subscribed channels and xmltvids
#             --show-config         show configuration details
#             --slow                download a details page for every show
#             --fast                opposite of --slow (to override config)
#             --days=<n>            days to grab
#             --warper              use webwarper.net anonymizer
#             --output=<file>       xml output file
#             --ignore-cache        ignore cached information
#
# The following config-file excerpt demonstrates which options can be
# set by default in the config-file:
#
#   $conf = {
#     'TZ' => 'Australia/Canberra',
#     'services' => [ ... ],
#     'days' => 5,
#     'output' => '/tmp/xmltv.xml',
#     'output_dir' => '/var/local/tv_grab_au', # for cache file & debugging
#     'slow' => 1,
#     'warper' => 1,
#   };
#
# When run with mythfilldatabase, the config file may end up as
# ~/.mythtv/tv_grab_au.xmltv and this is where xmltvids for channels can
# be modified.
#
# Listing information can be duplicated for more than one channel id
# using a construction like this (be careful of syntax) in the config file:
#
#    $duplicate = {
#       'act.abc.gov.au' => { 'ABC HD'       => 'hd.abc.gov.au',
#                             'ABC CHAN 22'  => '22.abc.gov.au' },
#       'eastern.sbs.com.au' => { 'SBS HD' => 'hd.eastern.sbs.com.au' }
#    };
#
# Downloading of details pages can be controlled by the --slow option
# (possibly set by default during configuration) and regular expressions
# in additional config files (see end of script for more information).
#
# A dump of the data from failed details pages will be placed into the
# debug subdirectory of the output_dir if it exists. In most cases this
# will either be /var/local/tv_grab_au/debug or ~/.tv_grab_au/debug.

# ---------------------------------------------------------------------------
# Additional debug options:
#
#             --static              write to /dev/null (for testing)
#             --store=<file>        store page data to file for replay use
#             --replay=<file>       replay page data from file
#
# e.g.  tv_grab_au --static --ignore-cache --store=/tmp/PAGES
#       tv_grab_au --static --ignore-cache --replay=/tmp/PAGES
#
# The first of these will do a full update, ignoring the cache, and
# storing every (decoded) page in the file /tmp/PAGES. The second one
# repeats the update, but simply replays the pages from /tmp/PAGES instead
# of fetching them again.
#
# additional config option example:  'store' => '/tmp/PAGES'


# ---------------------------------------------------------------------------
# Recent changes:
#
# 1.27: TZ change added as suggested by Greg Boundy
# 1.31: use parsed details url to make it more resilient to site changes
# 1.34: manual configuration of duplication for channels
# 1.36: user agent fix for proxy (thanks Carl Lewis)
# 1.37: fixed various issues with start/stop times around midnight/6am
#       (thanks to Darryl Young for suggestions of the cause)
# 1.39: fixed dupes from last-then-first listed programs across 6am
# 1.42: ridiculous encrypted data (standard java functions + Caesar slide
#       by 1!); javascript modification of closeup url (Michael Cowell)
# 1.43: move deobfuscate to get_content_base to cope with --config mode
# 1.44: make channel names matched on web page regular expressions/substrings
#       to alleviate problems when NMSN changes column names
# 1.45: fix problem with regexp match of channel name on webpage matching
#       shorter names (eg tv12 matched with tv1 instead of tv2) (Dave Oxley)
# 1.47: show version number when running (requested by William Kenworthy)
# 1.48: if timezone empty, don't append timezone info
# 1.49: use index instead of regexp match for channels (stupid me)
# 1.50: make index match case insensitive
# 1.51: include channel id in key for cached data
# 1.52: user agent set to look like IE
# 1.53: fix problems with nbsp and stop times...
# 1.54: incorportate JavaScript module to cope with obfuscation.
# 1.56: JavaScript tested and closeup parsing fixed (for the moment) plus
#       fixed memory leak due to TreeBuilder tree in get_details not being
#       deleted. Fixed problems with some closeup pages empty producing
#       empty title fields causing XMLTV to barf! Bad karma.
# 1.59: cleanup shows using a recursive routine. Fix stripping of non-
#       printables (use bytes).
# 1.61: fix for deobfuscate (Ian Dall).
# 1.62: working on cleanup of data...
# 1.63: deobfuscate adapted to new random function obfuscation...
# 1.65: fiddling with cleanup...

# 2.0:  some major improvements from a patch by Ian Dall, including:
#       proper date/time/timezone processing; webwaper.net anonymizer
#       option with gzipped data support; smarter caching logic;
#       optional data dump for debugging.
# 2.1:  fix for programs straddling 6am (bug introduced last version),
#       plus store/replay options for debugging assistance
# 2.3:  js init only once; some js debugging additions
# 2.4:  fix for changes to guide page urls from Paul Andreassen. Nice one!
# 2.5:  better cleanup code... more fixes/additions from PA.
# 2.6:  major mods to guide page fetching to solve problem introduced in
#       recent patch. Leaner debug code.
# 2.7:  minor bug fixes (mainly debug variable and gui init moved)
# 2.8:  couple of fixes from Andrew C. (conf output_dir and regionid fix)
# 2.12: various fixes supplied by various people... working again
# 2.13: refetch guide pages for changed pids, and pause before details...
#
# ----- patches from others (I'm no longer in the game...)
#
# 2.14: Ian Dall

use strict;
use Getopt::Long;
use Compress::Zlib;
use LWP::UserAgent;
use Date::Parse;
use Date::Manip;
use DateTime::TimeZone;
use DateTime::Format::DateManip;
use File::Path;
use File::Basename;
use Data::Dumper;
use HTML::TreeBuilder;

use JavaScript;

use XMLTV;
use XMLTV::Ask;
use XMLTV::ProgressBar;
use XMLTV::Config_file;

# ---------------------------------------------------------------------------
# --- version checking

{ last if $JavaScript::VERSION ge 0.55;
  warn "JavaScript version $JavaScript::VERSION too old. " .
    "Please upgrade to at least 0.55.\n" }

# ---------------------------------------------------------------------------
# --- global parameters/constants (read in from config)
my $conf = {};      # ref to hash of configuration parameters
my $channels = {};  # ref to hash of subscribed channel names to xmltvids
my $duplicate = {}; # hashref for channel duplication
my %stats = ( total => 0, comp => 0);

my $lang = "en";
my $spoofMSID = 1;  # spoof random MSIDs to avoid redirects? big speed up!

my ($output_dir) = grep { -d && -w _ } split ' ',qq{ /var/local/tv_grab_au
                                                $ENV{HOME}/.tv_grab_au };

my $want_details_file = "$ENV{HOME}/.tv_grab_au-detailed";
my $skip_details_file = "$ENV{HOME}/.tv_grab_au-undetailed";
my (@want_details, @skip_details);

# --- some global counters for reporting
my %count = map { $_,0 } qw{ guide detail bad adjust cached };

# --- NMSN site URLs
my $NMSN       = "http://tvguide.ninemsn.com.au";
my $TV         = "http://tvguide.ninemsn.com.au/todaytv/default.asp";
my $WW         = "http://webwarper.net/ww/";

my ($Revision)   = '$Revision: 2.14 $' =~ /Revision:\s*(\S+)/;
my ($debug, $debugjs);

$Data::Dumper::Sortkeys = 1;
$Data::Dumper::Indent   = 1;
$| = 1;

# ---------------------------------------------------------------------------
# --- Command-line options
my ($opt_configfile, $opt_configure, $opt_list_channels);
my ($opt_ignore_cache, $opt_static, $opt_show_config);
my ($opt_slow, $opt_fast, $opt_max_tries);
my ($opt_output, $opt_days, $opt_warper, $opt_nowarper);
my ($opt_replay, $opt_store);
my $opt_gui = 1;

GetOptions( 'days=i'          => \$opt_days,
            'list-channels'   => \$opt_list_channels,
            'output=s'        => \$opt_output,
            'config-file=s'   => \$opt_configfile,
            'configure'       => \$opt_configure,
            'gui=s'           => \$opt_gui,
            'slow'            => \$opt_slow,
            'fast'            => \$opt_fast,
            'ignore-cache'    => \$opt_ignore_cache,
            'static'          => \$opt_static,
            'warper'          => \$opt_warper,
            'nowarper'        => \$opt_nowarper,
            'show-config'     => \$opt_show_config,
            'max-tries=i'     => \$opt_max_tries,
            'debug+'          => \$debug,
            'debugjs'         => \$debugjs,
            'store=s'         => \$opt_store,
            'replay=s'        => \$opt_replay);

my $config_file = XMLTV::Config_file::filename
  ($opt_configfile, 'tv_grab_au', not $debug );

print "CONFIG_FILE = $config_file\n";

$debug ||= 0;

# --- read and parse configuration --- must do this before configuring
# in case the user has chosen non-default xmltvids for some channels...
if (-r $config_file) {
  local (@ARGV, $/) = ($config_file);
  no warnings 'all';
  eval <>;
  die "error in conf file: $@" if $@ and not $opt_configure;
  print "unable to read configuration file... configuring anyway..." if $@;
} elsif (not $opt_configure) {
  print qq{\nThis version of tv_grab_au is not configured!\n
    If you are using mythfilldatabase, you'll need to configure from
    within mythtv-setup (or with an appropriate --config-file option),
    otherwise simply run with the --configure option.\n\n};
  exit(1);
}

# --- output dir in conf file?
$output_dir = $conf->{output_dir} if $conf->{output_dir};
my $debug_dir = "$output_dir/debug" if $output_dir
                                      and -d "$output_dir/debug" && -w _;
my $cache_file = "$output_dir/cached.pl";

# --- extract sorted subscribed channel list from config-file hash;
# also compute canonicalised lowercased channel to xmltvid hash
my %chanid = map { lc $_, $channels->{$_} } keys %$channels;

# --- are we configuring?
if ($opt_configure) { configure($config_file); exit 0 }

# --- are we just listing channels?
if ($opt_list_channels) {
  print "    $_ -> $channels->{$_}\n" for sort keys %$channels; exit 0;
}

# --- we must be grabbing program information...
my $runtime = time();

# --- explicitly Set Timezone
my $TZ = $conf->{TZ} || 'local';
my $tz = DateTime::TimeZone->new( name => $TZ );

# --- flag for whether we get individual program details
die "cannot do slow and fast\n" if $opt_slow and $opt_fast;
my $slow = $opt_fast ? 0 : ($opt_slow || $conf->{slow});

# --- override slow for some shows (read list of regexps from file)
if (-f $want_details_file) {
  local (@ARGV) = ($want_details_file);
  chomp(@want_details = <>);
}
if (-f $skip_details_file) {
  local (@ARGV) = ($skip_details_file);
  chomp(@skip_details = <>);
}

# --- other configuration options
$opt_max_tries ||= 5;
$opt_days   ||= $conf->{days}   || 5;
$opt_output ||= $conf->{output} || "$output_dir/guide.xml";
die "cannot do webwarper and notwebwarper\n" if $opt_warper and $opt_nowarper;
$opt_warper ||= !$opt_nowarper && $conf->{warper} || 0;
$opt_store  ||= $conf->{store};

if ($debug or $opt_show_config) {
  print "\ntv_grab_au revision $Revision\n";
  print "configuration file: $config_file\n";
  print "output directory:   $output_dir\n";
  print "debug directory:    $debug_dir\n" if $debug_dir;
  print "** using webwarper.net anonymizer **\n" if $opt_warper;
  print "TZ = $TZ  slow = $conf->{slow}\n";
  print "output: $conf->{output}\n" if $conf->{output};

  print "services chosen:\n";
  print "  $_->{name}: " .
    " region=$_->{region} id=$_->{regionid}\n" for @{$conf->{services}};

  print "channel list:\n";
  print "  $_ -> $channels->{$_}\n" for keys %$channels;

  print "duplication list:\n";
  while (my ($a,$b) = each %$duplicate) {
    while (my ($c,$d) = each %$b) { print "  $a -> $d ($c)\n" }
  }
  print "\n";
  exit 0 if $opt_show_config;
} else {
  print "tv_grab_au $Revision: ";
}

print fixplural("grabbing $opt_days days into $opt_output\n");

# --- load replay hash if replaying last capture
my $replay;
if ($opt_replay) {
  local (@ARGV, $/) = ($opt_replay);
  no warnings 'all'; eval <>; die "$@" if $@;
}

# --- first get cached list of shows from last time (if any)
my ($cached, $newcache);
if (-r $cache_file and not $opt_ignore_cache) {
  local (@ARGV, $/) = ($cache_file);
  no warnings 'all'; eval <>; die "$@" if $@;
}

# --- now, on with the shows...

# showlists is a hash of refs to arrays holding shows for each channel
# which we use to infer endtimes from starttimes since the rowspan
# information is generally unreliable (in fast mode) --- actually, they
# seem to get completely off-cut sometimes; everything shifted forward
# or back by 5 minutes or more --- must try and do something about that.
my %showlists;

for my $service (@{$conf->{services}}) {

  # --- first, get a list of the pages of guide available for this
  # service; we set the region cookie with set_service, then parse
  # the returned page for days of guide to grab
  my @pages;
  { my $page = set_service($service);
    my $tree = HTML::TreeBuilder->new_from_content($page);
    my $sel  = $tree->look_down(qw{ _tag select name day id both });
    @pages = map { [ $_->as_text(),
                     $TV . "?day=" . $_->attr('value')
                     . "&channel=" . $service->{name}   
                     . "&region=" . $service->{regionid} ] }
      $sel->look_down('_tag' => 'option');
    $tree->delete();
  }

  # --- now loop over the pages to fetch...
  for my $x (@pages[0..$opt_days-1]) {

    my ($date_str, $guide_url) = @$x or next;

    # --- the date parsed from the link text for the guide page
    my $date = DateTime->from_epoch( epoch => str2time($date_str),
                                     time_zone => $tz);
    my $date6am = $date->clone->set(hour=>6);
    my $tree;
    my (%done, %failed);

  GUIDE: 

    my $guidedata = get_page($guide_url) or next;
    ++$count{guide};

    $tree->delete if $tree;
    $tree = HTML::TreeBuilder->new_from_content($guidedata);

    for ($tree->look_down('_tag' => 'table', 'class' => 'tv')) {

      # extract channel names from the first row of the table 
      # (this row has align=middle and the channels are in bold)
      my @hdr = map { $_->as_text }
        $_->look_down('_tag' => 'tr', 'align' => 'middle')
          ->look_down('_tag' => 'b');

      my @span = (0) x @hdr;    # rowspans to infer columns
      my $row = 0;              # row number (to compute start times)

      for ($_->look_down('_tag' => 'tr', 'valign' => 'top')) {
        my @idx = grep { $span[$_] == 0 } 0..$#hdr; # columns for this row
        for ($_->look_down('_tag' => 'td', 'class' => 'tv')) {

          my $col = shift @idx;
          my $rowspan = $_->attr("rowspan") || 1;
          $span[$col] = $rowspan;

          my $channel_hdr = $hdr[$col];
          my $channel = find_channel($channel_hdr) or next;
          my $chanid = $chanid{lc $channel};

          my ($e) = $_->content_list;
          next unless ref($e) eq 'HTML::Element';
          next unless $e->tag eq 'a';

          my $html=  $e->as_HTML;
          my ($pid) = $html =~ /pid=(\w+)/;
          next unless $pid; # sometimes null programs at bottom of table

          my ($title, $subtitle) = 
            $e->as_text() =~ /(.*\S)\s*(?:\s*-\s*(.*))?/;
          $title    ||= '';
          $subtitle ||= '';

          # NMSN changed their details urls; let's do it this way to
          # cope with future changes
          my @link = @{ $e->extract_links() };
          die "too many links:\n" . $html if @link > 1;
          my $url = $link[0]->[0];
          if ($opt_warper) {
            # Unmangle the webwarper URL.
            $url =~  s#^$WW#http://#;
            $url =~ s#^$NMSN##;
          }
          $url = $NMSN . $url;


          # --- NMSN javascript pop function translates closeup to cu
          $url =~ s/closeup/cu/;

          # --- check (pid, row, rowspan, title) against old cached data
          my $xmldate = dtxmltv($date);
          my $cache_id = "$xmldate:$chanid:$row:$rowspan:$title";
          next if $done{$cache_id};

          my $show;

          if (my $cached_show = $newcache->{$cache_id} || $cached->{$cache_id}) {
            
            $show = $newcache->{$cache_id} = $cached_show;
            ++$count{cached} if $cached->{$cache_id};

          } else {

            # --- compute start and stop times based on row of table and
            # rowspan --- although this appears problematic for some days
            my $start = $date6am->clone->add(minutes => $row*5);
            my $stop  = $start->clone->add( minutes => $rowspan*5 );

            if ($title =~ s/\s*\[\s* (\d+):(\d+) \s* (am|pm) \s*\]\s* //x) {
              my ($hr, $min, $ampm) = ($1, $2, lc($3));
              my $hrdt = $hr + (($ampm eq "am")? 0: 12);
              $start = $date6am->clone;
              $start->set(hour => $hrdt, minute=>$min);
              $start->add( days => 1 ) if $hr < 6 and $ampm eq "am";
            }

            $show = { 'title'     => [[$title, $lang]],
                      'start'     => dtxmltv($start),
                      'stop'      => dtxmltv($stop),
                      'channel'   => $chanid };

            $show->{subtitle} = [[$subtitle, $lang]] if $subtitle;

            # --- fill in more details? ---
            if (want_details($show)) {
              sleep 5+rand(3);
              if (my $details = (get_details_page($url) or
                                 get_details_page($url.'_CloseUp'))) {
                if ($details =~ /unable to retrieve your request/i) {
                  ++$count{bad};
                  if ($failed{$cache_id}++ < $opt_max_tries) {
                    if ($failed{$cache_id} > 2) {
                      print "pausing..." if $debug;
                      sleep 30;
                    }
                    goto GUIDE;
                  }
                } else {
                  $newcache->{$cache_id} = $show if
                    parse_closeup_details($date6am,$show,$details,
                                          $row,$pid,$url);
                }
              }
            }
          }

          # --- that's it!
          cleanup($show); # cleans cached version too...
          push @{ $showlists{$chanid} }, $show;
          $done{$cache_id} = 1;
          abbr_dump($show, $cached->{$cache_id}) if $debug==1;
          print Dumper($show) if $debug>1;
        }
        ++ $row;
        @span = map { $_ - 1} @span; # update rowspan counts
      }
    }
    $tree->delete();
  }
}


# --- check for static (i.e., don't write xml or cache)
if ($opt_static) {
  print "(static) writing xml and cache to /dev/null\n";
  $opt_output = "/dev/null";
  $cache_file = "/dev/null";
}

# --- save to cache before massaging dates
open(CACHE, "> $cache_file") or die "cannot open $cache_file: $!";
print CACHE Data::Dumper->Dump([$newcache], ["cached"]);
close CACHE;

# --- store replay pages?
if ($opt_store and not $opt_replay) {
  open (STORE, "> $opt_store") or die "cannot open $opt_store: $!";
  print STORE Data::Dumper->Dump([$replay], ["replay"]);
  close STORE;
}

# --- check start and stop times and mark duplicates
for my $channel (keys %showlists) {
  my @shows = @{ $showlists{$channel} };
  for my $i (0 .. @shows-2) {
    # make stop time consistent with following start time

    $shows[$i]->{stop} = $shows[$i+1]->{start}
      if not defined $shows[$i]->{stop};

    if (Date_Cmp($shows[$i+1]->{start}, $shows[$i]->{stop}) < 0) {
      if ($shows[$i+1]->{start}  eq  $shows[$i]->{start} and
          $shows[$i+1]->{stop}   eq  $shows[$i]->{stop} and
          title($shows[$i+1])    eq  title($shows[$i])) {
        $shows[$i]->{dupe} = 1; # duplicate show
      } else {
        # --- just adjust previous stop time
        ++$count{adjust};
        $shows[$i]->{stop} = $shows[$i+1]->{start};
      }
    }
  }
}
  
# --- append timezone info and strip fields that XMLTV doesn't need/want
for my $channel (keys %showlists) {
  for my $show (@{ $showlists{$channel} }) {
    next if exists $show->{dupe};
    $show->{title}->[0]->[0]  ||= 'nil';
  }
}

# --- now write to xml
my %writer_args = ( encoding => 'ISO-8859-1' );
if ($opt_output) {
  my $fh = new IO::File(">$opt_output")  or die "can't open $opt_output: $!";
  $writer_args{OUTPUT} = $fh;
}

my $writer = new XMLTV::Writer(%writer_args);

$writer->start
  ( { 'source-info-url'    => $NMSN,
      'source-info-name'   => "NMSN TV Guide",
      'generator-info-name' => "XMLTV - tv_grab_au NMSN v$Revision"} );

for my $channel (sort keys %$channels) {
  my $chanid = $chanid{lc $channel};
  $writer->write_channel( { 'display-name' => [[$channel, $lang]],
          'id' => $chanid } );
  # --- write duplicated channel definitions
  while (my ($name, $otherid) = each %{ $duplicate->{$chanid} }) {
    $writer->write_channel( { 'display-name' => [[$name, $lang]],
            'id' => $otherid } );
  }
}

for my $chanid (keys %showlists) {
  for my $show (@{ $showlists{$chanid} }) {
    next if exists $show->{dupe};
    $writer->write_programme($show);
    for my $otherid (values %{ $duplicate->{$chanid} }) { 
      $writer->write_programme({ %$show, 'channel' => $otherid });
    }
  }
}

$writer->end();
system "cp -f $opt_output $output_dir/guide.xml"
  if $opt_output and $opt_output ne "/dev/null"
  and $opt_output ne "$output_dir/guide.xml";

# --- report statistics and runtime
printf qq{tv_grab_au: downloads = %d guide pages, %d detail pages
  %d failed detail pages, %d pages found in cache, %d stop times adjusted
}, @count{qw{guide detail bad cached adjust}};
printf "tv_grab_au: %.2f KB downloaded", $stats{total}/1024;
printf " (%.2f KB compressed)", $stats{comp}/1024 if $stats{comp};
printf "\ntv_grab_au: finished in %d seconds\n", time() - $runtime;

exit 0; # Game over, man!


# ===========================================================================
# --- subroutines

sub abbr_dump {
  my $show = shift;
  my $cached = shift;
  print "SHOW: $show->{title}->[0]->[0]",
    ($show->{'sub-title'}?" - $show->{'sub-title'}->[0]->[0]":""), "\n";
  print "      $show->{start} -- $show->{stop}  $show->{channel}",
    ($cached?"  (cached)":""), "\n";
  if ($show->{'desc'}) {
    my $d = $show->{'desc'}->[0]->[0];
    printf "      '%s'\n", length($d)>72 ? substr($d,0,69)."..." : $d;
  }
  print "\n";
}

# --- find a channel that matches the channel heading

sub find_channel {
    my $chan_heading = shift;
    for (sort { length($b) <=> length($a) } keys %chanid) {
      return $_ if index(lc $chan_heading, lc $_) == 0;
    }
    for (grep { length($_) >= 3 } sort { length($b) <=> length($a) } keys %chanid) {
      return $_ if index(lc $chan_heading, lc $_) >= 0;
    }
    undef;
}

# --- determine whether to get details
sub want_details {
  my $show = shift;
  my $title = title($show);
  my $channel = $show->{channel};
  my $string = "$channel:$title";
  my $want = $slow;
  for my $expr (@skip_details) { $want = 0 if $string =~ /$expr/ }
  for my $expr (@want_details) { $want = 1 if $string =~ /$expr/ }
  return $want;
}

# --- get details from the closeup page for given show
sub parse_closeup_details {

  my ($date6am, $show, $details, $row, $pid, $url) = @_;
  my $title = title($show);

  # --- use HTML::TreeBuilder to parse the details from the page...
  my $tree = HTML::TreeBuilder->new_from_content($details);

  # --- the details are in a two row table: first row is the header
  # ('Time', 'Program'), second row is the body of the table containing
  # the information we want.

  my $tb = $tree->look_down('_tag' => 'table', 'borderColor' => '#003366');
  unless ($tb) {
    print "Unable to find table for details url=$url\n";
    print "details: ", $tree->as_text, "\n\n" if $debug;
    goto FAIL;
  }

  my ($hd,$bd) = $tb->content_list();
  unless ($hd and $bd) {
    print "Unable to extract 2 rows of table details url=$url\n";
    print "details: ", $tree->as_text, "\n\n" if $debug;
    goto FAIL;
  }

  # --- sanity check the header row
  unless ($hd->as_text =~ /Time .* Program/x) {
    print "Table header missing for details url=$url\n";
    print "details: ", $tree->as_text, "\n\n" if $debug;
    goto FAIL;
  }

  $_->replace_with_content for $bd->look_down('_tag' => 'b');
  $_->replace_with_content for $bd->look_down('_tag' => 'font');

  # --- OK, this is a pain in the neck... I'd like to parse the HTML cleanly
  # using TreeBuilder, but the data is formatted in an irritating manner
  # with multiple fields in the same table cells and things separated by
  # <br> tags etc. So we'll work with the HTML strings of the <td> contents.
  my @td = map { $_->as_HTML } $bd->look_down('_tag' => 'td');

  # --- split cells that contain multiple values delimited by <br>
  @td = map { split '<br>', $_ } @td;

  # --- strip remaining markup and clean things up a little
  for (@td) {
    s/<.*?>//g;
    s/(^\s+|\s+$)//g;
  }

  my ($start0, $channel_hdr, $genre, $desc) = @td[0,1,5,7];
  my ($title1, $title2, $duration, $rating) =
    $td[4] =~ /(.*?)(?: - (.*))?\(([^(]*?mins)(,.*)?\)/;

  # --- ensure all fields are defined
  $$_ ||= '' for \($title1, $title2, $rating, $duration, $genre, $desc);

  $genre =~ s/Genre: *//;
  $rating =~ s/, *//;
  $title2 =~ s/\s+$//;

  # --- extract (text) content lists of the 2 row cells
  unless ($channel_hdr) {
    print "Warning: parsed data incomplete for details url=$url\n";
    print map { "td[$_] = <$td[$_]>\n" } 0..$#td;
    print "body: q{$bd->as_text}\n";
    goto FAIL;
  }

  cleanup(\$channel_hdr); 
  my $channel = find_channel($channel_hdr);

  # --- is this a channel we know about? is it consistent with the guide?
  $channel = $chanid{lc $channel} if $channel; # -- xmltv channel id
  unless ($channel eq $show->{channel}) {
    print "channel mismatch for '$title' (details url = $url)\n";
    goto FAIL;
  }

  # --- now clean up a few things
  $title2 =~ s/^\s*-\s*//;
  $rating =~ s/Rated:\s*//;
  $duration =~ s/mins//;

  # --- compute start and stop times
  my ($start, $stop);
  if ($start0) {
    Date_Init("TZ=+0000"); # So parse doesn't assume a timezone.
    my $dt = eval {DateTime::Format::DateManip->parse_datetime("$start0")};
    if ($dt) {
      $dt->set_time_zone('floating');
      $start= $date6am->clone->set(hour=>$dt->hour, minute=>$dt->minute);
      # --- first, check for shows starting past midnight...
      $start->add(days=>1) if $row > 0 and $dt->hour < 6;
      # --- then for shows starting the day before... (11pm say)
      $start->subtract(days=>1) if $row == 0 and $dt->hour >= 9;
      my $ddt = eval{DateTime::Format::DateManip
                      ->parse_duration($duration . ' minutes')};
      $stop = $start + $ddt if $ddt;
    }
  }

  $show->{title}       = [[$title1, $lang]]   if $title1;
  $show->{start}       = dtxmltv($start)      if $start;
  $show->{stop}        = dtxmltv($stop)       if $stop;
  $show->{'sub-title'} = [[$title2, $lang]]   if $title2;
  $show->{desc}        = [[$desc,   $lang]]   if $desc;
  $show->{category}    = [[$genre,  $lang]]   if $genre;
  $show->{rating}      = [[$rating, "CTVA", undef]] if $rating;

  ++$count{detail};
  $tree->delete;
  return 1;

 FAIL: # --- CLOSEUP DETAILS FAILED ---

  ++$count{bad};
  $tree->delete;
  print join("", "===DEBUG===\ndetails url=$url\n",
             "---PAGE--------\n$details\n",
             "---ENDPAGE-----\n===ENDDEBUG===\n") if ($debug);

  # --- Details file didn't parse: save a copy for an autopsy of there is
  # an appropriate debug directory...
  if ($debug_dir) {
    my $dbgfile = "$debug_dir/$pid";
    if (open(CT, '>', $dbgfile)) { print CT $details; close CT }
    else { print "can't write $dbgfile\n" }
  }

  return undef;
}


# --- configure: query for region, services, and channels and write config

sub configure {

  die "Sorry, configuration is not currently supported.";

  my $config_file = shift;
  my $firstpage;

  XMLTV::Ask::init($opt_gui);
  XMLTV::Config_file::check_no_overwrite($config_file);

  # --- extract user's ids for channels (if there were any in
  # the config file), add the defaults then clear the channels hash
  $chanid{lc $_} = $channels->{$_} for keys %$channels;
  for (channel_mappings()) {
    my ($name, $id) = / \s* (.+?) \s* : \s* (\S+) /x or next;
    $chanid{lc $name} = $id unless $chanid{lc $name}; # use user's if defined
  }
  $channels = {};
  $conf = {};

  # --- get timezone
  my @zones = DateTime::TimeZone->all_names();
  my $zone = 
    ask(join "\n",
        'Timezone may be Olson DB time zone name ("Australia/Sydney"),',
        'an offset string ("+1000"), or "local" which is the timezone of',
        'your operating system.',
        'Type ? to get select from valid Olson DB timezones,',
        'or enter your timezone.',
        '',
        '  (default "local", "?" for list) :');
  $zone =~ s/^\s*(.*)\s*$/$1/;

  if ( $zone eq "") {
      $zone = 'local';
  } elsif ($zone ne 'local' &&
     $zone !~ /^\+\d\d\d\d$/x &&
     !grep { $zone eq $_ } @zones) {
      my @regions = DateTime::TimeZone->categories;
      my $region = ask_choice("Select your timezone region",
            $regions[5], @regions);
      if ($region) {
    my @region_zones = grep (/^$region/, @zones);
#       my @region_zones = DateTime::TimeZone->names_in_category($region);
    $zone = ask_choice("Select your timezone",
           $region_zones[0], @region_zones);
      }
  }
  $conf->{TZ} = $zone;
  my $date = DateTime->today(time_zone => $zone )->dmy('');

  my @channellist;

  # --- now find list of services - note that this appears to be invariant,
  # so perhaps we should always offer the same list and skip the get?
  {
    my %servicenames = ( free => 1);
    $firstpage = with_progress("getting list of services",
                               sub { get_page("$TV?channel=free") });

    ++$servicenames{$1} while $firstpage =~ /channel=(\w+)/g;
    my @choices = sort keys %servicenames;
    my @flag = ask_many_boolean
      (0, map { "Grab listings for $_ channels" } @choices);

    for (0..$#choices) {
      next unless $flag[$_];
      push @{$conf->{services}}, { name => $choices[$_] };
    }

    # --- now loop over services; find region/regionid and list of
    # channels
    for my $service (@{$conf->{services}}) {
      my ($page, $base);

      if ($service->{name} eq 'free') {
        # --- get list of regions
        my %region;
        my $tree = HTML::TreeBuilder->new_from_content($firstpage);
        for ($tree->look_down('_tag' => 'select', 'name' => 'region')) {
          for ($_->look_down('_tag' => 'option')) {
            $region{$_->as_text()} = $_->attr('value');
          }
        }
        $tree->delete();
        my @choices = sort keys %region;
        $service->{region}
          = ask_choice("Select your region for free channels",
                       $choices[0], @choices);
        $service->{regionid} = $region{$service->{region}};
        $page = with_progress
          ( "getting list of channels free service in " .
            "$service->{region}", sub { set_service($service) });

      } else {

        # --- find regionid for service
        ($page, $base) = with_progress
          ( "getting regionid and channels for service $service->{name}",
            sub { get_content_base("$TV?channel=$service->{name}") } );
        $service->{region} = "Australia";
        ($service->{regionid})
          = $base =~ /_(\d+).asp/ or die "cannot find regionid";
        # page now has channel list too
      }

      # --- now append channels for this service
      my %skip;

      # --- find the channels
      my $tree = HTML::TreeBuilder->new_from_content($page);
      for ($tree->look_down('_tag' => 'table', 'class' => 'tv',
                            'width' => '100%') # only one table of this type
           ->look_down('_tag' => 'tr') # ..first row has channels
           ->look_down('_tag' => 'b')) { # ..in bold tags
        my $channel = $_->as_text;
        push @channellist, $channel;
        unless ($chanid{lc $channel}) { # check/define xmltvid
          my $id = lc($channel);
          $id =~ s/( ^\s+ | \s+$ | \W )//gx;
          $id .= ".$service->{name}.au"; # e.g., "foxtel.au", "free.au"
          print "Warning, unknown channel '$channel', using '$id' as id\n";
          $chanid{lc $channel} = $id;
        }
      }
      $tree->delete();
    }
  }

  my @select = ask_many_boolean
    (1, map { "subscribe to channel $_  ->  $chanid{lc $_}" } @channellist);
  
  for (0..$#channellist) {
    next unless $select[$_];
    my $name = $channellist[$_];
    $channels->{$name} = $chanid{lc $name};
  }

  my @channels = sort keys %$channels;

  # --- does the user want the slow option turned on by default?
  $conf->{slow} =
    ask_boolean("Show descriptions, ratings, genres and more accurate\n" .
                "time information is available by downloading individual\n" .
                "pages for each show, but this takes a lot longer\n\n" .
                "Do you want this (--slow) option to be on by default?");

  # --- report configuration and ask for confirmation
  my $channel_count = @channels;
  my $services_info;
  for my $service (@{$conf->{services}}) {
    $services_info .= "service: name=$service->{name}, " .
      " region=$service->{region} (id=$service->{regionid})\n";
  }

  die "aborting configuration" unless
    ask_boolean( "Please confirm the following configuration:\n" .
                 "  TZ = $conf->{TZ}\n" .
                 "  $services_info\n" .
                 "  ($channel_count subscribed channels)\n\n" .
                 "[ use the '--list-channels' option for the\n" .
                 "  xmltvids to use in mythtvsetup ]\n\n" .
                 " Continue?\n");

  # --- open config file and write the configuration
  -d dirname($config_file) or mkdir dirname($config_file)
    or die "cannot create directory for $config_file: $!";

  # --- dump as perl code using Data::Dumper
  open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
  print CONF Data::Dumper->Dump([$conf, $channels, $duplicate],
                                ["conf", "channels", "duplicate"]);
  close CONF;

  print "wrote config_file: $config_file\n";
}


# ---------------------------------------------------------------------------
# --- we can avoid redirections by spoofing random MSIDs in the URLs
use Digest::MD5 qw{md5_hex};
sub MSID { $spoofMSID ? "&MSID=" . md5_hex(rand) : "" }

# ---------------------------------------------------------------------------

my $ua;
BEGIN {
  $ua = LWP::UserAgent->new
    ('timeout' => 30,
     'keep_alive' => 1,
     'agent' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)');
  $ua->env_proxy;
  $ua->cookie_jar({});
}

sub set_service {
  my $service = shift;
  my $url = $service->{name} eq 'free' ?
    "${NMSN}/setlocation.asp?region=$service->{regionid}" .
     "&returnURL=${TV}?channel=$service->{name}" :
        $TV . "?channel=$service->{name}&region=$service->{regionid}";
  print "SET_SERVICE($url)\n" if $debug;
  get_page($url);
}

# ---------------------------------------------------------------------------
# --- get NMSN program details for given pid

sub get_details_page {
  my $url = shift() . MSID();
  my $page;
  print "GET_DETAILS_PAGE($url)\n" if $debug;
  $page = get_page($url)
    or print "Warning: Failed to get program details from $url\n";
  return $page;
}

# ---------------------------------------------------------------------------
# --- descend a structure and clean up various things, including stripping
# leading/trailing spaces in strings, translations of html stuff etc

my %amp;
BEGIN { %amp = ( nbsp => ' ', qw{ amp & lt < gt > apos ' quot " } ) }
sub cleanup {
  my $x = shift;
  if    (ref $x eq "REF")   { cleanup($_) }
  elsif (ref $x eq "HASH")  { cleanup(\$_) for values %$x }
  elsif (ref $x eq "ARRAY") { cleanup(\$_) for @$x }
  elsif (defined $$x) {
    $$x =~ s/&(#(\d+)|(.*?));/ $2 ? chr($2) : $amp{$3}||' ' /eg;
    $$x =~ s/[^\x20-\x7f]/ /g;
    $$x =~ s/(^\s+|\s+$)//g;
  }
}

# ---------------------------------------------------------------------------
# --- get a page
sub get_page {
  my $url = shift;
  $url =~ s#^http://#$WW# if $opt_warper;
  my $page = (get_content_base($url))[0];
  return $page;
}

# ---------------------------------------------------------------------------
# get a page and its base (and report all redirections if debugging)
# we attempt 5 times with 3 second pauses between failures.

sub get_content_base {
  my $url = shift;
  my ($raw, $page, $base);
  if ($opt_replay) {
    $url  =~ s/&MSID=.*//;
    $base = $url;
    $raw  = $replay->{$url}->{raw};
    $page = $replay->{$url}->{page} unless $debugjs;
    print "REPLAY($url)\n"   if $debug;
    print "IGNORE DECODED\n" if $debug and $debugjs;
  } else {
    my $response;
    my $request = HTTP::Request->new(GET => $url);
    $request->header('Accept-Encoding' => 'gzip');
    for (1..5) {
      $response = $ua->request($request);
      last if $response->is_success;
      sleep 3;
    }
    unless ($response->is_success) {
      print "Warning: failed to read page $url in 5 attempts\n";
      return undef;
    }
    $stats{total} += do {use bytes; length($response->content)};
    if ($response->header('Content-Encoding') &&
        $response->header('Content-Encoding') eq 'gzip') {
      $stats{comp} += do {use bytes; length($response->content)};
      $response->content(Compress::Zlib::memGunzip($response->content));
    }
    if ($debug and (my $r = $response)->previous) { # track redirections
      print "GET_CONTENT_BASE redirection backtrace:\n";
      while ($r) { print "    ", $r->base, "\n"; $r = $r->previous }
    }
    $raw  = $response->content;
    $base = $response->base;
  }

  if ($raw and not $page) {
    $page = $raw;
    $page = deobfuscate($page);
    $page =~ s!<SCRIPT.*?>!!ig;            # strip these useless tags
    $page =~ s/&(#(\d+)|(nbsp|amp|quot|apos));/$2?chr($2):$amp{$3}||' '/eg;
    $page =~ s/[^\x20-\x7f]/ /g;
  }

  if ($opt_store) {
    $url =~ s/&MSID=.*//;
    $replay->{$url}->{raw}  = $raw;
    $replay->{$url}->{page} = $page;
  }

  return $page, $base;
}

# --- with the following code we look for sections of javascript that
# contain the obfuscation code and evaluate it using a JavaScript runtime
# context (using the Mozilla libjs library).
my $jsc;
BEGIN {
  $jsc = new JavaScript::Runtime->create_context();
  $jsc->set_error_handler( sub { } );
  $jsc->eval(qq{
     var doc = '';
     function Location() { this.href  = 'http://ninemsn.com.au'; }
     function Document() { this.write = function(x) { doc += x; } }
     function Window()   { this.___ww = 0 }
     location = new Location;
     document = new Document;
     window   = new Window;
  });
}
sub deobfuscate {
  my $data = shift;
  $data =~ s{<script language="?Javascript"?[^>]*>(.*?)</script>}{
    my $x = $1;
    $jsc->eval(qq{ doc = '' });
    $jsc->eval($x);
    $jsc->eval(qq{ doc }) || '';
  }isge;
  $data;
}

# ---------------------------------------------------------------------------
# show a progress message during call to code (given by closure)
sub with_progress {
  my ($message, $sub) = @_;
  my $bar = new XMLTV::ProgressBar($message, 1);
  my @results = $sub->();
  $bar->update, short_pause(), $bar->finish;
  return wantarray ? @results : $results[0];
}

sub short_pause { select(undef, undef, undef, 0.33) }


# ---------------------------------------------------------------------------
# misc/pedantic stuff...

sub title { (shift)->{title}->[0]->[0] }
sub Ymd { $_[0]->ymd('') or die "problem in Ymd($_[0])" }
sub dmY { $_[0]->dmy('') or die "problem in dmY($_[0])" }
sub dtxmltv { # Convert date time to yyyymmddhhmm +hhmm format
    my $offset = $_[0]->time_zone->offset_for_datetime($_[0]);
    sprintf("%s%s +%02d%02d", $_[0]->ymd('') , $_[0]->hms(''), $offset/3600,
            ($offset/60)%60);
}
sub fixplural { # hardly seems worth it sometimes... but, standards...
  local $_ = shift;
  s/(\d+) (\s+) (\w+)s (\s)/$1 . $2 . $3 . ($1==1?"":"s") . $4/xe; $_ }

# ---------------------------------------------------------------------------
# here is the default channel list... comments welcome :-)

sub channel_mappings {
  return grep ! /^#/, split "\n", qq{

    # --- Free channels
    ABC NSW                      : nsw.abc.gov.au
    ABC QLD                      : qld.abc.gov.au
    ABC TAS                      : tas.abc.gov.au
    ABC ACT                      : act.abc.gov.au
    ABC SA                       : sa.abc.gov.au
    ABC2                         : abc2.abc.gov.au

    Channel Seven Sydney         : sydney.seven.com.au
    Channel Seven Queensland     : queensland.seven.com.au
    Prime Southern               : southern.prime.com.au
    CHANNEL SEVEN BRISBANE       : brisbane.seven.com.au
    CHANNEL SEVEN ADELAIDE       : adelaide.seven.com.au

    SBS Sydney                   : sydney.sbs.com.au
    SBS Queensland               : queensland.sbs.com.au
    SBS News                     : news.sbs.com.au
    SBS EASTERN                  : eastern.sbs.com.au
    SBS SA                       : sa.sbs.com.au

    Network TEN Sydney           : sydney.ten.com.au
    NETWORK TEN BRISBANE         : brisbane.ten.com.au
    Southern Cross TEN Capital    : capital.southerncrossten.com.au
    Southern Cross TEN Queensland : queensland.southerncrossten.com.au
    NETWORK TEN ADELAIDE         : adelaide.ten.com.au

    Channel Nine Sydney          : sydney.nine.com.au
    CHANNEL NINE BRISBANE METRO  : brisbane.nine.com.au
    WIN Television NSW           : nsw.win.com.au
    WIN Television QLD           : qld.win.com.au
    CHANNEL NINE ADELAIDE  : adelaide.nine.com.au

    # --- Foxtel

    Arena TV                     : arena.foxtel.com.au
    BBC World                    : bbcworld.foxtel.com.au
    Cartoon Network              : cartoon.foxtel.com.au
    Channel [V]                  : v.foxtel.com.au
    CNBC                         : cnbc.foxtel.com.au
    CNN                          : cnn.foxtel.com.au
    Discovery Channel            : discovery.foxtel.com.au
    FOX News                     : foxnews.foxtel.com.au
    FOX8                         : fox8.foxtel.com.au
    MAX                          : max.foxtel.com.au
    National Geographic Channel  : natgeo.foxtel.com.au
    Nickelodeon                  : nickelodeon.foxtel.com.au
    Showtime                     : showtime.foxtel.com.au
    Showtime 2                   : showtime2.foxtel.com.au
    Sky News                     : skynews.foxtel.com.au
    TV1                          : tv1.foxtel.com.au
    UKTV                         : uktv.foxtel.com.au
    World Movies                 : worldmovies.foxtel.com.au

    A1                           : a1.foxtel.com.au
    ACC                          : acc.foxtel.com.au
    ADULTS ONLY                  : adultsonly.foxtel.com.au
    ANIMAL PLANET                : animalplanet.foxtel.com.au
    ANTENNA PACIFIC              : antennapacific.foxtel.com.au
    ARENA+2                      : arena2.foxtel.com.au
    AURORA                       : aurora.foxtel.com.au
    BLOOMBERG                    : bloomberg.foxtel.com.au
    BOOMERANG                    : boomerang.foxtel.com.au
    CLUB [V]                     : clubv.foxtel.com.au
    CMC                          : cmc.foxtel.com.au
    CRIME & INVESTIGATION NETWORK : crime.foxtel.com.au
    DISCOVERY HEALTH             : health.discovery.foxtel.com.au
    DISCOVERY SCIENCE            : science.discovery.foxtel.com.au
    DISCOVERY TRAVEL & ADVENTURE : travel.discovery.foxtel.com.au
    DISNEY CHANNEL               : disney.foxtel.com.au
    E!                           : e.foxtel.com.au
    ESPN                         : espn.foxtel.com.au
    EUROSPORT NEWS               : eurosportnews.foxtel.com.au
    FOOD                         : food.foxtel.com.au
    FOX CLASSICS                 : classics.foxtel.com.au
    FOX CLASSICS+2               : classics2.foxtel.com.au
    FOX SPORTS 1                 : sports1.foxtel.com.au
    FOX SPORTS 2                 : sports2.foxtel.com.au
    FOX8+2                       : fox82.foxtel.com.au
    FTV                          : ftv.foxtel.com.au
    FUEL                         : fuel.foxtel.com.au
    HALLMARK CHANNEL             : hallmark.foxtel.com.au
    HOW TO                       : howto.foxtel.com.au
    MAIN EVENT                   : mainevent.foxtel.com.au
    MOVIE EXTRA                  : movieextra.foxtel.com.au
    MOVIE GREATS                 : moviegreats.foxtel.com.au
    MOVIE ONE                    : movieone.foxtel.com.au
    MOVIE ONE TAKE 2             : movieonetake2.foxtel.com.au
    MTV                          : mtv.foxtel.com.au
    NICK JNR                     : nickjnr.foxtel.com.au
    OVATION                      : ovation.foxtel.com.au
    RAI INTERNATIONAL            : rai.foxtel.com.au
    SHOWTIME GREATS              : showtimegreats.foxtel.com.au
    SKY RACING                   : skyracing.foxtel.com.au
    TCM                          : tcm.foxtel.com.au
    THE BIOGRAPHY CHANNEL        : biography.foxtel.com.au
    THE COMEDY CHANNEL           : comedy.foxtel.com.au
    THE COMEDY CHANNEL+2         : comedy2.foxtel.com.au
    THE HISTORY CHANNEL          : history.foxtel.com.au
    THE LIFESTYLE CHANNEL        : lifestyle.foxtel.com.au
    THE LIFESTYLE CHANNEL+2      : lifestyle2.foxtel.com.au
    THE WEATHER CHANNEL          : weather.foxtel.com.au
    TV1+2                        : tv12.foxtel.com.au
    TVSN                         : tvsn.foxtel.com.au
    UKTV+2                       : uktv2.foxtel.com.au
    VH1                          : vh1.foxtel.com.au
    W                            : w.foxtel.com.au
  };
}
    

# Additional documentation:
#
# Downloading of details pages can be controlled by the --slow option
# (possibly set by default during configuration) and regular expressions
# contained in the following two files (one regexp per line):
#
# ~/.tv_grab_au-detailed    contains a list of regular expressions matching
#                           "chanid:titles" of shows for which we want details
#                           e.g.:   Doctor Who
#                                   Simpsons
#                                   abc.gov.au:.*Chef
#
# ~/.tv_grab_au-undetailed  regular expressions matching "chanid:titles" to 
#                           skip details downloads
#                           e.g.:   News
#                                   news.sbs.com.au
#
# NB: a match against the detailed list overrides any undetailed matches.
# When changing these files, use the --ignore-cache option to ensure
# all new details are downloaded.
#
# *** Local Variables: ***
# *** mode: cperl ***
# *** indent-tabs-mode: nil ***
# *** tab-width: 2 ***
# *** End: ***