#!/usr/bin/perl -w eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}' if 0; # not running under some shell # =========================================================================== # --- NMSN Australian TV grabber by Michael 'Immir' Smith... # --- $Id: tv_grab_au,v 2.14 2006/07/20 22:29:32 michael Exp michael $ # # A current version of this script should usually be available here: # # # # pod documentation perhaps coming later... in brief: # # --configure configure the grabber # --config-file= use specified config file # --list-channels show subscribed channels and xmltvids # --show-config show configuration details # --slow download a details page for every show # --fast opposite of --slow (to override config) # --days= days to grab # --warper use webwarper.net anonymizer # --output= xml output file # --ignore-cache ignore cached information # # The following config-file excerpt demonstrates which options can be # set by default in the config-file: # # $conf = { # 'TZ' => 'Australia/Canberra', # 'services' => [ ... ], # 'days' => 5, # 'output' => '/tmp/xmltv.xml', # 'output_dir' => '/var/local/tv_grab_au', # for cache file & debugging # 'slow' => 1, # 'warper' => 1, # }; # # When run with mythfilldatabase, the config file may end up as # ~/.mythtv/tv_grab_au.xmltv and this is where xmltvids for channels can # be modified. # # Listing information can be duplicated for more than one channel id # using a construction like this (be careful of syntax) in the config file: # # $duplicate = { # 'act.abc.gov.au' => { 'ABC HD' => 'hd.abc.gov.au', # 'ABC CHAN 22' => '22.abc.gov.au' }, # 'eastern.sbs.com.au' => { 'SBS HD' => 'hd.eastern.sbs.com.au' } # }; # # Downloading of details pages can be controlled by the --slow option # (possibly set by default during configuration) and regular expressions # in additional config files (see end of script for more information). # # A dump of the data from failed details pages will be placed into the # debug subdirectory of the output_dir if it exists. In most cases this # will either be /var/local/tv_grab_au/debug or ~/.tv_grab_au/debug. # --------------------------------------------------------------------------- # Additional debug options: # # --static write to /dev/null (for testing) # --store= store page data to file for replay use # --replay= replay page data from file # # e.g. tv_grab_au --static --ignore-cache --store=/tmp/PAGES # tv_grab_au --static --ignore-cache --replay=/tmp/PAGES # # The first of these will do a full update, ignoring the cache, and # storing every (decoded) page in the file /tmp/PAGES. The second one # repeats the update, but simply replays the pages from /tmp/PAGES instead # of fetching them again. # # additional config option example: 'store' => '/tmp/PAGES' # --------------------------------------------------------------------------- # Recent changes: # # 1.27: TZ change added as suggested by Greg Boundy # 1.31: use parsed details url to make it more resilient to site changes # 1.34: manual configuration of duplication for channels # 1.36: user agent fix for proxy (thanks Carl Lewis) # 1.37: fixed various issues with start/stop times around midnight/6am # (thanks to Darryl Young for suggestions of the cause) # 1.39: fixed dupes from last-then-first listed programs across 6am # 1.42: ridiculous encrypted data (standard java functions + Caesar slide # by 1!); javascript modification of closeup url (Michael Cowell) # 1.43: move deobfuscate to get_content_base to cope with --config mode # 1.44: make channel names matched on web page regular expressions/substrings # to alleviate problems when NMSN changes column names # 1.45: fix problem with regexp match of channel name on webpage matching # shorter names (eg tv12 matched with tv1 instead of tv2) (Dave Oxley) # 1.47: show version number when running (requested by William Kenworthy) # 1.48: if timezone empty, don't append timezone info # 1.49: use index instead of regexp match for channels (stupid me) # 1.50: make index match case insensitive # 1.51: include channel id in key for cached data # 1.52: user agent set to look like IE # 1.53: fix problems with nbsp and stop times... # 1.54: incorportate JavaScript module to cope with obfuscation. # 1.56: JavaScript tested and closeup parsing fixed (for the moment) plus # fixed memory leak due to TreeBuilder tree in get_details not being # deleted. Fixed problems with some closeup pages empty producing # empty title fields causing XMLTV to barf! Bad karma. # 1.59: cleanup shows using a recursive routine. Fix stripping of non- # printables (use bytes). # 1.61: fix for deobfuscate (Ian Dall). # 1.62: working on cleanup of data... # 1.63: deobfuscate adapted to new random function obfuscation... # 1.65: fiddling with cleanup... # 2.0: some major improvements from a patch by Ian Dall, including: # proper date/time/timezone processing; webwaper.net anonymizer # option with gzipped data support; smarter caching logic; # optional data dump for debugging. # 2.1: fix for programs straddling 6am (bug introduced last version), # plus store/replay options for debugging assistance # 2.3: js init only once; some js debugging additions # 2.4: fix for changes to guide page urls from Paul Andreassen. Nice one! # 2.5: better cleanup code... more fixes/additions from PA. # 2.6: major mods to guide page fetching to solve problem introduced in # recent patch. Leaner debug code. # 2.7: minor bug fixes (mainly debug variable and gui init moved) # 2.8: couple of fixes from Andrew C. (conf output_dir and regionid fix) # 2.12: various fixes supplied by various people... working again # 2.13: refetch guide pages for changed pids, and pause before details... # # ----- patches from others (I'm no longer in the game...) # # 2.14: Ian Dall use strict; use Getopt::Long; use Compress::Zlib; use LWP::UserAgent; use Date::Parse; use Date::Manip; use DateTime::TimeZone; use DateTime::Format::DateManip; use File::Path; use File::Basename; use Data::Dumper; use HTML::TreeBuilder; use JavaScript; use XMLTV; use XMLTV::Ask; use XMLTV::ProgressBar; use XMLTV::Config_file; # --------------------------------------------------------------------------- # --- version checking { last if $JavaScript::VERSION ge 0.55; warn "JavaScript version $JavaScript::VERSION too old. " . "Please upgrade to at least 0.55.\n" } # --------------------------------------------------------------------------- # --- global parameters/constants (read in from config) my $conf = {}; # ref to hash of configuration parameters my $channels = {}; # ref to hash of subscribed channel names to xmltvids my $duplicate = {}; # hashref for channel duplication my %stats = ( total => 0, comp => 0); my $lang = "en"; my $spoofMSID = 1; # spoof random MSIDs to avoid redirects? big speed up! my ($output_dir) = grep { -d && -w _ } split ' ',qq{ /var/local/tv_grab_au $ENV{HOME}/.tv_grab_au }; my $want_details_file = "$ENV{HOME}/.tv_grab_au-detailed"; my $skip_details_file = "$ENV{HOME}/.tv_grab_au-undetailed"; my (@want_details, @skip_details); # --- some global counters for reporting my %count = map { $_,0 } qw{ guide detail bad adjust cached }; # --- NMSN site URLs my $NMSN = "http://tvguide.ninemsn.com.au"; my $TV = "http://tvguide.ninemsn.com.au/todaytv/default.asp"; my $WW = "http://webwarper.net/ww/"; my ($Revision) = '$Revision: 2.14 $' =~ /Revision:\s*(\S+)/; my ($debug, $debugjs); $Data::Dumper::Sortkeys = 1; $Data::Dumper::Indent = 1; $| = 1; # --------------------------------------------------------------------------- # --- Command-line options my ($opt_configfile, $opt_configure, $opt_list_channels); my ($opt_ignore_cache, $opt_static, $opt_show_config); my ($opt_slow, $opt_fast, $opt_max_tries); my ($opt_output, $opt_days, $opt_warper, $opt_nowarper); my ($opt_replay, $opt_store); my $opt_gui = 1; GetOptions( 'days=i' => \$opt_days, 'list-channels' => \$opt_list_channels, 'output=s' => \$opt_output, 'config-file=s' => \$opt_configfile, 'configure' => \$opt_configure, 'gui=s' => \$opt_gui, 'slow' => \$opt_slow, 'fast' => \$opt_fast, 'ignore-cache' => \$opt_ignore_cache, 'static' => \$opt_static, 'warper' => \$opt_warper, 'nowarper' => \$opt_nowarper, 'show-config' => \$opt_show_config, 'max-tries=i' => \$opt_max_tries, 'debug+' => \$debug, 'debugjs' => \$debugjs, 'store=s' => \$opt_store, 'replay=s' => \$opt_replay); my $config_file = XMLTV::Config_file::filename ($opt_configfile, 'tv_grab_au', not $debug ); print "CONFIG_FILE = $config_file\n"; $debug ||= 0; # --- read and parse configuration --- must do this before configuring # in case the user has chosen non-default xmltvids for some channels... if (-r $config_file) { local (@ARGV, $/) = ($config_file); no warnings 'all'; eval <>; die "error in conf file: $@" if $@ and not $opt_configure; print "unable to read configuration file... configuring anyway..." if $@; } elsif (not $opt_configure) { print qq{\nThis version of tv_grab_au is not configured!\n If you are using mythfilldatabase, you'll need to configure from within mythtv-setup (or with an appropriate --config-file option), otherwise simply run with the --configure option.\n\n}; exit(1); } # --- output dir in conf file? $output_dir = $conf->{output_dir} if $conf->{output_dir}; my $debug_dir = "$output_dir/debug" if $output_dir and -d "$output_dir/debug" && -w _; my $cache_file = "$output_dir/cached.pl"; # --- extract sorted subscribed channel list from config-file hash; # also compute canonicalised lowercased channel to xmltvid hash my %chanid = map { lc $_, $channels->{$_} } keys %$channels; # --- are we configuring? if ($opt_configure) { configure($config_file); exit 0 } # --- are we just listing channels? if ($opt_list_channels) { print " $_ -> $channels->{$_}\n" for sort keys %$channels; exit 0; } # --- we must be grabbing program information... my $runtime = time(); # --- explicitly Set Timezone my $TZ = $conf->{TZ} || 'local'; my $tz = DateTime::TimeZone->new( name => $TZ ); # --- flag for whether we get individual program details die "cannot do slow and fast\n" if $opt_slow and $opt_fast; my $slow = $opt_fast ? 0 : ($opt_slow || $conf->{slow}); # --- override slow for some shows (read list of regexps from file) if (-f $want_details_file) { local (@ARGV) = ($want_details_file); chomp(@want_details = <>); } if (-f $skip_details_file) { local (@ARGV) = ($skip_details_file); chomp(@skip_details = <>); } # --- other configuration options $opt_max_tries ||= 5; $opt_days ||= $conf->{days} || 5; $opt_output ||= $conf->{output} || "$output_dir/guide.xml"; die "cannot do webwarper and notwebwarper\n" if $opt_warper and $opt_nowarper; $opt_warper ||= !$opt_nowarper && $conf->{warper} || 0; $opt_store ||= $conf->{store}; if ($debug or $opt_show_config) { print "\ntv_grab_au revision $Revision\n"; print "configuration file: $config_file\n"; print "output directory: $output_dir\n"; print "debug directory: $debug_dir\n" if $debug_dir; print "** using webwarper.net anonymizer **\n" if $opt_warper; print "TZ = $TZ slow = $conf->{slow}\n"; print "output: $conf->{output}\n" if $conf->{output}; print "services chosen:\n"; print " $_->{name}: " . " region=$_->{region} id=$_->{regionid}\n" for @{$conf->{services}}; print "channel list:\n"; print " $_ -> $channels->{$_}\n" for keys %$channels; print "duplication list:\n"; while (my ($a,$b) = each %$duplicate) { while (my ($c,$d) = each %$b) { print " $a -> $d ($c)\n" } } print "\n"; exit 0 if $opt_show_config; } else { print "tv_grab_au $Revision: "; } print fixplural("grabbing $opt_days days into $opt_output\n"); # --- load replay hash if replaying last capture my $replay; if ($opt_replay) { local (@ARGV, $/) = ($opt_replay); no warnings 'all'; eval <>; die "$@" if $@; } # --- first get cached list of shows from last time (if any) my ($cached, $newcache); if (-r $cache_file and not $opt_ignore_cache) { local (@ARGV, $/) = ($cache_file); no warnings 'all'; eval <>; die "$@" if $@; } # --- now, on with the shows... # showlists is a hash of refs to arrays holding shows for each channel # which we use to infer endtimes from starttimes since the rowspan # information is generally unreliable (in fast mode) --- actually, they # seem to get completely off-cut sometimes; everything shifted forward # or back by 5 minutes or more --- must try and do something about that. my %showlists; for my $service (@{$conf->{services}}) { # --- first, get a list of the pages of guide available for this # service; we set the region cookie with set_service, then parse # the returned page for days of guide to grab my @pages; { my $page = set_service($service); my $tree = HTML::TreeBuilder->new_from_content($page); my $sel = $tree->look_down(qw{ _tag select name day id both }); @pages = map { [ $_->as_text(), $TV . "?day=" . $_->attr('value') . "&channel=" . $service->{name} . "®ion=" . $service->{regionid} ] } $sel->look_down('_tag' => 'option'); $tree->delete(); } # --- now loop over the pages to fetch... for my $x (@pages[0..$opt_days-1]) { my ($date_str, $guide_url) = @$x or next; # --- the date parsed from the link text for the guide page my $date = DateTime->from_epoch( epoch => str2time($date_str), time_zone => $tz); my $date6am = $date->clone->set(hour=>6); my $tree; my (%done, %failed); GUIDE: my $guidedata = get_page($guide_url) or next; ++$count{guide}; $tree->delete if $tree; $tree = HTML::TreeBuilder->new_from_content($guidedata); for ($tree->look_down('_tag' => 'table', 'class' => 'tv')) { # extract channel names from the first row of the table # (this row has align=middle and the channels are in bold) my @hdr = map { $_->as_text } $_->look_down('_tag' => 'tr', 'align' => 'middle') ->look_down('_tag' => 'b'); my @span = (0) x @hdr; # rowspans to infer columns my $row = 0; # row number (to compute start times) for ($_->look_down('_tag' => 'tr', 'valign' => 'top')) { my @idx = grep { $span[$_] == 0 } 0..$#hdr; # columns for this row for ($_->look_down('_tag' => 'td', 'class' => 'tv')) { my $col = shift @idx; my $rowspan = $_->attr("rowspan") || 1; $span[$col] = $rowspan; my $channel_hdr = $hdr[$col]; my $channel = find_channel($channel_hdr) or next; my $chanid = $chanid{lc $channel}; my ($e) = $_->content_list; next unless ref($e) eq 'HTML::Element'; next unless $e->tag eq 'a'; my $html= $e->as_HTML; my ($pid) = $html =~ /pid=(\w+)/; next unless $pid; # sometimes null programs at bottom of table my ($title, $subtitle) = $e->as_text() =~ /(.*\S)\s*(?:\s*-\s*(.*))?/; $title ||= ''; $subtitle ||= ''; # NMSN changed their details urls; let's do it this way to # cope with future changes my @link = @{ $e->extract_links() }; die "too many links:\n" . $html if @link > 1; my $url = $link[0]->[0]; if ($opt_warper) { # Unmangle the webwarper URL. $url =~ s#^$WW#http://#; $url =~ s#^$NMSN##; } $url = $NMSN . $url; # --- NMSN javascript pop function translates closeup to cu $url =~ s/closeup/cu/; # --- check (pid, row, rowspan, title) against old cached data my $xmldate = dtxmltv($date); my $cache_id = "$xmldate:$chanid:$row:$rowspan:$title"; next if $done{$cache_id}; my $show; if (my $cached_show = $newcache->{$cache_id} || $cached->{$cache_id}) { $show = $newcache->{$cache_id} = $cached_show; ++$count{cached} if $cached->{$cache_id}; } else { # --- compute start and stop times based on row of table and # rowspan --- although this appears problematic for some days my $start = $date6am->clone->add(minutes => $row*5); my $stop = $start->clone->add( minutes => $rowspan*5 ); if ($title =~ s/\s*\[\s* (\d+):(\d+) \s* (am|pm) \s*\]\s* //x) { my ($hr, $min, $ampm) = ($1, $2, lc($3)); my $hrdt = $hr + (($ampm eq "am")? 0: 12); $start = $date6am->clone; $start->set(hour => $hrdt, minute=>$min); $start->add( days => 1 ) if $hr < 6 and $ampm eq "am"; } $show = { 'title' => [[$title, $lang]], 'start' => dtxmltv($start), 'stop' => dtxmltv($stop), 'channel' => $chanid }; $show->{subtitle} = [[$subtitle, $lang]] if $subtitle; # --- fill in more details? --- if (want_details($show)) { sleep 5+rand(3); if (my $details = (get_details_page($url) or get_details_page($url.'_CloseUp'))) { if ($details =~ /unable to retrieve your request/i) { ++$count{bad}; if ($failed{$cache_id}++ < $opt_max_tries) { if ($failed{$cache_id} > 2) { print "pausing..." if $debug; sleep 30; } goto GUIDE; } } else { $newcache->{$cache_id} = $show if parse_closeup_details($date6am,$show,$details, $row,$pid,$url); } } } } # --- that's it! cleanup($show); # cleans cached version too... push @{ $showlists{$chanid} }, $show; $done{$cache_id} = 1; abbr_dump($show, $cached->{$cache_id}) if $debug==1; print Dumper($show) if $debug>1; } ++ $row; @span = map { $_ - 1} @span; # update rowspan counts } } $tree->delete(); } } # --- check for static (i.e., don't write xml or cache) if ($opt_static) { print "(static) writing xml and cache to /dev/null\n"; $opt_output = "/dev/null"; $cache_file = "/dev/null"; } # --- save to cache before massaging dates open(CACHE, "> $cache_file") or die "cannot open $cache_file: $!"; print CACHE Data::Dumper->Dump([$newcache], ["cached"]); close CACHE; # --- store replay pages? if ($opt_store and not $opt_replay) { open (STORE, "> $opt_store") or die "cannot open $opt_store: $!"; print STORE Data::Dumper->Dump([$replay], ["replay"]); close STORE; } # --- check start and stop times and mark duplicates for my $channel (keys %showlists) { my @shows = @{ $showlists{$channel} }; for my $i (0 .. @shows-2) { # make stop time consistent with following start time $shows[$i]->{stop} = $shows[$i+1]->{start} if not defined $shows[$i]->{stop}; if (Date_Cmp($shows[$i+1]->{start}, $shows[$i]->{stop}) < 0) { if ($shows[$i+1]->{start} eq $shows[$i]->{start} and $shows[$i+1]->{stop} eq $shows[$i]->{stop} and title($shows[$i+1]) eq title($shows[$i])) { $shows[$i]->{dupe} = 1; # duplicate show } else { # --- just adjust previous stop time ++$count{adjust}; $shows[$i]->{stop} = $shows[$i+1]->{start}; } } } } # --- append timezone info and strip fields that XMLTV doesn't need/want for my $channel (keys %showlists) { for my $show (@{ $showlists{$channel} }) { next if exists $show->{dupe}; $show->{title}->[0]->[0] ||= 'nil'; } } # --- now write to xml my %writer_args = ( encoding => 'ISO-8859-1' ); if ($opt_output) { my $fh = new IO::File(">$opt_output") or die "can't open $opt_output: $!"; $writer_args{OUTPUT} = $fh; } my $writer = new XMLTV::Writer(%writer_args); $writer->start ( { 'source-info-url' => $NMSN, 'source-info-name' => "NMSN TV Guide", 'generator-info-name' => "XMLTV - tv_grab_au NMSN v$Revision"} ); for my $channel (sort keys %$channels) { my $chanid = $chanid{lc $channel}; $writer->write_channel( { 'display-name' => [[$channel, $lang]], 'id' => $chanid } ); # --- write duplicated channel definitions while (my ($name, $otherid) = each %{ $duplicate->{$chanid} }) { $writer->write_channel( { 'display-name' => [[$name, $lang]], 'id' => $otherid } ); } } for my $chanid (keys %showlists) { for my $show (@{ $showlists{$chanid} }) { next if exists $show->{dupe}; $writer->write_programme($show); for my $otherid (values %{ $duplicate->{$chanid} }) { $writer->write_programme({ %$show, 'channel' => $otherid }); } } } $writer->end(); system "cp -f $opt_output $output_dir/guide.xml" if $opt_output and $opt_output ne "/dev/null" and $opt_output ne "$output_dir/guide.xml"; # --- report statistics and runtime printf qq{tv_grab_au: downloads = %d guide pages, %d detail pages %d failed detail pages, %d pages found in cache, %d stop times adjusted }, @count{qw{guide detail bad cached adjust}}; printf "tv_grab_au: %.2f KB downloaded", $stats{total}/1024; printf " (%.2f KB compressed)", $stats{comp}/1024 if $stats{comp}; printf "\ntv_grab_au: finished in %d seconds\n", time() - $runtime; exit 0; # Game over, man! # =========================================================================== # --- subroutines sub abbr_dump { my $show = shift; my $cached = shift; print "SHOW: $show->{title}->[0]->[0]", ($show->{'sub-title'}?" - $show->{'sub-title'}->[0]->[0]":""), "\n"; print " $show->{start} -- $show->{stop} $show->{channel}", ($cached?" (cached)":""), "\n"; if ($show->{'desc'}) { my $d = $show->{'desc'}->[0]->[0]; printf " '%s'\n", length($d)>72 ? substr($d,0,69)."..." : $d; } print "\n"; } # --- find a channel that matches the channel heading sub find_channel { my $chan_heading = shift; for (sort { length($b) <=> length($a) } keys %chanid) { return $_ if index(lc $chan_heading, lc $_) == 0; } for (grep { length($_) >= 3 } sort { length($b) <=> length($a) } keys %chanid) { return $_ if index(lc $chan_heading, lc $_) >= 0; } undef; } # --- determine whether to get details sub want_details { my $show = shift; my $title = title($show); my $channel = $show->{channel}; my $string = "$channel:$title"; my $want = $slow; for my $expr (@skip_details) { $want = 0 if $string =~ /$expr/ } for my $expr (@want_details) { $want = 1 if $string =~ /$expr/ } return $want; } # --- get details from the closeup page for given show sub parse_closeup_details { my ($date6am, $show, $details, $row, $pid, $url) = @_; my $title = title($show); # --- use HTML::TreeBuilder to parse the details from the page... my $tree = HTML::TreeBuilder->new_from_content($details); # --- the details are in a two row table: first row is the header # ('Time', 'Program'), second row is the body of the table containing # the information we want. my $tb = $tree->look_down('_tag' => 'table', 'borderColor' => '#003366'); unless ($tb) { print "Unable to find table for details url=$url\n"; print "details: ", $tree->as_text, "\n\n" if $debug; goto FAIL; } my ($hd,$bd) = $tb->content_list(); unless ($hd and $bd) { print "Unable to extract 2 rows of table details url=$url\n"; print "details: ", $tree->as_text, "\n\n" if $debug; goto FAIL; } # --- sanity check the header row unless ($hd->as_text =~ /Time .* Program/x) { print "Table header missing for details url=$url\n"; print "details: ", $tree->as_text, "\n\n" if $debug; goto FAIL; } $_->replace_with_content for $bd->look_down('_tag' => 'b'); $_->replace_with_content for $bd->look_down('_tag' => 'font'); # --- OK, this is a pain in the neck... I'd like to parse the HTML cleanly # using TreeBuilder, but the data is formatted in an irritating manner # with multiple fields in the same table cells and things separated by #
tags etc. So we'll work with the HTML strings of the contents. my @td = map { $_->as_HTML } $bd->look_down('_tag' => 'td'); # --- split cells that contain multiple values delimited by
@td = map { split '
', $_ } @td; # --- strip remaining markup and clean things up a little for (@td) { s/<.*?>//g; s/(^\s+|\s+$)//g; } my ($start0, $channel_hdr, $genre, $desc) = @td[0,1,5,7]; my ($title1, $title2, $duration, $rating) = $td[4] =~ /(.*?)(?: - (.*))?\(([^(]*?mins)(,.*)?\)/; # --- ensure all fields are defined $$_ ||= '' for \($title1, $title2, $rating, $duration, $genre, $desc); $genre =~ s/Genre: *//; $rating =~ s/, *//; $title2 =~ s/\s+$//; # --- extract (text) content lists of the 2 row cells unless ($channel_hdr) { print "Warning: parsed data incomplete for details url=$url\n"; print map { "td[$_] = <$td[$_]>\n" } 0..$#td; print "body: q{$bd->as_text}\n"; goto FAIL; } cleanup(\$channel_hdr); my $channel = find_channel($channel_hdr); # --- is this a channel we know about? is it consistent with the guide? $channel = $chanid{lc $channel} if $channel; # -- xmltv channel id unless ($channel eq $show->{channel}) { print "channel mismatch for '$title' (details url = $url)\n"; goto FAIL; } # --- now clean up a few things $title2 =~ s/^\s*-\s*//; $rating =~ s/Rated:\s*//; $duration =~ s/mins//; # --- compute start and stop times my ($start, $stop); if ($start0) { Date_Init("TZ=+0000"); # So parse doesn't assume a timezone. my $dt = eval {DateTime::Format::DateManip->parse_datetime("$start0")}; if ($dt) { $dt->set_time_zone('floating'); $start= $date6am->clone->set(hour=>$dt->hour, minute=>$dt->minute); # --- first, check for shows starting past midnight... $start->add(days=>1) if $row > 0 and $dt->hour < 6; # --- then for shows starting the day before... (11pm say) $start->subtract(days=>1) if $row == 0 and $dt->hour >= 9; my $ddt = eval{DateTime::Format::DateManip ->parse_duration($duration . ' minutes')}; $stop = $start + $ddt if $ddt; } } $show->{title} = [[$title1, $lang]] if $title1; $show->{start} = dtxmltv($start) if $start; $show->{stop} = dtxmltv($stop) if $stop; $show->{'sub-title'} = [[$title2, $lang]] if $title2; $show->{desc} = [[$desc, $lang]] if $desc; $show->{category} = [[$genre, $lang]] if $genre; $show->{rating} = [[$rating, "CTVA", undef]] if $rating; ++$count{detail}; $tree->delete; return 1; FAIL: # --- CLOSEUP DETAILS FAILED --- ++$count{bad}; $tree->delete; print join("", "===DEBUG===\ndetails url=$url\n", "---PAGE--------\n$details\n", "---ENDPAGE-----\n===ENDDEBUG===\n") if ($debug); # --- Details file didn't parse: save a copy for an autopsy of there is # an appropriate debug directory... if ($debug_dir) { my $dbgfile = "$debug_dir/$pid"; if (open(CT, '>', $dbgfile)) { print CT $details; close CT } else { print "can't write $dbgfile\n" } } return undef; } # --- configure: query for region, services, and channels and write config sub configure { die "Sorry, configuration is not currently supported."; my $config_file = shift; my $firstpage; XMLTV::Ask::init($opt_gui); XMLTV::Config_file::check_no_overwrite($config_file); # --- extract user's ids for channels (if there were any in # the config file), add the defaults then clear the channels hash $chanid{lc $_} = $channels->{$_} for keys %$channels; for (channel_mappings()) { my ($name, $id) = / \s* (.+?) \s* : \s* (\S+) /x or next; $chanid{lc $name} = $id unless $chanid{lc $name}; # use user's if defined } $channels = {}; $conf = {}; # --- get timezone my @zones = DateTime::TimeZone->all_names(); my $zone = ask(join "\n", 'Timezone may be Olson DB time zone name ("Australia/Sydney"),', 'an offset string ("+1000"), or "local" which is the timezone of', 'your operating system.', 'Type ? to get select from valid Olson DB timezones,', 'or enter your timezone.', '', ' (default "local", "?" for list) :'); $zone =~ s/^\s*(.*)\s*$/$1/; if ( $zone eq "") { $zone = 'local'; } elsif ($zone ne 'local' && $zone !~ /^\+\d\d\d\d$/x && !grep { $zone eq $_ } @zones) { my @regions = DateTime::TimeZone->categories; my $region = ask_choice("Select your timezone region", $regions[5], @regions); if ($region) { my @region_zones = grep (/^$region/, @zones); # my @region_zones = DateTime::TimeZone->names_in_category($region); $zone = ask_choice("Select your timezone", $region_zones[0], @region_zones); } } $conf->{TZ} = $zone; my $date = DateTime->today(time_zone => $zone )->dmy(''); my @channellist; # --- now find list of services - note that this appears to be invariant, # so perhaps we should always offer the same list and skip the get? { my %servicenames = ( free => 1); $firstpage = with_progress("getting list of services", sub { get_page("$TV?channel=free") }); ++$servicenames{$1} while $firstpage =~ /channel=(\w+)/g; my @choices = sort keys %servicenames; my @flag = ask_many_boolean (0, map { "Grab listings for $_ channels" } @choices); for (0..$#choices) { next unless $flag[$_]; push @{$conf->{services}}, { name => $choices[$_] }; } # --- now loop over services; find region/regionid and list of # channels for my $service (@{$conf->{services}}) { my ($page, $base); if ($service->{name} eq 'free') { # --- get list of regions my %region; my $tree = HTML::TreeBuilder->new_from_content($firstpage); for ($tree->look_down('_tag' => 'select', 'name' => 'region')) { for ($_->look_down('_tag' => 'option')) { $region{$_->as_text()} = $_->attr('value'); } } $tree->delete(); my @choices = sort keys %region; $service->{region} = ask_choice("Select your region for free channels", $choices[0], @choices); $service->{regionid} = $region{$service->{region}}; $page = with_progress ( "getting list of channels free service in " . "$service->{region}", sub { set_service($service) }); } else { # --- find regionid for service ($page, $base) = with_progress ( "getting regionid and channels for service $service->{name}", sub { get_content_base("$TV?channel=$service->{name}") } ); $service->{region} = "Australia"; ($service->{regionid}) = $base =~ /_(\d+).asp/ or die "cannot find regionid"; # page now has channel list too } # --- now append channels for this service my %skip; # --- find the channels my $tree = HTML::TreeBuilder->new_from_content($page); for ($tree->look_down('_tag' => 'table', 'class' => 'tv', 'width' => '100%') # only one table of this type ->look_down('_tag' => 'tr') # ..first row has channels ->look_down('_tag' => 'b')) { # ..in bold tags my $channel = $_->as_text; push @channellist, $channel; unless ($chanid{lc $channel}) { # check/define xmltvid my $id = lc($channel); $id =~ s/( ^\s+ | \s+$ | \W )//gx; $id .= ".$service->{name}.au"; # e.g., "foxtel.au", "free.au" print "Warning, unknown channel '$channel', using '$id' as id\n"; $chanid{lc $channel} = $id; } } $tree->delete(); } } my @select = ask_many_boolean (1, map { "subscribe to channel $_ -> $chanid{lc $_}" } @channellist); for (0..$#channellist) { next unless $select[$_]; my $name = $channellist[$_]; $channels->{$name} = $chanid{lc $name}; } my @channels = sort keys %$channels; # --- does the user want the slow option turned on by default? $conf->{slow} = ask_boolean("Show descriptions, ratings, genres and more accurate\n" . "time information is available by downloading individual\n" . "pages for each show, but this takes a lot longer\n\n" . "Do you want this (--slow) option to be on by default?"); # --- report configuration and ask for confirmation my $channel_count = @channels; my $services_info; for my $service (@{$conf->{services}}) { $services_info .= "service: name=$service->{name}, " . " region=$service->{region} (id=$service->{regionid})\n"; } die "aborting configuration" unless ask_boolean( "Please confirm the following configuration:\n" . " TZ = $conf->{TZ}\n" . " $services_info\n" . " ($channel_count subscribed channels)\n\n" . "[ use the '--list-channels' option for the\n" . " xmltvids to use in mythtvsetup ]\n\n" . " Continue?\n"); # --- open config file and write the configuration -d dirname($config_file) or mkdir dirname($config_file) or die "cannot create directory for $config_file: $!"; # --- dump as perl code using Data::Dumper open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; print CONF Data::Dumper->Dump([$conf, $channels, $duplicate], ["conf", "channels", "duplicate"]); close CONF; print "wrote config_file: $config_file\n"; } # --------------------------------------------------------------------------- # --- we can avoid redirections by spoofing random MSIDs in the URLs use Digest::MD5 qw{md5_hex}; sub MSID { $spoofMSID ? "&MSID=" . md5_hex(rand) : "" } # --------------------------------------------------------------------------- my $ua; BEGIN { $ua = LWP::UserAgent->new ('timeout' => 30, 'keep_alive' => 1, 'agent' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'); $ua->env_proxy; $ua->cookie_jar({}); } sub set_service { my $service = shift; my $url = $service->{name} eq 'free' ? "${NMSN}/setlocation.asp?region=$service->{regionid}" . "&returnURL=${TV}?channel=$service->{name}" : $TV . "?channel=$service->{name}®ion=$service->{regionid}"; print "SET_SERVICE($url)\n" if $debug; get_page($url); } # --------------------------------------------------------------------------- # --- get NMSN program details for given pid sub get_details_page { my $url = shift() . MSID(); my $page; print "GET_DETAILS_PAGE($url)\n" if $debug; $page = get_page($url) or print "Warning: Failed to get program details from $url\n"; return $page; } # --------------------------------------------------------------------------- # --- descend a structure and clean up various things, including stripping # leading/trailing spaces in strings, translations of html stuff etc my %amp; BEGIN { %amp = ( nbsp => ' ', qw{ amp & lt < gt > apos ' quot " } ) } sub cleanup { my $x = shift; if (ref $x eq "REF") { cleanup($_) } elsif (ref $x eq "HASH") { cleanup(\$_) for values %$x } elsif (ref $x eq "ARRAY") { cleanup(\$_) for @$x } elsif (defined $$x) { $$x =~ s/&(#(\d+)|(.*?));/ $2 ? chr($2) : $amp{$3}||' ' /eg; $$x =~ s/[^\x20-\x7f]/ /g; $$x =~ s/(^\s+|\s+$)//g; } } # --------------------------------------------------------------------------- # --- get a page sub get_page { my $url = shift; $url =~ s#^http://#$WW# if $opt_warper; my $page = (get_content_base($url))[0]; return $page; } # --------------------------------------------------------------------------- # get a page and its base (and report all redirections if debugging) # we attempt 5 times with 3 second pauses between failures. sub get_content_base { my $url = shift; my ($raw, $page, $base); if ($opt_replay) { $url =~ s/&MSID=.*//; $base = $url; $raw = $replay->{$url}->{raw}; $page = $replay->{$url}->{page} unless $debugjs; print "REPLAY($url)\n" if $debug; print "IGNORE DECODED\n" if $debug and $debugjs; } else { my $response; my $request = HTTP::Request->new(GET => $url); $request->header('Accept-Encoding' => 'gzip'); for (1..5) { $response = $ua->request($request); last if $response->is_success; sleep 3; } unless ($response->is_success) { print "Warning: failed to read page $url in 5 attempts\n"; return undef; } $stats{total} += do {use bytes; length($response->content)}; if ($response->header('Content-Encoding') && $response->header('Content-Encoding') eq 'gzip') { $stats{comp} += do {use bytes; length($response->content)}; $response->content(Compress::Zlib::memGunzip($response->content)); } if ($debug and (my $r = $response)->previous) { # track redirections print "GET_CONTENT_BASE redirection backtrace:\n"; while ($r) { print " ", $r->base, "\n"; $r = $r->previous } } $raw = $response->content; $base = $response->base; } if ($raw and not $page) { $page = $raw; $page = deobfuscate($page); $page =~ s!!!ig; # strip these useless tags $page =~ s/&(#(\d+)|(nbsp|amp|quot|apos));/$2?chr($2):$amp{$3}||' '/eg; $page =~ s/[^\x20-\x7f]/ /g; } if ($opt_store) { $url =~ s/&MSID=.*//; $replay->{$url}->{raw} = $raw; $replay->{$url}->{page} = $page; } return $page, $base; } # --- with the following code we look for sections of javascript that # contain the obfuscation code and evaluate it using a JavaScript runtime # context (using the Mozilla libjs library). my $jsc; BEGIN { $jsc = new JavaScript::Runtime->create_context(); $jsc->set_error_handler( sub { } ); $jsc->eval(qq{ var doc = ''; function Location() { this.href = 'http://ninemsn.com.au'; } function Document() { this.write = function(x) { doc += x; } } function Window() { this.___ww = 0 } location = new Location; document = new Document; window = new Window; }); } sub deobfuscate { my $data = shift; $data =~ s{}{ my $x = $1; $jsc->eval(qq{ doc = '' }); $jsc->eval($x); $jsc->eval(qq{ doc }) || ''; }isge; $data; } # --------------------------------------------------------------------------- # show a progress message during call to code (given by closure) sub with_progress { my ($message, $sub) = @_; my $bar = new XMLTV::ProgressBar($message, 1); my @results = $sub->(); $bar->update, short_pause(), $bar->finish; return wantarray ? @results : $results[0]; } sub short_pause { select(undef, undef, undef, 0.33) } # --------------------------------------------------------------------------- # misc/pedantic stuff... sub title { (shift)->{title}->[0]->[0] } sub Ymd { $_[0]->ymd('') or die "problem in Ymd($_[0])" } sub dmY { $_[0]->dmy('') or die "problem in dmY($_[0])" } sub dtxmltv { # Convert date time to yyyymmddhhmm +hhmm format my $offset = $_[0]->time_zone->offset_for_datetime($_[0]); sprintf("%s%s +%02d%02d", $_[0]->ymd('') , $_[0]->hms(''), $offset/3600, ($offset/60)%60); } sub fixplural { # hardly seems worth it sometimes... but, standards... local $_ = shift; s/(\d+) (\s+) (\w+)s (\s)/$1 . $2 . $3 . ($1==1?"":"s") . $4/xe; $_ } # --------------------------------------------------------------------------- # here is the default channel list... comments welcome :-) sub channel_mappings { return grep ! /^#/, split "\n", qq{ # --- Free channels ABC NSW : nsw.abc.gov.au ABC QLD : qld.abc.gov.au ABC TAS : tas.abc.gov.au ABC ACT : act.abc.gov.au ABC SA : sa.abc.gov.au ABC2 : abc2.abc.gov.au Channel Seven Sydney : sydney.seven.com.au Channel Seven Queensland : queensland.seven.com.au Prime Southern : southern.prime.com.au CHANNEL SEVEN BRISBANE : brisbane.seven.com.au CHANNEL SEVEN ADELAIDE : adelaide.seven.com.au SBS Sydney : sydney.sbs.com.au SBS Queensland : queensland.sbs.com.au SBS News : news.sbs.com.au SBS EASTERN : eastern.sbs.com.au SBS SA : sa.sbs.com.au Network TEN Sydney : sydney.ten.com.au NETWORK TEN BRISBANE : brisbane.ten.com.au Southern Cross TEN Capital : capital.southerncrossten.com.au Southern Cross TEN Queensland : queensland.southerncrossten.com.au NETWORK TEN ADELAIDE : adelaide.ten.com.au Channel Nine Sydney : sydney.nine.com.au CHANNEL NINE BRISBANE METRO : brisbane.nine.com.au WIN Television NSW : nsw.win.com.au WIN Television QLD : qld.win.com.au CHANNEL NINE ADELAIDE : adelaide.nine.com.au # --- Foxtel Arena TV : arena.foxtel.com.au BBC World : bbcworld.foxtel.com.au Cartoon Network : cartoon.foxtel.com.au Channel [V] : v.foxtel.com.au CNBC : cnbc.foxtel.com.au CNN : cnn.foxtel.com.au Discovery Channel : discovery.foxtel.com.au FOX News : foxnews.foxtel.com.au FOX8 : fox8.foxtel.com.au MAX : max.foxtel.com.au National Geographic Channel : natgeo.foxtel.com.au Nickelodeon : nickelodeon.foxtel.com.au Showtime : showtime.foxtel.com.au Showtime 2 : showtime2.foxtel.com.au Sky News : skynews.foxtel.com.au TV1 : tv1.foxtel.com.au UKTV : uktv.foxtel.com.au World Movies : worldmovies.foxtel.com.au A1 : a1.foxtel.com.au ACC : acc.foxtel.com.au ADULTS ONLY : adultsonly.foxtel.com.au ANIMAL PLANET : animalplanet.foxtel.com.au ANTENNA PACIFIC : antennapacific.foxtel.com.au ARENA+2 : arena2.foxtel.com.au AURORA : aurora.foxtel.com.au BLOOMBERG : bloomberg.foxtel.com.au BOOMERANG : boomerang.foxtel.com.au CLUB [V] : clubv.foxtel.com.au CMC : cmc.foxtel.com.au CRIME & INVESTIGATION NETWORK : crime.foxtel.com.au DISCOVERY HEALTH : health.discovery.foxtel.com.au DISCOVERY SCIENCE : science.discovery.foxtel.com.au DISCOVERY TRAVEL & ADVENTURE : travel.discovery.foxtel.com.au DISNEY CHANNEL : disney.foxtel.com.au E! : e.foxtel.com.au ESPN : espn.foxtel.com.au EUROSPORT NEWS : eurosportnews.foxtel.com.au FOOD : food.foxtel.com.au FOX CLASSICS : classics.foxtel.com.au FOX CLASSICS+2 : classics2.foxtel.com.au FOX SPORTS 1 : sports1.foxtel.com.au FOX SPORTS 2 : sports2.foxtel.com.au FOX8+2 : fox82.foxtel.com.au FTV : ftv.foxtel.com.au FUEL : fuel.foxtel.com.au HALLMARK CHANNEL : hallmark.foxtel.com.au HOW TO : howto.foxtel.com.au MAIN EVENT : mainevent.foxtel.com.au MOVIE EXTRA : movieextra.foxtel.com.au MOVIE GREATS : moviegreats.foxtel.com.au MOVIE ONE : movieone.foxtel.com.au MOVIE ONE TAKE 2 : movieonetake2.foxtel.com.au MTV : mtv.foxtel.com.au NICK JNR : nickjnr.foxtel.com.au OVATION : ovation.foxtel.com.au RAI INTERNATIONAL : rai.foxtel.com.au SHOWTIME GREATS : showtimegreats.foxtel.com.au SKY RACING : skyracing.foxtel.com.au TCM : tcm.foxtel.com.au THE BIOGRAPHY CHANNEL : biography.foxtel.com.au THE COMEDY CHANNEL : comedy.foxtel.com.au THE COMEDY CHANNEL+2 : comedy2.foxtel.com.au THE HISTORY CHANNEL : history.foxtel.com.au THE LIFESTYLE CHANNEL : lifestyle.foxtel.com.au THE LIFESTYLE CHANNEL+2 : lifestyle2.foxtel.com.au THE WEATHER CHANNEL : weather.foxtel.com.au TV1+2 : tv12.foxtel.com.au TVSN : tvsn.foxtel.com.au UKTV+2 : uktv2.foxtel.com.au VH1 : vh1.foxtel.com.au W : w.foxtel.com.au }; } # Additional documentation: # # Downloading of details pages can be controlled by the --slow option # (possibly set by default during configuration) and regular expressions # contained in the following two files (one regexp per line): # # ~/.tv_grab_au-detailed contains a list of regular expressions matching # "chanid:titles" of shows for which we want details # e.g.: Doctor Who # Simpsons # abc.gov.au:.*Chef # # ~/.tv_grab_au-undetailed regular expressions matching "chanid:titles" to # skip details downloads # e.g.: News # news.sbs.com.au # # NB: a match against the detailed list overrides any undetailed matches. # When changing these files, use the --ignore-cache option to ensure # all new details are downloaded. # # *** Local Variables: *** # *** mode: cperl *** # *** indent-tabs-mode: nil *** # *** tab-width: 2 *** # *** End: ***