#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell

=pod

=head1 NAME

tv_grab_fi - Grab TV listings for Finland.

=head1 SYNOPSIS

tv_grab_fi --help

tv_grab_fi [--config-file FILE] --configure [--gui OPTION]

tv_grab_fi [--config-file FILE] [--output FILE] [--days N]
           [--offset N] [--quiet]

tv_grab_fi --list-channels

tv_grab_fi --capabilities

tv_grab_fi --version

=head1 DESCRIPTION

Output TV listings for several channels available in Finland.
The data comes from www.telkku.com. The grabber relies on parsing HTML 
so it might stop working at any time.

First run B<tv_grab_fi --configure> to choose, which channels you want
to download. Then running B<tv_grab_fi> with no arguments will output
listings in XML format to standard output.

B<--configure> Prompt for which channels,
and write the configuration file.

B<--config-file FILE> Set the name of the configuration file, the
default is B<~/.xmltv/tv_grab_fi.conf>.  This is the file written by
B<--configure> and read when grabbing.

B<--gui OPTION> Use this option to enable a graphical interface to be used.
OPTION may be 'Tk', or left blank for the best available choice.
Additional allowed values of OPTION are 'Term' for normal terminal output
(default) and 'TermNoProgressBar' to disable the use of XMLTV::ProgressBar.

B<--output FILE> Write to FILE rather than standard output.

B<--days N> Grab N days.  The default is 14 (22 days should be available).

B<--offset N> Start N days in the future.  The default is to start
from today.

B<--quiet> Suppress the progress messages normally written to standard
error.

B<--list-channels> Write output giving <channel> elements for every
channel available (ignoring the config file), but no programmes.

B<--capabilities> Show which capabilities the grabber supports. For more
information, see L<http://membled.com/twiki/bin/view/Main/XmltvCapabilities>

B<--version> Show the version of the grabber.

B<--help> Print a help message and exit.

=head1 SEE ALSO

L<xmltv(5)>.

=head1 AUTHOR

Ville Ahonen, (ville dot ahonen at iki dot fi). Based on previous version of tv_grab_fi by Matti Airas.

=head1 BUGS

The data source does not include full channels information and the
channels are identified by short names rather than the RFC2838 form
recommended by the XMLTV DTD.

=cut

######################################################################
# initializations

use strict;
use XMLTV::Version '$Id: tv_grab_fi,v 1.53 2007/12/25 15:35:49 va1210 Exp $ ';
use XMLTV::Capabilities qw/baseline manualconfig cache/;
use XMLTV::Description 'Finland';
use Getopt::Long;
use Date::Manip;
use HTML::Entities;
use HTML::TreeBuilder;
use IO::File;

use XMLTV;
use XMLTV::Memoize;
use XMLTV::ProgressBar;
use XMLTV::Ask;
use XMLTV::Config_file;
use XMLTV::DST;
use XMLTV::Get_nice;
use XMLTV::Mode;
use XMLTV::Date;
# Todo: perhaps we should internationalize messages and docs?
use XMLTV::Usage <<END
$0: get Finnish television listings in XMLTV format
To configure: $0 --configure [--config-file FILE]
To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
        [--offset N] [--quiet]
To list channels: $0 --list-channels
To show capabilities: $0 --capabilities
To show version: $0 --version
END
  ;

my $DOMAIN = 'telkku.com';
my $SITE = "http://www.$DOMAIN";

# Attributes of the root element in output.
my $HEAD = { 'source-info-url'     => "$SITE/",
	     'source-data-url'     => "$SITE/",
	     'generator-info-name' => 'XMLTV',
	     'generator-info-url'  => 'http://membled.com/work/apps/xmltv/',
	   };

# Whether zero-length programmes should be included in the output.
my $WRITE_ZERO_LENGTH = 0;

# The winter timezone in Finland.  Summer time is one hour ahead of this.
my $TZ="+0200";

# default language
my $LANG="fi";

# Global channel data.
our @ch_all;


######################################################################
# get options

# Get options, including undocumented --cache option.
XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
my ($opt_days, $opt_offset, $opt_help, $opt_output,
    $opt_configure, $opt_config_file, $opt_gui,
    $opt_quiet, $opt_list_channels);
$opt_days  = 14; # default
$opt_offset = 0; # default
$opt_quiet  = 0; # default
GetOptions('days=i'        => \$opt_days,
	   'offset=i'      => \$opt_offset,
	   'help'          => \$opt_help,
	   'configure'     => \$opt_configure,
	   'config-file=s' => \$opt_config_file,
       'gui:s'         => \$opt_gui,
	   'output=s'      => \$opt_output,
	   'quiet'         => \$opt_quiet,
	   'list-channels' => \$opt_list_channels,
	  )
  or usage(0);
die 'number of days must not be negative'
  if (defined $opt_days && $opt_days < 0);
usage(1) if $opt_help;

XMLTV::Ask::init($opt_gui);

my $mode = XMLTV::Mode::mode('grab', # default
			     $opt_configure => 'configure',
			     $opt_list_channels => 'list-channels',
			    );

# File that stores which channels to download.
my $config_file
  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_fi', $opt_quiet);

my @config_lines; # used only in grab mode
if ($mode eq 'configure') {
    XMLTV::Config_file::check_no_overwrite($config_file);
}
elsif ($mode eq 'grab') {
    @config_lines = XMLTV::Config_file::read_lines($config_file);
}
elsif ($mode eq 'list-channels') {
    # Config file not used.
}
else { die }

# Whatever we are doing, we need the channels data.
my %channels = get_channels(); # sets @ch_all
my @channels;

######################################################################
# write configuration

if ($mode eq 'configure') {
    open(CONF, ">$config_file") or die "cannot write to $config_file: $!";

    # Ask about each channel.
    my @chs = sort keys %channels;
    my @names = map { $channels{$_} } @chs;
    my @qs = map { "add channel $_?" } @names;
    my @want = ask_many_boolean(1, @qs);
    foreach (@chs) {
	my $w = shift @want;
	warn("cannot read input, stopping channel questions"), last
	  if not defined $w;
	# No need to print to user - XMLTV::Ask is verbose enough.

	# Print a config line, but comment it out if channel not wanted.
	print CONF '#' if not $w;
	my $name = shift @names;
	print CONF "channel $_ $name\n";
	# TODO don't store display-name in config file.
    }

    close CONF or warn "cannot close $config_file: $!";
    say("Finished configuration.");

    exit();
}

# Not configuration, we must be writing something, either full
# listings or just channels.
#
die if $mode ne 'grab' and $mode ne 'list-channels';

# Options to be used for XMLTV::Writer.
my %w_args;
if (defined $opt_output) {
    my $fh = new IO::File(">$opt_output");
    die "cannot write to $opt_output: $!" if not defined $fh;
    $w_args{OUTPUT} = $fh;
}
$w_args{encoding} = 'ISO-8859-1';
my $writer = new XMLTV::Writer(%w_args);
$writer->start($HEAD);

if ($mode eq 'list-channels') {
    # Write channels mode.
    $writer->write_channel($_) foreach @ch_all;
    $writer->end();
    exit();
}

######################################################################
# We are producing full listings.
die if $mode ne 'grab';

# Read configuration.
my %title;
my %description;
my $line_num = 1;
foreach (@config_lines) {
    ++ $line_num;
    next if not defined;
    if (/^channel:?\s+(\S+)\s+([^\#]+)/) {
	my $ch_did = $1;
	my $ch_name = $2;
	$ch_name =~ s/\s*$//;
	push @channels, $ch_did;
	$channels{$ch_did} = $ch_name;
    }
    elsif (/^series:?\s+title:?\s+([^\#]+)/) {
	my $name = $1;
	$name =~ s/\s*$//;
	$title{$name}++;
    }
    elsif (/^series:?\s+description:?\s+([^\#]+)/) {
	my $name = $1;
	$name =~ s/\s*$//;
	$description{$name}++;
    }
    else {
	warn "$config_file:$line_num: bad line\n";
    }
}

######################################################################
# begin main program

# Assume the listings source uses EET (see BUGS above).
my $now = DateCalc(parse_date('now'), "$opt_offset days");
die "No channels specified, run me with --configure\n"
  if not keys %channels;
my @to_get;

# the order in which we fetch the channels matters
foreach my $ch_did (@channels) {
    my $ch_name=$channels{$ch_did};
    my $ch_xid="$ch_did.$DOMAIN";
    $writer->write_channel({ id => $ch_xid,
			     'display-name' => [ [ $ch_name ] ] });
    my $day=UnixDate($now,'%Q');
    for (my $i=0;$i<$opt_days;$i++) {
	push @to_get, [ $day, $ch_xid, $ch_did ];
	#for each day
	$day=nextday($day); die if not defined $day;
    }
}

# This progress bar is for both downloading and parsing.  Maybe
# they could be separate stages.
#
my $bar = new XMLTV::ProgressBar( {
   name => 'getting listings', 
   count => scalar @to_get,
 } ) if not $opt_quiet;
foreach (@to_get) {
    foreach (process_table($_->[0], $_->[1], $_->[2])) {
	$writer->write_programme($_);
    }
    update $bar if not $opt_quiet;
}
$bar->finish() if not $opt_quiet;
$writer->end();

######################################################################
# subroutine definitions

# Use Log::TraceMessages if installed.
BEGIN {
    eval { require Log::TraceMessages };
    if ($@) {
	*t = sub {};
	*d = sub { '' };
    }
    else {
	*t = \&Log::TraceMessages::t;
	*d = \&Log::TraceMessages::d;
	Log::TraceMessages::check_argv();
    }
}

my $warned_bad_chars;
sub tidy( $ ) {
    for (my $tmp = shift) {
	tr/\t\205/ /d;
	if (s/([^\012\015\040-\176\240-\377]+)//g) {
	    warn "removing bad characters: '$1'"
	      unless $warned_bad_chars++;
	}
	return $_;
    }
}

####
# process_table: fetch a URL and process it
#
# arguments:
#    Date::Manip object giving the day to grab
#    xmltv id of channel
#    their id of channel
#
# returns: list of programme hashes to write
#
sub process_table {
    my ($date, $ch_xmltv_id, $ch_their_id) = @_;
    my $today = UnixDate($date, '%Y%m%d');
    my $url = "$SITE/telkku?tila=knvt&kan=$ch_their_id&p=$today";
    t "getting URL: $url";
    my $tree = get_nice_tree $url, \&tidy;
    local $SIG{__WARN__} = sub {
	warn "$url: $_[0]";
    };

    my @program_data = get_program_data($tree);
    my $bump_start_day=0;

    my @r;
    while (@program_data) {
	my $cur = shift @program_data;
	my $next = shift @program_data;
	unshift @program_data,$next if $next;
	push @r, make_programme_hash($date, $ch_xmltv_id, $ch_their_id, $cur, $next);
	if (!$bump_start_day && bump_start_day($cur,$next)) {
	    $bump_start_day=1;
	    $date = UnixDate(DateCalc($date,"+ 1 day"),'%Q');
	}
    }
    return @r;
}

sub make_programme_hash {
    my ($date, $ch_xmltv_id, $ch_their_id, $cur, $next) = @_;

    my %prog;

    $prog{channel}=$ch_xmltv_id;

    my $cur_time = $cur->{time};
    t 'raw time for programme: ' . d $cur_time;
    t 'with base (winter) timezone: ' . d $TZ;
    my $start=parse_local_date("$date $cur_time", $TZ);
    t 'parse_local_date() returned: ' . d $start;
    my ($start_base, $start_tz) = @{date_to_local($start, $TZ)};
    t "date_to_local() returned time $start_base, timezone $start_tz";
    $prog{start}=UnixDate($start_base, '%q') . " $start_tz";
    t 'set programme start time to: ' . d $prog{start};

    my $next_time = $next ? $next->{time} : undef;
    if (defined $next_time) {
	t '$cur_time=' . d $cur_time;
	t '$next_time=' . d $next_time;
	my $stop_date;
	if ($next_time lt $cur_time) {
	    # Must span midnight.  (Don't worry about start being
	    # summer time and stop being winter: we assume the site is
	    # sane enough to put them both in the same timezone and
	    # avoid looking like stop < start.)
	    #
	    t '$next_time appears sooner, must be next day';
	    $stop_date = nextday($date);
	}
	else {
	    $stop_date = $date;
	}
	t '$stop_date set to: ' . d $stop_date;
	my $stop = parse_local_date("$stop_date $next_time", $TZ);
	t 'stop time in UTC: ' . d $stop;
	my ($stop_base, $stop_tz) = @{date_to_local($stop, $TZ)};
	t 'converted back to Finnish: ' . d [ $stop_base, $stop_tz ];
	$prog{stop}=UnixDate($stop_base, '%q') . " $stop_tz";
    }	

    # Check for series.
    #
    # Check 1: episode name of series in title.
    # If title contains a colon (:), check to see if the string on the 
    # left-hand side of the colon has been defined as a series in the 
    # conf-file. If it has, assume that the string on the left-hand side
    # of the colon is the name of the series, and the string on the
    # right-hand side is the name of the episode. For example, if the 
    # following line has been defined in the tv_grab_fi.conf-file:
    # "series title Prisma", and the title of the program is 
    # "Prisma: Totuus tappajadinosauruksista", then the script will assume 
    # that the title of the program is actually "Prisma", and the episode 
    # name/sub-title is "Totuus tappajadinosauruksista".
    if (($cur->{title} =~ m/([^:]+):\s*(.*)/) &&
	(exists $title{$1})) {
	my $new_title = $1;
	my $episode = $2;
	t "series $new_title, episode title $episode"; 
	$prog{title}=[ [ $new_title, $LANG ] ];
	$prog{'sub-title'} = [ [ $episode, $LANG ] ];	
    }
    else {
	$prog{title}=[ [ $cur->{title}, $LANG ] ];
    }
    # Check 2: episode name of series in description.
    # Check if the program has a description. If so, also check if the title
    # of the program has been defined as a series in the conf-file. If it
    # has, assume that the first sentence (i.e. the text before the first
    # period) marks the name of the episode. For example, if the following
    # line has been defined in the tv_grab_fi.conf-file:
    # "series description Batman", the title of of the program is "Batman",
    # and the description of the program is "Pingviinin paluu. Amerikkalainen
    # animaatiosarja. Outojen rystjen sarja johdattaa Batmanin Pingviinin
    # jljille.", then the script will assume that the episode name/sub-title
    # is "Pingviinin paluu", and that the description is actually 
    # "Amerikkalainen animaatiosarja. Outojen rystjen sarja johdattaa 
    # Batmanin Pingviinin jljille."
    if ((defined $cur->{desc}) &&
	(exists $description{$cur->{title}})   &&
	($cur->{desc} =~ s/^\s*([^.]+)\.\s*//)) {
	my $episode = $1;
	t "series $cur->{title}, episode title $episode"; 
	$prog{'sub-title'} = [ [ $episode, $LANG ] ];	
    }
    $prog{desc}=[ [ $cur->{desc}, $LANG ] ] if defined $cur->{desc};

    return \%prog;
}
sub bump_start_day {
    my ($cur,$next) = @_;
    if (!defined($next)) {
	return undef;
    }
    my $start = UnixDate($cur->{time},'%H:%M');
    my $stop = UnixDate($next->{time},'%H:%M');
    if (Date_Cmp($start,$stop)>0) {
	return 1;
    } else {
	return 0;
    }
}

#####
# All program info is contained within a table cell with the
# following properties:
#
# <td style="height: 500px; padding:5px 15px 5px 15px;" valign="top">
#
# For each program there is a entry that looks as follows:
#
# <b>hh:mm Program Name</b><br />Description<br /><br />
# 
# After processing the html with get_nice_tree, the space and slash in the 
# <br /> tags sometimes fall off (with TreeBuilder v. 3.21 and below).
# Also, all special characters (e.g. ) are replaced with their 
# corresponding html special characters. This means that the following 
# program element
#
# <b>12.10 Tnn otsikoissa</b><br />Aamu-tv:n ajankohtaiset aiheet koosteena.<br /><br />
# 
# ends up looking like this (with TreeBuilder v. 3.21 and below):
#
# <b>12.10 T&auml;n&auml;&auml;n otsikoissa</b><br>Aamu-tv:n ajankohtaiset aiheet koosteena.<br><br>
#
# or with TreeBuilder v. 3.22 and above:
#
# <b>12.10 T&auml;n&auml;&auml;n otsikoissa</b><br />Aamu-tv:n ajankohtaiset aiheet koosteena.<br /><br />
#
sub get_program_data {
    my $tree = shift;
    t "get_program_data() ENTRY for tree: $tree";
    my @data;
    # Dump the html-tree to a string for matching
    my $html = $tree->as_HTML;
    while ($html =~ m/popup\(this\.href\)">([0-9]{2})\.([0-9]{2}) (.+?)<\/a><br( \/)?>(.*?)<br( \/)?><br( \/)?>/g) {
	# Use decode_entities() to convert html characters 
	# to ascii (e.g &auml; to )
	my %keys = (time => $1 . ':' . $2,
		    title => decode_entities($3),
		    desc => decode_entities($5),
		    );
	my %h;
	foreach my $k (keys %keys) {
	    my $v = $keys{$k};
	    # Only record entry if it isn't empty (actually time 
	    # and title are required, but we don't check that.)
	    if (length ($v) > 0) {
		t "got a result from sub for $k: $v";
		$h{$k} = $v;
	    }
	}
	t 'after running all subs, got data: ' . d \%h;
	push @data, \%h;
    }
    t 'get_program_data() RETURNING ' . d \@data;
    return @data;
}

# get channel listing
sub get_channels {
    my $bar = new XMLTV::ProgressBar({
       name => 'getting list of channels', 
       count => 1,
    } ) if not $opt_quiet;
    my %channels;

    # Channels are retrieved from the channel "Suosikkikanava" (149),
    # to avoid listing it as a channel.

    my $url="$SITE/telkku?tila=knvt&kan=149";
    my $tree = get_nice_tree $url;

    # FIXME commonize this
    local $SIG{__WARN__} = sub {
	warn "$url: $_[0]";
    };
    local $SIG{__DIE__} = sub {
	die "$url: $_[0]";
    };

    # All channels are listed after the tag <h4>A-Z</h4> within <li>-tags

    my $html = $tree->as_HTML;
    $html =~ /<h3>A-Z<\/h3>\s*<ul>(.*)<\/ul>/;
    my $trunc_html = $1;
    while ($trunc_html =~ m/<li><a href=\"telkku\?tila=knvt&amp;kan=([0-9]+)\">(.+?)<\/a>/g) {
	my $channel_id = $1;
	my $channel_name = $2;
	$channels{$channel_id} = $channel_name;
	push @ch_all, { 'display-name' => [ [ $channel_name, $LANG ] ],
			'id' => $channel_id };    
    }
    die "no channels could be found" if not keys %channels;
    update $bar if not $opt_quiet;
    $bar->finish() if not $opt_quiet;
    return %channels;
}

# Bump a YYYYMMDD date by one.
sub nextday {
    my $d = shift;
    my $p = parse_date($d);
    my $n = DateCalc($p, '+ 1 day');
    return UnixDate($n, '%Q');
}
