#!/usr/bin/perl -w
# gnuhtml2latex html to latex converter
# Copyright (c) 1999 Tomasz Wgrzanowski <maniek@beer.com>
# Taken over (for maintenance only) by Gunnar Wolf <gwolf@debian.org>, 2005
#
# gnuhtml2latex is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# On Debian GNU/Linux systems, the complete text of the GNU General
# Public License can be found in `/usr/share/common-licenses/GPL'.
#
# THIS IS VERY ALPHA

use strict;
use Getopt::Std;

getopts('a:bcf:h:i:no:pst:',\%main::opts);
unless (defined $main::opts{o}) { $main::opts{o} = '{article}' }
unless (defined $main::opts{h}) { $main::opts{h} = '' }
unless (defined $main::opts{f}) { $main::opts{f} = '' }
$main::num = ($main::opts{n})?'':'*';

{
my %tagstable_start = (
'p' => '\\par ',
'b' => '\\textbf{',
'i' => '\\textit{',
'u' => '\\underline{',
'dt' => '\\item[',
'dd' => ']',
'br' => '\\\\',
'em' => '\\emph{',
'h1' => "\\chapter${main::num}\{",
'h2' => "\\section${main::num}\{",
'h3' => "\\subsection${main::num}\{",
'h4' => "\\subsubsection${main::num}\{",
'h5' => "\\paragraph${main::num}\{",
'h6' => "\\subparagraph${main::num}\{",
'li' => '\\item ',
'ul' => '\\begin{itemize}',
'ol' => '\\begin{enumerate}',
'dl' => '\\begin{description}',
'tt' => '\\texttt{',
'kbd' => '{\\tt\\bf ',
'var' => '\\textit{',
'dfn' => '{\\bf\\it ',
'cite' => '{\\sc ',
'samp' => '\\texttt{',
'strong' => '\\textbf{',
'listing' => '\\begin{verbatim}',
'code' => '\\texttt{',
'pre' => '\\begin{verbatim}'
);

my %tagstable_end = (
'b' => '}',
'i' => '}',
'u' => '}',
'em' => '}',
'h1' => '}',
'h2' => '}',
'h3' => '}',
'h4' => '}',
'h5' => '}',
'h6' => '}',
'tt' => '}',
'kbd' => '}',
'var' => '}',
'dfn' => '}',
'cite' => '}',
'samp' => '}',
'strong' => '}',
'ul' => '\\end{itemize}',
'ol' => '\\end{enumerate}',
'dl' => '\\end{description}',
'listing' => '\\end{verbatim}',
'code' => '}',
'pre' => '\\end{verbatim}'
);

my $mode = 0;
my $firstfile = 1;
my $lastfile = 1;
my $substitution = 1;
package HTML::LatexMaker;
use HTML::Parser;
use HTML::Entities;
@HTML::LatexMaker::ISA = ( "HTML::Parser" );
1;

sub firstfile { my $self = shift; $firstfile = shift; }
sub lastfile { my $self = shift; $lastfile = shift; }
    
sub start {
my ( $self, $tag, $attr, $attrseq ) = @_;
   if ( $tag eq 'html' ) { start_mode(1); return }
elsif ( $tag eq 'head' ) { start_mode(2); return }
elsif ( $tag eq 'body' ) { start_mode(3); return }
elsif ( ($tag eq 'pre') or ($tag eq 'listing') ) { $substitution=0 }
return unless( $mode == 3 and defined $tagstable_start{$tag} );
print $tagstable_start{$tag};
}

sub end {
my ( $self, $tag ) = @_;
   if ( $tag eq 'html' ) { end_mode(0); return }
elsif ( $tag eq 'head' ) { end_mode(1); return }
elsif ( $tag eq 'body' ) { end_mode(1); return }
elsif ( ($tag eq 'pre') or ($tag eq 'listing') ) { $substitution=1 }
return unless( $mode == 3 and defined $tagstable_end{$tag} );
print $tagstable_end{$tag};
}

sub text {
my ( $self, $text ) = @_;
return unless( $mode == 3 );

# Handle some things that decode_entities doesn't.
# (This needs to be done *before* calling decode_entities: otherwise
# there'd be no way of distinguishing `&FOO;' from `&amp;FOO;'.)

# We use `!' for internal purposes during entity translation.
$text =~ s/!|&\#(?:0*33|x0*21);/!bang;/g;

# Handle `&lsquo;&ldquo;', `&ndash;&mdash;' and so on by inserting
# thin space between the translations in such cases.
$text =~ s/&\#(?:x0*2d|0*45);/-/g;
$text =~ s/(&mdash;|&ndash;|-)(?=(?:&mdash;|&ndash;|-))/$1!thinsp;/g;
$text =~ s/(&[lr][sd]quo;)(?=(?:&[lr][sd]quo;))/$1!thinsp;/g;

# There are many things that decode_entities doesn't handle.
# A few of those things we handle ourselves.  The final replacement
# happens later (so that we correctly handle the various quotes
# whether they're literal, numeric character ref, or symbolic ref).
# In the meantime we change from `&FOO;' to `!FOO;'.
$text =~ s/&([mn]dash|[lr][sd]quo|hellip);/!$1;/g;

decode_entities($text);

$text =~ s/\\/!backslash;/g;

# Does not work properly.
#	$text =~ s/([~\`\'\"]+)/!verb|$1|/g;  
if ($substitution) {
    $text =~ s/([_&%\{\}\#])/\\$1/g;
}
$text =~ s/\$/\\\$/g;
$text =~ s/\^/\\^{}/g;
$text =~ s/!backslash;/\$\\backslash\$/g;
$text =~ s/!mdash;/---/g;
$text =~ s/!ndash;/--/g;
$text =~ s/!lsquo;/`/g;  #`;
$text =~ s/!rsquo;/'/g;  #';
$text =~ s/!ldquo;/``/g;
$text =~ s/!rdquo;/''/g;
$text =~ s/!hellip;/\ldots{}/g;
$text =~ s/!thinsp;/\$\\,\$/g;
#	$text =~ s/!verb|/\\verb|/g;
$text =~ s/!bang;/!/g;
$text =~ s/\xa0/~/g;
#$text =~ s/>/\$>\$/g;
	    
print $text;
}

sub start_mode {
my ( $mode_new ) = @_;
if ( $mode_new == 1 && $firstfile) {
print
'% This file was converted from HTML to LaTeX with
% Tomasz Wegrzanowski\'s <maniek@beer.com> gnuhtml2latex program
% Version : '.$main::version.'
\documentclass'.$main::opts{o}."\n";
}
if ( $mode_new == 3 && $firstfile) {
print ('\begin{document}'."\n".$main::opts{h});

if ( defined $main::opts{a} or defined $main::opts{t} or defined $main::opts{c} ){
if ( defined $main::opts{a} or defined $main::opts{t} ) {
if ( $main::opts{t} ) { print ('\\title{'.$main::opts{t}.'}') }
print ( '\\author{'.($main::opts{a} or '')."}\n\\maketitle" );
}
if ( $main::opts{c} ) { print "\n\\tableofcontents\n" }
}
if ( $main::opts{p} ) { print "\n\\newpage" }
}
$mode = $mode_new;
}

sub end_mode {
my ( $mode_new ) = @_;
if ( $mode == 3 && $lastfile ) {
print ($main::opts{f}.'\end{document}'."\n");
}
$mode = $mode_new;
}

}

$main::version = '0.1';

if ( $main::opts{i} ) {
    open FILE, $main::opts{i} or 
	die "$main::opts{i}  $1";
    @ARGV=<FILE>;
    close FILE;
}

if ( $main::opts{b} ) {
    if (@ARGV>=1) {
        my $filename=$ARGV[0];
        open FILE, $filename or die "$filename $!";
        $filename =~ s/\.html?$//;
        my $outfile = $filename.".tex";
        unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
        my $doc = new HTML::LatexMaker;
 	 $doc->lastfile(0);
        $doc->parse_file (\*FILE);
        $doc->firstfile(0);
        close FILE;
	for (my $i=1; $i < @ARGV-1; $i++) {
	    $filename=$ARGV[$i];
	    open FILE, $filename or next;
	    $filename =~ s/\.html?$//;
	    $outfile = $filename.".tex";
	    unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
	    $doc->parse_file (\*FILE);
	    close FILE;
	}
	$filename=$ARGV[@ARGV-1];
        open FILE, $filename or die;
        $filename =~ s/\.html?$//;
        $outfile = $filename.".tex";
        unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
        $doc->lastfile(1);
        $doc->parse_file (\*FILE);
    }
} else {
    foreach my $filename(@ARGV) {
	open FILE, $filename or next;
	$filename =~ s/\.html?$//;
	my $outfile = $filename.".tex";
	unless ( $main::opts{s} ) { open STDOUT,">$outfile" }
	my $doc = new HTML::LatexMaker;
	$doc->parse_file (\*FILE);
	close FILE;
    }
}

=head1 NAME

gnuhtml2latex - html to latex converter

=head1 SYNOPSIS

B<gnuhtml2latex> F<[options]> F<filename>

F<-a> F<[author]> - speecify author of document 

F<-b>          -  Process more than one input HTML file (they all get 
concatenated and written to a single output file, or to STDOUT if F<-s> is set)

F<-c>          - use table of contents

F<-f> F<[string]> - specify foonote

F<-h> F<[string]> - specify header

F<-i> F<filename> - Get the list of files to be converted from the specified 
filename

F<-n>          - use numbered sections

F<-o> F<[string]> - specify document style

F<-p>          - break page after title / table of contents

F<-s>          - write to STDOUT

F<-t> F<[title]>  - specify title of document

=head1 DESCRIPTION

This aims to be replacement of html2latex.

Program takes html file foo.html or foo.htm file
and makes latex file foo.tex from it

=head1 NOT VERY AMBITIOUS TODO

For people who want only functionality of original html2latex

 bugfixes - Im sure there is plenty of bugs inside
 clueful backslash escaping
 more entities from outside of iso-8895-1
 tables
 performance boost
 and a lot more

=head1 MORE AMBITIOUS TODO

For people who want a real tool

 make it part of some html processor
 rewrite in flex

=head1 FUTURE OF THIS PACKAGE

This is very possible that functions of this package will be included
to some more general project. This package was made mainly to make world
a bit more free.

=cut
