#!/bin/sh

# This is a shell script that calls a perl script;
if [ -e /usr/local/bin/perl ] ; then
	exec /usr/local/bin/perl -x $0 ${1+"$@"}
else 
	exec perl -x $0 ${1+"$@"}
fi

# The reason for this is because MaraDNS is built on Centos 3.X
# (RHEL 3 clone) and the Perl Centos comes with takes over 30 seconds to make
# the changelog; Perl 5.8.7 does it in under 4 seconds.

# /usr/local/bin/perl is a local compile of Perl which, on MaraDNS' build
# system, is able to quickly process the changelog.  The script looks for
# Perl at /usr/local/bin/perl then anywhere else in the user's path.

#!/usr/local/bin/perl
# Convert an ej-formatted doc in to a man page
# Input: First argument or standard input
# Output: Standard output

$FILENAME = shift || "/////";

# Tmp dir (used for running iconv on non-8859-1 pages)
$TMP = $ENV{'HOME'} . "/tmp";
# Make this just /tmp at your own risk.  You have been warned.

if(! -d $TMP ) {
    die "Fatal: Please create a directory entitled " . $TMP . "\n";
    }

# Read in the doc

# This makes the script happy when run with both Perl 5.8.0 and
# Perl 5.8.8; basically 5.8.0 had a lot of unhappy Unicode bugs
# and so they changed the behavior for later releases.  It is possible
# to make a Unicode-happy script that runs unchanged Unicode-happy
# on both 5.8.0 and 5.8.8, but the contortions I had to do were
# amazing.

# I would like to thank all of the helpful people in the newsgroup
# comp.lang.perl.misc for their assistance; I couldn't have done it
# without them.

use utf8;
if($FILENAME ne "/////") {
    close(STDIN);
    open(STDIN,"< $FILENAME");
    }

binmode(STDIN,":utf8");

while(<STDIN>){$doc .= $_}

#$* = 1; # Match multiple lines

# Get rid of <!-- ... --> comments
$doc =~ s|<\!\-\-.*?\-\->||msg;

# Grab the header
if($doc =~ m|<head>(.*?)</head>|ims) {
    $header = $1;
    }
else {
    die "Fatal: Document must have a heading section\n";
    }

# Make sure the header has 
# <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=XXX">
# Where XXX is any character set
if($header !~ 
m|meta\s+http\-equiv\=\"content\-type\"\s+content\=\"text\/html\;\s+charset=|i) 
  {
  print "Please have somthing like this:\n";
  print
    '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; CHARSET=utf-8">';
  print "\n";
  die "Fatal: Header must declare charset\n";
  }
else {
   if($header =~ /charset=([^"]+)/i) {
       $charset = $1;
       #print "Charset: $charset\n";
       }
   else {
       die "Fatal: Error determining charset of document\n";
       }
   }

# Sanitize charset; only allow letters, numbers, and the dash
$charset =~ s/[^A-Za-z0-9\-]//g;

#$doc = conv_to_latin1($doc); # Not in Fedora core 2

# OK, see if we have a DTWIDTH header in the ej document.  If so, use that
# value to determine how wide to make "dt" values in the formatted man page

if($header =~ m|<dtwidth>(.*?)</dtwidth>|ims) {
    $width = $1;
    if($width =~ /\D/) {
        die "Fatal: DTWIDTH tag can only have a numeric argument.\n";
	}
    $DTROFF = ".TP $1";
    }
else {
    # The nroff to convert a DT tag in to
    $DTROFF = ".TP 4";
    }

# OK, the header looks kosher.  Start generating nroff

print '.\" Do *not* edit this file; it was automatically generated by ej2man';
print "\n";
print '.\" Look for a name.ej file with the same name as this filename';
print "\n";
print '.\"' . "\n";
#print '.\" Process this file with the following on iso-8859-1 terminals:'."\n";
#print '.\" nroff -man -Tlatin1 maradns.8' . "\n";
print '.\" Process this file with the following' . "\n";
print '.\" nroff -man -Tutf8 maradns.8 | tr \'\020\' \' \'' . "\n";
print '.\"' . "\n";
# Timestamp
$ts = localtime(time());
print '.\" Last updated ' . $ts . "\n";
print '.\"' . "\n";

# OK, see if we have a TH header in the ej document.  If so, add that to the
# man page.  If not, generate a generic TH

if($header =~ m|<th>(.*?)</th>|ims) {
    print ".TH $1\n";
    }
else {
    print ".TH \n";
    }

print '.\" We don\'t want hyphenation (it\'s too ugly)' . "\n";
print '.\" We also disable justification when using nroff' . "\n";
print '.\" Due to the way the -mandoc macro works, this needs to be placed'
      . "\n";
print '.\" after the .TH heading' . "\n";
print ".hy 0\n";
print ".if n .na\n";
print '.\"' . "\n";
print '.\" We need the following stuff so that we can have single quotes' .
      "\n";
print '.\" In both groff and other UNIX *roff processors' . "\n";
print '.if \n(.g .mso www.tmac' . "\n";
print '.ds aq \(aq' . "\n";
print '.if !\\n(.g .if \'\\(aq\'\' .ds aq \\\'' . "\n";


# Enough of header processing; let's get to the body of the document

# Grab the body
if($doc =~ m|<body>(.*?)</body>|ims) {
    $body = $1;
    }
else {
    die "Fatal: Document must have a body section\n";
    }

$body = process_body($body,0);

print($body);

print "\n";

exit(0);

# And this processes the body (we do this way so we can recursively handle 
# those pesky PRE flags)
sub process_body {
   my($body,$inrecurse) = @_;
   my($hack,$rest,$filename);
   my(@parts);

   # The INCLUDE tag
   while($body =~ m|\<include\s+\"([^"]+)\"\s*\>|ims) {
       $filename = $1;
       open(FILE,"< $filename") || die "Can not find file $filename\n";
       $hack = "";
       while(<FILE>) {$hack .= $_}
       close(FILE);
       #$hack = conv_to_latin1($hack);
       #$hack = process_body($hack);
       $body =~ s|\<include\s+\"([^"]+)\"\s*\>|$hack|ims;
       }

   # Get rid of any </?BLOCKQUOTE> tags in bulletted lists; the NROFF macros
   # can not handle nesting 
   if($inrecurse == 0) {
       @parts = split(m|</?ul>|im,$body);
       if($#parts > 0) {
           $body = "";
           for($hack = 0; $hack <= $#parts; $hack++) {
               if($hack % 2 == 0) { # If we are not in a bulleted list
	           $body .= $parts[$hack];
	           if($hack < $#parts) {
	              $body .= "\n<ul>\n";
		      }
                   }
               else {
	           $parts[$hack] =~ s|</?blockquote>||g;
	           $body .= $parts[$hack];
	           $body .= "\n</ul>\n";
	           }
               }
           }
       }

   # The HIBIT tag (replace with ALT text then remove)
   while($body =~ m|<hibit alt=\"([^"]*)\">(.*?)</hibit>|ims) {
       $body =~ s|<hibit alt=\"([^"]*)\">(.*?)</hibit>|$1|ims;
       }
   $body =~ s|<hibit[^>]*>(.*?)</hibit>||imsg;
   # Because of how PRE is handled, we need to delete anything starting
   # with <hibit> to the end of string and anything from the beginning
   # of the string to </hibit>, since we may only be processing a subpart
   # of the whole text. (Disabled because this is above the PRE processing
   #$body =~ s|<hibit alt=\"([^"]*)\">.*$|$1|is;
   #$body =~ s|<hibit[^>]*>.*$||is;
   #$body =~ s|^.*</hibit>||is;

   # The PRE tag
   @parts = split(m|</?pre>|im,$body);
   if($#parts > 0) {
        $body = "";
        for($hack=0;$hack <= $#parts; $hack++) {
           if($hack %2 == 0) { # If we are not in a <pre> section
	       $body .= process_body($parts[$hack],1);
               }
           else {
	       $body .= "\n.nf";
	       # Deal with back slashes
	       $parts[$hack] =~ s/\\/\\\\/g;
   	       # Make single quotes literal single quotes ('\(aq' in troff)
	       $parts[$hack] =~ s/\'/\\\(aq/g;
               # Handle the á character (Debian's lint complains if the 
               # man page has raw hi-bit characters)
               $parts[$hack] =~ s|\xc3\xa1|\\\(\'a|g;
	       $body .= $parts[$hack];
	       $body .= ".fi\n";
	       }
           }

       # Make á \('a so *roff can digest this.  We have to put this here
       # so the sequence remains a *roff command
       $body =~ s|á|\\\(\'a|g;

       return($body);
       }

   # Backslashes need to be escaped in *roff source
   $body =~ s/\\/\\\\/g;
   # As do single quotes
   $body =~ s/\'/\\\(aq/g;

   # The H1 tag
   while($body =~ m|<h1>(.*?)</h1>|ims) {
       $hack = $1;
       $hack =~ s/\s+/ /g;
       $hack =~ s/\"\'//g;
       $body =~ s|<h1>(.*?)</h1>|\n.SH "$hack"\n.PP\n|ims;
       }

   # The H2 tag
   while($body =~ m|<h2>(.*?)</h2>|ims) {
       $hack = $1;
       $hack =~ s/\s+/ /g;
       $hack =~ s/\"\'//g;
       $body =~ s|<h2>(.*?)</h2>|\n.PP\n.in -3\n\\fB$hack\\fR\n.PP\n|ims;
       }

   # The A tag (and /A closer)
   $body =~ s|</?a[^>]+>||img;
   $body =~ s|</?a>||img;

   # The TT tag (and /TT closer)
   $body =~ s|</?tt>||img;

   # The HR tag
   $body =~ s|<hr>|\n.PP\n.RS 28\n* * *\n.RE\n.PP\n|img;

   # The HINCLUDE tag
   $body =~ s|<hinclude[^>]+>||img;

   # The BLOCKQUOTE tag
   $body =~ s|<blockquote>|\n.PP\n.RS 4\n|img;
   $body =~ s|</blockquote>|\n.RE\n.PP\n|img;

   # The B tag
   while($body =~ m|<b>(.*?)</b>(\S+)?|ims) {
      $hack = $1;
      $rest = $2;
      if($rest =~ /[<>]/) {
         die "ej2man can't handle a tag immediately after a B tag\nthe offending text is $rest\n";
	 }
      if($hack =~ m|<\?i>|) {
         die "No I tags are allowed inside B tags\n";
	 }
      $hack =~ s/\s+/ /g;
      $hack =~ s/\"\'//g;
      $rest =~ s/\"\'//g;
      if($rest) {
        $body =~ s|<b>(.*?)</b>\S+|\n.BR "$hack" "$rest"\n|ims;
	}
      else {
        $body =~ s|<b>(.*?)</b>|\n.B "$hack"\n|ims;
	}
      }

   # The I tag
   while($body =~ m|<i>(.*?)</i>(\S+)?|ims) {
      $hack = $1;
      $rest = $2;
      if($rest =~ /[<>]/) {
         print "The stuff in the I: $hack\n";
         die "ej2man can't handle a tag immediately after a I tag\nthe offending text is $rest\n";
	 }
      if($hack =~ m|<\?b>|) {
         die "No B tags are allowed inside I tags\n";
	 }
      $hack =~ s/\s+/ /g;
      $hack =~ s/\"\'//g;
      $rest =~ s/\"\'//g;
      if($rest) {
        $body =~ s|<i>(.*?)</i>\S+|\n.IR "$hack" "$rest"\n|ims;
	}
      else {
        $body =~ s|<i>(.*?)</i>|\n.I "$hack"\n|ims;
	}
      }

   # Get rid of any multiple newlines
   $body =~ s/\n(\s*)\n/\n/msg;

   # The P tag
   $body =~ s/<p>\s*/\n.PP\n/img;

   # The UL and tags (just nuke them)
   $body =~ s/<[du]l>//img;

   # The LI tag
   #$body =~ s/<li>\n?/\n.TP 2\n•\n/img; # Can't do because of groff bug
   #$body =~ s/<li>\n?/\n.TP 2\nʘ\n/img; # World isn't ready for unicode
   $body =~ s/<li>\n?/\n.TP 2\n*\n/img;

   # The DT and DD tag
   while($body =~ /<dt>(.*?)<dd>\n?/sim) {
       $hack = $1;
       $hack =~ s/\s+/ /g;
       $body =~ s/<dt>(.*?)<dd>\n?/\n$DTROFF\n$hack\n/sim;
       }

   # The /DT and /DD tags (just nuke them)
   $body =~ s|</d[td]>||img;

   # The /UL and /DL tag (which we don't ignore)
   $body =~ s|</[du]l>|\n.PP\n|img;

   # Get rid of leading space; this confuses nroff
   $body =~ s/\n[ \t]+/\n/msg;

   # Get rid of empty lines before a .TP or .PP flag; this never looks nice
   $body =~ s/\n+(\n\.[TP]P)/$1/msg;

   # Get rid of empty lines at the beginning of the segment which come
   # before a .TP; this covers the case of a </pre> before a <li> in the
   # EJ source
   $body =~ s/^\n+(\.[TP]P)/$1/ms;

   # Same with empty lines before an .RE flag; this does not look nice
   $body =~ s/^\n+(\.RE)/$1/ms;

   # Get rid of multiple empty lines together; this never looks nice
   # when formatted by Nroff
   $body =~ s/\n\n\n+/\n\n/msg;

   # Get rid of empty lines at the end of a segment after a .TP or .PP
   # flag to work around how <pre> tags are handled
   $body =~ s/(\n\.[TP]P)\s*$/$1/ms;

   # Put a newline before the .RE flag; this looks nicer
   $body =~ s/(\n\.RE)/\n$1/msg;
   $body =~ s/^(\.RE)/\n$1/msg;

   # Get rid of empty lines after a .RE flag; this does not look nice either
   $body =~ s/(\n\.RE.*?\n)\n+/$1/msg;

   # Put a newline before the .in flag; this looks nicer
   $body =~ s/(\n\.in)/\n$1/msg;

   # Get rid of empty lines after a .TP or .PP flag; this never looks nice
   $body =~ s/(\n\.[TP]P.*?\n)\n+/$1/msg;

   # Get rid of multiple spaces; nroff (unlike EJ) honors them
   $body =~ s/[ \t]+/ /sg;

   # The TABLE tags (TABLE, TD, TR, /TABLE)
   $body =~ s|<table>|.ta +5 +7 +7|ig;
   $body =~ s|<td>|\t|ig;
   # We also process .br tags
   $body =~ s|<[tb]r>\n?|\n.br\n|ig;
   $body =~ s|</table>||ig;

   # Break long lines so the nroff source is more legible
   $body = fmt($body);

   $body;
   }

# This takes a string, and braks any lines longer than 75 columns; otherwise
# it performs no other formatting
# Input: The string to format
# Output: The formatted string

sub fmt {
   my($input) = @_;
   my($place,$lastspace,$column,$linebegin);
  
   $place = $lastspace = $column = $linebegin = 0;

   # Get rid of trailing white space, which confuses this algorithm
   $input =~ s/[ \t]+\n/\n/sg;

   # The core algorithm
   while($place < length($input)) {
       # If we hit a whitespace, remember that this is where the last
       # (previous) space character is
       if(substr($input,$place,1) =~ /[ \t]/) {
           $lastspace = $place;
	   }
       # If we hit the end of a line reset the counters which tell us when
       # to break a line
       if(substr($input,$place,1) =~ /\n/) {
           $column = -1;
	   $linebegin = $lastspace = $place + 1;
	   }
       # This adds the newline as needed.  Note that we do not break
       # lines which start with a .; this means the line has a man macro 
       # and breaking the line will change the formatting of the page
       if($column > 70 && $linebegin != $lastspace && 
          substr($input,$linebegin,1) !~ /\./) {
           substr($input,$lastspace,1,"\n");
	   $place = $lastspace;
	   $column = -1;
	   $linebegin = $lastspace = $place + 1;
	   }
       $column++;
       $place++;
       }

   $input;
   } 

# Convert a given string from whatever encoding to latin1, since groff is
# still in the dark ages and can not handle utf-8 input (not true in
# Fedoa core 2)
# Input: String to convert
# Output: Converted string
# Global variables used: $charset (assumed to already be sanitized)
# $TMP (also assumed to be sanitized)
sub conv_to_latin1 {
    my($string) = @_;
    if($charset =~ /latin1/i || $charset =~ /8859.1/) {
        return $string;
	}
    open(TFILE,"> $TMP/conv.$$") || die "Can not open $TMP/conv.$$: $!\n";
    print TFILE $string;
    close(TFILE);
#   if($charset !~ /utf.?8/i) {
        system("iconv -f $charset -t latin1 $TMP/conv.$$ > $TMP/converted.$$");
#	}
#   else {
#      system("utf8tol1 < $TMP/conv.$$ > $TMP/converted.$$");
#      }
    open(TFILE,"< $TMP/converted.$$") || 
        die "Can not open $TMP/converted.$$: $!\n";
    $string = "";
    while(<TFILE>) {
        $string .= $_;
        }
    close(TFILE);

    # Delete the trash
    #unlink("$TMP/conv.$$") || die "Can not erase $TMP/conv.$$: $!\n";
    #unlink("$TMP/converted.$$") ||die "Can not erase $TMP/convierted.$$: $!\n";
    return $string;
    }

