#!%PERLPROG%
#
# hindent %VERSION%
#
# Properly indent HTML code and convert tags to uppercase like the Gods intended.
# Understands all nesting tags defined under the HTML 3.2 standard.
#
# by Paul Balyoz <pab@domtools.com>
#
# Usage:
#	hindent [-fslcv] [-i num] [file ...] > newfile
#
# Options:
#	-f	Flow - just prints tags _without_args_, for visual checking.
#		NOTE: This option DAMAGES the HTML code.  The output is for
#		human debugging use ONLY.  Keep your original file!!
#	-s	Strict - prints 1 tag per line with proper indenting.
#		Helpful for deciphering HTML code that's all on one line.
#		NOTE: This slightly DAMAGES the HTML code because it introduces
#		whitespace around tags that had none before, which will mess up
#		formatting somewhat on the page (links will have extra spaces, etc).
#	-i num	Set indentation to this many characters.
#	-l	List all the tags we recognize and exit.
#	-c      Lowercase HTML tags. (Uppercase is default)
#	-v	Print version of hindent and exit.
#
# Copyright (C) 1993-1999 Paul A. Balyoz <pab@domtools.com>
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#


# How many spaces to indent per level?
# (sets of 8-spaces will be automatically converted to tabs intelligently).
# You can use any value here, some recommendations:  8, 4, 3, or 2
$spacesperlevel = 2;

# How many spaces does a "tab" occupy on your screen?
# Unix generally uses 8-space-tabs, but it's user-configurable in most editors.
# If tabs are not turned off (-t0) then we output 1 tab character for every
# $tabstop spaces we need to output.
$tabstop = 8;

# Tags that require their own end tag <TAG>...</TAG> we will nest them
# properly:   (WARNING, you must use lower-case here)
# All other tags (not on this list) will be ignored for indenting purposes.
%nesttag = (
	'html' => 1,
	'head' => 1,
	'body' => 1,
	'title' => 1,

	'a' => 1,

	'table' => 1,
	'tr' => 1,
	'th' => 1,
	'td' => 1,

	'form' => 1,
	'select' => 1,
	'textarea' => 1,

#	'p' => 1,      Don't do this one because many people use <P> but not </P>
	'ul' => 1,
	'ol' => 1,
	'dl' => 1,
	'blockquote' => 1,
	'center' => 1,
	'div' => 1,

	'font' => 1,
	'pre' => 1,
	'tt' => 1,
	'i' => 1,
	'b' => 1,
	'u' => 1,
	'strike' => 1,
	'big' => 1,
	'small' => 1,
	'sub' => 1,
	'sup' => 1,
	'em' => 1,
	'strong' => 1,
	'dfn' => 1,
	'code' => 1,
	'samp' => 1,
	'kbd' => 1,
	'var' => 1,
	'cite' => 1,

	'h1' => 1,
	'h2' => 1,
	'h3' => 1,
	'h4' => 1,
	'h5' => 1,
	'h6' => 1,

	'applet' => 1,

	'map' => 1,

	'frameset' => 1,
	'noframes' => 1,
);


#-------------------\
# END CONFIGURATIONS ===================================================================
#-------------------/

use Getopt::Std;


#
# Parse args
#

sub usageexit {
	print STDERR "usage: hindent [-fslcv] [-i num] [-t num] [file ...] > newfile\n";
	exit 1;
}

getopts('fsi:lvt:c') || &usageexit;
if (defined $opt_i) {
	if ($opt_i < 0 || $opt_i > 10) {
		print STDERR "$0: error: indentation factor '$opt_i' not in range 0..10.\n";
		&usageexit;
	} else {
		$spacesperlevel = $opt_i;
	}
}
if (defined $opt_t) {
	if ($opt_t < 0 || $opt_t > 12) {
		print STDERR "$0: error: indentation factor '$opt_i' not in range 0..12.\n";
		&usageexit;
	} else {
		$tabstop = $opt_t;
	}
}


#
# If -l option, just list tags and exit.
#

if ($opt_l) {
	print "hindent recognizes these HTML tags:\n";
	for $tag (sort(keys(%nesttag))) {
		$tag =~ tr/a-z/A-Z/;
		print "$tag\n";
	}
	exit 0;
}


#
# If -v option, just print version and exit.
#

if ($opt_v) {
	print "hindent version %VERSION%\n";
	exit 0;
}


#
# Main HTML parsing code
#

$level = 0;			# indentation level
$changelevel = 0;		# change in indentation level (delta)
$out = "";			# accumulated output string
while (<>) {
	chomp;			# some HTML has no newline on last line, chop mangles it.
	s/^\s+//;		# remove ALL preceding whitespace, we rebuild it ourselves
	$line++;

	$end = -1;
	$start = $len = 0;
	while (/<(.*?)>/g) {
		$end = $start+$len-1;	# of previous values
		$start = length($`);
		$len = 1 + length($1) + 1;
		($tag,$arg) = split(/\s+/,$1,2);
		if (!$opt_f) {
			$out .= substr($_, $end+1, $start-($end+1));   # print stuff from last tag to here
		}
		if ($opt_c) {
			$tag =~ tr/A-Z/a-z/;
		} else {
			$tag =~ tr/a-z/A-Z/;
		}
		if ($arg && !$opt_f) {
			$out .= "<$tag $arg>";
		} else {
			$out .= "<$tag>";
		}

		# if regular tag, push it on stack; if end-tag, pop it off stack.
		# but don't do any of this if it's not a special "nesting" tag!
		if ($tag !~ m,^/,) {
			if ($nesttag{lc($tag)}) {
				push @tagstack,$tag;
				$changelevel++;		# remember how much for later
			}
		} else {
			$tag =~ s,^/,,;		# convert this end-tag to a begin-tag
			$tag = lc($tag);
			if ($nesttag{lc($tag)}) {
				# throw away tags until we find a match
				if ($#tagstack > -1) {
					while ($tag ne lc(pop @tagstack)) {
						$changelevel--;	# we threw away extra tags
						last if $#tagstack <= 0;
					}
					$changelevel--;	# we threw away extra tags
					if ($level+$changelevel < 0) {
						print STDERR "line $line: saw more end tags than begin ones!\n";
						$changelevel = -$level;
					}
				}
			}
		}
		&printout if $opt_s;		# -s -> print every tag on new line
	}

	#
	# Print rest of line after the last match, and newline.
	# (not part of Flow)
	#
	if (!$opt_f) {
		$end = $start+$len-1;
		$out .= substr($_,$end+1,length($_)-($end+1));
	}

	&printout;
}

# Any tags left on the stack?
if ($level > 0) {
	print STDERR "WARNING: level=$level, ", $#tagstack+1," tags left on stack after done parsing!  Specifically:\n";
	while ($tag = pop @tagstack) {
		print STDERR "\t$tag";
	}
}

exit 0;


#
# Print this line of data indented properly.
#
sub printout {
	my($numtabs) = 0;

	#
	# To OUTdent, do that BEFORE printing.
	#
	if ($changelevel < 0) {
		$level += $changelevel;
		$changelevel = 0;
	}

	#
	# Print indents and this line of output
	#
	$spaces = " " x ($level * $spacesperlevel);
	$numtabs = int(length($spaces)/$tabstop) if $tabstop;
	print "\t" x $numtabs;				 # print the tabs
	print " " x (length($spaces)-$numtabs*$tabstop); # print the spaces
	print "$out\n";
	$out = "";

	#
	# To INdent, do that AFTER printing.
	#
	if ($changelevel > 0) {
		$level += $changelevel;
		$changelevel = 0;
	}
}
