#!/usr/bin/perl

=head1 NAME

pristine-tar - regenerate pristine tarballs

=head1 SYNOPSIS

B<pristine-tar> [-vdk] gentar delta tarball

B<pristine-tar> [-vdk] gendelta tarball delta

B<pristine-tar> [-vdk] commit tarball [upstream]

B<pristine-tar> [-vdk] checkout tarball

=head1 DESCRIPTION

pristine-tar can regenerate a pristine upstream tarball using only a small
binary delta file and a revision control checkout of the upstream branch.
 
The delta file is designed to be checked into revision control along-side
the upstream branch, thus allowing Debian packages to be built entirely
using sources in revision control, without the need to keep copies of
upstream tarballs.

pristine-tar supports compressed tarballs, calling out to pristine-gz(1)
and pristine-bz2(1) to produce the pristine gzip and bzip2 files.

=head1 COMMANDS

=over 4

=item pristine-tar gendelta

This takes the specified upstream tarball, and generates a small binary
delta file that can later be used by pristine-tar gentar to recreate the
tarball.

If the delta filename is "-", it is written to standard output.

=item pristine-tar gentar 

This takes the specified delta file, and the files in the current
directory, which must have identical content to those in the upstream
tarball, and uses these to regenerate the pristine upstream tarball.

If the delta filename is "-", it is read from standard input.

=item pristine-tar commit

B<pristine-tar commit> generatess a pristine-tar delta file for the specified
tarball, and commits it to version control. The B<pristine-tar checkout>
command can later be used to recreate the original tarball based only
on the information stored in version control.

For B<pristine-tar checkout> to work, you also need to store the precise
contents of the tarball in version control. To specify in which tag (or
branch or other treeish object) it's stored, use the B<upstream> parameter.
This defaults to "refs/heads/upstream", or if there's no such branch, any
branch matching "upstream". The name of the tree it points to will be
recorded for later use by B<pristine-tar checkout>.

The delta files are stored in a branch named "pristine-tar", with filenames
corresponding to the input tarball, with ".delta" prepended. This
branch is created or updated as needed to add each new delta.

=item pristine-tar checkout

This regenerates a copy of the specified tarball using information
previously saved to revision control by B<pristine-tar commit>.

=back

=head1 OPTIONS

=over 4

=item -v

Verbose mode, show each command that is run.

=item -d

Debug mode.

=item -k

Don't clean up the temporary directory on exit.

=head1 LIMITATIONS

Only .tar, .tar.gz, and .tar.bz2 files are currently supported.

It could fail on certain spectacularly strange files. If it fails, it will
fail during creation of the delta, since the delta is tested to make sure it
reproduces the pristine tarball.

Currently only the git revision control system is supported by the
"checkout" and "commit" commands. It's ok if the working copy
is not clean or has uncommitted changes, or has changes staged in the
index; none of that will be touched by "checkout" or "commit".

=head1 AUTHOR

Joey Hess <joeyh@debian.org>

Licensed under the GPL, version 2 or above.

=cut

use warnings;
use strict;
use File::Temp;
use File::Path;
use File::Basename;
use Getopt::Long;
use Cwd qw{getcwd abs_path};

# magic identification
use constant GZIP_ID1            => 0x1F;
use constant GZIP_ID2            => 0x8B;
use constant BZIP2_ID1           => 0x42;
use constant BZIP2_ID2           => 0x5a;

# compression methods, 0x00-0x07 are reserved
use constant GZIP_METHOD_DEFLATE => 0x08;

# compression methods, 'h' for Bzip2 ('H'uffman coding), '0' for Bzip1 (deprecated)
use constant BZIP2_METHOD_HUFFMAN => 0x68;

my $verbose=0;
my $debug=0;
my $keep=0;
	
# Force locale to C since tar may output utf-8 filenames differently
# depending on the locale.
$ENV{LANG}='C';

sub usage {
	print STDERR "Usage: pristine-tar [-vdk] gentar delta tarball\n";
	print STDERR "       pristine-tar [-vdk] gendelta tarball delta\n";
	print STDERR "       pristine-tar [-vdk] commit tarball [upstream]\n";
	print STDERR "       pristine-tar [-vdk] checkout tarball\n";
	exit 1;
}

sub error {
	die "pristine-tar: @_\n";
}

sub message {
	print "pristine-tar: @_\n";
}

sub debug {
	message(@_) if $debug;
}

sub vprint {
	message(@_) if $verbose;
}

sub doit {
	vprint(@_);
	if (system(@_) != 0) {
		error "command failed: @_";
	}
}

sub tempdir {
	return File::Temp::tempdir("pristine-tar.XXXXXXXXXX",
		TMPDIR => 1, CLEANUP => !$keep);
}

sub recreatetarball {
	my $tempdir=shift;
	my $source=shift;
	my %options=@_;
	
	my @manifest;
	open (IN, "$tempdir/manifest") || die "$tempdir/manifest: $!";
	while (<IN>) {
		chomp;
		push @manifest, $_;
	}
	close IN;

	# The manifest and source should have the same filenames,
	# but the manifest probably has all the files under a common
	# subdirectory. Check if it does.
	my $subdir="";
	foreach my $file (@manifest) {
		#debug("file: $file");
		if ($file=~m!^(/?[^/]+)(/|$)!) {
			if (length $subdir && $subdir ne $1) {
				debug("found file not in subdir $subdir: $file");
				$subdir="";
				last;
			}
			elsif (! length $subdir) {
				$subdir=$1;
				debug("set subdir to $subdir");
			}
		}
		else {
			debug("found file not in subdir: $file");
			$subdir="";
			last;
		}
	}
		
	if (length $subdir && -d "$source/$subdir") {
		my $all_in_subdir=1;
		foreach my $file (@manifest) {
			if (! -e "$source/$file" &&
			    ! -l "$source/$file") {
				$all_in_subdir=0;
				last;
			}
		}
		if ($all_in_subdir) {
			debug("source already in subdir $subdir");
			$subdir="";
		}
	}

	if (length $subdir) {
		debug("subdir is $subdir");
		doit("mkdir", "$tempdir/workdir");
		$subdir="/$subdir";
	}

	if (! $options{clobber_source}) {
		doit("cp", "-a", $source, "$tempdir/workdir$subdir");
	}
	else {
		doit("mv", $source, "$tempdir/workdir$subdir");
	}

	# It's important that this create an identical tarball each time
	# for a given set of input files. So don't include file metadata
	# in the tarball, since it can easily vary.
	my $full_sweep=0;
	foreach my $file (@manifest) {
		if (-l "$tempdir/workdir/$file") {
			# Can't set timestamp of a symlink, so
			# replace the symlink with an empty file.
			unlink("$tempdir/workdir/$file") || die "unlink: $!";
			open(OUT, ">", "$tempdir/workdir/$file") || die "open: $!";
			close OUT;
		}
		elsif (! -e "$tempdir/workdir/$file") {
			debug("$file is listed in the manifest but may not be present in the source directory");
			$full_sweep=1;

			if ($options{create_missing}) {
				# Avoid tar failing on the nonexistent item by
				# creating a dummy directory.
				debug("creating missing $file");
				mkpath "$tempdir/workdir/$file";
			}
		}
		
		if (-d "$tempdir/workdir/$file" && (-u _ || -g _ || -k _)) {
			# tar behaves weirdly for some special modes
			# and ignores --mode, so clear them.
			debug("chmod $file");
			chmod(0755, "$tempdir/workdir/$file") ||
				die "chmod: $!";
		}
		utime 0, 0, "$tempdir/workdir/$file" || die "utime: $!";
	}
	
	# If some files couldn't be matched up with the manifest,
	# it's possible they do exist, but just with names that make sense
	# to tar, but not to this program. Work around this and make sure
	# such files have their metadata tweaked, by doing a full sweep of
	# the tree.
	if ($full_sweep) {
		debug("doing full tree sweep to catch missing files");
		use File::Find;
		find(sub {
			if (-l $_) {
				unlink($_) || die "unlink: $!";
				open(OUT, ">", $_) || die "open: $!";
				close OUT;
			}
			if (-d $_ && (-u _ || -g _ || -k _)) {
				chmod(0755, $_) ||
					die "chmod: $!";
			}
			utime 0, 0, $_ || die "utime: $!";
		}, "$tempdir/workdir");
	}

	doit("tar", "cf", "$tempdir/recreatetarball", "--owner", 0, "--group", 0, 
		"--numeric-owner", "-C", "$tempdir/workdir",
		"--no-recursion", "--mode", "0644", 
		"--files-from", "$tempdir/manifest"); 
}

sub gentar {
	my $delta=shift;
	my $tarball=shift;
	my %opts=@_;

	my $tempdir=tempdir();
	
	if ($delta eq "-") {
		$delta="$tempdir/in";
		open (OUT, ">", $delta) || die "$delta: $!";
		while (<STDIN>) {
			print OUT $_;
		}
		close OUT;
	}

	doit("tar", "xf", File::Spec->rel2abs($delta), "-C", $tempdir);
	if (! -e "$tempdir/type") {
		error "failed to gentar delta $delta";
	}

	open (IN, "$tempdir/version") || error "delta lacks version number ($!)";
	my $version=<IN>;
	if ($version >= 3 || $version < 2) {
		error "delta is version $version, not supported";
	}
	close IN;
	if (open (IN, "$tempdir/type")) {
		my $type=<IN>;
		chomp $type;
		if ($type ne "tar") {
			error "delta is for a $type, not a tar";
		}
		close IN;
	}

	recreatetarball($tempdir, getcwd, clobber_source => 0, %opts);
	my $out=(-e "$tempdir/wrapper") ? $tarball.".tmp" : $tarball;
	doit("xdelta", "patch", "$tempdir/delta", "$tempdir/recreatetarball", $out);

	if (-e "$tempdir/wrapper") {
		my $type=`tar xOzf $tempdir/wrapper type`;
		chomp $type;
		if ($type eq 'gz') {
			doit("pristine-gz", 
				($verbose ? "-v" : "--no-verbose"),
				($debug ? "-d" : "--no-debug"),
				($keep ? "-k" : "--no-keep"),
				"gengz", "$tempdir/wrapper", $out);
			doit("mv", "-f", $out.".gz", $tarball);
		}
		elsif ($type eq 'bz2') {
			doit("pristine-bz2",
				($verbose ? "-v" : "--no-verbose"),
				($debug ? "-d" : "--no-debug"),
				($keep ? "-k" : "--no-keep"),
				"genbz2", "$tempdir/wrapper", $out);
			doit("mv", "-f", $out.".bz2", $tarball);
		}
		else {
			error "unknown wrapper file type: $type";
		}
	}
}
	
sub genmanifest {
	my $tarball=shift;
	my $manifest=shift;

	open(IN, "tar tf $tarball |") || die "tar tf: $!";
	open(OUT, ">", $manifest) || die "$!";
	while (<IN>) {
		chomp;
		# ./ or / in the manifest just confuses tar
		s/^\.?\/+//;
		print OUT "$_\n" if length $_;
	}
	close IN || error "tar tf exited nonzero";
	close OUT;
}

sub gendelta {
	my $tarball=shift;
	my $delta=shift;

	my $tempdir=tempdir();

	my $stdout=0;
	if ($delta eq "-") {
		$stdout=1;
		$delta="$tempdir/out";
	}

	my @files=qw(delta manifest version type);

	# Check to see if it's compressed.
	my $compression=undef;
	open (IN, "<", $tarball) || error "Cannot read $tarball: $!\n";
	my ($chars, $id1, $id2, $method);
	if (read(IN, $chars, 10) == 10 &&
	    (($id1, $id2, $method) = unpack("CCC", $chars)) &&
    	    $id1 == GZIP_ID1 && $id2 == GZIP_ID2 &&
	    $method == GZIP_METHOD_DEFLATE) {
	    	$compression='gz';
		doit("zcat $tarball > $tempdir/origtarball");
	}
	else {
		seek(IN, 0, 0) || die "seek: $!";
		if (read(IN, $chars, 3) == 3 &&
	            (($id1, $id2, $method) = unpack("CCC", $chars)) &&
	            $id1 == BZIP2_ID1 && $id2 == BZIP2_ID2 &&
	            $method == BZIP2_METHOD_HUFFMAN) {
		    	$compression='bz2';
			doit("bzcat $tarball > $tempdir/origtarball");
		}
	}
	close IN;
	
	# Generate a wrapper file to recreate the compressed file.
	if (defined $compression) {
		doit("pristine-$compression",
			($verbose ? "-v" : "--no-verbose"),
			($debug ? "-d" : "--no-debug"),
			($keep ? "-k" : "--no-keep"),
			"gendelta", $tarball, "$tempdir/wrapper");
		push @files, "wrapper";
		$tarball="$tempdir/origtarball";
	}

	my $sourcedir="$tempdir/tmp";
	doit("mkdir", $sourcedir);
	doit("tar", "xf", File::Spec->rel2abs($tarball), "-C", $sourcedir);
	# if all files were in a subdir, use the subdir as the sourcedir
	my @out=grep { $_ ne "$sourcedir/.." && $_ ne "$sourcedir/." }
		(glob("$sourcedir/*"), glob("$sourcedir/.*"));
	if ($#out == 0 && -d $out[0]) {
		$sourcedir=$out[0];
	}

	genmanifest($tarball, "$tempdir/manifest");
	recreatetarball($tempdir, $sourcedir, clobber_source => 1);
	my $ret=system("xdelta delta -0 --pristine $tempdir/recreatetarball $tarball $tempdir/delta") >> 8;
	# xdelta exits 1 on success if there were differences
	if ($ret != 1 && $ret != 0) {
		error "xdelta failed with return code $ret";
	}

	open(OUT, ">", "$tempdir/version") || die "$!";
	print OUT "2.0\n";
	close OUT;
	open(OUT, ">", "$tempdir/type") || die "$!";
	print OUT "tar\n";
	close OUT;

	doit("tar", "czf", $delta, "-C", $tempdir, @files);

	if ($stdout) {
		doit("cat", $delta);
	}
}

sub vcstype {
	if (-d ".git") {
		return 'git';
	}
	else {
		error("cannot determine type of vcs used for the current directory");
	}
}

sub export {
	my $upstream=shift;

	my $dest=tempdir();
	my $id;
	
	my $vcs=vcstype();
	if ($vcs eq "git") {
		if (defined $upstream && $upstream =~ /[A-Za-z0-9]{40}/) {
			$id=$upstream;
		}
		else {
			my @totry;
			if (! defined $upstream) {
				# try a specific name first, if that's not
				# present fall back to any upstream branch
				@totry=("refs/heads/upstream", "upstream")
			}
			else {
				@totry=($upstream);
			}
			foreach my $branch (@totry) {
				my @reflines=map { chomp; $_ } `git show-ref \Q$branch\E`;
				if (@reflines > 1) {
					error "more than one ref matches \"$branch\":\n".
						join("\n", @reflines);
				}
				elsif (@reflines) {
					($id)=$reflines[0]=~/^([A-Za-z0-9]+)\s/;
					last;
				}
			}
			if (! defined $id) {
				error "failed to find ref using: git show-ref @totry";
			}
		}

		doit("git archive --format=tar \Q$id\E | (cd '$dest' && tar x)");
	}
	else {
		die "unsupported vcs $vcs";
	}

	return ($dest, $id);
}

sub checkoutdelta {
	my $tarball=shift;

	my $branch="pristine-tar";
	my $deltafile=basename($tarball).".delta";
	my $idfile=basename($tarball).".id";

	my ($delta, $id);

	my $vcs=vcstype();
	if ($vcs eq "git") {
		# Check that a local $branch exists
		my @reflines=map { chomp; $_ } `git show-ref \Q$branch\E`;
		my @remotes=grep { ! m/ refs\/heads\/\Q$branch\E$/ } @reflines;
		if ($#reflines != $#remotes) {
			$branch="refs/heads/$branch";
		}
		else {
			if (@reflines == 0) {
				error "no $branch branch found, use \"pristine-tar commit\" first";
			}
			elsif (@remotes == 1) {
				($branch)=$remotes[0]=~/^[A-Za-z0-9]+\s(.*)/;
			}
			else {
				error "Several remote $branch branches exist. Run \"git branch --track $branch <remote>\" to create a local $branch branch\n".
					join("\n", @remotes);
			}
		}

		$delta=`git show $branch:\Q$deltafile\E`;
		if ($?) {
			error "git show $branch:$deltafile failed";
		}
		if (! length $delta) {
			error "git show $branch:$deltafile returned no content";
		}
		$id=`git show $branch:\Q$idfile\E`;
		if ($?) {
			error "git show $branch:$idfile failed";
		}
		chomp $id;
		if (! length $id) {
			error "git show $branch:$idfile returned no id";
		}
	}
	else {
		die "unsupported vcs $vcs";
	}

	return ($delta, $id);
}

sub commitdelta {
	my $delta=shift;
	my $id=shift;
	my $tarball=shift;
	
	my $branch="pristine-tar";
	my $deltafile=basename($tarball).".delta";
	my $idfile=basename($tarball).".id";
	my $commit_message="pristine-tar data for ".basename($tarball);

	my $vcs=vcstype();
	if ($vcs eq "git") {
		my $tempdir=tempdir();
		open(OUT, ">$tempdir/$deltafile") || die "$tempdir/$deltafile: $!";
		print OUT $delta;
		close OUT;
		open(OUT, ">$tempdir/$idfile") || die "$tempdir/$idfile: $!";
		print OUT "$id\n";
		close OUT;
			
		# Commit the delta to a branch in git without affecting the
		# index, and without touching the working tree. Aka deep 
		# git magick.
		$ENV{GIT_INDEX_FILE}="$tempdir/index";
		$ENV{GIT_DIR}=getcwd."/.git";
		chdir($tempdir) || die "chdir: $!";
		my $branch_exists=(system("git show-ref --quiet --verify refs/heads/$branch") == 0);
		if ($branch_exists) {
			doit("git ls-tree -r --full-name $branch | git update-index --index-info");
		}
		doit("git", "update-index", "--add", $deltafile, $idfile);
		my $sha=`git write-tree`;
		if ($?) {
			error("git write-tree failed");
		}
		$sha=~s/\n//sg;
		if (! length $sha) {
			error("git write-tree did not return a sha");
		}
		my $pid = open(COMMIT, "|-");
		if (! $pid) {
			# child
			my $commitopts=$branch_exists ? "-p $branch" : "";
			my $commitsha=`git commit-tree $sha $commitopts`;
			if ($?) {
				error("git commit-tree failed");
			}
			$commitsha=~s/\n//sg;
			if (! length $commitsha) {
				error("git commit-tree did not return a sha");
			}
			doit("git", "update-ref", "refs/heads/$branch", $commitsha);
			exit 0;
		}
		else {
			# parent
			print COMMIT $commit_message."\n";
			close COMMIT || error("git commit-tree failed");
		}
		
		message("committed $deltafile to branch $branch");
	}
	else {
		die "unsupported vcs $vcs";
	}
}

sub commit {
	my $tarball=shift;
	my $upstream=shift;

	my $tempdir=tempdir();
	my ($sourcedir, $id)=export($upstream);
	genmanifest($tarball, "$tempdir/manifest");
	recreatetarball($tempdir, $sourcedir, clobber_source => 1,
		create_missing => 1);
	my $pid = open(GENDELTA, "-|");
	if (! $pid) {
		# child
		gendelta($tarball, "-");
		exit 0;
	}
	local $/=undef;
	my $delta=<GENDELTA>;
	close GENDELTA || error "failed to generate delta";
	commitdelta($delta, $id, $tarball);
}

sub checkout {
	my $tarball=shift;
	
	my $tempdir=tempdir();
	my ($delta, $id)=checkoutdelta($tarball);
	my ($sourcedir, undef)=export($id);
	my $pid = open(GENTAR, "|-");
	if (! $pid) {
		# child
		$tarball=abs_path($tarball);
		chdir($sourcedir) || die "chdir $sourcedir: $!";
		gentar("-", $tarball, clobber_source => 1, create_missing => 1);
		exit 0;
	}
	print GENTAR $delta;
	close GENTAR || error "failed to generate tarball";

	message("successfully generated $tarball");
}

Getopt::Long::Configure("bundling");
if (! GetOptions(
	"v|verbose!" => \$verbose,
	"d|debug!" => \$debug,
	"k|keep!" => \$keep)) {
	usage();
}

usage unless @ARGV;
my $command=shift;

if ($command eq 'gentar') {
	usage unless @ARGV == 2;
	gentar(@ARGV);
}
elsif ($command eq 'gendelta') {
	usage unless @ARGV == 2;
	gendelta(@ARGV);
}
elsif ($command eq 'commit') {
	usage unless @ARGV >= 1;
	commit(@ARGV);
}
elsif ($command eq 'checkout') {
	usage unless @ARGV == 1;
	checkout(@ARGV);
}
else {
	print STDERR "Unknown subcommand \"$command\"\n";
	usage();
}
