#!/usr/bin/perl

# This is based on eg/hrefsub with HTML::Parser.

# The idea of grabbing the text between the style tags is from
# http://www.foo.be/docs/tpj/issues/vol5_1/tpj0501-0003.html

# Copyright (C) 2008  Nick Urbanik <nicku@nicku.org>

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

# How we want to process the URLs:

# If this is a link or other URL tag:

# Leave it if the link is to "./" and the class is topParent or parent.

# If we are inside a style tag, edit the URL in the @import url(...)
# thing.

# if this is a comment, if it is of the form #include virtual="...",
# edit the URL.  This follows a convention I set up some time ago with
# my web sites.  Stuff that I include using Apache SSI is in a filename
# of the form _top_level_xxxxx.html or _level_1_xxxxx.html, etc.
# Here we munge that filename to match that convention.


use strict;
use warnings;

use HTML::Parser ();
use URI;
use Getopt::Long;
use Data::Dumper;

my $level_num = 0;
my $debug = 0;

sub usage {
    ( my $prog = $0 ) =~ s{.*/}{};
    print <<END_USAGE;
Usage: $prog [--level=LEVEL_NUM] <filename>

The LEVEL_NUM is the number of levels of directory below the top level.
For example, http://linus.nicku.org/ is at the top level (LEVEL_NUM = 0)
while http://linus.nicku.org/BFG/ is at level 1 (LEVEL_NUM = 1)
while http://linus.nicku.org/BFG/giants/ is at level 2 (LEVEL_NUM = 2)

This code will blindly strip all leading ./ or ../ from the start of any URL
and replace it with '../' x LEVEL_NUM
It blindly assumes that all URLs refer to items at the top level.

Default LEVEL_NUM: $level_num
END_USAGE
    exit 1;
}

GetOptions(
    'level=i' => \$level_num,
    'debug+' => \$debug,
    help => sub { usage },
) or usage;

# Construct a hash of tag names that may have links.
my %link_attr;
{
    # To simplify things, reformat the %HTML::Tagset::linkElements
    # hash so that it is always a hash of hashes.
    require HTML::Tagset;
    no warnings;
    while (my($k,$v) = each %HTML::Tagset::linkElements) {
	if (ref($v)) {
	    $v = { map {$_ => 1} @$v };
	}
	else {
	    $v = { $v => 1};
	}
	$link_attr{$k} = $v;
    }
    use warnings;
    # To see what HTML::Tagset::linkElements thinks are
    # the tags with link attributes
    use Data::Dump; Data::Dump::dump(\%link_attr) if $debug
}

my $input_html_file;

# This munging is very simple minded.
# Generally we run this program in the directory where the html file is.
# The following does not apply when we are working with the top level files
# beginning with '_' such as _level_1_sidebar.html.

# What we should really do is to see if the file is at the level
# that is currently there, otherwise see if it exists at the munged level,
# else warn.
sub munge_url {
    my ( $url ) = @_;
    $url =~ s{^(?:\.?\.?/)*}{};
    # Leave it if it seems to be absolute:
    return $url if $url =~ m{^(?:https?:|mailto:|ftp:|#)};
    ( my $bare_url = $url ) =~ s{#.*}{};
    return $url if -e $bare_url and $input_html_file !~ m{^_};
    $url = ( '../' x $level_num ) . $url;
    warn "Cannot find '$url'\n" unless -e ( '../' x $level_num ) . $bare_url;
    return $url;
}

# Flag indicates if we are between start and end tags of style.
# We want to search the text for @import url(...)
my $in_style;

# The handler that is called when a starting tag is found
sub start_tag {
    my( $tagname, $pos, $text, $attr ) = @_;
    print "tagname='$tagname', ", Dumper( $pos ), ", text='$text', ",
	Dumper( $attr ), "===\n" if $debug;

    if ( lc $tagname eq 'style' ) {
	$in_style = 1;
    }

    # Now just select tags which provide links:
    if ( my $link_attr = $link_attr{$tagname} ) {
	# Don't modify favicon links:
	#if ( $tagname eq 'link' && $attr->{rel} =~ m{\bicon\b} ) {
	#    print $text;
	#    return;
	#}
	# Leave parent tags that point to "./" alone.
	# These are menu items that just exist as parent menu items.
	if ( lc $tagname eq 'a'
		 && exists $attr->{class}
		 && defined $attr->{class}
		 && $attr->{class} =~ m{parent}xmsi
		 && $attr->{href} eq q{./}
		 ) {
	    print $text;
	    return;
	}
	while ( 4 <= @$pos ) {
	    # use attribute sets from right to left
	    # to avoid invalidating the offsets
	    # when replacing the values
	    my ( $k_offset, $k_len, $v_offset, $v_len ) = splice( @$pos, -4 );
	    my $attrname = lc substr($text, $k_offset, $k_len);
	    next unless $link_attr->{$attrname};
	    next unless $v_offset; # 0 v_offset means no value
	    my $v = substr( $text, $v_offset, $v_len );
	    $v =~ s/^([\'\"])(.*)\1$/$2/;
	    my $new_v = munge_url($v, $attrname, $tagname);
	    next if $new_v eq $v;
	    $new_v =~ s/\"/&quot;/g;  # since we quote with ""
	    substr( $text, $v_offset, $v_len ) = qq("$new_v");
	}
    }
    print $text;
}


# Set up the parser.
my $p = HTML::Parser->new(api_version => 3);

$p->handler(
    text => sub {
	my ( $text ) = @_;
	if ( $in_style && $text =~ m{\burl\(\s*.*?\s*\)}xms ) {
	    #print "\n*********MATCH**********\n";
	    $text =~s{\burl\(\s*(.*?)\s*\)}{
		'url(' . munge_url($1) . ')'}gexms;
	}
	print $text;
    },
    'text'
);

$p->handler(
    end => sub {
	my ( $tagname, $text ) = @_;
	if ( lc $tagname eq 'style' ) {
	    $in_style = 0;
	}
	print $text;
    },
    'tagname, text'
);

$p->handler(
    comment => sub {
	my ( $text ) = @_;
	if ( $text =~ m{#include virtual="(.*?)"} ) {
	    my $url = munge_url $1;
	    if ( $level_num == 0 ) {
		$url =~ s{_level_\d+}{_top_level};
	    } elsif ( $url =~ s{_top_level}{_level_$level_num}xms ) {
	    } elsif ( $url =~ s{_level_\d+}{_level_$level_num}xms ) {
	    }
	    $text = qq{<!--#include virtual="$url" -->};
	} elsif ( $text =~ m{\burl\(.*\)}xms ) {
	    # We have some crap to work around IE in a comment:
	    $text =~s{\burl\(\s*(.*?)\s*\)}{
		'url(' . munge_url($1) . ')'}gexms;
	}
	print $text;
    },
    'text'
);

# The default is to print everything as is.
$p->handler(default => sub { print @_ }, "text");


# All links are found in start tags.
# This handler 
$p->handler(
    start => \&start_tag,
    'tagname, tokenpos, text, attr'
);

# Parse the file passed in from the command line
$input_html_file = shift || usage();
$p->parse_file( $input_html_file )
    or die "Can't open file $input_html_file: $!";