#!/usr/bin/perl # This is based on eg/hrefsub with HTML::Parser. # The idea of grabbing the text between the style tags is from # http://www.foo.be/docs/tpj/issues/vol5_1/tpj0501-0003.html # Copyright (C) 2008 Nick Urbanik # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # How we want to process the URLs: # If this is a link or other URL tag: # Leave it if the link is to "./" and the class is topParent or parent. # If we are inside a style tag, edit the URL in the @import url(...) # thing. # if this is a comment, if it is of the form #include virtual="...", # edit the URL. This follows a convention I set up some time ago with # my web sites. Stuff that I include using Apache SSI is in a filename # of the form _top_level_xxxxx.html or _level_1_xxxxx.html, etc. # Here we munge that filename to match that convention. use strict; use warnings; use HTML::Parser (); use URI; use Getopt::Long; use Data::Dumper; my $level_num = 0; my $debug = 0; sub usage { ( my $prog = $0 ) =~ s{.*/}{}; print < The LEVEL_NUM is the number of levels of directory below the top level. For example, http://linus.nicku.org/ is at the top level (LEVEL_NUM = 0) while http://linus.nicku.org/BFG/ is at level 1 (LEVEL_NUM = 1) while http://linus.nicku.org/BFG/giants/ is at level 2 (LEVEL_NUM = 2) This code will blindly strip all leading ./ or ../ from the start of any URL and replace it with '../' x LEVEL_NUM It blindly assumes that all URLs refer to items at the top level. Default LEVEL_NUM: $level_num END_USAGE exit 1; } GetOptions( 'level=i' => \$level_num, 'debug+' => \$debug, help => sub { usage }, ) or usage; # Construct a hash of tag names that may have links. my %link_attr; { # To simplify things, reformat the %HTML::Tagset::linkElements # hash so that it is always a hash of hashes. require HTML::Tagset; no warnings; while (my($k,$v) = each %HTML::Tagset::linkElements) { if (ref($v)) { $v = { map {$_ => 1} @$v }; } else { $v = { $v => 1}; } $link_attr{$k} = $v; } use warnings; # To see what HTML::Tagset::linkElements thinks are # the tags with link attributes use Data::Dump; Data::Dump::dump(\%link_attr) if $debug } my $input_html_file; # This munging is very simple minded. # Generally we run this program in the directory where the html file is. # The following does not apply when we are working with the top level files # beginning with '_' such as _level_1_sidebar.html. # What we should really do is to see if the file is at the level # that is currently there, otherwise see if it exists at the munged level, # else warn. sub munge_url { my ( $url ) = @_; $url =~ s{^(?:\.?\.?/)*}{}; # Leave it if it seems to be absolute: return $url if $url =~ m{^(?:https?:|mailto:|ftp:|#)}; ( my $bare_url = $url ) =~ s{#.*}{}; return $url if -e $bare_url and $input_html_file !~ m{^_}; $url = ( '../' x $level_num ) . $url; warn "Cannot find '$url'\n" unless -e ( '../' x $level_num ) . $bare_url; return $url; } # Flag indicates if we are between start and end tags of style. # We want to search the text for @import url(...) my $in_style; # The handler that is called when a starting tag is found sub start_tag { my( $tagname, $pos, $text, $attr ) = @_; print "tagname='$tagname', ", Dumper( $pos ), ", text='$text', ", Dumper( $attr ), "===\n" if $debug; if ( lc $tagname eq 'style' ) { $in_style = 1; } # Now just select tags which provide links: if ( my $link_attr = $link_attr{$tagname} ) { # Don't modify favicon links: #if ( $tagname eq 'link' && $attr->{rel} =~ m{\bicon\b} ) { # print $text; # return; #} # Leave parent tags that point to "./" alone. # These are menu items that just exist as parent menu items. if ( lc $tagname eq 'a' && exists $attr->{class} && defined $attr->{class} && $attr->{class} =~ m{parent}xmsi && $attr->{href} eq q{./} ) { print $text; return; } while ( 4 <= @$pos ) { # use attribute sets from right to left # to avoid invalidating the offsets # when replacing the values my ( $k_offset, $k_len, $v_offset, $v_len ) = splice( @$pos, -4 ); my $attrname = lc substr($text, $k_offset, $k_len); next unless $link_attr->{$attrname}; next unless $v_offset; # 0 v_offset means no value my $v = substr( $text, $v_offset, $v_len ); $v =~ s/^([\'\"])(.*)\1$/$2/; my $new_v = munge_url($v, $attrname, $tagname); next if $new_v eq $v; $new_v =~ s/\"/"/g; # since we quote with "" substr( $text, $v_offset, $v_len ) = qq("$new_v"); } } print $text; } # Set up the parser. my $p = HTML::Parser->new(api_version => 3); $p->handler( text => sub { my ( $text ) = @_; if ( $in_style && $text =~ m{\burl\(\s*.*?\s*\)}xms ) { #print "\n*********MATCH**********\n"; $text =~s{\burl\(\s*(.*?)\s*\)}{ 'url(' . munge_url($1) . ')'}gexms; } print $text; }, 'text' ); $p->handler( end => sub { my ( $tagname, $text ) = @_; if ( lc $tagname eq 'style' ) { $in_style = 0; } print $text; }, 'tagname, text' ); $p->handler( comment => sub { my ( $text ) = @_; if ( $text =~ m{#include virtual="(.*?)"} ) { my $url = munge_url $1; if ( $level_num == 0 ) { $url =~ s{_level_\d+}{_top_level}; } elsif ( $url =~ s{_top_level}{_level_$level_num}xms ) { } elsif ( $url =~ s{_level_\d+}{_level_$level_num}xms ) { } $text = qq{}; } elsif ( $text =~ m{\burl\(.*\)}xms ) { # We have some crap to work around IE in a comment: $text =~s{\burl\(\s*(.*?)\s*\)}{ 'url(' . munge_url($1) . ')'}gexms; } print $text; }, 'text' ); # The default is to print everything as is. $p->handler(default => sub { print @_ }, "text"); # All links are found in start tags. # This handler $p->handler( start => \&start_tag, 'tagname, tokenpos, text, attr' ); # Parse the file passed in from the command line $input_html_file = shift || usage(); $p->parse_file( $input_html_file ) or die "Can't open file $input_html_file: $!";