#! /usr/bin/perl # count-elements-in-training-packages.pl # The TAFE qualification "Certificate 4 in Workplace Training and # Assessment" is required for all TAFE teachers. # Part of this course requires an assignment that involves workplace # assessment. That requires that you select a "Training Package" from # http://www.ntis.gov.au/. There is a vast number of these; I # estimated about 22400. To reduce the complexity of the assignment, # it is better to select a training package that has a minimal number # of elements of competency. This program downloads the html pages # about each training package, and counts the number of elements of # competency. The format of the output (to standard output) is: # num_elements tab national_code tab name_of_unit tab URL_of_unit # This can be sorted with # sort -s -k1,1nr # to sort the units by order of number of elements. # TODO: it would be nice to count the number of bullet points under # each element, or perhaps the total number of bullet points for each unit. # Usage: # call with a filename containing html from START (URL defined below), # otherwise it will download the huge main web page as well as the # individual web pages for the training packages. # Copyright (C) 2005 Nick Urbanik # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. use warnings; use strict; use URI; use LWP::Simple; use HTML::TableContentParser; use Getopt::Long; use File::Basename; $| = 1; use constant START => 'http://www.ntis.gov.au/cgi-bin/waxhtml/' . '~ntis2/std.wxh?page=8&usedname=unit' . '&standard_name=&standard_code=&submit=Search'; use constant SHORT_START => "http://www.ntis.gov.au/cgi-bin/waxhtml/~ntis2/" . "pkg.wxh?page=82&inputRef=11"; use constant DEBUG => 0; sub usage() { my $prog = basename $0; die <([^<]+)<\s*/a\s*>!s ) { my ( $url, $text ) = ( $1, $2 ); return ( $url, $text ); } warn "Cannot parse link '$link'\n"; return; } sub read_units_of_competency($) { my ( $url ) = @_; my $document; if ( @ARGV ) { my $filename = shift @ARGV; open UNITS, '<', $filename or die "Cannot open $filename: $!"; { local $/; $document = ; } close UNITS or die "Cannot close $filename: $!"; } else { $document = get $url or die "Unable to get $url: $!"; } return $document; } sub complete_url($$) { my ( $urltext, $host ) = @_; my $url = URI->new( $urltext ); unless ( $url->scheme ) { $url->scheme( 'http' ); } return unless $url->scheme eq 'http'; unless ( $url->host ) { $url->host( $host ); } return $url; } # Each unit of competency can be a single table; for example, START # has about 22439 tables. This goes against the idea of examining all # the rows of one particular table :-) # We can avoid downloading all the elements of competence by reading # it from a file. # The strategy here is to look at each row of each table after finding # that the first element of the first row of any table contains # "National Code". For all rows of all tables after that, if the row # matches the pattern of a unit of competence, follow the link, pasing # the name of the unit of competency. # To process a unit of competency, we look at each row of each table # after finding that the first element is "National Code", and if the # row matches the pattern of an element of competence, we increment # the count of elements. # We print the number of elements, the national code, and the name of # the unit of competency. sub process_one_unit($$$$) { my ( $document, $code, $name, $url ) = @_; my $tcp = HTML::TableContentParser->new; my $tables = $tcp->parse( $document ); my $found_start; my $element_count = 0; TABLE: foreach my $t ( @$tables ) { next TABLE unless $found_start or $t->{rows}[0]{cells}[0]{data} =~ /National Code/; $found_start = 1; ROW: foreach my $row ( @{$t->{rows}} ) { next ROW if $row->{cells}[0]{data} =~ /National Code/; my ( $ecode, $name ) = ( $row->{cells}[0]{data}, $row->{cells}[1]{data} ); print "ECODE: '$ecode', NAME: '$name'\n" if DEBUG > 1; next ROW unless $ecode =~ s!^(?:<.*?>\s*)?([A-Z\d./]+)(?:\s*<.*?>)?$!$1!; ++$element_count; } } print "$element_count\t$code\t$name\t$url\n"; } sub fetch_and_process_one_unit($$$$) { my ( $url, $host, $code, $name ) = @_; $url = complete_url $url, $host; my $document = get $url or warn "Unable to get $url: $!" and return; # my $document = ""; print "EXAMINING URL: $url:\n" if DEBUG > 1; process_one_unit $document, $code, $name, $url; } sub process_units($$$) { my ( $document, $host, $skipto ) = @_; my $tablecount; my $tcp = HTML::TableContentParser->new; my $tables = $tcp->parse( $document ); my $found_start; my $found_skipto = 1 unless $skipto; TABLE: foreach my $t ( @$tables ) { print "Table @{[++$tablecount]}:\n" if DEBUG > 1; next TABLE unless $found_start or $t->{rows}[0]{cells}[0]{data} =~ /National Code/; $found_start = 1; ROW: foreach my $row ( @{$t->{rows}} ) { my ( $code, $link ) = ( $row->{cells}[0]{data}, $row->{cells}[1]{data} ); if ( $code =~ /National Code/ ) { if ( $code =~ m!National Code: (\S+)!) { $code = $1; } else { warn "Cannot parse '$code'\n"; } my ( undef, $name ) = parse_link $link; warn "bad name '$name' with $code\n" and next ROW unless $name; print "NAME: $code\t$name\n"; next ROW unless $found_skipto or $name =~ /$skipto/io; $found_skipto = 1; next ROW; } next ROW unless $found_skipto; print "CODE: '$code', LINK: '$link'\n" if DEBUG > 1; next ROW unless $code =~ s!^(?:<.*?>\s*)?([A-Z\d./]+)(?:\s*<.*?>)?$!$1!; my ( $url, $text ) = parse_link $link; if ( DEBUG > 1 ) { print "URL: '$url' " if $url; print "TEXT: '$text'" if $text; print "\n" if $url or $text; } next unless $url; print "CODE is '$code', URL is '$url', TEXT IS '$text'\n" if DEBUG > 1; fetch_and_process_one_unit $url, $host, $code, $text if $found_start; } } } # Return the host part of the URL passed to it. # Assumes that URL *does* have a host part. sub host($) { my ( $url ) = @_; $url = URI->new( $url ); return $url->host; } sub main() { my $skipto; GetOptions( "skipto=s" => \$skipto ) or usage; my $host = host START; process_units read_units_of_competency( START ), $host, $skipto; } main; __END__