#! /usr/bin/perl

# count-elements-in-training-packages.pl
# The TAFE qualification "Certificate 4 in Workplace Training and
# Assessment" is required for all TAFE teachers.

# Part of this course requires an assignment that involves workplace
# assessment.  That requires that you select a "Training Package" from
# http://www.ntis.gov.au/.  There is a vast number of these; I
# estimated about 22400.  To reduce the complexity of the assignment,
# it is better to select a training package that has a minimal number
# of elements of competency.  This program downloads the html pages
# about each training package, and counts the number of elements of
# competency.  The format of the output (to standard output) is:

# num_elements tab national_code tab name_of_unit tab URL_of_unit

# This can be sorted with

# sort -s -k1,1nr

# to sort the units by order of number of elements.

# TODO: it would be nice to count the number of bullet points under
# each element, or perhaps the total number of bullet points for each unit.

# Usage:

# call with a filename containing html from START (URL defined below),
# otherwise it will download the huge main web page as well as the
# individual web pages for the training packages.

# Copyright (C) 2005  Nick Urbanik <nicku@nicku.org>

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

use warnings;
use strict;
use URI;
use LWP::Simple;
use HTML::TableContentParser;
use Getopt::Long;
use File::Basename;

$| = 1;

use constant START => 'http://www.ntis.gov.au/cgi-bin/waxhtml/'
    . '~ntis2/std.wxh?page=8&usedname=unit'
    . '&standard_name=&standard_code=&submit=Search';
use constant SHORT_START => "http://www.ntis.gov.au/cgi-bin/waxhtml/~ntis2/"
            . "pkg.wxh?page=82&inputRef=11";

use constant DEBUG => 0;

sub usage() {
    my $prog = basename $0;
    die <<USAGE;
usage: $prog --skipto=SKIPTO [html-file]
SKIPTO is the starting point of the search as a Competency Standard Name,
such as "Information Technology".  Skips all Units of Competency before
finding that title.

html-file is a filename containing html from the URL
@{[START]}
START (URL defined below),
otherwise it will download the huge main web page from that URL as well as the
individual web pages for the training packages.

The TAFE qualification "Certificate 4 in Workplace Training and
Assessment" is required for all TAFE teachers.

Part of this course requires an assignment that involves workplace
assessment.  That requires that you select a "Training Package" from
http://www.ntis.gov.au/.  There is a vast number of these; I
estimated about 22400.  To reduce the complexity of the assignment,
it is better to select a training package that has a minimal number
of elements of competency.  This program downloads the html pages
about each training package, and counts the number of elements of
competency.  The format of the output (to standard output) is:

num_elements tab national_code tab name_of_unit tab URL_of_unit

This can be sorted with

sort -s -k1,1nr

to sort the units by order of number of elements.
USAGE
}

sub parse_link($) {
    my ( $link ) = @_;
    if ( $link =~ m!<a\s+href="([^"]+)"\s*>([^<]+)<\s*/a\s*>!s ) {
        my ( $url, $text ) = ( $1, $2 );
        return ( $url, $text );
    }
    warn "Cannot parse link '$link'\n";
    return;
}

sub read_units_of_competency($) {
    my ( $url ) = @_;
    my $document;
    if ( @ARGV ) {
        my $filename = shift @ARGV;
        open UNITS, '<', $filename or die "Cannot open $filename: $!";
        {
            local $/;
            $document = <UNITS>;
        }
        close UNITS or die "Cannot close $filename: $!";
    } else {
        $document = get $url or die "Unable to get $url: $!";
    }
    return $document;
}

sub complete_url($$) {
    my ( $urltext, $host ) = @_;
    my $url = URI->new( $urltext );
    unless ( $url->scheme ) {
        $url->scheme( 'http' );
    }
    return unless $url->scheme eq 'http';
    unless ( $url->host ) {
        $url->host( $host );
    }
    return $url;
}

# Each unit of competency can be a single table; for example, START
# has about 22439 tables.  This goes against the idea of examining all
# the rows of one particular table :-)

# We can avoid downloading all the elements of competence by reading
# it from a file.

# The strategy here is to look at each row of each table after finding
# that the first element of the first row of any table contains
# "National Code".  For all rows of all tables after that, if the row
# matches the pattern of a unit of competence, follow the link, pasing
# the name of the unit of competency.

# To process a unit of competency, we look at each row of each table
# after finding that the first element is "National Code", and if the
# row matches the pattern of an element of competence, we increment
# the count of elements.

# We print the number of elements, the national code, and the name of
# the unit of competency.

sub process_one_unit($$$$) {
    my ( $document, $code, $name, $url ) = @_;
    my $tcp = HTML::TableContentParser->new;
    my $tables = $tcp->parse( $document );
    my $found_start;
    my $element_count = 0;
 TABLE: foreach my $t ( @$tables ) {
        next TABLE unless $found_start
            or $t->{rows}[0]{cells}[0]{data} =~ /National Code/;
        $found_start = 1;
    ROW: foreach my $row ( @{$t->{rows}} ) {
            next ROW if $row->{cells}[0]{data} =~ /National Code/;
            my ( $ecode, $name ) = ( $row->{cells}[0]{data},
                                     $row->{cells}[1]{data} );
            print "ECODE: '$ecode', NAME: '$name'\n" if DEBUG > 1;
            next ROW unless
                $ecode =~ s!^(?:<.*?>\s*)?([A-Z\d./]+)(?:\s*<.*?>)?$!$1!;
            ++$element_count;
        }
    }
    print "$element_count\t$code\t$name\t$url\n";
}

sub fetch_and_process_one_unit($$$$) {
    my ( $url, $host, $code, $name ) = @_;
    $url = complete_url $url, $host;
    my $document = get $url
        or warn "Unable to get $url: $!" and return;
    # my $document = "";
    print "EXAMINING URL: $url:\n" if DEBUG > 1;
    process_one_unit $document, $code, $name, $url;
}

sub process_units($$$) {
    my ( $document, $host, $skipto ) = @_;
    my $tablecount;
    my $tcp = HTML::TableContentParser->new;
    my $tables = $tcp->parse( $document );
    my $found_start;
    my $found_skipto = 1 unless $skipto;
 TABLE: foreach my $t ( @$tables ) {
        print "Table @{[++$tablecount]}:\n" if DEBUG > 1;
        next TABLE unless $found_start
            or $t->{rows}[0]{cells}[0]{data} =~ /National Code/;
        $found_start = 1;
    ROW: foreach my $row ( @{$t->{rows}} ) {
            my ( $code, $link ) = ( $row->{cells}[0]{data},
                                    $row->{cells}[1]{data} );
            if ( $code =~ /National Code/ ) {
                if ( $code =~ m!National Code:&nbsp</b>(\S+)!) {
                    $code = $1;
                } else {
                    warn "Cannot parse '$code'\n";
                }
                my ( undef, $name ) = parse_link $link;
                warn "bad name '$name' with $code\n" and next ROW unless $name;
                print "NAME: $code\t$name\n";
                next ROW unless $found_skipto or $name =~ /$skipto/io;
                $found_skipto = 1;
                next ROW;
            }
            next ROW unless $found_skipto;
            print "CODE: '$code', LINK: '$link'\n" if DEBUG > 1;
            next ROW
                unless $code =~ s!^(?:<.*?>\s*)?([A-Z\d./]+)(?:\s*<.*?>)?$!$1!;
            my ( $url, $text ) = parse_link $link;
            if ( DEBUG > 1 ) {
                print "URL: '$url' " if $url;
                print "TEXT: '$text'" if $text;
                print "\n" if $url or $text;
            }
            next unless $url;
            print "CODE is '$code', URL is '$url', TEXT IS '$text'\n"
                if DEBUG > 1;
            fetch_and_process_one_unit $url, $host, $code, $text
                if $found_start;
        }
    }
}

# Return the host part of the URL passed to it.
# Assumes that URL *does* have a host part.
sub host($) {
    my ( $url ) = @_;
    $url = URI->new( $url );
    return $url->host;
}

sub main() {
    my $skipto;
    GetOptions( "skipto=s" => \$skipto ) or usage;
    my $host = host START;
    process_units read_units_of_competency( START ), $host, $skipto;
}

main;

__END__