#! /usr/bin/perl -w # (C) 2003 Nick Urbanik # Copyright terms: GNU General Public Licence (GPL). # Uses the MSRP1107R format PDF files after run through pdftotext -layout # Copyright (C) 2004 Nick Urbanik # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # BUGS: doesn't rearrange the headers # Company and address are wrapped to a fixed constant WRAP_BY rather than # one character less than the column width # Dates are fixed and don't vary use warnings; use strict; use FindBin; use FindBin qw($RealBin); use lib "$RealBin/"; use HKID; use Rand::Dist::Weighted qw(&rand_dist_weighted); use constant PROCESS_OLD_CLIPPER_RECORDS => 0; use constant PROCESS_MSRP1107R => 1; # This is a hard coded width to wrap company names and addresses by: use constant WRAP_BY => 16; use constant MAX_CLASS => 150; use constant MAX_PAGE => 10; use constant RAND_PAGE => 4; use constant RAND => 20; use constant PERCENT_COMPANIES => 95; my $course; my $year; open COMP, "< companylist.txt" or die "Cannot open companylist.txt: $!"; our @companies = ; close COMP; open NAMES, "< names-not-ict.txt" or die "Cannot open names-not-ict.txt: $!"; our @names = ; close NAMES; open ADDRESSES, "<", "addresslist.txt" or die "Cannot open addresslist.txt: $!"; our @addresses = ; close ADDRESSES; sub gen_company() { my $company = $companies[ rand @companies ]; chomp $company; return $company; } sub gen_address() { my $address = $addresses[ rand @addresses ]; chomp $address; return $address; } sub gen_name() { my $name = $names[ rand @names ]; chomp $name; # Remove any commas after the first name only: $name =~ s/([^\s,]+),?\s(.*)/$1 $2/; return $name; } # Probability weights of relation being: my %relationships = ( PARENT => 10, FATHER => 21, MOTHER => 60, SISTER => 5, BROTHER => 1, GRANDMOTHER => 2, GRANDFATHER => 1 ); # The percentage chance that the relative has the same family name # as the student: my %same_name_chance = ( PARENT => 30, FATHER => 99, MOTHER => 10, SISTER => 100, BROTHER => 100, GRANDMOTHER => 5, GRANDFATHER => 50 ); my @relationships_order = sort { $relationships{$a} <=> $relationships{$b} } keys %relationships; my $relationships_weight = 0; foreach ( @relationships_order ) { $relationships_weight += $relationships{$_}; } sub gen_relationship() { return rand_dist_weighted( \%relationships, \@relationships_order, $relationships_weight ); } # Pass the student name, and the relationship. # Relative names are all in capitals. sub gen_relative($$) { my ( $student_name, $relationship ) = @_; my $relative_name = gen_name; return unless $relationship and exists $same_name_chance{$relationship}; if ( rand 100 <= $same_name_chance{$relationship} ) { # Get the student's family name: ( my $family_name = $student_name ) =~ s/^([^\s,]+),?\s.*/$1/; # Leave the space before the given names: $relative_name =~ s/\S+,?(\s.*)/$1/; $relative_name = $family_name . $relative_name; } return uc $relative_name; } sub gen_hkid() { my $hkid = join '', ( 'A'..'Z' )[ rand 26 ], ( '0'..'9' )[ rand 10, rand 10, rand 10, rand 10, rand 10, rand 10 ], '(?)'; return scalar HKID::verify_hkids( $hkid ); } my %second_stunum_digit_weights = ( 0 => 1, 1 => 5, 2 => 10, 3 => 20, 4 => 10, ); sub gen_student_id() { return join '', '0', rand_dist_weighted( \%second_stunum_digit_weights ), ( '0'..'9' )[ rand 10, rand 10, rand 10, rand 10, rand 10, rand 10, rand 10 ]; } sub gen_phone_num() { my %phone_digit_weights = ( 2 => 20, 3 => 1, 9 => 2 ); return join '', rand_dist_weighted( \%phone_digit_weights ), ( '0'..'9' )[ rand 10, rand 10, rand 10, rand 10, rand 10, rand 10, rand 10 ]; } sub gen_ext() { return '' unless rand 6 > 4; return ( '0'..'9' )[ rand 10, rand 10, rand 10, rand 10 ] if rand 2 > 1; return ( '0'..'9' )[ rand 10, rand 10, rand 10 ]; } sub gen_registration_status() { my %registration_status_weights = ( L => 20, P => 1, D => 1, S => 1, ' ' => 900 ); return rand_dist_weighted( \%registration_status_weights ); } sub gen_academic_status() { my %academic_status_weights = ( N => 5, T => 1, R => 4, ' ' => 90 ); return rand_dist_weighted( \%academic_status_weights ); } sub gen_gender() { my %gender_weights = ( 'M' => 7, 'F' => 3 ); return rand_dist_weighted( \%gender_weights ); } sub gen_home_phone() { return gen_phone_num() } sub gen_guardian_phone() { return gen_phone_num() } sub gen_class_number($) { return shift; } sub gen_registration_date() { return "10-AUG-03"; } sub gen_effective_date() { return "10-AUG-03"; } # The format is an array of arrays of hashes. our @msrp1107r_format = ( # First line of record: [ { name => "class_number", col => 1, }, { name => "registration_status", col => 4, }, { name => "academic_status", col => 7, }, { name => "name", col => 10, }, { name => "gender", col => "38=>10,39=>90", shift => 1, }, { name => "student_id", col => 41, }, { name => "hkid", col => 52, }, { name => "relative", col => 98, }, ], # Second line of record: [ { name => "relationship", col => 98, }, ], # Third line of record: [ { name => "registration_date", col => 10, }, { name => "effective_date", col => 23, }, { name => "home_phone", col => 51, }, ], # Fourth line of record: [ { name => "guardian_phone", col => 98, }, ] ); our %gen_functions = ( class_number => \&gen_class_number, registration_status => \&gen_registration_status, academic_status => \&gen_academic_status, name => \&gen_name, gender => \&gen_gender, student_id => \&gen_student_id, hkid => \&gen_hkid, relative => \&gen_relative, relationship => \&gen_relationship, registration_date => \&gen_registration_date, effective_date => \&gen_effective_date, home_phone => \&gen_home_phone, guardian_phone => \&gen_guardian_phone, company1 => \&gen_company, company2 => \&gen_company, company3 => \&gen_company, company4 => \&gen_company, address1 => \&gen_address, address2 => \&gen_address, address3 => \&gen_address, address4 => \&gen_address, ); sub read_format($) { my $format_file_name = shift; my @format; open FMT, "<", $format_file_name or die "Unable to open $format_file_name: $!"; # Paragraph mode: local $/ = ""; while ( ) { my @line = (); # split the record into lines, thowing away comment lines: my @lines_in = grep !/^\s*#/, split( "\n" ); foreach my $line ( @lines_in ) { my ( $name, $col, $shift ) = split " ", $line; warn "Don't know $name\n" unless exists $gen_functions{$name}; push @line, { name => $name, col => $col, shift => $shift }; } push @format, [ @line ]; } close FMT; return \@format; } sub find_col($) { my $dist = shift; return $dist if $dist =~ /^\d+$/; $dist =~ s/=>/,/g; my @pairs = split /,/, $dist; return rand_dist_weighted( { @pairs } ); } # Return the column that has the highest weight: sub most_likely_col($) { my $dist = shift; return $dist if $dist =~ /^\d+$/; my @pairs = split /,/, $dist; my ( $most_likely_col, $max_weight ) = split /=>/, shift @pairs; foreach my $pair ( @pairs ) { my ( $col, $weight ) = split /=>/, $pair; ( $most_likely_col, $max_weight ) = ( $col, $weight ) if $weight > $max_weight; } return $most_likely_col; } use Text::Wrap; # Note that any newlines are thrown away here. Each line has no # newline at the end. Returns a list of lines. sub wrap_text($$) { my ( $text, $width ) = @_; local $Text::Wrap::columns = $width; return split /\n/, wrap('', '', $text ); } # Note that this scheme fails in one particular respect: when one data # item depends on or is equal to another, such as the relationship and # the relative's name, or the registration and effective dates (which # are the same in the data I have) # A smarter way would be to make all data "exceptional" by generating # it ahead of formatting it. That would be much more flexible and do # away with special cases. For example, having multiline items would # be easier. The problem is that we don't know what data is required # until we read the format. our $percent_companies = PERCENT_COMPANIES; sub gen_one_msr1107r_student_record($*\@) { my ($last_student_class_number, $outfh, $format ) = @_; my $relationship = gen_relationship; my $name = gen_name; my $relative = gen_relative( $name, $relationship ); # 20% chance of having a company: my @company = wrap_text gen_company, WRAP_BY; my @address = wrap_text gen_address, WRAP_BY; if ( rand 100 > $percent_companies ) { @company = @address = ( " " ) x 4; } my %exceptional_data_items = ( relative => $relative, relationship => $relationship, name => $name, company1 => $company[ 0 ], company2 => $company[ 1 ], company3 => $company[ 2 ], company4 => $company[ 3 ], address1 => $address[ 0 ], address2 => $address[ 1 ], address3 => $address[ 2 ], address4 => $address[ 3 ], ); # print "$relative, $name, $relationship\n"; foreach my $line ( @$format ) { my $pack = ""; my @data = (); # Pad beginning of line if start after column zero: my $col = find_col $$line[ 0 ]->{col}; if ( $col ) { $pack = "A$col "; push @data, " "; } my $prev_col = $col; # loop to index of the second last element: for ( my $i = 0; $i < $#$line; ++$i ) { my $shift = $$line[ $i ]->{shift}; $prev_col = $shift ? most_likely_col $$line[ $i ]->{col} : $col; $col = find_col $$line[ $i + 1 ]->{col}; $pack .= "A" . ( $col - $prev_col ) . " "; if ( exists $exceptional_data_items{$$line[ $i ]->{name}} ) { push @data, $exceptional_data_items{$$line[ $i ]->{name}}; } else { #print $$line[ $i ]->{name}; push @data, &{$gen_functions{$$line[ $i ]->{name}}}( $last_student_class_number ); } } # No limit on width of the last data item: $pack .= "A*"; if ( exists $exceptional_data_items{$$line[ $#$line ]->{name}} ) { push @data, $exceptional_data_items{$$line[ $#$line ]->{name}}; } else { push @data, &{$gen_functions{$$line[ $#$line ]->{name}}}(); } foreach ( @data ) { $_ = " " unless defined; } # Only print the line if its not blank: print $outfh pack( $pack, @data ) . "\n" if grep { /\S/ } @data; } } our $format_file; our $max_page = MAX_PAGE; our $max_class = MAX_CLASS; our $page_jitter = RAND_PAGE; our $class_jitter = RAND; use File::Basename; sub usage() { my $prog = basename $0; print <2,23=100,24=>1 which means: 2 cases out of (2 + 100 + 1), put the data starting at column 22; in 100 cases out of 103, put the data starting at column 23; in 1 case out of 103, start the data at column 24. Note that the left-most column is column zero. Finally, after additional space is an optional 1 or 0. Let us call this "shift". This last option indicates whether subsequent data items on this row are to be shifted together with the change of this data item from its most probable position. In the example above, the column 23 has the greatest probability of occuring, so if "shift" is true, then if the above mentioned data item is placed at column 24, then all other data items to the right of this on this line only will be shifted right by 24 - 23 columns (i.e., one column to the right). EOF exit 1; } use Getopt::Long; our $format = \@msrp1107r_format; GetOptions ( "format-file=s" => \$format_file, "max-page=i" => \$max_page, "max-class=i" => \$max_class, "page-jitter=i" => \$page_jitter, "class-jitter=i" => \$class_jitter, "percent-companies=i" => \$percent_companies, ) or usage(); $format = read_format $format_file if $format_file; if ( PROCESS_OLD_CLIPPER_RECORDS ) { foreach my $file ( @ARGV ) { open IN, "< $file" or die "Cannot open file $file: $!"; my $outfile = "$file.out"; die "Oh dear, $outfile exists already!" if -f $outfile; open OUT, "> $outfile" or die "Cannot open $outfile for writing: $!"; while ( ) { chomp; if ( m!^(\d+)/(\d)\s! ) { $course = $1; $year = $2; print OUT "$_\n"; next; } if ( m{ \s+ # at leaset 1 space ( # this matches $name [A-Z]+,? # family name is upper case with comma (?:\s[A-Z][a-z]*)+ # one or more given names ) \s\s+ # at leaset 2 spaces ([MF]) # gender \s+ # at least one space (\d{9}) # student id is 9 digits \s\s+ # at leaset 2 spaces ([a-zA-Z]\d{6}$[\dA-Z]$) # HK ID }x ) { my ( $class_num, $new, $lval, $name, $sex, $stud_no, $HK_id, $home_tel, $company, $company_addr1, $company_addr2, $company_addr3, $company_tel, $company_ext, ) = unpack "A3 A1 A2 A37 A3 A11 A11 A11 A36 A31 A31 A31 A11 A4", $_; $home_tel = gen_phone_num(); $HK_id = gen_hkid(); $stud_no = gen_student_id(); $name = gen_name(); if ( $company_addr1 or $company or $company_tel or $company_ext ) { $company = gen_company(); $company_tel = gen_phone_num(); $company_ext = gen_ext(); $company_addr1 = ""; $company_addr2 = ""; # $company_addr3 = ""; } $_ = pack "A3 A1 A2 A37 A3 A11 A11 A11 A36 A31 A31 A31 A11 A4", ( $class_num, $new, $lval, $name, $sex, $stud_no, $HK_id, $home_tel, $company, $company_addr1, $company_addr2, $company_addr3, $company_tel, $company_ext, ); # trim any trailing white space: s/\s+$//; print OUT $_ . "\r\n"; next; } else { print OUT "$_\n"; } warn "POSSIBLE UNMATCHED STUDENT: $_\n" if m!^\s*\d+\s+!; } close IN; close OUT or die "Cannot close $outfile: $!"; } } elsif ( PROCESS_MSRP1107R ) { foreach my $file ( @ARGV ) { my $this_max_class = $max_class + int( rand( $class_jitter ) - $class_jitter / 2 ); my $this_max_page = $max_page + int( rand( $page_jitter ) - $page_jitter / 2 ); my $this_page_num = 0; open IN, "< $file" or die "Cannot open file $file: $!"; my $outfile = "$file.fictionalised"; die "Oh dear, $outfile exists already!" if -f $outfile; open OUT, "> $outfile" or die "Cannot open $outfile for writing: $!"; my ( $course, $year, $class ); # Initialise to impossible values so can compare even # for first /^\sCourse/: my ( $prev_course, $prev_year, $prev_class ) = qw( a a a ); my $in_header; my $class_number = 0; while ( ) { $in_header = 1 if /^ Student Records/ or /^#R : Registration Status/; #$in_header = 1 if /^\s{43,}Page\s\d/; #$in_header = 1 if /^#R : Registration Status/; #$in_header = 1 if /^#A : Academic Status/; #$in_header = 1 if /^\f/; #$in_header = 1 if /^\sMSRP1107R/; #$in_header = 1 if /^\sCampus/; #$in_header = 1 if /^\sDepartment of\s/; #$in_header = 1 if /^\sCourse/; if ( m!^\sCourse\s:\s(\d{5}[A-C]?)(?:/(\d)([A-Z]))?! ) { ( $course, $year, $class ) = ( $1, $2, $3 ); if ( $course ne $prev_course or $year ne $prev_year or $class ne $prev_class ) { $class_number = 0; $this_max_class = $max_class + int( rand( $class_jitter ) - $class_jitter / 2 ); print "\$this_max_class=$this_max_class\n"; ( $prev_course, $prev_year, $prev_class) = ( $course, $year, $class ); } } #$in_header = 1 if /^\s{95,}Name of Guardian/; #$in_header = 1 if /^\s{95,}Next of Kin,/; if ( /^\s{9,}Reg. Date/ ) { undef $in_header; $this_page_num = 0; $this_max_page = $max_page + int( rand( $page_jitter ) - $page_jitter / 2 ); print OUT; # One blank line before student records begin: print OUT "\n"; next; } print OUT and next if $in_header; # So now we should generate student records. # This actually determines how many on each page. # Should decide how many per class separately. # Let's do 30 for each class, as a sign of things to come! if ( ++$this_page_num <= $this_max_page and ++$class_number <= $this_max_class ) { gen_one_msr1107r_student_record $class_number, *OUT, @$format; } elsif ( $this_page_num == $this_max_page + 1 or $class_number == $this_max_class + 1 ) { # We are at the end of the page: print OUT "\n\n"; } } } }