zeroclickinfo-goodies/share/goodie/independence_day/data_parser.pl

71 lines
1.9 KiB
Perl

package MyParser;
use base qw(HTML::Parser);
# ABSTRACT: Parses specific data from html table on wikipedia into json
# uses the the table from https://en.wikipedia.org/wiki/List_of_national_independence_days
# as data source
# parser state information
my $column = 0;
my $field;
# prints out relevant data (as json)
sub text{
my ($self, $origtext) = @_;
if($field eq 'country'){
# lowercase country names
$origtext = lc($origtext);
$origtext =~ s/(.*), (.*)/$2 $1/;
print "\t'$origtext' => [{";
}elsif($field eq 'date'){
# replace html entities with regular space
$origtext =~ s/ / /g;
# add proper suffix to day number
$origtext =~ s/ 1$/ 1st/;
$origtext =~ s/ 2$/ 2nd/;
$origtext =~ s/ 3$/ 3rd/;
$origtext =~ s/( [0987654321]+)$/$1th/;
print "$field => \"$origtext\", ";
}elsif($field eq 'year'){
print "$field => \"$origtext\"}],\n";
}
# make sure we dont print unintended data form subsequent fields
$field = '';
}
# triggered when parser encouters tag start
sub start {
my ($self, $tagname, $attr, $attrseq, $origtext) = @_;
# stated parsing new table row
if ($tagname eq 'tr'){
$column = 0;
}
# parser reached next column
if ($tagname eq 'td') {
$column++;
}
# country names are inside link tags in the first column
if ($tagname eq 'a' && $column eq 1) {
$field = 'country';
}
# dates are inside span with class 'sorttext' in column 2
elsif ($tagname eq 'span' && $attr->{ class } eq 'sorttext' && $column eq 2){
$field = 'date';
}
# years are just plain in the 3rd column
elsif ($tagname eq 'td' && $column eq 3){
$field = 'year';
}
}
package main;
my $file = "wikipedia_html_table";
my $parser = MyParser->new;
print "my \%data = (\n";
$parser->parse_file( $file );
print ");\n";