zeroclickinfo-goodies/share/goodie/name_days/preprocess.pl

#!/usr/bin/perl
# Preprocess the source files and output data structures for slurping

use strict;
use DateTime;
use Path::Class;
use Locale::Country;

sub get_flag {
    my $country = shift;
    return '<span class="flag-sm flag-sm-' . country2code($country) . '"></span>';
}

# Load the data file
my @names = (); # Names indexed by day
my %dates = (); # Days indexed by name

# File format: 366 lines (one for each day of a year).
# Each line contains names separated with a space.
# A line may contain the names in genitive case or variations of a name.
# These variations are placed after vertical bar character (|); they are
# not shown when searching for this day, but you can search for them.

sub load_days_file {
    my $file_name = shift();

    my @lines = file($file_name)->slurp(iomode => '<:encoding(UTF-8)');

    $file_name =~ s/\.txt$//;
    $file_name =~ s/_/ /g;

    die "The text file $file_name must include 366 lines" unless scalar(@lines) == 366;

    my $day_of_year = 1;

    # Read names for each day and add them to the hash
    for (@lines) {
        # Add all names, including the names after vertical bar
        my $names_for_date = lc($_);
        $names_for_date =~ s/[|,]/ /g;
        for my $name (split(' ', $names_for_date)) {
            push(@{$dates{$name}}, $file_name . '|' . $day_of_year);
        }

        # Remove the names after vertical bar (|)
        chomp;
        s/\s*\|.*$//;

        if ($_) {
            $names[$day_of_year - 1] .= "; " if ($names[$day_of_year - 1]);
            $names[$day_of_year - 1] .= $file_name . ': ' . $_;
        }

        # Advance to the next day
        $day_of_year++;
    }
}

sub prepare_dates {
    my @month_names = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
    my ($dates_by_country, $dates_by_country_and_month) = @_;

    # Prepare the plain-text answer
    my $res = '';
    foreach (sort keys %{$dates_by_country}) {
        $res .= $_ . ': ' . $dates_by_country->{$_} . "; ";
    }
    $res =~ s/; $/\|/;

    # Prepare the HTML answer
    foreach (sort keys %{$dates_by_country_and_month}) {
        $res .= '<tr><td class="name-days-country">' .
            get_flag($_) . ' <span class="name-days-country-name">' . $_ .
            '</span></td><td class="name-days-dates">';
        my $i = 0;
        for (@{$dates_by_country_and_month->{$_}}) {
            $res .= '<div class="name-days-tile"><span>' . $month_names[$i] . ' ' .
                    $_ . '</span></div>' if $_;
            $i++;
        }
        $res .= '</td></tr>';
    }
    return $res;
}

sub finish_loading {
    # Convert the dates to string
    for (keys %dates) {
        # Group the dates by country
        my %dates_by_country = ();
        my %dates_by_country_and_month = ();
        foreach (@{$dates{$_}}) {
            die 'Internal error' unless /^(.*?)\|(\d+)$/;
            # Any leap year here, because the text file includes February, 29
            my $d = DateTime->from_day_of_year(year => 2000, day_of_year => $2);
            if (exists $dates_by_country{$1}) {
                $dates_by_country{$1} .= ', ';
            }
            $dates_by_country{$1} .= $d->strftime('%e %b');

            if ($dates_by_country_and_month{$1}[$d->month - 1]) {
                $dates_by_country_and_month{$1}[$d->month - 1] .= ', ';
            }
            $dates_by_country_and_month{$1}[$d->month - 1] .= $d->day;
        }

        $dates{$_} = prepare_dates(\%dates_by_country, \%dates_by_country_and_month);
    }
}


# Load the source files into %dates and @names

load_days_file('Bulgaria.txt');
load_days_file('Croatia.txt');
load_days_file('Czech_Republic.txt');
load_days_file('Denmark.txt');
load_days_file('France.txt');
load_days_file('Greece.txt');
load_days_file('Hungary.txt');
load_days_file('Latvia.txt');
load_days_file('Poland.txt');
load_days_file('Slovakia.txt');
load_days_file('Sweden.txt');
finish_loading();


# Output the array and the hash.
# Spew does not work for hashes, so use two loops here.

open(my $fd, '>:encoding(UTF-8)', 'preprocessed_dates.txt') or die;

for (sort keys %dates) {
    print($fd $_ . "\n" . $dates{$_} . "\n");
}


open(my $fn, '>:encoding(UTF-8)', 'preprocessed_names.txt') or die;

for (@names) {
    print($fn $_ . "\n");
}