DatesRole: support standard output of date(1).

- Convert to ISO-8601 for the parse.
- Support short month to number in service of above.
- Support TZ to UTC offset in service of above.

In cases where the TZ abbreviation was ambiguous I selected what I
deemed to be the most populous or most likely to be a DDG user's intent.
I may, of course, be wrong.

Fixes #621.
master
Matt Miller 2014-09-07 17:42:03 -04:00
parent f101ef6ebf
commit 97f737e0be
2 changed files with 210 additions and 1 deletions

View File

@ -13,6 +13,21 @@ use Try::Tiny;
# This appears to parse most/all of the big ones, however it doesn't present a regex
use DateTime::Format::HTTP;
my %short_month_to_number = (
jan => 1,
feb => 2,
mar => 3,
apr => 4,
may => 5,
jun => 6,
jul => 7,
aug => 8,
sep => 9,
oct => 10,
nov => 11,
dec => 12,
);
# Reused lists and components for below
my $short_day_of_week = qr#Mon|Tue|Wed|Thu|Fri|Sat|Sun#i;
my $full_day_of_week = qr#Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday#i;
@ -35,7 +50,191 @@ my $ambiguous_dates_matches = qr#^(?<m>$date_number)$date_delim(?<d>$date_number
my $number_suffixes = qr#(?:st|nd|rd|th)#i;
# Timezones: https://en.wikipedia.org/wiki/List_of_time_zone_abbreviations
my $tz_suffixes = qr#(?:[+-][0-9]{4})|ACDT|ACST|ACT|ADT|AEDT|AEST|AFT|AKDT|AKST|AMST|AMT|ART|AST|AWDT|AWST|AZOST|AZT|BDT|BIOT|BIT|BOT|BRT|BST|BTT|CAT|CCT|CDT|CEDT|CEST|CET|CHADT|CHAST|CHOT|CHUT|CIST|CIT|CKT|CLST|CLT|COST|COT|CST|CT|CVT|CWST|CXT|ChST|DAVT|DDUT|DFT|EASST|EAST|EAT|ECT|EDT|EEDT|EEST|EET|EGST|EGT|EIT|EST|FET|FJT|FKST|FKT|FNT|GALT|GAMT|GET|GFT|GILT|GIT|GMT|GST|GYT|HADT|HAEC|HAST|HKT|HMT|HOVT|HST|ICT|IDT|IOT|IRDT|IRKT|IRST|IST|JST|KGT|KOST|KRAT|KST|LHST|LINT|MAGT|MART|MAWT|MDT|MEST|MET|MHT|MIST|MIT|MMT|MSK|MST|MUT|MVT|MYT|NCT|NDT|NFT|NPT|NST|NT|NUT|NZDT|NZST|OMST|ORAT|PDT|PET|PETT|PGT|PHOT|PHT|PKT|PMDT|PMST|PONT|PST|PYST|PYT|RET|ROTT|SAKT|SAMT|SAST|SBT|SCT|SGT|SLST|SRT|SST|SYOT|TAHT|TFT|THA|TJT|TKT|TLT|TMT|TOT|TVT|UCT|ULAT|UTC|UYST|UYT|UZT|VET|VLAT|VOLT|VOST|VUT|WAKT|WAST|WAT|WEDT|WEST|WET|WIT|WST|YAKT|YEKT|Z#i;
my %tz_offsets = (
ACDT => '+1030',
ACST => '+0930',
ACT => '+0800',
ADT => '-0300',
AEDT => '+1100',
AEST => '+1000',
AFT => '+0430',
AKDT => '-0800',
AKST => '-0900',
AMST => '-0300',
AMT => '-0400',
ART => '-0300',
AST => '+0300',
AWDT => '+0900',
AWST => '+0800',
AZOST => '-0100',
AZT => '+0400',
BDT => '+0800',
BIOT => '+0600',
BIT => '-1200',
BOT => '-0400',
BRT => '-0300',
BST => '+0100',
BTT => '+0600',
CAT => '+0200',
CCT => '+0630',
CDT => '-0500',
CEDT => '+0200',
CEST => '+0200',
CET => '+0100',
CHADT => '+1345',
CHAST => '+1245',
CHOT => '+0800',
CHUT => '+1000',
CIST => '-0800',
CIT => '+0800',
CKT => '-1000',
CLST => '-0300',
CLT => '-0400',
COST => '-0400',
COT => '-0500',
CST => '-0600',
CT => '+0800',
CVT => '-0100',
CWST => '+0845',
CXT => '+0700',
ChST => '+1000',
DAVT => '+0700',
DDUT => '+1000',
DFT => '+0100',
EASST => '-0500',
EAST => '-0600',
EAT => '+0300',
ECT => '-0400',
EDT => '-0400',
EEDT => '+0300',
EEST => '+0300',
EET => '+0200',
EGST => '+0000',
EGT => '-0100',
EIT => '+0900',
EST => '-0500',
FET => '+0300',
FJT => '+1200',
FKST => '-0300',
FKT => '-0400',
FNT => '-0200',
GALT => '-0600',
GAMT => '-0900',
GET => '+0400',
GFT => '-0300',
GILT => '+1200',
GIT => '-0900',
GMT => '+0000',
GST => '-0200',
GYT => '-0400',
HADT => '-0900',
HAEC => '+0200',
HAST => '-1000',
HKT => '+0800',
HMT => '+0500',
HOVT => '+0700',
HST => '-1000',
ICT => '+0700',
IDT => '+0300',
IOT => '+0300',
IRDT => '+0430',
IRKT => '+0900',
IRST => '+0330',
IST => '+0530',
JST => '+0900',
KGT => '+0600',
KOST => '+1100',
KRAT => '+0700',
KST => '+0900',
LHST => '+1030',
LINT => '+1400',
MAGT => '+1200',
MART => '-0930',
MAWT => '+0500',
MDT => '-0600',
MEST => '+0200',
MET => '+0100',
MHT => '+1200',
MIST => '+1100',
MIT => '-0930',
MMT => '+0630',
MSK => '+0400',
MST => '-0700',
MUT => '+0400',
MVT => '+0500',
MYT => '+0800',
NCT => '+1100',
NDT => '-0230',
NFT => '+1130',
NPT => '+0545',
NST => '-0330',
NT => '-0330',
NUT => '-1100',
NZDT => '+1300',
NZST => '+1200',
OMST => '+0700',
ORAT => '-0500',
PDT => '-0700',
PET => '-0500',
PETT => '+1200',
PGT => '+1000',
PHOT => '+1300',
PKT => '+0500',
PMDT => '-0200',
PMST => '-0300',
PONT => '+1100',
PST => '-0800',
PYST => '-0300',
PYT => '-0400',
RET => '+0400',
ROTT => '-0300',
SAKT => '+1100',
SAMT => '+0400',
SAST => '+0200',
SBT => '+1100',
SCT => '+0400',
SGT => '+0800',
SLST => '+0530',
SRT => '-0300',
SST => '-1100',
SYOT => '+0300',
TAHT => '-1000',
TFT => '+0500',
THA => '+0700',
TJT => '+0500',
TKT => '+1300',
TLT => '+0900',
TMT => '+0500',
TOT => '+1300',
TVT => '+0500',
UCT => '+0000',
ULAT => '+0800',
UTC => '+0000',
UYST => '-0200',
UYT => '-0300',
UZT => '+0500',
VET => '-0430',
VLAT => '+1000',
VOLT => '+0400',
VOST => '+0600',
VUT => '+1100',
WAKT => '+1200',
WAST => '+0200',
WAT => '+0100',
WEDT => '+0100',
WEST => '+0100',
WET => '+0000',
WIT => '+0700',
WST => '+0800',
YAKT => '+1000',
YEKT => '+0600',
Z => '+0000',
);
my $tz_strings = join('|', keys %tz_offsets);
my $tz_suffixes = qr#(?:[+-][0-9]{4})|$tz_strings#i;
my $date_standard = qr#$short_day_of_week $short_month\s{1,2}$date_number $time_24h $tz_suffixes [0-9]{4}#i;
my $date_standard_matches = qr#$short_day_of_week (?<m>$short_month)\s{1,2}(?<d>$date_number) (?<t>$time_24h) (?<tz>$tz_suffixes) (?<y>[0-9]{4})#i;
# formats parsed by vague datestring, without colouring
# the context of the code using it
@ -104,6 +303,9 @@ sub build_datestring_regex {
# RFC850 08-Feb-94 14:15:29 GMT
push @regexes, qr#[0-9]{2}-$short_month-(?:[0-9]{2}|[0-9]{4}) $time_24h?(?: ?$tz_suffixes)#i;
# date(1) default format Sun Sep 7 15:57:56 EDT 2014
push @regexes, $date_standard;
# month-first date formats
push @regexes, qr#$date_number$date_delim$short_month$date_delim[0-9]{4}#i;
push @regexes, qr#$date_number$date_delim$full_month$date_delim[0-9]{4}#i;
@ -145,6 +347,9 @@ sub parse_formatted_datestring_to_date {
}
$d = sprintf("%04d-%02d-%02d", $year, $month, $day);
} elsif ($d =~ $date_standard_matches) {
# To ISO8601 for parsing
$d = sprintf('%04d-%02d-%02dT%s%s', $+{'y'}, $short_month_to_number{lc $+{'m'}}, $+{'d'}, $+{'t'}, $tz_offsets{$+{'tz'}});
}
$d =~ s/(\d+)\s?$number_suffixes/$1/i; # Strip ordinal text.

View File

@ -86,6 +86,10 @@ subtest 'Dates' => sub {
'Sat, 09 Aug 2014 18:20:00' => 1407608400,
# RFC850
'08-Feb-94 14:15:29 GMT' => 760716929,
# date(1) default
'Sun Sep 7 15:57:56 EDT 2014' => 1410119876,
'Sun Sep 14 15:57:56 UTC 2014' => 1410710276,
'Sun Sep 7 20:11:44 BST 2014' => 1410117104,
#Undefined/Natural formats:
'13/12/2011' => 1323734400, #DMY
'01/01/2001' => 978307200, #Ambiguous, but valid