2014-10-28 11:34:36 -07:00
package DDG::Goodie::HTMLEntitiesDecode ;
# ABSTRACT: Decode HTML Entities.
# HTML Entity Encoding has been moved to a separate module
2015-02-22 12:09:29 -08:00
use strict ;
2014-10-28 11:34:36 -07:00
use DDG::Goodie ;
use HTML::Entities 'decode_entities' ;
use Unicode::UCD 'charinfo' ;
2015-03-15 17:33:39 -07:00
use Text::Trim ;
2014-10-28 11:34:36 -07:00
use warnings ;
use strict ;
zci answer_type = > 'html_entity' ;
zci is_cached = > 1 ;
triggers any = > 'html' , 'entity' , 'htmldecode' , 'decodehtml' , 'htmlentity' ;
handle remainder = > sub {
2015-03-15 17:33:39 -07:00
$ _ = trim $ _ ; # remove front and back whitespace
2014-10-28 11:34:36 -07:00
$ _ =~ s/(\bwhat\s*is\s*(the)?)//ig ; # remove "what is the" (optional: the)
$ _ =~ s/\b(the|for|of|is|entity|decode|decoded|code|character)\b//ig ; # remove filler words
2015-03-15 17:33:39 -07:00
$ _ = trim $ _ ; # remove front and back whitespace that existed in between that may show up after removing the filler words
2014-10-28 11:34:36 -07:00
$ _ =~ s/\s*\?$//g ; # remove ending question mark
return unless ( ( /^(&?#(?:[0-9]+(?!_))+;?)$/ ) || ( /^(&(?:[a-zA-Z]+(?!_))+;?)$/ ) || ( /^(&?#[xX](?:[0-9A-Fa-f]+(?!_))+;?)$/ ) ) ; # decimal (') || text with no underscores (¢) || hex (')
# "&" optional for all
# ";" optional except in text type
# "?" optional: question-like queries
# Standardize the query so it works well with library decoding functions
my $ entity = $ 1 ;
$ entity =~ s/^&?/&/ ; # append '&' at the front
$ entity =~ s/;?$/;/ ; # append ';' at the back
2014-10-28 11:36:15 -07:00
2014-10-28 11:34:36 -07:00
# Attempt to decode, exit if unsuccessful
my $ decoded = decode_entities ( $ entity ) ; # decode_entities() returns the input if unsuccesful
my $ decimal = ord ( $ decoded ) ;
return if ( lc $ entity eq lc $ decoded ) ; # safety net -- makes trying to decode something not real like "&enchantedbunny;" fail
2014-10-28 11:36:15 -07:00
# If invisible character, provide link instead of displaying it
2014-10-28 11:34:36 -07:00
my $ info = charinfo ( $ decimal ) ; # charinfo() returns undef if input is not a "real" character
return unless ( defined $ info ) ; # another safety net
2014-10-28 11:36:15 -07:00
if ( $$ info { name } eq '<control>' ) {
2014-10-28 11:34:36 -07:00
$ decoded = "Unicode control character (no visual representation)" ;
} elsif ( substr ( $$ info { category } , 0 , 1 ) eq 'C' ) {
$ decoded = "Special character (no visual representation)" ;
}
2015-08-31 13:48:47 -07:00
return "Decoded HTML Entity: $decoded" ,
structured_answer = > {
2016-05-12 05:30:55 -07:00
data = > {
2016-05-14 12:05:32 -07:00
title = > $ decoded ,
subtitle = > 'HTML Entity Decode: ' . $ _
2016-05-12 05:30:55 -07:00
} ,
templates = > {
group = > 'text' ,
}
2015-08-31 13:48:47 -07:00
} ;
2014-10-28 11:34:36 -07:00
} ;
1 ;