Add separate modules HTMLEncode, Decode; URLEncode

– Add HTMLEntitiesDecode.pm and .t
– Add HTMLEntitiesEncode.pm and .t
– Add URLEncode and .t
– Remove HTMLEntities.pm and HTMLEntitiesCode.pm and their test files
master
nishanths 2014-06-13 19:29:01 +05:30
parent 9af4fdc4b5
commit c0b15421eb
9 changed files with 250 additions and 165 deletions

View File

@ -1,62 +0,0 @@
package DDG::Goodie::HTMLEntities;
# ABSTRACT: Decode HTML Entities.
use DDG::Goodie;
use HTML::Entities;
use Unicode::UCD 'charinfo';
zci answer_type => 'html_entity';
zci is_cached => 1;
triggers query_nowhitespace => qr/^(?:
(?:html|entity|htmlentity|htmldecode)?(&\#?\w+;?) |
html(?:entity|encode)?(.{1,50})
)$/ix;
primary_example_queries '!';
secondary_example_queries 'html entity &';
description 'decode HTML entities';
name 'HTMLEntities';
code_url 'https://github.com/duckduckgo/zeroclickinfo-goodies/blob/master/lib/DDG/Goodie/HTMLEntities.pm';
category 'computing_tools';
topics 'programming';
attribution twitter => 'crazedpsyc',
cpan => 'CRZEDPSYC' ;
handle matches => sub {
my ($entity, $decoded) = @_;
my $html;
my $decimal;
my $encoding = 0;
if (defined $entity) { # decoding
$entity =~ s/;?$/;/; # append a semicolon (some entities like &mdash do not work without one)
$decoded = decode_entities($entity);
$html = $entity;
} else { # encoding
$encoding = 1;
$entity = encode_entities($decoded);
$html = encode_entities($entity);
}
$decimal = ord($decoded);
my $info = charinfo($decimal);
if( $$info{name} eq '<control>' ) {
$html = "<a href='https://en.wikipedia.org/wiki/Unicode_control_characters'>Unicode control character</a> (no visual representation)";
$decoded = "Unicode control character (no visual representation)";
}
elsif(substr($$info{category},0,1) eq 'C') {
$decoded = "Special character (no visual representation)";
$html = "Special character (no visual representation)";
}
my $hex = sprintf("%04x", $decimal);
my $label = $encoding ? "Encoded HTML: " : "Decoded HTML Entity: ";
# decode_entities will return the input if it cannot be decoded
return $label . ($encoding ? "$entity" : "$decoded, decimal: $decimal, hexadecimal: $hex"),
html => $label.$html.($encoding ? "" : ", decimal: $decimal, hexadecimal: <a href=\"/?q=U%2B$hex\">$hex</a>") unless $entity eq $decoded;
return;
};
1;

View File

@ -0,0 +1,62 @@
package DDG::Goodie::HTMLEntitiesDecode;
# ABSTRACT: Decode HTML Entities.
use DDG::Goodie;
use HTML::Entities 'decode_entities';
use Unicode::UCD 'charinfo';
use warnings;
use strict;
zci answer_type => 'html_entity';
primary_example_queries 'html decode &#33;', 'html decode &amp';
secondary_example_queries 'html decode &#x21' , '#36 html decode';
description 'Decode HTML entities';
name 'HTMLEntitiesDecode';
code_url 'https://github.com/duckduckgo/zeroclickinfo-goodies/blob/master/lib/DDG/Goodie/HTMLEntitiesDecode.pm';
category 'computing_tools';
topics 'programming';
attribution twitter => 'crazedpsyc',
cpan => 'CRZEDPSYC' ,
twitter => ['nshanmugham', 'Nishanth Shanmugham'],
web => ['http://nishanths.github.io', 'Nishanth Shanmugham'],
github => ['https://github.com/nishanths', 'Nishanth Shanmugham'];
triggers startend => 'html decode', 'decode html';
my $label = "Decoded HTML Entity: ";
handle remainder => sub {
$_ =~ s/^\s*//g; # remove front whitespace
$_ =~ s/^(for|of)\s+//g; # remove filler words at the start
$_ =~ s/\s*$//g; # remove back whitespace.
return unless $_; # guard against (now) empty string
if ( (/^(&?#?(?:[0-9]+(?!_))+;?)$/) || (/^(&?(?:[a-zA-Z]+(?!_))+;?)$/) || (/^(&?#?x{1}(?:[0-9A-Fa-f]+(?!_))+;?)$/) ) { # Regex guard - capture if there is only one entity (examples: &#8271; , &bsol;, but NOT: &#54h;) in the query, otherwise our ia may be a false positive
my $entity = $1; # &#8271; # INPUT
$entity =~ s/^&?/&/; # append an ampersand in front (better decode_entities results and more freedom in input)
$entity =~ s/;?$/;/; # append a semicolon (some entities like &mdash do not work without one) (also better decode_entities results and more freedom in input)
my $decoded = decode_entities($entity); # decode_entities will return the input if it cannot be decoded
my $decimal = ord($decoded);
my $hex = sprintf("%04x", $decimal);
my $info = charinfo($decimal);
return unless (defined $info);
# Check if $decoded is an invisible character, and if it is, then provide a link instead of printing it on screen
if ($$info{name} eq '<control>') {
$decoded = "Unicode control character (no visual representation)";
$entity = "<a href='https://en.wikipedia.org/wiki/Unicode_control_characters'>Unicode control character</a> (no visual representation)";
}
elsif(substr($$info{category},0,1) eq 'C') {
$decoded = "Special character (no visual representation)";
$entity = "<a href='https://en.wikipedia.org/wiki/Special_characters'>Special character (no visual representation)";
}
return $label . "$decoded, decimal: $decimal, hexadecimal: $hex",
html => $label . $entity . ", decimal: $decimal, hexadecimal: <a href=\"/?q=U%2B$hex\">$hex</a>" unless $entity eq $decoded;
}
return;
};
1;

View File

@ -1,7 +1,8 @@
package DDG::Goodie::HTMLEntitiesCodes;
package DDG::Goodie::HTMLEntitiesEncode;
# ABSTRACT: Displays the HTML entity code for the query name.
use DDG::Goodie;
use HTML::Entities qw(encode_entities);
use warnings;
use strict;
@ -86,7 +87,8 @@ my %codes = (
# Currency
'cent' => [['Cent','cent']],
'dollar' => [['Dollar','#36']],
'dollar' => [['Dollar sign','#36']],
'peso' => [['Peso','#36']],
'yen' => [['Yen', 'yen']],
'japanese yen' => [['Yen', 'yen']],
'euro' => [['Euro','euro']],
@ -107,6 +109,7 @@ my %codes = (
'plus/minus' => [['Plus/minus','#177']],
'+-' => => [['Plus/minus','#177']],
'percent' => [['Percent sign','#37']],
'percentage' => [['Percent sign','#37']],
'per mil' => [['Per mil','permil']],
'per mille' => [['Per mil','permil']],
'per ten thousand' => [['Per ten thousand','#8241']],
@ -130,7 +133,7 @@ my %codes = (
'not congruent' => [['Not congruent','#8802']],
'sum' => [['Summation (mathematics)','#8721']],
'summation' => [['Summation (mathematics)','#8721']],
'pi' => [['pi (mathematics)','#960']],
'pi' => [['Pi','#960']],
'reals' => [['Reals (mathematics)','#8477']],
'complexes' => [['Complexes','#8450']],
'imaginary' => [['Imaginary (mathematics)','#8520']],
@ -197,21 +200,18 @@ my %accented_chars = (
'Uacute' => [['U-acute','Uacute']],
);
# The existing HTML entity decoder (HTMLEntities.pm) and this module have the same triggers but different input queries.
# HTMLEntities.pm performs entity (query) --> name (answer); this module performs name (query) --> entity (answer).
triggers startend => 'html code', 'html entity', 'html character code', 'html encode';
primary_example_queries 'html code em dash', 'html entity A-acute';
secondary_example_queries 'html encode backward semicolon', 'html entity for E grave', 'html encode pound symbol', 'html code of trademark sign';
name 'HTMLEntitiesCodes';
triggers startend => 'html code', 'html entity', 'html character code', 'html encode', 'encode html';
primary_example_queries 'html code em dash', 'html entity A-acute', 'html encode &';
secondary_example_queries 'html code em-dash', 'html entity for E grave', 'html entity $', 'html encode pound sign', 'html character code for trademark symbol';
name 'HTMLEntitiesEncode';
description 'Displays the HTML entity code for the query name';
category 'cheat_sheets';
topics 'programming', 'web_design';
attribution web => ["http://nishanths.github.io", "Nishanth Shanmugham"],
github => [ "https://github.com/nishanths", "Nishanth Shanmugham"],
twitter => ["nshanmugham", "Nishanth Shanmugham"];
code_url "https://github.com/duckduckgo/zeroclickinfo-spice/blob/master/lib/DDG/Goodie/HTMLEntitiesCodes.pm";
zci answer_type => 'HTML_Entity';
code_url "https://github.com/duckduckgo/zeroclickinfo-spice/blob/master/lib/DDG/Goodie/HTMLEntitiesEncode.pm";
zci answer_type => 'html_entity';
my $url = "http://dev.w3.org/html5/html-author/charref";
@ -228,12 +228,12 @@ sub make_html {
# Returns a html formatted string containing the HTML character name, entity, and a link
my $html = "";
if (scalar(@{$_[0]}) == 1) { # single line answer
$html = "<div>(&$_[0][0][1];) $_[0][0][0]: &<span>$_[0][0][1]</span>;&nbsp;&nbsp;<a href=\"$_[1]\">More at W3</a></div>" ; # link in the same line for single line answers
$html = "<div>$_[0][0][0] (&$_[0][0][1];): &<span>$_[0][0][1]</span>;&nbsp;&nbsp;<a href=\"$url\">More at W3</a></div>" ; # link in the same line for single line answers
} else {
foreach my $i (0 .. scalar(@{$_[0]}) - 1) { # multiple line answer
$html = "$html" . "<div>(&$_[0][$i][1];) $_[0][$i][0]: &<span>$_[0][$i][1]</span>;</div>";
$html = "$html" . "<div>$_[0][$i][0] (&$_[0][$i][1];): &<span>$_[0][$i][1]</span>;</div>";
}
$html = "$html" . "<div><a href=\"$_[1]\">More at W3</a></div>";
$html = "$html" . "<div><a href=\"$url\">More at W3</a></div>";
}
return $html;
};
@ -242,28 +242,52 @@ handle remainder => sub {
my $key;
my $value;
my $query = shift;
$query =~ s/^\s*//g; # remove front whitespace.
$query =~ s/^(for|of)\s+//g; # remove filler words at the start
$query =~ s/\-/ /g; # change '-' to ' '
$query =~ s/\s+(symbol|sign)//g; # remove 'symbol' and 'sign'
# $query =~ s/"//g; # remove double quote
# $query =~ s/'//g; # remove single quote
$query =~ s/\s*$//g; # remove back whitespace.
return unless $query; # guard against (now) empty query strings
$_ =~ s/^\s*//g; # remove front whitespace
$_ =~ s/\s*$//g; # remove back whitespace.
if ($query =~ /^(a|A|e|E|i|I|o|O|u|U)\s*(grave|acute)$/) { # search query is for an accented character Example: $query is now "A acute". Things that would also work: "A acute". Things that don't: "Aacute", "A acute"
$query =~ s/\s*//g; # remove in between spaces
$key = $query; # capitalization matters for accented characters lookup
$value = $accented_chars{$key};
} else {
$key = lc $query;
$value = $codes{$key};
# HASHES LOOKUP
my $hashes_query = $_;
$hashes_query =~ s/^(for|of)\s+//g; # remove filler words at the start
$hashes_query =~ s/\s+(symbol|sign)//g; # remove 'symbol' and 'sign'
$hashes_query =~ s/\-/ /g; # change '-' to ' '
$hashes_query =~ s/"//g; # remove double quote
$hashes_query =~ s/'//g; # remove single quote
# If a string still exists after the stripping, lookup the accented_chars hash if it's an accented character query and if it's not an accented char look up the codes hash
if ($hashes_query) {
if ($hashes_query =~ /^(a|A|e|E|i|I|o|O|u|U)\s*(grave|acute)$/) { # search query is for an accented character
$hashes_query =~ s/\s*//g; # remove in between spaces
$key = $hashes_query; # capitalization matters for accented characters lookup
$value = $accented_chars{$key};
} else { # not an accented char -- so lookup codes hash
$key = lc $hashes_query;
$value = $codes{$key};
}
# If a we found a value in the hashes, we have a positive hit. Return.
if (defined $value) {
my $text = make_text($value);
my $html = make_html($value);
return $text, html => $html;
}
}
return unless $value;
my $text = make_text($value);
my $html = make_html($value, $url);
return $text, html => $html;
# SINGLE CHARACTER ENCODING
# If we have gotten this far, there were no hits above
# Use the encode function of HTML::Entities
if (length($_) == 1){
my $entity = encode_entities($_);
if ($entity eq $_) { # encode_entities returns the same if it fails
$entity = ord($_); # get the decimal
$entity = '#' . $entity; # dress it up like a decimal
}
$entity =~ s/^&//;
$entity =~ s/;$//;
my $text = "Encoded HTML Entity: &$entity;";
my $html = "<div>Encoded HTML Entity (&$entity;): &<span>$entity</span>;&nbsp;&nbsp;<a href=\"$url\">More at W3</a></div>";
return $text, html => $html;
}
return;
};
1;

View File

@ -0,0 +1,27 @@
package DDG::Goodie::URLEncode;
# ABSTRACT: Displays the percent-encoded url.
use DDG::Goodie;
use URI::Encode qw(uri_encode);
use warnings;
use strict;
zci answer_type => 'encoded_url';
primary_example_queries 'url encode http://nospaces.duckduckgo.com/hook em horns' , 'encode url xkcd.com/a webcomic of%romance+math+sarcasm+language';
secondary_example_queries 'http://arstechnica.com/spaces after end url encode', 'apple.com/mac encode URL';
description 'Displays the percent-encoded url';
name 'URLEncode';
code_url 'https://github.com/duckduckgo/zeroclickinfo-goodies/blob/master/lib/DDG/Goodie/URLEncode.pm';
category 'computing_tools';
topics 'programming', 'web_design';
attribution twitter => ['nshanmugham', 'Nishanth Shanmugham'],
web => ['http://nishanths.github.io', 'Nishanth Shanmugham'],
github => ['https://github.com/nishanths', 'Nishanth Shanmugham'];
triggers startend => 'url encode', 'encode url';
my $url = "https://en.wikipedia.org/wiki/Url_encoding";
handle remainder => sub {
my $holder = uri_encode($_);
return "Encoded URL: $holder", html => "<div>Encoded URL: $holder</div><div>More at <a href=\"$url\">Wikipedia</a></div>";
};
1;

View File

@ -1,24 +0,0 @@
#!/usr/bin/env perl
use strict;
use warnings;
use Test::More;
use DDG::Test::Goodie;
zci answer_type => 'html_entity';
zci is_cached => 1;
ddg_goodie_test(
[qw(
DDG::Goodie::HTMLEntities
)],
'&#33;' => test_zci("Decoded HTML Entity: !, decimal: 33, hexadecimal: 0021", html => "Decoded HTML Entity: &#33;, decimal: 33, hexadecimal: <a href=\"/?q=U%2B0021\">0021</a>"),
'&#x21' => test_zci("Decoded HTML Entity: !, decimal: 33, hexadecimal: 0021", html => "Decoded HTML Entity: &#x21;, decimal: 33, hexadecimal: <a href=\"/?q=U%2B0021\">0021</a>"),
'html entity &amp;' => test_zci("Decoded HTML Entity: &, decimal: 38, hexadecimal: 0026", html => "Decoded HTML Entity: &amp;, decimal: 38, hexadecimal: <a href=\"/?q=U%2B0026\">0026</a>"),
'html encode <foo>' => test_zci("Encoded HTML: &lt;foo&gt;", html => "Encoded HTML: &amp;lt;foo&amp;gt;"),
'html encode amp;' => undef,
'html encode &' => test_zci("Encoded HTML: &amp;", html => "Encoded HTML: &amp;amp;"),
);
done_testing;

View File

@ -1,44 +0,0 @@
#!/usr/bin/env perl
use strict;
use warnings;
use Test::More;
use DDG::Test::Goodie;
zci answer_type => 'HTML_Entity';
ddg_goodie_test(
[qw(DDG::Goodie::HTMLEntitiesCodes)],
# Test 1
'html code em dash' => test_zci(
"Em dash: &mdash;",
html => "<div>(&mdash;) Em dash: &<span>mdash</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 2
'html entity A-acute' => test_zci(
"A-acute: &Aacute;",
html => "<div>(&Aacute;) A-acute: &<span>Aacute</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 3
'html encode backward semicolon' => test_zci(
"Backward semicolon: &#8271;",
html => "<div>(&#8271;) Backward semicolon: &<span>#8271</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 4
'html entity for E grave' => test_zci(
"E-grave: &Egrave;",
html => "<div>(&Egrave;) E-grave: &<span>Egrave</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 5
'html encode pound symbol' => test_zci(
"British Pound Sterling: &pound;\nNumber sign: &#35;",
html => "<div>(&pound;) British Pound Sterling: &<span>pound</span>;</div><div>(&#35;) Number sign: &<span>#35</span>;</div><div><a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 6
'html code of trademark sign' => test_zci(
"Trademark: &#8482;",
html => "<div>(&#8482;) Trademark: &<span>#8482</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
);
done_testing;

22
t/HTMLEntitiesDecode.t Normal file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env perl
use strict;
use warnings;
use Test::More;
use DDG::Test::Goodie;
zci answer_type => 'html_entity';
ddg_goodie_test(
[qw(DDG::Goodie::HTMLEntitiesDecode)],
# Test 1
'html decode &#33;' => test_zci("Decoded HTML Entity: !, decimal: 33, hexadecimal: 0021", html => "Decoded HTML Entity: &#33;, decimal: 33, hexadecimal: <a href=\"/?q=U%2B0021\">0021</a>"),
# Test 2
'html decode &amp;' => test_zci("Decoded HTML Entity: &, decimal: 38, hexadecimal: 0026", html => "Decoded HTML Entity: &amp;, decimal: 38, hexadecimal: <a href=\"/?q=U%2B0026\">0026</a>"),
# Test 3
'&#x21 decode html' => test_zci("Decoded HTML Entity: !, decimal: 33, hexadecimal: 0021", html => "Decoded HTML Entity: &#x21;, decimal: 33, hexadecimal: <a href=\"/?q=U%2B0021\">0021</a>"),
# Test 4
'#36 html decode' => test_zci("Decoded HTML Entity: \$, decimal: 36, hexadecimal: 0024", html => "Decoded HTML Entity: &#36;, decimal: 36, hexadecimal: <a href=\"/?q=U%2B0024\">0024</a>"),
);
done_testing;

54
t/HTMLEntitiesEncode.t Normal file
View File

@ -0,0 +1,54 @@
#!/usr/bin/env perl
use strict;
use warnings;
use Test::More;
use DDG::Test::Goodie;
zci answer_type => 'html_entity';
ddg_goodie_test(
[qw(DDG::Goodie::HTMLEntitiesEncode)],
# Test 1
'html code em dash' => test_zci(
"Em dash: &mdash;",
html => "<div>Em dash (&mdash;): &<span>mdash</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 2
'html entity A-acute' => test_zci(
"A-acute: &Aacute;",
html => "<div>A-acute (&Aacute;): &<span>Aacute</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 3
'html encode &' => test_zci(
"Encoded HTML Entity: &amp;",
html => "<div>Encoded HTML Entity (&amp;): &<span>amp</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 4
'html code em-dash' => test_zci(
"Em dash: &mdash;",
html => "<div>Em dash (&mdash;): &<span>mdash</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 5
'html entity for E grave' => test_zci(
"E-grave: &Egrave;",
html => "<div>E-grave (&Egrave;): &<span>Egrave</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 6
'html encode $' => test_zci(
"Encoded HTML Entity: &#36;",
html => "<div>Encoded HTML Entity (&#36;): &<span>#36</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 7
'html encode pound symbol' => test_zci(
"British Pound Sterling: &pound;\nNumber sign: &#35;",
html => "<div>British Pound Sterling (&pound;): &<span>pound</span>;</div><div>Number sign (&#35;): &<span>#35</span>;</div><div><a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
# Test 8
'html character code for trademark sign' => test_zci(
"Trademark: &#8482;",
html => "<div>Trademark (&#8482;): &<span>#8482</span>;&nbsp;&nbsp;<a href=\"http://dev.w3.org/html5/html-author/charref\">More at W3</a></div>",
),
);
done_testing;

26
t/URLEncode.t Normal file
View File

@ -0,0 +1,26 @@
#!/usr/bin/env perl
use strict;
use warnings;
use Test::More;
use DDG::Test::Goodie;
zci answer_type => 'encoded_url';
ddg_goodie_test(
[qw(DDG::Goodie::URLEncode)],
# Test 1
'url encode http://nospaces.duckduckgo.com/hook em horns' => test_zci("Encoded URL: http://nospaces.duckduckgo.com/hook%20em%20horns",
html => "<div>Encoded URL: http://nospaces.duckduckgo.com/hook%20em%20horns</div><div>More at <a href=\"https://en.wikipedia.org/wiki/Url_encoding\">Wikipedia</a></div>"),
# Test 2
'encode url xkcd.com/a webcomic of%romance+math+sarcasm+language' => test_zci("Encoded URL: xkcd.com/a%20webcomic%20of%25romance+math+sarcasm+language",
html => "<div>Encoded URL: xkcd.com/a%20webcomic%20of%25romance+math+sarcasm+language</div><div>More at <a href=\"https://en.wikipedia.org/wiki/Url_encoding\">Wikipedia</a></div>"),
# Test 3
'http://arstechnica.com/space after end url encode' => test_zci("Encoded URL: http://arstechnica.com/space%20after%20end%20",
html => "<div>Encoded URL: http://arstechnica.com/space%20after%20end%20</div><div>More at <a href=\"https://en.wikipedia.org/wiki/Url_encoding\">Wikipedia</a></div>"),
# Test 4
'apple.com/mac encode URL' => test_zci("Encoded URL: apple.com/mac",
html => "<div>Encoded URL: apple.com/mac</div><div>More at <a href=\"https://en.wikipedia.org/wiki/Url_encoding\">Wikipedia</a></div>"),
);
done_testing;