2012-03-10 00:54:44 -08:00
|
|
|
package DDG::Goodie::Unicode;
|
2014-08-20 11:45:33 -07:00
|
|
|
# ABSTRACT: unicode character information lookup
|
2012-03-10 00:54:44 -08:00
|
|
|
|
2015-02-22 12:09:29 -08:00
|
|
|
use strict;
|
2012-03-10 00:54:44 -08:00
|
|
|
use DDG::Goodie;
|
2014-08-20 11:45:33 -07:00
|
|
|
|
2012-03-10 00:54:44 -08:00
|
|
|
use Unicode::UCD qw/charinfo/;
|
2012-05-01 14:47:32 -07:00
|
|
|
use Unicode::Char (); # For name -> codepoint lookup
|
2012-03-11 00:39:45 -08:00
|
|
|
use Encode qw/encode_utf8/;
|
2012-03-10 00:54:44 -08:00
|
|
|
|
2012-05-01 14:47:32 -07:00
|
|
|
use constant {
|
2014-06-16 07:59:24 -07:00
|
|
|
CODEPOINT_RE => qr/^ \s* (?:U \+|\\(?:u|x{(?=.*}))) (?<codepoint> [a-f0-9]{4,6})}? \s* $/xi,
|
2012-05-01 14:47:32 -07:00
|
|
|
NAME_RE => qr/^ (?<name> [A-Z][A-Z\s]+) $/xi,
|
2012-05-18 13:05:21 -07:00
|
|
|
CHAR_RE => qr/^ \s* (?<char> .) \s* $/x,
|
2015-12-22 07:26:43 -08:00
|
|
|
UNICODE_RE => qr/^ (?:unicode|emoji|utf-(?:8|16|32)) \s+ (.+) $/xi,
|
2012-05-01 14:47:32 -07:00
|
|
|
CODEPOINT => 1,
|
|
|
|
NAME => 2,
|
2012-05-05 01:00:23 -07:00
|
|
|
CHAR => 3,
|
2012-05-01 14:47:32 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
triggers query_raw => CODEPOINT_RE;
|
|
|
|
|
|
|
|
# Also allows open-ended queries like: "LATIN SMALL LETTER X"
|
2012-05-05 01:00:23 -07:00
|
|
|
triggers query_raw => UNICODE_RE;
|
2012-03-10 00:54:44 -08:00
|
|
|
|
|
|
|
zci is_cached => 1;
|
2012-03-20 21:08:12 -07:00
|
|
|
zci answer_type => "unicode_conversion";
|
2012-03-10 00:54:44 -08:00
|
|
|
|
|
|
|
handle sub {
|
2012-05-01 14:47:32 -07:00
|
|
|
my $term = $_[0];
|
|
|
|
|
2012-05-05 01:00:23 -07:00
|
|
|
# Search term starts with "unicode "
|
|
|
|
if ($term =~ UNICODE_RE) {
|
2015-12-08 10:07:20 -08:00
|
|
|
return unless my $result = unicode_lookup($1);
|
|
|
|
return $result;
|
2012-05-01 14:47:32 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return codepoint_description($term);
|
2012-05-18 13:05:21 -07:00
|
|
|
};
|
2012-05-01 14:47:32 -07:00
|
|
|
|
2015-05-29 06:13:41 -07:00
|
|
|
# Performs a lookup for a codepoint input and returns the description
|
2012-05-01 14:47:32 -07:00
|
|
|
sub codepoint_description {
|
|
|
|
my $term = $_[0];
|
2013-05-02 15:54:54 -07:00
|
|
|
return unless $term;
|
2012-05-01 14:47:32 -07:00
|
|
|
|
|
|
|
if ($term !~ m{([a-f0-9]+)}i) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-03-10 00:54:44 -08:00
|
|
|
my $c = hex $1;
|
|
|
|
my %i = %{ charinfo($c) };
|
|
|
|
return unless $i{name};
|
|
|
|
|
|
|
|
my $info_str = join ' ', chr($c), 'U+' . $i{code}, $i{name};
|
|
|
|
my %extra;
|
|
|
|
if (defined $i{script}) {
|
|
|
|
my $s = $i{script};
|
|
|
|
$s =~ tr/_/ /;
|
|
|
|
if ($s ne 'Common' && $s ne 'Inherited' && $s ne 'Unknown'
|
2013-05-02 15:54:54 -07:00
|
|
|
&& $i{name} !~ /$s/i) {
|
2012-03-10 00:54:44 -08:00
|
|
|
$extra{script} = $i{script};
|
|
|
|
}
|
|
|
|
}
|
2012-03-10 12:20:09 -08:00
|
|
|
$extra{decimal} = $c;
|
2013-04-30 13:31:43 -07:00
|
|
|
$extra{HTML} = substr($i{category},0,1) eq 'C' ? "No visual representation" : "&#$c;";
|
2012-03-11 00:39:45 -08:00
|
|
|
$extra{'UTF-8'} = join ' ',
|
2013-05-02 15:54:54 -07:00
|
|
|
map { sprintf '0x%02X', ord $_ }
|
|
|
|
split //, encode_utf8(chr($c));
|
2012-03-11 00:39:45 -08:00
|
|
|
|
2012-03-10 00:54:44 -08:00
|
|
|
if ($i{decomposition}) {
|
2012-03-11 00:40:10 -08:00
|
|
|
($extra{decomposition} = $i{decomposition}) =~ s/\b(?<!<)([0-9a-fA-F]{4,6})\b(?!>)/U+$1/g;
|
2012-03-10 00:54:44 -08:00
|
|
|
}
|
|
|
|
$extra{block} = $i{block};
|
|
|
|
|
|
|
|
delete $i{title} if $i{title} eq $i{upper};
|
|
|
|
|
|
|
|
for (qw/upper title lower/) {
|
2012-05-02 09:31:28 -07:00
|
|
|
$extra{$_} = 'U+' . $i{$_} if exists $i{$_} && length $i{$_};
|
2012-03-10 00:54:44 -08:00
|
|
|
}
|
|
|
|
|
2012-03-11 00:39:45 -08:00
|
|
|
for (qw/decimal HTML UTF-8 script block decomposition title upper lower/) {
|
2012-03-10 00:54:44 -08:00
|
|
|
$info_str .= ", $_: $extra{$_}" if exists $extra{$_};
|
|
|
|
}
|
|
|
|
return $info_str;
|
2012-05-01 14:47:32 -07:00
|
|
|
}
|
|
|
|
|
2015-05-29 06:13:41 -07:00
|
|
|
# Converts a character input to a codepoint
|
2012-05-01 14:47:32 -07:00
|
|
|
sub char_to_codepoint {
|
|
|
|
my $c = $_[0];
|
|
|
|
|
|
|
|
my $u = Unicode::Char->new();
|
|
|
|
return if ! defined $c or $c eq "";
|
|
|
|
|
|
|
|
my $cp = unpack('H*', pack('N', ord($c)));
|
|
|
|
$cp =~ s{^ 0+ }{}x;
|
|
|
|
$cp = uc ('u+' . $cp);
|
|
|
|
return $cp;
|
|
|
|
}
|
|
|
|
|
2015-05-29 06:13:41 -07:00
|
|
|
# Determines whether an input is a codepoint, name or character based on regular expressions
|
2012-05-01 14:47:32 -07:00
|
|
|
sub input_type ($) {
|
|
|
|
my $input = $_[0] || q{};
|
|
|
|
my $type;
|
|
|
|
|
|
|
|
if ($input =~ CODEPOINT_RE) {
|
|
|
|
$input = $+{codepoint};
|
|
|
|
$type = CODEPOINT;
|
|
|
|
}
|
|
|
|
elsif ($input =~ NAME_RE) {
|
|
|
|
$input = $+{name};
|
|
|
|
$type = NAME;
|
|
|
|
}
|
2012-05-05 01:00:23 -07:00
|
|
|
elsif ($input =~ CHAR_RE) {
|
|
|
|
$input = $+{char};
|
|
|
|
$type = CHAR;
|
|
|
|
}
|
2012-05-01 14:47:32 -07:00
|
|
|
|
|
|
|
return ($input, $type);
|
2013-05-02 15:54:54 -07:00
|
|
|
}
|
2012-05-01 14:47:32 -07:00
|
|
|
|
2015-05-29 06:13:41 -07:00
|
|
|
# Converts a name input to a character
|
2012-05-01 14:47:32 -07:00
|
|
|
sub name_to_char {
|
|
|
|
my $name = $_[0];
|
|
|
|
my $u = Unicode::Char->new();
|
|
|
|
return $u->n($name);
|
|
|
|
}
|
|
|
|
|
2015-05-29 06:13:41 -07:00
|
|
|
# Performs a unicode lookup based on type of input - codepoint, name or char
|
2012-05-01 14:47:32 -07:00
|
|
|
sub unicode_lookup {
|
|
|
|
my $term = $_[0];
|
|
|
|
|
|
|
|
if (! defined $term or $term eq "") {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
my $result;
|
|
|
|
my $type;
|
|
|
|
|
|
|
|
($term, $type) = input_type($term);
|
|
|
|
if (! defined $type) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($type == CODEPOINT) {
|
|
|
|
$result = codepoint_description($term);
|
|
|
|
}
|
|
|
|
elsif ($type == NAME) {
|
|
|
|
my $char = name_to_char($term);
|
|
|
|
my $cp = char_to_codepoint($char);
|
|
|
|
$result = codepoint_description($cp);
|
|
|
|
}
|
2012-05-05 01:00:23 -07:00
|
|
|
elsif ($type == CHAR) {
|
|
|
|
my $cp = char_to_codepoint($term);
|
|
|
|
$result = codepoint_description($cp);
|
|
|
|
}
|
2012-05-01 14:47:32 -07:00
|
|
|
|
|
|
|
return $result;
|
|
|
|
}
|
2012-03-10 00:54:44 -08:00
|
|
|
|
|
|
|
1;
|