zeroclickinfo-goodies/lib/DDG/Goodie/Unicode.pm

161 lines
3.9 KiB
Perl

package DDG::Goodie::Unicode;
# ABSTRACT: unicode character information lookup
use strict;
use DDG::Goodie;
use Unicode::UCD qw/charinfo/;
use Unicode::Char (); # For name -> codepoint lookup
use Encode qw/encode_utf8/;
use constant {
CODEPOINT_RE => qr/^ \s* (?:U \+?|\\(?:u|x\{(?=.*\}))) (?<codepoint> [a-f0-9]{4,6})\}? \s* $/xi,
NAME_RE => qr/^ (?<name> [A-Z][A-Z\s]+) $/xi,
CHAR_RE => qr/^ \s* (?<char> .) \s* $/x,
UNICODE_RE => qr/^ (?:unicode|emoji|utf-(?:8|16|32)) \s+ (.+) $/xi,
CODEPOINT => 1,
NAME => 2,
CHAR => 3,
};
triggers query_raw => CODEPOINT_RE;
# Also allows open-ended queries like: "LATIN SMALL LETTER X"
triggers query_raw => UNICODE_RE;
zci is_cached => 1;
zci answer_type => "unicode_conversion";
handle sub {
my $term = $_[0];
# Search term starts with "unicode "
if ($term =~ UNICODE_RE) {
return unless my $result = unicode_lookup($1);
return $result;
}
return codepoint_description($term);
};
# Performs a lookup for a codepoint input and returns the description
sub codepoint_description {
my $term = $_[0];
return unless $term;
if ($term !~ m{([a-f0-9]+)}i) {
return;
}
my $c = hex $1;
my %i = %{ charinfo($c) };
return unless $i{name};
my $info_str = join ' ', chr($c), 'U+' . $i{code}, $i{name};
my %extra;
if (defined $i{script}) {
my $s = $i{script};
$s =~ tr/_/ /;
if ($s ne 'Common' && $s ne 'Inherited' && $s ne 'Unknown'
&& $i{name} !~ /$s/i) {
$extra{script} = $i{script};
}
}
$extra{decimal} = $c;
$extra{HTML} = substr($i{category},0,1) eq 'C' ? "No visual representation" : "&#$c;";
$extra{'UTF-8'} = join ' ',
map { sprintf '0x%02X', ord $_ }
split //, encode_utf8(chr($c));
if ($i{decomposition}) {
($extra{decomposition} = $i{decomposition}) =~ s/\b(?<!<)([0-9a-fA-F]{4,6})\b(?!>)/U+$1/g;
}
$extra{block} = $i{block};
delete $i{title} if $i{title} eq $i{upper};
for (qw/upper title lower/) {
$extra{$_} = 'U+' . $i{$_} if exists $i{$_} && length $i{$_};
}
for (qw/decimal HTML UTF-8 script block decomposition title upper lower/) {
$info_str .= ", $_: $extra{$_}" if exists $extra{$_};
}
return $info_str;
}
# Converts a character input to a codepoint
sub char_to_codepoint {
my $c = $_[0];
my $u = Unicode::Char->new();
return if ! defined $c or $c eq "";
my $cp = unpack('H*', pack('N', ord($c)));
$cp =~ s{^ 0+ }{}x;
$cp = uc ('u+' . $cp);
return $cp;
}
# Determines whether an input is a codepoint, name or character based on regular expressions
sub input_type ($) {
my $input = $_[0] || q{};
my $type;
if ($input =~ CODEPOINT_RE) {
$input = $+{codepoint};
$type = CODEPOINT;
}
elsif ($input =~ NAME_RE) {
$input = $+{name};
$type = NAME;
}
elsif ($input =~ CHAR_RE) {
$input = $+{char};
$type = CHAR;
}
return ($input, $type);
}
# Converts a name input to a character
sub name_to_char {
my $name = $_[0];
my $u = Unicode::Char->new();
return $u->n($name);
}
# Performs a unicode lookup based on type of input - codepoint, name or char
sub unicode_lookup {
my $term = $_[0];
if (! defined $term or $term eq "") {
return;
}
my $result;
my $type;
($term, $type) = input_type($term);
if (! defined $type) {
return;
}
if ($type == CODEPOINT) {
$result = codepoint_description($term);
}
elsif ($type == NAME) {
my $char = name_to_char($term);
my $cp = char_to_codepoint($char);
$result = codepoint_description($cp);
}
elsif ($type == CHAR) {
my $cp = char_to_codepoint($term);
$result = codepoint_description($cp);
}
return $result;
}
1;