package DDG::Goodie::Unicode; # ABSTRACT: unicode character information lookup use strict; use DDG::Goodie; use Unicode::UCD qw/charinfo/; use Unicode::Char (); # For name -> codepoint lookup use Encode qw/encode_utf8/; use constant { CODEPOINT_RE => qr/^ \s* (?:U \+|\\(?:u|x{(?=.*}))) (? [a-f0-9]{4,6})}? \s* $/xi, NAME_RE => qr/^ (? [A-Z][A-Z\s]+) $/xi, CHAR_RE => qr/^ \s* (? .) \s* $/x, UNICODE_RE => qr/^ (?:unicode|emoji|utf-(?:8|16|32)) \s+ (.+) $/xi, CODEPOINT => 1, NAME => 2, CHAR => 3, }; triggers query_raw => CODEPOINT_RE; # Also allows open-ended queries like: "LATIN SMALL LETTER X" triggers query_raw => UNICODE_RE; zci is_cached => 1; zci answer_type => "unicode_conversion"; handle sub { my $term = $_[0]; # Search term starts with "unicode " if ($term =~ UNICODE_RE) { return unless my $result = unicode_lookup($1); return $result; } return codepoint_description($term); }; # Performs a lookup for a codepoint input and returns the description sub codepoint_description { my $term = $_[0]; return unless $term; if ($term !~ m{([a-f0-9]+)}i) { return; } my $c = hex $1; my %i = %{ charinfo($c) }; return unless $i{name}; my $info_str = join ' ', chr($c), 'U+' . $i{code}, $i{name}; my %extra; if (defined $i{script}) { my $s = $i{script}; $s =~ tr/_/ /; if ($s ne 'Common' && $s ne 'Inherited' && $s ne 'Unknown' && $i{name} !~ /$s/i) { $extra{script} = $i{script}; } } $extra{decimal} = $c; $extra{HTML} = substr($i{category},0,1) eq 'C' ? "No visual representation" : "&#$c;"; $extra{'UTF-8'} = join ' ', map { sprintf '0x%02X', ord $_ } split //, encode_utf8(chr($c)); if ($i{decomposition}) { ($extra{decomposition} = $i{decomposition}) =~ s/\b(?)/U+$1/g; } $extra{block} = $i{block}; delete $i{title} if $i{title} eq $i{upper}; for (qw/upper title lower/) { $extra{$_} = 'U+' . $i{$_} if exists $i{$_} && length $i{$_}; } for (qw/decimal HTML UTF-8 script block decomposition title upper lower/) { $info_str .= ", $_: $extra{$_}" if exists $extra{$_}; } return $info_str; } # Converts a character input to a codepoint sub char_to_codepoint { my $c = $_[0]; my $u = Unicode::Char->new(); return if ! defined $c or $c eq ""; my $cp = unpack('H*', pack('N', ord($c))); $cp =~ s{^ 0+ }{}x; $cp = uc ('u+' . $cp); return $cp; } # Determines whether an input is a codepoint, name or character based on regular expressions sub input_type ($) { my $input = $_[0] || q{}; my $type; if ($input =~ CODEPOINT_RE) { $input = $+{codepoint}; $type = CODEPOINT; } elsif ($input =~ NAME_RE) { $input = $+{name}; $type = NAME; } elsif ($input =~ CHAR_RE) { $input = $+{char}; $type = CHAR; } return ($input, $type); } # Converts a name input to a character sub name_to_char { my $name = $_[0]; my $u = Unicode::Char->new(); return $u->n($name); } # Performs a unicode lookup based on type of input - codepoint, name or char sub unicode_lookup { my $term = $_[0]; if (! defined $term or $term eq "") { return; } my $result; my $type; ($term, $type) = input_type($term); if (! defined $type) { return; } if ($type == CODEPOINT) { $result = codepoint_description($term); } elsif ($type == NAME) { my $char = name_to_char($term); my $cp = char_to_codepoint($char); $result = codepoint_description($cp); } elsif ($type == CHAR) { my $cp = char_to_codepoint($term); $result = codepoint_description($cp); } return $result; } 1;