170 lines
4.2 KiB
Perl
170 lines
4.2 KiB
Perl
package DDG::Goodie::Unicode;
|
|
# ABSTRACT: unicode character information lookup
|
|
|
|
use strict;
|
|
use DDG::Goodie;
|
|
|
|
use Unicode::UCD qw/charinfo/;
|
|
use Unicode::Char (); # For name -> codepoint lookup
|
|
use Encode qw/encode_utf8/;
|
|
|
|
attribution github => ['cosimo', 'Cosimo Streppone'];
|
|
primary_example_queries 'U+590c';
|
|
secondary_example_queries 'unicode white smiling face';
|
|
description 'get information about a unicode character';
|
|
code_url 'https://github.com/duckduckgo/zeroclickinfo-goodies/blob/master/lib/DDG/Goodie/Unicode.pm';
|
|
name 'Unicode';
|
|
category 'computing_info';
|
|
topics 'programming';
|
|
|
|
|
|
use constant {
|
|
CODEPOINT_RE => qr/^ \s* (?:U \+|\\(?:u|x{(?=.*}))) (?<codepoint> [a-f0-9]{4,6})}? \s* $/xi,
|
|
NAME_RE => qr/^ (?<name> [A-Z][A-Z\s]+) $/xi,
|
|
CHAR_RE => qr/^ \s* (?<char> .) \s* $/x,
|
|
UNICODE_RE => qr/^ unicode \s+ (.+) $/xi,
|
|
CODEPOINT => 1,
|
|
NAME => 2,
|
|
CHAR => 3,
|
|
};
|
|
|
|
triggers query_raw => CODEPOINT_RE;
|
|
|
|
# Also allows open-ended queries like: "LATIN SMALL LETTER X"
|
|
triggers query_raw => UNICODE_RE;
|
|
|
|
zci is_cached => 1;
|
|
zci answer_type => "unicode_conversion";
|
|
|
|
handle sub {
|
|
my $term = $_[0];
|
|
|
|
# Search term starts with "unicode "
|
|
if ($term =~ UNICODE_RE) {
|
|
return unicode_lookup($1);
|
|
}
|
|
|
|
return codepoint_description($term);
|
|
};
|
|
|
|
# Performs a lookup for a codepoint input and returns the description
|
|
sub codepoint_description {
|
|
my $term = $_[0];
|
|
return unless $term;
|
|
|
|
if ($term !~ m{([a-f0-9]+)}i) {
|
|
return;
|
|
}
|
|
|
|
my $c = hex $1;
|
|
my %i = %{ charinfo($c) };
|
|
return unless $i{name};
|
|
|
|
my $info_str = join ' ', chr($c), 'U+' . $i{code}, $i{name};
|
|
my %extra;
|
|
if (defined $i{script}) {
|
|
my $s = $i{script};
|
|
$s =~ tr/_/ /;
|
|
if ($s ne 'Common' && $s ne 'Inherited' && $s ne 'Unknown'
|
|
&& $i{name} !~ /$s/i) {
|
|
$extra{script} = $i{script};
|
|
}
|
|
}
|
|
$extra{decimal} = $c;
|
|
$extra{HTML} = substr($i{category},0,1) eq 'C' ? "No visual representation" : "&#$c;";
|
|
$extra{'UTF-8'} = join ' ',
|
|
map { sprintf '0x%02X', ord $_ }
|
|
split //, encode_utf8(chr($c));
|
|
|
|
if ($i{decomposition}) {
|
|
($extra{decomposition} = $i{decomposition}) =~ s/\b(?<!<)([0-9a-fA-F]{4,6})\b(?!>)/U+$1/g;
|
|
}
|
|
$extra{block} = $i{block};
|
|
|
|
delete $i{title} if $i{title} eq $i{upper};
|
|
|
|
for (qw/upper title lower/) {
|
|
$extra{$_} = 'U+' . $i{$_} if exists $i{$_} && length $i{$_};
|
|
}
|
|
|
|
for (qw/decimal HTML UTF-8 script block decomposition title upper lower/) {
|
|
$info_str .= ", $_: $extra{$_}" if exists $extra{$_};
|
|
}
|
|
return $info_str;
|
|
}
|
|
|
|
# Converts a character input to a codepoint
|
|
sub char_to_codepoint {
|
|
my $c = $_[0];
|
|
|
|
my $u = Unicode::Char->new();
|
|
return if ! defined $c or $c eq "";
|
|
|
|
my $cp = unpack('H*', pack('N', ord($c)));
|
|
$cp =~ s{^ 0+ }{}x;
|
|
$cp = uc ('u+' . $cp);
|
|
return $cp;
|
|
}
|
|
|
|
# Determines whether an input is a codepoint, name or character based on regular expressions
|
|
sub input_type ($) {
|
|
my $input = $_[0] || q{};
|
|
my $type;
|
|
|
|
if ($input =~ CODEPOINT_RE) {
|
|
$input = $+{codepoint};
|
|
$type = CODEPOINT;
|
|
}
|
|
elsif ($input =~ NAME_RE) {
|
|
$input = $+{name};
|
|
$type = NAME;
|
|
}
|
|
elsif ($input =~ CHAR_RE) {
|
|
$input = $+{char};
|
|
$type = CHAR;
|
|
}
|
|
|
|
return ($input, $type);
|
|
}
|
|
|
|
# Converts a name input to a character
|
|
sub name_to_char {
|
|
my $name = $_[0];
|
|
my $u = Unicode::Char->new();
|
|
return $u->n($name);
|
|
}
|
|
|
|
# Performs a unicode lookup based on type of input - codepoint, name or char
|
|
sub unicode_lookup {
|
|
my $term = $_[0];
|
|
|
|
if (! defined $term or $term eq "") {
|
|
return;
|
|
}
|
|
|
|
my $result;
|
|
my $type;
|
|
|
|
($term, $type) = input_type($term);
|
|
if (! defined $type) {
|
|
return;
|
|
}
|
|
|
|
if ($type == CODEPOINT) {
|
|
$result = codepoint_description($term);
|
|
}
|
|
elsif ($type == NAME) {
|
|
my $char = name_to_char($term);
|
|
my $cp = char_to_codepoint($char);
|
|
$result = codepoint_description($cp);
|
|
}
|
|
elsif ($type == CHAR) {
|
|
my $cp = char_to_codepoint($term);
|
|
$result = codepoint_description($cp);
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
1;
|