zeroclickinfo-goodies/lib/DDG/Goodie/Unicode.pm

package DDG::Goodie::Unicode;
# ABSTRACT: unicode character information lookup

use strict;
use DDG::Goodie;

use Unicode::UCD qw/charinfo/;
use Unicode::Char ();              # For name -> codepoint lookup
use Encode qw/encode_utf8/;

use constant {
    CODEPOINT_RE => qr/^ \s* (?:U \+?|\\(?:u|x\{(?=.*\}))) (?<codepoint> [a-f0-9]{4,6})\}? \s* $/xi,
    NAME_RE      => qr/^ (?<name> [A-Z][A-Z\s]+) $/xi,
    CHAR_RE      => qr/^ \s* (?<char> .) \s* $/x,
    UNICODE_RE   => qr/^ (?:unicode|emoji|utf-(?:8|16|32)) \s+ (.+) $/xi,
    CODEPOINT    => 1,
    NAME         => 2,
    CHAR         => 3,
};

triggers query_raw => CODEPOINT_RE;

# Also allows open-ended queries like: "LATIN SMALL LETTER X"
triggers query_raw => UNICODE_RE;

zci is_cached => 1;
zci answer_type => "unicode_conversion";

handle sub {
    my $term = $_[0];

    # Search term starts with "unicode "
    if ($term =~ UNICODE_RE) {
        return unless my $result = unicode_lookup($1);
        return $result;
    }

    return codepoint_description($term);
};

# Performs a lookup for a codepoint input and returns the description
sub codepoint_description {
    my $term = $_[0];
    return unless $term;

    if ($term !~ m{([a-f0-9]+)}i) {
        return;
    }

    my $c = hex $1;
    my %i = %{ charinfo($c) };
    return unless $i{name};

    my $info_str = join ' ', chr($c), 'U+' . $i{code}, $i{name};
    my %extra;
    if (defined $i{script}) {
        my $s = $i{script};
        $s =~ tr/_/ /;
        if ($s ne 'Common' && $s ne 'Inherited' && $s  ne 'Unknown'
            && $i{name} !~ /$s/i) {
            $extra{script} = $i{script};
        }
    }
    $extra{decimal} = $c;
    $extra{HTML}    = substr($i{category},0,1) eq 'C' ? "No visual representation" : "&#$c;";
    $extra{'UTF-8'} = join ' ',
    map { sprintf '0x%02X', ord $_ }
    split //, encode_utf8(chr($c));

    if ($i{decomposition}) {
        ($extra{decomposition} = $i{decomposition}) =~ s/\b(?<!<)([0-9a-fA-F]{4,6})\b(?!>)/U+$1/g;
    }
    $extra{block} = $i{block};

    delete $i{title} if $i{title} eq $i{upper};

    for (qw/upper title lower/) {
        $extra{$_} = 'U+' . $i{$_} if exists $i{$_} && length $i{$_};
    }

    for (qw/decimal HTML UTF-8 script block decomposition title upper lower/) {
        $info_str .= ", $_: $extra{$_}" if exists $extra{$_};
    }
    return $info_str;
}

# Converts a character input to a codepoint
sub char_to_codepoint {
    my $c = $_[0];

    my $u = Unicode::Char->new();
    return if ! defined $c or $c eq "";

    my $cp = unpack('H*', pack('N', ord($c)));
    $cp =~ s{^ 0+ }{}x;
    $cp = uc ('u+' . $cp);
    return $cp;
}

# Determines whether an input is a codepoint, name or character based on regular expressions
sub input_type ($) {
    my $input = $_[0] || q{};
    my $type;

    if ($input =~ CODEPOINT_RE) {
        $input = $+{codepoint};
        $type = CODEPOINT;
    }
    elsif ($input =~ NAME_RE) {
        $input = $+{name};
        $type = NAME;
    }
    elsif ($input =~ CHAR_RE) {
        $input = $+{char};
        $type = CHAR;
    }

    return ($input, $type);
                }

# Converts a name input to a character
sub name_to_char {
    my $name = $_[0];
    my $u = Unicode::Char->new();
    return $u->n($name);
}

# Performs a unicode lookup based on type of input - codepoint, name or char
sub unicode_lookup {
    my $term = $_[0];

    if (! defined $term or $term eq "") {
        return;
    }

    my $result;
    my $type;

    ($term, $type) = input_type($term);
    if (! defined $type) {
        return;
    }

    if ($type == CODEPOINT) {
        $result = codepoint_description($term);
    }
    elsif ($type == NAME) {
        my $char = name_to_char($term);
        my $cp = char_to_codepoint($char);
        $result = codepoint_description($cp);
    }
    elsif ($type == CHAR) {
        my $cp = char_to_codepoint($term);
        $result = codepoint_description($cp);
    }

    return $result;
}

1;