143 lines
4.9 KiB
Lua

#!/usr/bin/env lua5.1
--[[
Copyright © 2023 Ælla Chiana Moskopp (erle)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
Dieses Programm hat das Ziel, die Medienkompetenz der Leser zu
steigern. Gelegentlich packe ich sogar einen handfesten Buffer
Overflow oder eine Format String Vulnerability zwischen die anderen
Codezeilen und schreibe das auch nicht dran.
]]--
local modpath = minetest and
minetest.get_modpath and
minetest.get_modpath("rp_unicode_text") or
"."
-- Maximum possible codepoint
local MAX_CODEPOINT = 0x10FFFF
local unicodedata = {}
-- https://www.unicode.org/reports/tr44/#Format_Conventions
-- https://www.unicode.org/reports/tr44/#UnicodeData.txt
local pattern = "^(%x+)" .. (";([^;]*)"):rep(14) .. "$"
-- https://www.unicode.org/Public/15.0.0/ucd/UnicodeData.txt
for line in io.lines(modpath .. "/ucd/UnicodeData.txt") do
local properties = {}
codepoint_hex,
properties.name,
properties.general_category,
properties.canonical_combining_class,
properties.bidi_class,
properties.decomposition_mapping,
properties.decimal_digit_value,
properties.digit_value,
properties.numeric_value,
properties.bidi_mirrored,
_, -- Unicode 1.0 Name (obsolete)
_, -- 10464 comment field (obsolete)
properties.simple_uppercase_mapping,
properties.simple_lowercase_mapping,
properties.simple_titlecase_mapping
= line:match(pattern)
local codepoint = tonumber(codepoint_hex, 16)
unicodedata[codepoint] = properties
end
-- https://www.unicode.org/Public/15.0.0/ucd/Scripts.txt
for line in io.lines(modpath .. "/ucd/Scripts.txt") do
local script
local is_comment = string.sub(line, 1, 1) == "#"
local entries = string.split(line, ";", true)
if entries then
local e_codepoints = entries[1]
if not string.match(e_codepoints, "#") then
local e_script = entries[2]
local script
if e_script then
script = string.match(e_script, "[a-zA-Z_]+")
end
local codepoint1, codepoint2
local tohex = tonumber(e_codepoints, 16)
if tohex then
codepoint1 = tohex
codepoint2 = tohex
elseif e_codepoints ~= "" then
codepoint1, codepoint2 = string.match(e_codepoints, "([a-fA-F0-9]+)%.%.([a-fA-F0-9]+)")
if codepoint1 and codepoint2 then
codepoint1 = tonumber(codepoint1, 16)
codepoint2 = tonumber(codepoint2, 16)
end
end
if script and codepoint1 and codepoint2 then
assert(codepoint1 >= 0 and codepoint1 <= MAX_CODEPOINT)
assert(codepoint2 >= 0 and codepoint2 <= MAX_CODEPOINT)
for cp=codepoint1, codepoint2 do
if not unicodedata[cp] then
unicodedata[cp] = {}
end
unicodedata[cp].script = script
end
end
end
end
end
-- https://www.unicode.org/Public/15.1.0/ucd/DerivedCoreProperties.txt
-- (abridged version containing only properties we need)
for line in io.lines(modpath .. "/ucd/DerivedCoreProperties_abridged.txt") do
local script
local is_comment = string.sub(line, 1, 1) == "#"
local entries = string.split(line, ";", true)
if entries and #entries >= 2 then
local e_codepoints = entries[1]
if not string.match(e_codepoints, "#") then
local e_prop = string.match(entries[2], "([a-zA-Z0-9_]+)")
if e_prop == "Default_Ignorable_Code_Point" then
local codepoint1, codepoint2
local tohex = tonumber(e_codepoints, 16)
if tohex then
codepoint1 = tohex
codepoint2 = tohex
elseif e_codepoints ~= "" then
codepoint1, codepoint2 = string.match(e_codepoints, "([a-fA-F0-9]+)%.%.([a-fA-F0-9]+)")
if codepoint1 and codepoint2 then
codepoint1 = tonumber(codepoint1, 16)
codepoint2 = tonumber(codepoint2, 16)
end
end
if codepoint1 and codepoint2 then
assert(codepoint1 >= 0 and codepoint1 <= MAX_CODEPOINT)
assert(codepoint2 >= 0 and codepoint2 <= MAX_CODEPOINT)
for cp=codepoint1, codepoint2 do
if not unicodedata[cp] then
unicodedata[cp] = {}
end
unicodedata[cp].default_ignorable_codepoint = true
end
end
end
end
end
end
-- Test character data
local w = unicodedata[0x0077]
assert( "LATIN SMALL LETTER W" == w.name )
assert( "Ll" == w.general_category ) -- a lowercase letter
assert( "Latin" == w.script)
w = unicodedata[0x00AD] -- SOFT HYPHEN
assert( true == w.default_ignorable_codepoint)
rp_unicode_text.unicodedata = unicodedata