Add UTF8 support for string.len

This commit is contained in:
MoNTE48 2020-03-23 18:50:42 +01:00
parent f884d0ace9
commit 232904ed1c

View File

@ -2,6 +2,7 @@
$Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $ $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $
Provides UTF-8 aware string functions implemented in pure lua: Provides UTF-8 aware string functions implemented in pure lua:
* string.len(s)
* string.upper(s) * string.upper(s)
* string.lower(s) * string.lower(s)
@ -35,6 +36,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Based on: https://github.com/Planimeter/grid-sdk/blob/master/public/utf8.lua Based on: https://github.com/Planimeter/grid-sdk/blob/master/public/utf8.lua
https://github.com/Stepets/utf8.lua
Changed by: MultiCraft Development Team (2019) Changed by: MultiCraft Development Team (2019)
Note: Now used very minimal version, with the support of only lower and upper. Note: Now used very minimal version, with the support of only lower and upper.
Only latin and russian letters are supported. Only latin and russian letters are supported.
@ -43,16 +45,16 @@ Note: Now used very minimal version, with the support of only lower and upper.
-- returns the number of bytes used by the UTF-8 character at byte i in s -- returns the number of bytes used by the UTF-8 character at byte i in s
-- also doubles as a UTF-8 character validator -- also doubles as a UTF-8 character validator
function utf8charbytes(s, i) local function utf8charbytes(s, i)
-- argument defaults -- argument defaults
i = i or 1 i = i or 1
-- argument checking -- argument checking
if type(s) ~= "string" then if type(s) ~= "string" then
error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")") error("bad argument #1 to 'utf8charbytes' (string expected, got " .. type(s) .. ")")
end end
if type(i) ~= "number" then if type(i) ~= "number" then
error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")") error("bad argument #2 to 'utf8charbytes' (number expected, got " .. type(i) .. ")")
end end
local c = s:byte(i) local c = s:byte(i)
@ -135,20 +137,40 @@ function utf8charbytes(s, i)
end end
end end
-- returns the number of characters in a UTF-8 string
local originlen = string.len
local function utf8len(s)
-- argument checking
if type(s) ~= "string" then
error("bad argument #1 to 'utf8len' (string expected, got " .. type(s) .. ")")
end
local pos = 1
local bytes = originlen(s)
local len = 0
while pos <= bytes do
len = len + 1
pos = pos + utf8charbytes(s, pos)
end
return len
end
dofile(core.get_builtin_path() .. "utf8lib" .. DIR_DELIM .. "utf8data.lua") dofile(core.get_builtin_path() .. "utf8lib" .. DIR_DELIM .. "utf8data.lua")
-- replace UTF-8 characters based on a mapping table -- replace UTF-8 characters based on a mapping table
local function utf8replace(s, mapping) local function utf8replace(s, mapping)
-- argument checking -- argument checking
if type(s) ~= "string" then if type(s) ~= "string" then
error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")") error("bad argument #1 to 'utf8replace' (string expected, got " .. type(s) .. ")")
end end
if type(mapping) ~= "table" then if type(mapping) ~= "table" then
error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")") error("bad argument #2 to 'utf8replace' (table expected, got " .. type(mapping) .. ")")
end end
local pos = 1 local pos = 1
local bytes = s:len() local bytes = originlen(s)
local charbytes local charbytes
local newstr = "" local newstr = ""
@ -162,13 +184,18 @@ local function utf8replace(s, mapping)
return newstr return newstr
end end
-- identical to string.upper except it knows about unicode simple case conversions -- identical to string.len with UTF-8 support
function string.len(s)
return utf8len(s)
end
-- identical to string.upper with UTF-8 support
local origupper = string.upper local origupper = string.upper
function string.upper(s) function string.upper(s)
return origupper(utf8replace(s, utf8_lc_uc)) return origupper(utf8replace(s, utf8_lc_uc))
end end
-- identical to string.lower except it knows about unicode simple case conversions -- identical to string.lower with UTF-8 support
local origlower = string.lower local origlower = string.lower
function string.lower(s) function string.lower(s)
return origlower(utf8replace(s, utf8_uc_lc)) return origlower(utf8replace(s, utf8_uc_lc))