ReStructuredText: fix parsing of titles containing UTF-8 characters

If a title contained multi-byte UTF-8 characters, it wasn't properly recognized due to the title being longer (in bytes) than the underline. So, fix the title length computation to properly count the characters, not the bytes. Note that this fix only handles ASCII, one-byte charsets and UTF-8, it won't help with other multi-bytes encodings. However, the whole parser expects ASCII-compatible encoding anyway, and in most situations it will be fed the Geany's UTF-8 buffer. Closes #3578050.
2012-10-18 17:02:39 +02:00 · 2012-10-18 17:02:39 +02:00 · b626cc93e3
commit b626cc93e3
parent 6e8e0c7bfb
1 changed files with 35 additions and 1 deletions
--- a/tagmanager/ctags/rest.c
+++ b/tagmanager/ctags/rest.c
@ -123,6 +123,35 @@ static int get_kind(char c)
 }


+/* computes the length of an UTF-8 string
+ * if the string doesn't look like UTF-8, return -1 */
+static int utf8_strlen(const char *buf, int buf_len)
+{
+	int len = 0;
+	const char *end = buf + buf_len;
+
+	for (len = 0; buf < end; len ++)
+	{
+		/* perform quick and naive validation (no sub-byte checking) */
+		if (! (*buf & 0x80))
+			buf ++;
+		else if ((*buf & 0xe0) == 0xc0)
+			buf += 2;
+		else if ((*buf & 0xf0) == 0xe0)
+			buf += 3;
+		else if ((*buf & 0xf8) == 0xf0)
+			buf += 4;
+		else /* not a valid leading UTF-8 byte, abort */
+			return -1;
+
+		if (buf > end) /* incomplete last byte */
+			return -1;
+	}
+
+	return len;
+}
+
+
 /* TODO: parse overlining & underlining as distinct sections. */
 static void findRestTags (void)
 {
@ -135,7 +164,12 @@ static void findRestTags (void)
 	while ((line = fileReadLine ()) != NULL)
 	{
 		int line_len = strlen((const char*) line);
-		int name_len = vStringLength(name);
+		int name_len_bytes = vStringLength(name);
+		int name_len = utf8_strlen(vStringValue(name), name_len_bytes);
+
+		/* if the name doesn't look like UTF-8, assume one-byte charset */
+		if (name_len < 0)
+			name_len = name_len_bytes;

 		/* underlines must be the same length or more */
 		if (line_len >= name_len && name_len > 0 &&