diff --git a/sys/include/ape/utf.h b/sys/include/ape/utf.h index 6302166fb..bba977100 100644 --- a/sys/include/ape/utf.h +++ b/sys/include/ape/utf.h @@ -14,7 +14,8 @@ enum UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* diff --git a/sys/include/libc.h b/sys/include/libc.h index 833a3f50b..de6c3105d 100644 --- a/sys/include/libc.h +++ b/sys/include/libc.h @@ -45,6 +45,7 @@ enum Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* diff --git a/sys/src/9/pc/cga.c b/sys/src/9/pc/cga.c index 2d8c42f2b..b07652b0c 100644 --- a/sys/src/9/pc/cga.c +++ b/sys/src/9/pc/cga.c @@ -99,7 +99,9 @@ cgascreenputc(Rune c) int i; uchar *p; - if(c == '\n'){ + if(c == '\0') + return; + else if(c == '\n'){ cgapos = cgapos/Width; cgapos = (cgapos+1)*Width; } @@ -138,8 +140,10 @@ cgascreenputc(Rune c) static void cgascreenputs(char* s, int n) { + static char rb[UTFmax]; + static int nrb; + char *e; Rune r; - int i; if(!islo()){ /* @@ -152,11 +156,14 @@ cgascreenputs(char* s, int n) else lock(&cgascreenlock); - while(n > 0){ - i = chartorune(&r, s); - cgascreenputc(r); - s += i; - n -= i; + e = s + n; + while(s < e){ + rb[nrb++] = *s++; + if(nrb >= UTFmax || fullrune(rb, nrb)){ + chartorune(&r, rb); + cgascreenputc(r); + nrb = 0; + } } unlock(&cgascreenlock); diff --git a/sys/src/9/pc/vga.c b/sys/src/9/pc/vga.c index 1dae2d193..c325fd4e2 100644 --- a/sys/src/9/pc/vga.c +++ b/sys/src/9/pc/vga.c @@ -119,9 +119,10 @@ vgascreenputc(VGAscr* scr, char* buf, Rectangle *flushr) static void vgascreenputs(char* s, int n) { - int i, gotdraw; - Rune r; - char buf[4]; + static char rb[UTFmax+1]; + static int nrb; + char *e; + int gotdraw; VGAscr *scr; Rectangle flushr; @@ -146,13 +147,14 @@ vgascreenputs(char* s, int n) flushr = Rect(10000, 10000, -10000, -10000); - while(n > 0){ - i = chartorune(&r, s); - memmove(buf, s, i); - buf[i] = 0; - n -= i; - s += i; - vgascreenputc(scr, buf, &flushr); + e = s + n; + while(s < e){ + rb[nrb++] = *s++; + if(nrb >= UTFmax || fullrune(rb, nrb)){ + rb[nrb] = 0; + vgascreenputc(scr, rb, &flushr); + nrb = 0; + } } flushmemscreen(flushr); diff --git a/sys/src/9/port/lib.h b/sys/src/9/port/lib.h index dce00b07a..cf235388e 100644 --- a/sys/src/9/port/lib.h +++ b/sys/src/9/port/lib.h @@ -38,7 +38,8 @@ enum UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* diff --git a/sys/src/ape/lib/ap/gen/mbwc.c b/sys/src/ape/lib/ap/gen/mbwc.c index 66a982193..416ab03cf 100644 --- a/sys/src/ape/lib/ap/gen/mbwc.c +++ b/sys/src/ape/lib/ap/gen/mbwc.c @@ -1,4 +1,5 @@ #include +#include /* * Use the FSS-UTF transformation proposed by posix. @@ -7,12 +8,14 @@ * Tx 10xxxxxx 6 free bits * T1 110xxxxx 5 free bits * T2 1110xxxx 4 free bits + * T3 11110xxx 3 free bits * * Encoding is as follows. * From hex Thru hex Sequence Bits * 00000000 0000007F T0 7 * 00000080 000007FF T1 Tx 11 * 00000800 0000FFFF T2 Tx Tx 16 + * 00010000 0010FFFF T3 Tx Tx Tx 20 (and change) */ int @@ -25,7 +28,7 @@ mblen(const char *s, size_t n) int mbtowc(wchar_t *pwc, const char *s, size_t n) { - int c, c1, c2; + int c, c1, c2, c3; long l; if(!s) @@ -70,7 +73,25 @@ mbtowc(wchar_t *pwc, const char *s, size_t n) return 3; } - /* + if(n < 4) + goto bad; + if(UTFmax >= 4) { + c3 = (s[3] ^ 0x80) & 0xff; + if(c3 & 0xC0) + goto bad; + if(c < 0xf8) { + l = ((((((c << 6) | c1) << 6) | c2) << 6) | c3) & 0x3fffff; + if(l <= 0x10000) + goto bad; + if(l > Runemax) + goto bad; + if(pwc) + *pwc = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -86,7 +107,10 @@ wctomb(char *s, wchar_t wchar) if(!s) return 0; - c = wchar & 0xFFFF; + c = wchar; + if(c > Runemax) + c = Runeerror; + if(c < 0x80) { s[0] = c; return 1; @@ -98,10 +122,18 @@ wctomb(char *s, wchar_t wchar) return 2; } - s[0] = 0xE0 | (c >> 12); - s[1] = 0x80 | ((c >> 6) & 0x3F); - s[2] = 0x80 | (c & 0x3F); - return 3; + if(c < 0x10000) { + s[0] = 0xE0 | (c >> 12); + s[1] = 0x80 | ((c >> 6) & 0x3F); + s[2] = 0x80 | (c & 0x3F); + return 3; + } + + s[0] = 0xf0 | c >> 18; + s[1] = 0x80 | (c >> 12) & 0x3F; + s[2] = 0x80 | (c >> 6) & 0x3F; + s[3] = 0x80 | (c & 0x3F); + return 4; } size_t @@ -117,7 +149,7 @@ mbstowcs(wchar_t *pwcs, const char *s, size_t n) break; s++; } else { - d = mbtowc(pwcs, s, 3); + d = mbtowc(pwcs, s, UTFmax); if(d <= 0) return (size_t)((d<0) ? -1 : i); s += d; @@ -133,10 +165,10 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n) int i, d; long c; char *p, *pe; - char buf[3]; + char buf[UTFmax]; p = s; - pe = p+n-3; + pe = p+n-UTFmax; while(p < pe) { c = *pwcs++; if(c < 0x80) @@ -146,17 +178,14 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n) if(c == 0) return p-s; } - while(p < pe+3) { + while(p < pe+UTFmax) { c = *pwcs++; d = wctomb(buf, c); - if(p+d <= pe+3) { - *p++ = buf[0]; - if(d > 1) { - *p++ = buf[2]; - if(d > 2) - *p++ = buf[3]; - } - } + if(p+d <= pe+UTFmax) { + for(i = 0; i < d; i++) + p[i] = buf[i]; + p += d; + } if(c == 0) break; } diff --git a/sys/src/ape/lib/fmt/dofmt.c b/sys/src/ape/lib/fmt/dofmt.c index 014b66257..7f314e93e 100644 --- a/sys/src/ape/lib/fmt/dofmt.c +++ b/sys/src/ape/lib/fmt/dofmt.c @@ -546,12 +546,15 @@ __flagfmt(Fmt *f) int __badfmt(Fmt *f) { - char x[3]; + char x[2+UTFmax]; + Rune r; + int n; + r = f->r; x[0] = '%'; - x[1] = f->r; - x[2] = '%'; - f->prec = 3; - __fmtcpy(f, (const void*)x, 3, 3); + n = 1+runetochar(x+1, &r); + x[n++] = '%'; + f->prec = n; + __fmtcpy(f, x, n, n); return 0; } diff --git a/sys/src/ape/lib/utf/rune.c b/sys/src/ape/lib/utf/rune.c index e1aaa9be9..66a9b2527 100644 --- a/sys/src/ape/lib/utf/rune.c +++ b/sys/src/ape/lib/utf/rune.c @@ -23,16 +23,19 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -101,11 +123,14 @@ runetochar(char *str, Rune *rune) { long c; + c = *rune; + if(c > Runemax) + c = Runeerror; + /* * one character sequence * 00000-0007F => 00-7F */ - c = *rune; if(c <= Rune1) { str[0] = c; return 1; @@ -125,17 +150,29 @@ runetochar(char *str, Rune *rune) * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int runelen(long c) { Rune rune; - char str[10]; + char str[UTFmax]; rune = c; return runetochar(str, &rune); @@ -155,7 +192,10 @@ runenlen(Rune *r, int nrune) if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -165,13 +205,15 @@ fullrune(char *str, int n) { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } + diff --git a/sys/src/cmd/1c/swt.c b/sys/src/cmd/1c/swt.c index 66f4d36dc..85622a4df 100644 --- a/sys/src/cmd/1c/swt.c +++ b/sys/src/cmd/1c/swt.c @@ -244,26 +244,26 @@ outstring(char *s, long n) } long -outlstring(ushort *s, long n) +outlstring(Rune *s, long n) { - char buf[2]; - int c; + char buf[sizeof(Rune)]; + int c, i; long r; - while(nstring & 1) + while(nstring % sizeof buf) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = sizeof buf; i > 0; c >>= 8) + buf[--i] = c; } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof buf; c >>= 8) + buf[i++] = c; } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof buf); + n -= sizeof buf; } return r; } diff --git a/sys/src/cmd/2c/swt.c b/sys/src/cmd/2c/swt.c index a2a94ea08..ce180d67f 100644 --- a/sys/src/cmd/2c/swt.c +++ b/sys/src/cmd/2c/swt.c @@ -324,26 +324,26 @@ outstring(char *s, long n) } long -outlstring(ushort *s, long n) +outlstring(Rune *s, long n) { - char buf[2]; - int c; + char buf[sizeof(Rune)]; + int c, i; long r; - while(nstring & 1) + while(nstring % sizeof buf) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = sizeof buf; i > 0; c >>= 8) + buf[--i] = c; } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof buf; c >>= 8) + buf[i++] = c; } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof buf); + n -= sizeof buf; } return r; } diff --git a/sys/src/cmd/acme/regx.c b/sys/src/cmd/acme/regx.c index f18d395f0..09ad0290b 100644 --- a/sys/src/cmd/acme/regx.c +++ b/sys/src/cmd/acme/regx.c @@ -487,7 +487,7 @@ bldcclass(void) exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -509,7 +509,7 @@ classmatch(int classno, int c, int negate) p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; diff --git a/sys/src/cmd/auth/convkeys.c b/sys/src/cmd/auth/convkeys.c index 200c5ef21..dcd2c027f 100644 --- a/sys/src/cmd/auth/convkeys.c +++ b/sys/src/cmd/auth/convkeys.c @@ -121,7 +121,7 @@ badname(char *s) for (; *s != '\0'; s += n) { n = chartorune(&r, s); - if (n == 1 && r == Runeerror) + if (r == Runeerror) return 1; } return 0; diff --git a/sys/src/cmd/bitsy/keyboard.c b/sys/src/cmd/bitsy/keyboard.c index 0972bcd75..aaa811436 100644 --- a/sys/src/cmd/bitsy/keyboard.c +++ b/sys/src/cmd/bitsy/keyboard.c @@ -395,7 +395,7 @@ threadmain(int argc, char *argv[]) if(strcmp(args[0], "keyboard:")==0 || strcmp(args[0], "scribble:")==0) if(strcmp(args[1], "value") == 0){ n = atoi(args[2]); - if(n <= 0xFFFF){ + if(n <= Runemax){ r = n; i = runetochar(str, &r); write(kbdfd, str, i); diff --git a/sys/src/cmd/bitsy/prompter.c b/sys/src/cmd/bitsy/prompter.c index 9a76a0d0e..df8e546f5 100644 --- a/sys/src/cmd/bitsy/prompter.c +++ b/sys/src/cmd/bitsy/prompter.c @@ -282,7 +282,7 @@ threadmain(int argc, char *argv[]) n = atoi(args[2]); if(n == '\033') /* Escape exits */ break; - if(n <= 0xFFFF){ + if(n <= Runemax){ r = n; send(kbdctl->c, &r); } diff --git a/sys/src/cmd/cc/cc.h b/sys/src/cmd/cc/cc.h index d66faaa11..01eb04562 100644 --- a/sys/src/cmd/cc/cc.h +++ b/sys/src/cmd/cc/cc.h @@ -51,7 +51,7 @@ struct Node double fconst; /* fp constant */ vlong vconst; /* non fp const */ char* cstring; /* character string */ - ushort* rstring; /* rune string */ + Rune* rstring; /* rune string */ Sym* sym; Type* type; @@ -336,6 +336,8 @@ enum TFILE, TOLD, NALLTYPES, + + TRUNE = sizeof(Rune)==4? TUINT: TUSHORT, }; enum { @@ -740,7 +742,7 @@ void gclean(void); void gextern(Sym*, Node*, long, long); void ginit(void); long outstring(char*, long); -long outlstring(ushort*, long); +long outlstring(Rune*, long); void sextern(Sym*, Node*, long, long); void xcom(Node*); long exreg(Type*); diff --git a/sys/src/cmd/cc/cc.y b/sys/src/cmd/cc/cc.y index 09b788598..eff930b23 100644 --- a/sys/src/cmd/cc/cc.y +++ b/sys/src/cmd/cc/cc.y @@ -855,9 +855,9 @@ lstring: LLSTRING { $$ = new(OLSTRING, Z, Z); - $$->type = typ(TARRAY, types[TUSHORT]); - $$->type->width = $1.l + sizeof(ushort); - $$->rstring = (ushort*)$1.s; + $$->type = typ(TARRAY, types[TRUNE]); + $$->type->width = $1.l + sizeof(Rune); + $$->rstring = (Rune*)$1.s; $$->sym = symstring; $$->etype = TARRAY; $$->class = CSTATIC; @@ -867,16 +867,16 @@ lstring: char *s; int n; - n = $1->type->width - sizeof(ushort); + n = $1->type->width - sizeof(Rune); s = alloc(n+$2.l+MAXALIGN); memcpy(s, $1->rstring, n); memcpy(s+n, $2.s, $2.l); - *(ushort*)(s+n+$2.l) = 0; + *(Rune*)(s+n+$2.l) = 0; $$ = $1; $$->type->width += $2.l; - $$->rstring = (ushort*)s; + $$->rstring = (Rune*)s; } zelist: diff --git a/sys/src/cmd/cc/com.c b/sys/src/cmd/cc/com.c index 8ff7c4663..a957c3acd 100644 --- a/sys/src/cmd/cc/com.c +++ b/sys/src/cmd/cc/com.c @@ -633,10 +633,11 @@ tcomo(Node *n, int f) break; case OLSTRING: - if(n->type->link != types[TUSHORT]) { + if(n->type->link != types[TRUNE]) { o = outstring(0, 0); while(o & 3) { - outlstring(L"", sizeof(ushort)); + Rune str[1] = {0}; + outlstring(str, sizeof(Rune)); o = outlstring(0, 0); } } diff --git a/sys/src/cmd/cc/dpchk.c b/sys/src/cmd/cc/dpchk.c index 99a49ee5a..aa7d468b8 100644 --- a/sys/src/cmd/cc/dpchk.c +++ b/sys/src/cmd/cc/dpchk.c @@ -67,13 +67,14 @@ getflag(char *s) { Bits flag; int f; - char *fmt; + char *fmt, *e; Rune c; fmt = fmtbuf; + e = fmtbuf + sizeof(fmtbuf)-1; flag = zbits; nstar = 0; - for(;;) { + while(fmt < e){ s += chartorune(&c, s); fmt += runetochar(fmt, &c); if(c == 0 || c >= nelem(flagbits)) @@ -175,7 +176,7 @@ pragvararg(void) { Sym *s; int n, c; - char *t; + char *t, *e; Rune r; Type *ty; @@ -225,12 +226,15 @@ cktype: if(c != '"') goto bad; t = fmtbuf; + e = t + sizeof(fmtbuf)-1; for(;;) { r = getr(); if(r == ' ' || r == '\n') goto bad; if(r == '"') break; + if(t >= e) + goto bad; t += runetochar(t, &r); } *t = 0; diff --git a/sys/src/cmd/cc/lex.c b/sys/src/cmd/cc/lex.c index 68a566ba0..48da7339f 100644 --- a/sys/src/cmd/cc/lex.c +++ b/sys/src/cmd/cc/lex.c @@ -467,7 +467,7 @@ l1: yyerror("missing '"); peekc = c1; } - yylval.vval = convvtox(c, TUSHORT); + yylval.vval = convvtox(c, TRUNE); return LUCONST; } if(c == '"') { @@ -541,15 +541,15 @@ l1: c = escchar('"', 1, 0); if(c == EOF) break; - cp = allocn(cp, c1, sizeof(ushort)); - *(ushort*)(cp + c1) = c; - c1 += sizeof(ushort); + cp = allocn(cp, c1, sizeof(Rune)); + *(Rune*)(cp + c1) = c; + c1 += sizeof(Rune); } yylval.sval.l = c1; do { - cp = allocn(cp, c1, sizeof(ushort)); - *(ushort*)(cp + c1) = 0; - c1 += sizeof(ushort); + cp = allocn(cp, c1, sizeof(Rune)); + *(Rune*)(cp + c1) = 0; + c1 += sizeof(Rune); } while(c1 & MAXALIGN); yylval.sval.s = cp; return LLSTRING; @@ -1027,7 +1027,7 @@ getnsc(void) } else c = GETC(); for(;;) { - if(!isspace(c)) + if(c >= Runeself || !isspace(c)) return c; if(c == '\n') { lineno++; diff --git a/sys/src/cmd/cc/pswt.c b/sys/src/cmd/cc/pswt.c index df1cda4a4..1eb495595 100644 --- a/sys/src/cmd/cc/pswt.c +++ b/sys/src/cmd/cc/pswt.c @@ -132,28 +132,28 @@ casf(void) } long -outlstring(ushort *s, long n) +outlstring(Rune *s, long n) { - char buf[2]; - int c; + char buf[sizeof(Rune)]; + int c, i; long r; if(suppress) return nstring; - while(nstring & 1) + while(nstring % sizeof buf) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = sizeof buf; i > 0; c >>= 8) + buf[--i] = c; } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof buf; c >>= 8) + buf[i++] = c; } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof buf); + n -= sizeof buf; } return r; } diff --git a/sys/src/cmd/disk/9660/cdrdwr.c b/sys/src/cmd/disk/9660/cdrdwr.c index 36e849377..b80195100 100644 --- a/sys/src/cmd/disk/9660/cdrdwr.c +++ b/sys/src/cmd/disk/9660/cdrdwr.c @@ -503,7 +503,6 @@ Cputrscvt(Cdimg *cd, char *s, int size) { Rune r[256]; - strtorune(r, s); Cputrs(cd, strtorune(r, s), size); } diff --git a/sys/src/cmd/disk/9660/jchar.c b/sys/src/cmd/disk/9660/jchar.c index c49da6351..9836b610c 100644 --- a/sys/src/cmd/disk/9660/jchar.c +++ b/sys/src/cmd/disk/9660/jchar.c @@ -45,8 +45,7 @@ isbadjoliet(char *s) if(utflen(s) > 64) return 1; - strtorune(r, s); - for(p=r; *p; p++) + for(p=strtorune(r, s); *p; p++) if(isjolietfrog(*p)) return 1; return 0; diff --git a/sys/src/cmd/ed.c b/sys/src/cmd/ed.c index 9864dd3bf..0f18fadc0 100644 --- a/sys/src/cmd/ed.c +++ b/sys/src/cmd/ed.c @@ -54,7 +54,7 @@ Reprog *pattern; int peekc; int pflag; int rescuing; -Rune rhsbuf[LBSIZE/2]; +Rune rhsbuf[LBSIZE/sizeof(Rune)]; char savedfile[FNSIZE]; jmp_buf savej; int subnewa; @@ -990,11 +990,11 @@ getline(int tl) lp = linebuf; bp = getblock(tl, OREAD); nl = nleft; - tl &= ~((BLKSIZE/2) - 1); + tl &= ~((BLKSIZE/sizeof(Rune)) - 1); while(*lp++ = *bp++) { nl -= sizeof(Rune); if(nl == 0) { - bp = getblock(tl += BLKSIZE/2, OREAD); + bp = getblock(tl += BLKSIZE/sizeof(Rune), OREAD); nl = nleft; } } @@ -1012,7 +1012,7 @@ putline(void) tl = tline; bp = getblock(tl, OWRITE); nl = nleft; - tl &= ~((BLKSIZE/2)-1); + tl &= ~((BLKSIZE/sizeof(Rune))-1); while(*bp = *lp++) { if(*bp++ == '\n') { bp[-1] = 0; @@ -1021,7 +1021,7 @@ putline(void) } nl -= sizeof(Rune); if(nl == 0) { - tl += BLKSIZE/2; + tl += BLKSIZE/sizeof(Rune); bp = getblock(tl, OWRITE); nl = nleft; } @@ -1048,8 +1048,8 @@ getblock(int atl, int iof) static uchar ibuff[BLKSIZE]; static uchar obuff[BLKSIZE]; - bno = atl / (BLKSIZE/2); - off = (atl<<1) & (BLKSIZE-1) & ~03; + bno = atl / (BLKSIZE/sizeof(Rune)); + off = (atl*sizeof(Rune)) & (BLKSIZE-1) & ~03; if(bno >= NBLK) { lastc = '\n'; error(T); @@ -1240,7 +1240,7 @@ compsub(void) if(c == '\\') { c = getchr(); *p++ = ESCFLG; - if(p >= &rhsbuf[LBSIZE/2]) + if(p >= &rhsbuf[nelem(rhsbuf)]) error(Q); } else if(c == '\n' && (!globp || !globp[0])) { @@ -1251,7 +1251,7 @@ compsub(void) if(c == seof) break; *p++ = c; - if(p >= &rhsbuf[LBSIZE/2]) + if(p >= &rhsbuf[nelem(rhsbuf)]) error(Q); } *p = 0; diff --git a/sys/src/cmd/file.c b/sys/src/cmd/file.c index dec241f96..cbc2227d8 100644 --- a/sys/src/cmd/file.c +++ b/sys/src/cmd/file.c @@ -359,7 +359,7 @@ utfconv(void) rb = malloc(nbuf+1); memmove(rb, buf+2, nbuf); p = (char*)buf; - e = p+nbuf-4; + e = p+sizeof(buf)-UTFmax-1; for(i=0; i T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(unsigned char*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -85,11 +107,14 @@ runetochar(char *str, Rune *rune) { long c; + c = *rune; + if(c > Runemax) + c = Runeerror; + /* * one character sequence * 00000-0007F => 00-7F */ - c = *rune; if(c <= Rune1) { str[0] = c; return 1; @@ -109,34 +134,70 @@ runetochar(char *str, Rune *rune) * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int runelen(long c) { Rune rune; - char str[10]; + char str[UTFmax]; rune = c; return runetochar(str, &rune); } +int +runenlen(Rune *r, int nrune) +{ + int nb, c; + + nb = 0; + while(nrune--) { + c = *r++; + if(c <= Rune1) + nb++; + else + if(c <= Rune2) + nb += 2; + else + if(c <= Rune3 || c > Runemax) + nb += 3; + else + nb += 4; + } + return nb; +} + int fullrune(char *str, int n) { int c; - if(n > 0) { - c = *(unsigned char*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(unsigned char*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } + diff --git a/sys/src/cmd/postscript/common/rune.h b/sys/src/cmd/postscript/common/rune.h index 9c1fd4fd0..84301a8ea 100644 --- a/sys/src/cmd/postscript/common/rune.h +++ b/sys/src/cmd/postscript/common/rune.h @@ -14,6 +14,7 @@ enum UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a utf sequence (<) */ Runeself = 0x80, /* rune and utf sequences are the same (<) */ - Runeerror = 0xFFFD, /* decoding error in utf */ + Runeerror = 0xFFFD, /* decoding error in utf */ + Runemax = 0xFFFF, /* 16 bit rune */ }; #endif diff --git a/sys/src/cmd/sam/cmd.c b/sys/src/cmd/sam/cmd.c index d34333d18..a0e336f01 100644 --- a/sys/src/cmd/sam/cmd.c +++ b/sys/src/cmd/sam/cmd.c @@ -71,7 +71,7 @@ int inputc(void) { int n, nbuf; - char buf[3]; + char buf[UTFmax]; Rune r; Again: diff --git a/sys/src/cmd/sam/regexp.c b/sys/src/cmd/sam/regexp.c index 3fd05a0b5..4c655dda3 100644 --- a/sys/src/cmd/sam/regexp.c +++ b/sys/src/cmd/sam/regexp.c @@ -494,7 +494,7 @@ bldcclass(void) exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -516,7 +516,7 @@ classmatch(int classno, int c, int negate) p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; diff --git a/sys/src/cmd/samterm/mesg.c b/sys/src/cmd/samterm/mesg.c index be306a0f6..99831a9e2 100644 --- a/sys/src/cmd/samterm/mesg.c +++ b/sys/src/cmd/samterm/mesg.c @@ -429,7 +429,7 @@ outTv(Tmesg type, vlong v1) void outTslS(Tmesg type, int s1, long l1, Rune *s) { - char buf[DATASIZE*3+1]; + char buf[DATASIZE*UTFmax+1]; char *c; outstart(type); diff --git a/sys/src/cmd/sed.c b/sys/src/cmd/sed.c index 96c3eb493..790f2ed51 100644 --- a/sys/src/cmd/sed.c +++ b/sys/src/cmd/sed.c @@ -625,7 +625,7 @@ compsub(Rune *rhs, Rune *end) while ((r = *cp++) != '\0') { if(r == '\\') { if (rhs < end) - *rhs++ = 0xFFFF; + *rhs++ = Runemax; else return 0; r = *cp++; @@ -1055,7 +1055,7 @@ dosub(Rune *rhsbuf) sp = place(sp, loc1, loc2); continue; } - if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB + '0') { + if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB + '0') { n = c-'0'; if (subexp[n].rsp && subexp[n].rep) { sp = place(sp, subexp[n].rsp, subexp[n].rep); @@ -1336,7 +1336,7 @@ void arout(void) { int c; - char *s; + char *s, *e; char buf[128]; Rune *p1; Biobuf *fi; @@ -1347,7 +1347,7 @@ arout(void) Bputrune(&fout, *p1); Bputc(&fout, '\n'); } else { - for(s = buf, p1 = (*aptr)->text; *p1; p1++) + for(s = buf, e = buf+sizeof(buf)-UTFmax-1, p1 = (*aptr)->text; *p1 && s < e; p1++) s += runetochar(s, p1); *s = '\0'; if((fi = Bopen(buf, OREAD)) == 0) diff --git a/sys/src/cmd/tcs/utf.c b/sys/src/cmd/tcs/utf.c index 56e91890a..764ef9f7b 100644 --- a/sys/src/cmd/tcs/utf.c +++ b/sys/src/cmd/tcs/utf.c @@ -93,7 +93,7 @@ isoutf_in(int fd, long *notused, struct convert *out) if(!fullisorune(buf+i, tot-i)) break; c = isochartorune(&runes[j], buf+i); - if(runes[j] == Runeerror && c == 1){ + if(runes[j] == Runeerror){ if(squawk) EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); if(clean){ diff --git a/sys/src/cmd/tr.c b/sys/src/cmd/tr.c index da6fedf5f..adea05c25 100644 --- a/sys/src/cmd/tr.c +++ b/sys/src/cmd/tr.c @@ -15,7 +15,7 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 }; #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) -#define MAXRUNE 0xFFFF +#define MAXRUNE Runemax uchar f[(MAXRUNE+1)/8]; uchar t[(MAXRUNE+1)/8]; diff --git a/sys/src/cmd/tweak.c b/sys/src/cmd/tweak.c index 54ce2f678..ef4256889 100644 --- a/sys/src/cmd/tweak.c +++ b/sys/src/cmd/tweak.c @@ -803,13 +803,14 @@ attext(Thing *t, Point p, char *buf) } int -type(char *buf, char *tag) +type(char *buf, int nbuf, char *tag) { Rune r; - char *p; + char *p, *e; esetcursor(&busy); p = buf; + e = buf + nbuf-UTFmax-1; for(;;){ *p = 0; mesg("%s: %s", tag, buf); @@ -827,7 +828,8 @@ type(char *buf, char *tag) --p; break; default: - p += runetochar(p, &r); + if(p < e) + p += runetochar(p, &r); } } } @@ -846,7 +848,7 @@ textedit(Thing *t, char *tag) Thing *nt; buttons(Up); - if(type(buf, tag) == 0) + if(type(buf, sizeof(buf), tag) == 0) return; if(strcmp(tag, "file") == 0){ for(s=buf; *s; s++) @@ -1174,7 +1176,7 @@ cntledit(char *tag) long l; buttons(Up); - if(type(buf, tag) == 0) + if(type(buf, sizeof(buf), tag) == 0) return; if(strcmp(tag, "mag") == 0){ if(buf[0]<'0' || '9'Maxmag){ @@ -1806,7 +1808,7 @@ tchar(Thing *t) return; } } - if(type(buf, "char (hex or character or hex-hex)") == 0) + if(type(buf, sizeof(buf), "char (hex or character or hex-hex)") == 0) return; if(utflen(buf) == 1){ chartorune(&r, buf); @@ -2000,7 +2002,7 @@ menu(void) sel = emenuhit(3, &mouse, &menu3); switch(sel){ case Mopen: - if(type(buf, "file")){ + if(type(buf, sizeof(buf), "file")){ t = tget(buf); if(t) drawthing(t, 1); diff --git a/sys/src/cmd/unicode.c b/sys/src/cmd/unicode.c index a04472711..aec44b750 100644 --- a/sys/src/cmd/unicode.c +++ b/sys/src/cmd/unicode.c @@ -51,13 +51,13 @@ range(char *argv[]) return "bad range"; } min = strtoul(q, &q, 16); - if(min<0 || min>0xFFFF || *q!='-') + if(min<0 || min>Runemax || *q!='-') goto err; q++; if(strchr(hex, *q) == 0) goto err; max = strtoul(q, &q, 16); - if(max<0 || max>0xFFFF || maxRunemax || max0xFFFF || *q!=0) + if(m<0 || m>Runemax || *q!=0) goto err; Bprint(&bout, "%C", m); if(!text) diff --git a/sys/src/cmd/unix/drawterm/libc/dofmt.c b/sys/src/cmd/unix/drawterm/libc/dofmt.c index 826360d0a..f905c0ad8 100644 --- a/sys/src/cmd/unix/drawterm/libc/dofmt.c +++ b/sys/src/cmd/unix/drawterm/libc/dofmt.c @@ -528,12 +528,15 @@ __flagfmt(Fmt *f) int __badfmt(Fmt *f) { - char x[3]; + char x[2+UTFmax]; + Rune r; + int n; + r = f->r; x[0] = '%'; - x[1] = f->r; - x[2] = '%'; - f->prec = 3; - __fmtcpy(f, (const void*)x, 3, 3); + n = 1+runetochar(x+1, &r); + x[n++] = '%'; + f->prec = n; + _fmtcpy(f, x, n, n); return 0; } diff --git a/sys/src/cmd/unix/drawterm/libc/rune.c b/sys/src/cmd/unix/drawterm/libc/rune.c index b62da9e66..0bb49a745 100644 --- a/sys/src/cmd/unix/drawterm/libc/rune.c +++ b/sys/src/cmd/unix/drawterm/libc/rune.c @@ -8,16 +8,19 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -86,11 +108,14 @@ runetochar(char *str, Rune *rune) { long c; + c = *rune; + if(c > Runemax) + c = Runeerror; + /* * one character sequence * 00000-0007F => 00-7F */ - c = *rune; if(c <= Rune1) { str[0] = c; return 1; @@ -110,10 +135,22 @@ runetochar(char *str, Rune *rune) * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -140,7 +177,10 @@ runenlen(Rune *r, int nrune) if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -150,13 +190,15 @@ fullrune(char *str, int n) { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } + diff --git a/sys/src/cmd/unix/drawterm/libc/utf.h b/sys/src/cmd/unix/drawterm/libc/utf.h index 623bfda94..f7c3ebd83 100644 --- a/sys/src/cmd/unix/drawterm/libc/utf.h +++ b/sys/src/cmd/unix/drawterm/libc/utf.h @@ -8,7 +8,8 @@ enum UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* diff --git a/sys/src/cmd/unix/u9fs/rune.c b/sys/src/cmd/unix/u9fs/rune.c index a0822d625..b8f73ba94 100644 --- a/sys/src/cmd/unix/u9fs/rune.c +++ b/sys/src/cmd/unix/u9fs/rune.c @@ -1,6 +1,7 @@ #include char *argv0; + enum { Bit1 = 7, @@ -8,27 +9,30 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -86,11 +109,14 @@ runetochar(char *str, Rune *rune) { long c; + c = *rune; + if(c > Runemax) + c = Runeerror; + /* * one character sequence * 00000-0007F => 00-7F */ - c = *rune; if(c <= Rune1) { str[0] = c; return 1; @@ -110,39 +136,70 @@ runetochar(char *str, Rune *rune) * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int runelen(long c) { Rune rune; - char str[10]; + char str[UTFmax]; rune = c; return runetochar(str, &rune); } int -utflen(char *s) +runenlen(Rune *r, int nrune) +{ + int nb, c; + + nb = 0; + while(nrune--) { + c = *r++; + if(c <= Rune1) + nb++; + else + if(c <= Rune2) + nb += 2; + else + if(c <= Rune3 || c > Runemax) + nb += 3; + else + nb += 4; + } + return nb; +} + +int +fullrune(char *str, int n) { int c; - long n; - Rune rune; - n = 0; - for(;;) { - c = *(uchar*)s; - if(c < Runeself) { - if(c == 0) - return n; - s++; - } else - s += chartorune(&rune, s); - n++; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } + diff --git a/sys/src/cmd/upas/fs/mbox.c b/sys/src/cmd/upas/fs/mbox.c index 71dab3d0c..21f786f0c 100644 --- a/sys/src/cmd/upas/fs/mbox.c +++ b/sys/src/cmd/upas/fs/mbox.c @@ -1223,12 +1223,12 @@ latin1toutf(char **out, char *in, char *e) return 0; n += e-in; - *out = p = malloc(n+1); + *out = p = malloc(UTFmax*n+1); if(p == nil) return 0; for(; in < e; in++){ - r = (uchar)*in; + r = (*in) & 0xff; p += runetochar(p, &r); } *p = 0; diff --git a/sys/src/cmd/upas/vf/vf.c b/sys/src/cmd/upas/vf/vf.c index 376488f7e..d73fbe47f 100644 --- a/sys/src/cmd/upas/vf/vf.c +++ b/sys/src/cmd/upas/vf/vf.c @@ -954,7 +954,7 @@ tokenconvert(String *t) { String *s; char decoded[1024]; - char utfbuf[2*1024]; + char utfbuf[UTFmax*1024]; int i, len; char *e; char *token; diff --git a/sys/src/cmd/vnc/screen.c b/sys/src/cmd/vnc/screen.c index e11155ac9..ce8a7e818 100644 --- a/sys/src/cmd/vnc/screen.c +++ b/sys/src/cmd/vnc/screen.c @@ -335,6 +335,8 @@ screenputc(char *buf) addflush(r); curpos.x = *xp; break; + case '\0': + break; default: p = memsubfontwidth(memdefont, buf); w = p.x; @@ -354,23 +356,19 @@ screenputc(char *buf) void screenputs(char *s, int n) { - int i; - Rune r; - char buf[4]; + static char rb[UTFmax+1]; + static int nrb; + char *e; drawlock(); - while(n > 0){ - i = chartorune(&r, s); - if(i == 0){ - s++; - --n; - continue; + e = s + n; + while(s < e){ + rb[nrb++] = *s++; + if(nrb >= UTFmax || fullrune(rb, nrb)){ + rb[nrb] = 0; + screenputc(rb); + nrb = 0; } - memmove(buf, s, i); - buf[i] = 0; - n -= i; - s += i; - screenputc(buf); } screenflush(); drawunlock(); diff --git a/sys/src/cmd/yacc.c b/sys/src/cmd/yacc.c index 7aef1a281..b25594df9 100644 --- a/sys/src/cmd/yacc.c +++ b/sys/src/cmd/yacc.c @@ -141,7 +141,7 @@ Biobuf* foutput; /* y.output file */ char* infile; /* input file name */ int numbval; /* value of an input number */ -char tokname[NAMESIZE+4]; /* input token name, slop for runes and 0 */ +char tokname[NAMESIZE+UTFmax+1]; /* input token name, slop for runes and 0 */ /* structure declarations */ diff --git a/sys/src/libbio/bgetrune.c b/sys/src/libbio/bgetrune.c index 3ba90384e..467b1ba4a 100644 --- a/sys/src/libbio/bgetrune.c +++ b/sys/src/libbio/bgetrune.c @@ -7,7 +7,7 @@ Bgetrune(Biobufhdr *bp) { int c, i; Rune rune; - char str[4]; + char str[UTFmax]; c = Bgetc(bp); if(c < Runeself) { /* one char */ diff --git a/sys/src/libbio/bputrune.c b/sys/src/libbio/bputrune.c index 2a625bd9a..efc4672e6 100644 --- a/sys/src/libbio/bputrune.c +++ b/sys/src/libbio/bputrune.c @@ -6,7 +6,7 @@ int Bputrune(Biobufhdr *bp, long c) { Rune rune; - char str[4]; + char str[UTFmax]; int n; rune = c; diff --git a/sys/src/libc/fmt/dofmt.c b/sys/src/libc/fmt/dofmt.c index 2a4b42959..95852f5c2 100644 --- a/sys/src/libc/fmt/dofmt.c +++ b/sys/src/libc/fmt/dofmt.c @@ -512,12 +512,15 @@ _flagfmt(Fmt *f) int _badfmt(Fmt *f) { - char x[3]; + char x[2+UTFmax]; + Rune r; + int n; + r = f->r; x[0] = '%'; - x[1] = f->r; - x[2] = '%'; - f->prec = 3; - _fmtcpy(f, x, 3, 3); + n = 1+runetochar(x+1, &r); + x[n++] = '%'; + f->prec = n; + _fmtcpy(f, x, n, n); return 0; } diff --git a/sys/src/libc/port/rune.c b/sys/src/libc/port/rune.c index b62da9e66..d69d59515 100644 --- a/sys/src/libc/port/rune.c +++ b/sys/src/libc/port/rune.c @@ -8,16 +8,19 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + /* * bad decoding */ @@ -86,11 +108,14 @@ runetochar(char *str, Rune *rune) { long c; + c = *rune; + if(c > Runemax) + c = Runeerror; + /* * one character sequence * 00000-0007F => 00-7F */ - c = *rune; if(c <= Rune1) { str[0] = c; return 1; @@ -110,17 +135,29 @@ runetochar(char *str, Rune *rune) * three character sequence * 0800-FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int runelen(long c) { Rune rune; - char str[10]; + char str[UTFmax]; rune = c; return runetochar(str, &rune); @@ -140,7 +177,10 @@ runenlen(Rune *r, int nrune) if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -150,13 +190,15 @@ fullrune(char *str, int n) { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } + diff --git a/sys/src/libdraw/buildfont.c b/sys/src/libdraw/buildfont.c index ba32e775b..ca13d55d6 100644 --- a/sys/src/libdraw/buildfont.c +++ b/sys/src/libdraw/buildfont.c @@ -70,7 +70,7 @@ buildfont(Display *d, char *buf, char *name) } max = strtol(s, &s, 0); s = skip(s); - if(*s==0 || min>=65536 || max>=65536 || min>max){ + if(*s==0 || min>Runemax || max>Runemax || min>max){ werrstr("illegal subfont range"); Err3: freefont(fnt); diff --git a/sys/src/libdraw/event.c b/sys/src/libdraw/event.c index 1cf223c36..5f99199dd 100644 --- a/sys/src/libdraw/event.c +++ b/sys/src/libdraw/event.c @@ -199,7 +199,7 @@ static void ekeyslave(int fd) { Rune r; - char t[3], k[10]; + char t[1+UTFmax], k[10]; int kr, kn, w; if(eforkslave(Ekeyboard) < MAXSLAVE) @@ -215,10 +215,9 @@ ekeyslave(int fd) } w = chartorune(&r, k); kn -= w; + memmove(t+1, k, w); memmove(k, &k[w], kn); - t[1] = r; - t[2] = r>>8; - if(write(epipe[1], t, 3) != 3) + if(write(epipe[1], t, sizeof(t)) != sizeof(t)) break; } breakout:; @@ -302,7 +301,7 @@ loop: s->head = (Ebuf *)1; return; } - if(i == Skeyboard && n != 3) + if(i == Skeyboard && n != (1+UTFmax)) drawerror(display, "events: protocol error: keyboard"); if(i == Smouse){ if(n < 1+1+2*12) @@ -418,14 +417,13 @@ int ekbd(void) { Ebuf *eb; - int c; + Rune r; if(Skeyboard < 0) drawerror(display, "events: keyboard not initialzed"); eb = ebread(&eslave[Skeyboard]); - c = eb->buf[0] + (eb->buf[1]<<8); - free(eb); - return c; + chartorune(&r, (char*)eb->buf); + return r; } void diff --git a/sys/src/libhtml/lex.c b/sys/src/libhtml/lex.c index ca8fc77d6..70106b9b4 100644 --- a/sys/src/libhtml/lex.c +++ b/sys/src/libhtml/lex.c @@ -1310,9 +1310,9 @@ getchar(TokenSource* ts) break; case UTF_8: ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i); - n = chartorune(&r, (char*)(buf+ts->i)); if(ok) { - if(warn && c == 0x80) + n = chartorune(&r, (char*)(buf+ts->i)); + if(warn && c == Runeerror) fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]); ts->i += n; c = r; diff --git a/sys/src/libhtml/utils.c b/sys/src/libhtml/utils.c index 5998913ac..7b272d904 100644 --- a/sys/src/libhtml/utils.c +++ b/sys/src/libhtml/utils.c @@ -535,7 +535,7 @@ toStr(uchar* buf, int n, int chset) // Convert buf[0:n], Unicode characters, // into an emalloc'd null-terminated string in character set chset. -// Use 0x80 for unconvertable characters. +// Use Runeerror for unconvertable characters. uchar* fromStr(Rune* buf, int n, int chset) { @@ -554,7 +554,7 @@ fromStr(Rune* buf, int n, int chset) for(i = 0; i < n; i++) { ch = buf[i]; if(ch > lim) - ch = 0x80; + ch = Runeerror; ans[i] = ch; } ans[n] = 0;