[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: LPeg support for utf-8
- From: Tony Finch <dot@...>
- Date: Fri, 8 Apr 2011 17:54:11 +0100
Matthew Frazier <leafstormrush@gmail.com> wrote:
> On 04/07/2011 03:15 PM, E. Toernig wrote:
> >
> > I think it would help (not only LPeg) to add unicode
> > escape sequences (\uXXXX) to Lua's strings.
>
> Lua itself does not have Unicode support. So, if you're suggesting that this
> would merely generate the proper UTF-8 bytes for the character...hmm, that's
> actually a pretty good idea.
Try the attached patch (with versions for 5.1 and 5.2).
Tony.
--
f.anthony.n.finch <dot@dotat.at> http://dotat.at/
Humber, Thames: Variable 3 or 4. Slight or moderate. Fair. Moderate or good,
occasionally poor later.
diff --git a/src/llex.c b/src/llex.c
index 62a89cc..8bb8922 100644
--- a/src/llex.c
+++ b/src/llex.c
@@ -277,6 +277,51 @@ static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
}
+static void saveutf8(LexState *ls, unsigned u) {
+ /* no protection against malformed utf-8 */
+ if (u > 0x0000007F) {
+ if (u > 0x000007FF) {
+ if (u > 0x0000FFFF) {
+ if (u > 0x001FFFFF) {
+ if (u > 0x03FFFFFF) {
+ if (u > 0x7FFFFFFF) {
+ save(ls, 0xFE);
+ save(ls, (u >> 30) % 0x40 + 0x80);
+ } else save(ls, (u >> 30) % 0x40 + 0xFC);
+ save(ls, (u >> 24) % 0x40 + 0x80);
+ } else save(ls, (u >> 24) % 0x40 + 0xF8);
+ save(ls, (u >> 18) % 0x40 + 0x80);
+ } else save(ls, (u >> 18) % 0x40 + 0xF0);
+ save(ls, (u >> 12) % 0x40 + 0x80);
+ } else save(ls, (u >> 12) % 0x40 + 0xE0);
+ save(ls, (u >> 6) % 0x40 + 0x80);
+ } else save(ls, (u >> 6) % 0x40 + 0xC0);
+ save(ls, (u ) % 0x40 + 0x80);
+ } else save(ls, (u ) );
+}
+
+
+static unsigned readhexaesc (LexState *ls, int n) {
+ char buf[8], esc = ls->current;
+ unsigned x = 0;
+ int i, j, c;
+ for (i = 0; i < n; i++) {
+ c = buf[i] = next(ls);
+ if ('0' <= c && c <= '9') x = x*16 + c - '0';
+ else if ('A' <= c && c <= 'F') x = x*16 + c - 'A' + 10;
+ else if ('a' <= c && c <= 'f') x = x*16 + c - 'a' + 10;
+ else {
+ luaZ_resetbuffer(ls->buff); /* prepare error message */
+ save(ls, '\\'); save(ls, esc);
+ for (j = 0; j <= i; j++) save(ls, buf[j]);
+ luaX_lexerror(ls, "hexadecimal digit expected", TK_STRING);
+ }
+ }
+ next(ls);
+ return x;
+}
+
+
static void read_string (LexState *ls, int del, SemInfo *seminfo) {
save_and_next(ls);
while (ls->current != del) {
@@ -299,6 +344,9 @@ static void read_string (LexState *ls, int del, SemInfo *seminfo) {
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'v': c = '\v'; break;
+ case 'x': save(ls, readhexaesc(ls, 2)); continue;
+ case 'u': saveutf8(ls, readhexaesc(ls, 4)); continue;
+ case 'U': saveutf8(ls, readhexaesc(ls, 8)); continue;
case '\n': /* go through */
case '\r': save(ls, '\n'); inclinenumber(ls); continue;
case EOZ: continue; /* will raise an error next loop */
diff --git a/src/llex.c b/src/llex.c
index 38a2452..06bfdd7 100644
--- a/src/llex.c
+++ b/src/llex.c
@@ -284,23 +284,47 @@ static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
}
-static int hexavalue (int c) {
- if (lisdigit(c)) return c - '0';
- else if (lisupper(c)) return c - 'A' + 10;
- else return c - 'a' + 10;
+static void saveutf8(LexState *ls, unsigned u) {
+ /* no protection against malformed utf-8 */
+ if (u > 0x0000007F) {
+ if (u > 0x000007FF) {
+ if (u > 0x0000FFFF) {
+ if (u > 0x001FFFFF) {
+ if (u > 0x03FFFFFF) {
+ if (u > 0x7FFFFFFF) {
+ save(ls, 0xFE);
+ save(ls, (u >> 30) % 0x40 + 0x80);
+ } else save(ls, (u >> 30) % 0x40 + 0xFC);
+ save(ls, (u >> 24) % 0x40 + 0x80);
+ } else save(ls, (u >> 24) % 0x40 + 0xF8);
+ save(ls, (u >> 18) % 0x40 + 0x80);
+ } else save(ls, (u >> 18) % 0x40 + 0xF0);
+ save(ls, (u >> 12) % 0x40 + 0x80);
+ } else save(ls, (u >> 12) % 0x40 + 0xE0);
+ save(ls, (u >> 6) % 0x40 + 0x80);
+ } else save(ls, (u >> 6) % 0x40 + 0xC0);
+ save(ls, (u ) % 0x40 + 0x80);
+ } else save(ls, (u ) );
}
-static int readhexaesc (LexState *ls) {
- int c1, c2 = EOZ;
- if (!lisxdigit(c1 = next(ls)) || !lisxdigit(c2 = next(ls))) {
- luaZ_resetbuffer(ls->buff); /* prepare error message */
- save(ls, '\\'); save(ls, 'x');
- if (c1 != EOZ) save(ls, c1);
- if (c2 != EOZ) save(ls, c2);
- lexerror(ls, "hexadecimal digit expected", TK_STRING);
+static unsigned readhexaesc (LexState *ls, int n) {
+ char buf[8], esc = ls->current;
+ unsigned x = 0;
+ int i, j, c;
+ for (i = 0; i < n; i++) {
+ c = buf[i] = next(ls);
+ if (lisxdigit(c))
+ x = x*16 + c - (lisdigit(c) ? '0' : lisupper(c) ? 'A' - 10 : 'a' - 10);
+ else {
+ luaZ_resetbuffer(ls->buff); /* prepare error message */
+ save(ls, '\\'); save(ls, esc);
+ for (j = 0; j <= i; j++) save(ls, buf[j]);
+ lexerror(ls, "hexadecimal digit expected", TK_STRING);
+ }
}
- return (hexavalue(c1) << 4) + hexavalue(c2);
+ next(ls);
+ return x;
}
@@ -348,7 +372,9 @@ static void read_string (LexState *ls, int del, SemInfo *seminfo) {
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'v': c = '\v'; break;
- case 'x': c = readhexaesc(ls); break;
+ case 'x': save(ls, readhexaesc(ls, 2)); continue;
+ case 'u': saveutf8(ls, readhexaesc(ls, 4)); continue;
+ case 'U': saveutf8(ls, readhexaesc(ls, 8)); continue;
case '\n':
case '\r': save(ls, '\n'); inclinenumber(ls); continue;
case EOZ: continue; /* will raise an error next loop */