lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Matthew Frazier <leafstormrush@gmail.com> wrote:
> On 04/07/2011 03:15 PM, E. Toernig wrote:
> >
> > I think it would help (not only LPeg) to add unicode
> > escape sequences (\uXXXX) to Lua's strings.
>
> Lua itself does not have Unicode support. So, if you're suggesting that this
> would merely generate the proper UTF-8 bytes for the character...hmm, that's
> actually a pretty good idea.

Try the attached patch (with versions for 5.1 and 5.2).

Tony.
-- 
f.anthony.n.finch  <dot@dotat.at>  http://dotat.at/
Humber, Thames: Variable 3 or 4. Slight or moderate. Fair. Moderate or good,
occasionally poor later.
diff --git a/src/llex.c b/src/llex.c
index 62a89cc..8bb8922 100644
--- a/src/llex.c
+++ b/src/llex.c
@@ -277,6 +277,51 @@ static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
 }
 
 
+static void saveutf8(LexState *ls, unsigned u) {
+  /* no protection against malformed utf-8 */
+  if (u > 0x0000007F) {
+    if (u > 0x000007FF) {
+      if (u > 0x0000FFFF) {
+        if (u > 0x001FFFFF) {
+          if (u > 0x03FFFFFF) {
+            if (u > 0x7FFFFFFF) {
+                     save(ls,                    0xFE);
+                    save(ls, (u >> 30) % 0x40 + 0x80);
+            } else save(ls, (u >> 30) % 0x40 + 0xFC);
+                  save(ls, (u >> 24) % 0x40 + 0x80);
+          } else save(ls, (u >> 24) % 0x40 + 0xF8);
+                save(ls, (u >> 18) % 0x40 + 0x80);
+        } else save(ls, (u >> 18) % 0x40 + 0xF0);
+              save(ls, (u >> 12) % 0x40 + 0x80);
+      } else save(ls, (u >> 12) % 0x40 + 0xE0);
+            save(ls, (u >>  6) % 0x40 + 0x80);
+    } else save(ls, (u >>  6) % 0x40 + 0xC0);
+          save(ls, (u      ) % 0x40 + 0x80);
+  } else save(ls, (u      )              );
+}
+
+
+static unsigned readhexaesc (LexState *ls, int n) {
+  char buf[8], esc = ls->current;
+  unsigned x = 0;
+  int i, j, c;
+  for (i = 0; i < n; i++) {
+    c = buf[i] = next(ls);
+    if ('0' <= c && c <= '9') x = x*16 + c - '0';
+    else if ('A' <= c && c <= 'F') x = x*16 + c - 'A' + 10;
+    else if ('a' <= c && c <= 'f') x = x*16 + c - 'a' + 10;
+    else {
+      luaZ_resetbuffer(ls->buff);  /* prepare error message */
+      save(ls, '\\'); save(ls, esc);
+      for (j = 0; j <= i; j++) save(ls, buf[j]);
+      luaX_lexerror(ls, "hexadecimal digit expected", TK_STRING);
+    }
+  }
+  next(ls);
+  return x;
+}
+
+
 static void read_string (LexState *ls, int del, SemInfo *seminfo) {
   save_and_next(ls);
   while (ls->current != del) {
@@ -299,6 +344,9 @@ static void read_string (LexState *ls, int del, SemInfo *seminfo) {
           case 'r': c = '\r'; break;
           case 't': c = '\t'; break;
           case 'v': c = '\v'; break;
+          case 'x': save(ls, readhexaesc(ls, 2)); continue;
+          case 'u': saveutf8(ls, readhexaesc(ls, 4)); continue;
+          case 'U': saveutf8(ls, readhexaesc(ls, 8)); continue;
           case '\n':  /* go through */
           case '\r': save(ls, '\n'); inclinenumber(ls); continue;
           case EOZ: continue;  /* will raise an error next loop */
diff --git a/src/llex.c b/src/llex.c
index 38a2452..06bfdd7 100644
--- a/src/llex.c
+++ b/src/llex.c
@@ -284,23 +284,47 @@ static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
 }
 
 
-static int hexavalue (int c) {
-  if (lisdigit(c)) return c - '0';
-  else if (lisupper(c)) return c - 'A' + 10;
-  else return c - 'a' + 10;
+static void saveutf8(LexState *ls, unsigned u) {
+  /* no protection against malformed utf-8 */
+  if (u > 0x0000007F) {
+    if (u > 0x000007FF) {
+      if (u > 0x0000FFFF) {
+        if (u > 0x001FFFFF) {
+          if (u > 0x03FFFFFF) {
+            if (u > 0x7FFFFFFF) {
+                     save(ls,                    0xFE);
+                    save(ls, (u >> 30) % 0x40 + 0x80);
+            } else save(ls, (u >> 30) % 0x40 + 0xFC);
+                  save(ls, (u >> 24) % 0x40 + 0x80);
+          } else save(ls, (u >> 24) % 0x40 + 0xF8);
+                save(ls, (u >> 18) % 0x40 + 0x80);
+        } else save(ls, (u >> 18) % 0x40 + 0xF0);
+              save(ls, (u >> 12) % 0x40 + 0x80);
+      } else save(ls, (u >> 12) % 0x40 + 0xE0);
+            save(ls, (u >>  6) % 0x40 + 0x80);
+    } else save(ls, (u >>  6) % 0x40 + 0xC0);
+          save(ls, (u      ) % 0x40 + 0x80);
+  } else save(ls, (u      )              );
 }
 
 
-static int readhexaesc (LexState *ls) {
-  int c1, c2 = EOZ;
-  if (!lisxdigit(c1 = next(ls)) || !lisxdigit(c2 = next(ls))) {
-    luaZ_resetbuffer(ls->buff);  /* prepare error message */
-    save(ls, '\\'); save(ls, 'x');
-    if (c1 != EOZ) save(ls, c1);
-    if (c2 != EOZ) save(ls, c2);
-    lexerror(ls, "hexadecimal digit expected", TK_STRING);
+static unsigned readhexaesc (LexState *ls, int n) {
+  char buf[8], esc = ls->current;
+  unsigned x = 0;
+  int i, j, c;
+  for (i = 0; i < n; i++) {
+    c = buf[i] = next(ls);
+    if (lisxdigit(c))
+      x = x*16 + c - (lisdigit(c) ? '0' : lisupper(c) ? 'A' - 10 : 'a' - 10);
+    else {
+      luaZ_resetbuffer(ls->buff);  /* prepare error message */
+      save(ls, '\\'); save(ls, esc);
+      for (j = 0; j <= i; j++) save(ls, buf[j]);
+      lexerror(ls, "hexadecimal digit expected", TK_STRING);
+    }
   }
-  return (hexavalue(c1) << 4) + hexavalue(c2);
+  next(ls);
+  return x;
 }
 
 
@@ -348,7 +372,9 @@ static void read_string (LexState *ls, int del, SemInfo *seminfo) {
           case 'r': c = '\r'; break;
           case 't': c = '\t'; break;
           case 'v': c = '\v'; break;
-          case 'x': c = readhexaesc(ls); break;
+          case 'x': save(ls, readhexaesc(ls, 2)); continue;
+          case 'u': saveutf8(ls, readhexaesc(ls, 4)); continue;
+          case 'U': saveutf8(ls, readhexaesc(ls, 8)); continue;
           case '\n':
           case '\r': save(ls, '\n'); inclinenumber(ls); continue;
           case EOZ: continue;  /* will raise an error next loop */