lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Hi all, hi Roberto.

while playing with LPeg and later LPegLabel I found out, that LPeg has (so far 
undocument (?)) internal UTF-8 support.

So, here comes my "wishlist"...

1a) Please release 1.1 with utfR() documented.

1b) I was wondering, though, with all the P,R,S (T in LPegLabel), ... why not 
just call it "U".

1c) Along those lines: I attached a patch (lpeg-unicode.diff), which provides a 
"U" as a mixture of R and S, also allowing to write single code points or 
ranges as strings (cf. https://github.com/sqmedeiros/lpeglabel/issues/35 - 
thanks, Sérgio).

2) "patt / function" - I'm using some form of class system in one of my 
projects. I'd wish, a callable table would be supported here, too (see 
attached patch lpeg-capture-callable.diff).

Apart from my whishes, thanks a lot for LPeg (and derivatives)!
Yogev
diff --git a/lptree.c b/lptree.c
index 30cef67..8a7ad6c 100644
--- a/lptree.c
+++ b/lptree.c
@@ -704,8 +704,25 @@ static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) {
 
 
 static int lp_utfr (lua_State *L) {
-  lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1);
-  lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2);
+  lua_Unsigned from, to;
+  if (lua_type(L, 1) == LUA_TSTRING) {
+    size_t len;
+    const char *s = lua_tolstring(L, 1, &len);
+    const char *e = s + len;
+    s = utf8_decode(s, &from);
+    luaL_argcheck(L, s, 1, "invalid UTF-8 sequence");
+    if (s < e) {
+      s = utf8_decode(s, &to);
+      luaL_argcheck(L, s, 1, "invalid UTF-8 sequence");
+      luaL_argcheck(L, s == e, 1, "extraneous characters after UTF-8 range");
+    }
+    else
+      to = from;
+  }
+  else {
+    from = (lua_Unsigned)luaL_checkinteger(L, 1);
+    to = (lua_Unsigned)luaL_checkinteger(L, 2);
+  }
   luaL_argcheck(L, from <= to, 2, "empty range");
   if (to <= 0x7f) {  /* ascii range? */
     TTree *tree = newcharset(L);  /* code it as a regular charset */
@@ -1315,7 +1332,7 @@ static struct luaL_Reg pattreg[] = {
   {"P", lp_P},
   {"S", lp_set},
   {"R", lp_range},
-  {"utfR", lp_utfr},
+  {"U", lp_utfr},
   {"locale", lp_locale},
   {"version", NULL},
   {"setmaxstack", lp_setmax},
diff --git a/lpvm.c b/lpvm.c
index f7c7e62..eae3d51 100644
--- a/lpvm.c
+++ b/lpvm.c
@@ -30,7 +30,7 @@ static const Instruction giveup = {{IGiveup, 0, 0}};
 /*
 ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
 */
-static const char *utf8_decode (const char *o, int *val) {
+const char *utf8_decode (const char *o, lua_Unsigned *val) {
   static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu};
   const unsigned char *s = (const unsigned char *)o;
   unsigned int c = s[0];  /* first byte */
@@ -237,11 +237,11 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
         continue;
       }
       case IUTFR: {
-        int codepoint;
+        lua_Unsigned codepoint;
         if (s >= e)
           goto fail;
         s = utf8_decode (s, &codepoint);
-        if (s && p[1].offset <= codepoint && codepoint <= utf_to(p))
+        if (s && ((lua_Unsigned) p[1].offset) <= codepoint && codepoint <= ((lua_Unsigned) utf_to(p)))
           p += 2;
         else
           goto fail;
diff --git a/lpvm.h b/lpvm.h
index ca625f9..67ee416 100644
--- a/lpvm.h
+++ b/lpvm.h
@@ -55,6 +55,7 @@ typedef union Instruction {
 #define utf_to(inst)	(((inst)->i.key << 8) | (inst)->i.aux)
 
 
+const char *utf8_decode (const char *o, lua_Unsigned *val);
 void printpatt (Instruction *p, int n);
 const char *match (lua_State *L, const char *o, const char *s, const char *e,
                    Instruction *op, Capture *capture, int ptop);
diff --git a/lptree.c b/lptree.c
index 30cef67..a5b7fb5 100644
--- a/lptree.c
+++ b/lptree.c
@@ -803,7 +803,13 @@ static TTree *newemptycapkey (lua_State *L, int cap, int idx) {
 static int lp_divcapture (lua_State *L) {
   switch (lua_type(L, 2)) {
     case LUA_TFUNCTION: return capture_aux(L, Cfunction, 2);
-    case LUA_TTABLE: return capture_aux(L, Cquery, 2);
+    case LUA_TTABLE: {
+      int call = luaL_getmetafield(L, 2, "__call");
+      if (call == LUA_TFUNCTION)
+        return capture_aux(L, Cfunction, 2);
+      else
+        return capture_aux(L, Cquery, 2);
+    }
     case LUA_TSTRING: return capture_aux(L, Cstring, 2);
     case LUA_TNUMBER: {
       int n = lua_tointeger(L, 2);