[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: LPeg wishlist...
- From: Yogev Sawa <4950n2012e@...>
- Date: Thu, 13 Apr 2023 20:42:34 +0200
Hi all, hi Roberto.
while playing with LPeg and later LPegLabel I found out, that LPeg has (so far
undocument (?)) internal UTF-8 support.
So, here comes my "wishlist"...
1a) Please release 1.1 with utfR() documented.
1b) I was wondering, though, with all the P,R,S (T in LPegLabel), ... why not
just call it "U".
1c) Along those lines: I attached a patch (lpeg-unicode.diff), which provides a
"U" as a mixture of R and S, also allowing to write single code points or
ranges as strings (cf. https://github.com/sqmedeiros/lpeglabel/issues/35 -
thanks, Sérgio).
2) "patt / function" - I'm using some form of class system in one of my
projects. I'd wish, a callable table would be supported here, too (see
attached patch lpeg-capture-callable.diff).
Apart from my whishes, thanks a lot for LPeg (and derivatives)!
Yogev
diff --git a/lptree.c b/lptree.c
index 30cef67..8a7ad6c 100644
--- a/lptree.c
+++ b/lptree.c
@@ -704,8 +704,25 @@ static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) {
static int lp_utfr (lua_State *L) {
- lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1);
- lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2);
+ lua_Unsigned from, to;
+ if (lua_type(L, 1) == LUA_TSTRING) {
+ size_t len;
+ const char *s = lua_tolstring(L, 1, &len);
+ const char *e = s + len;
+ s = utf8_decode(s, &from);
+ luaL_argcheck(L, s, 1, "invalid UTF-8 sequence");
+ if (s < e) {
+ s = utf8_decode(s, &to);
+ luaL_argcheck(L, s, 1, "invalid UTF-8 sequence");
+ luaL_argcheck(L, s == e, 1, "extraneous characters after UTF-8 range");
+ }
+ else
+ to = from;
+ }
+ else {
+ from = (lua_Unsigned)luaL_checkinteger(L, 1);
+ to = (lua_Unsigned)luaL_checkinteger(L, 2);
+ }
luaL_argcheck(L, from <= to, 2, "empty range");
if (to <= 0x7f) { /* ascii range? */
TTree *tree = newcharset(L); /* code it as a regular charset */
@@ -1315,7 +1332,7 @@ static struct luaL_Reg pattreg[] = {
{"P", lp_P},
{"S", lp_set},
{"R", lp_range},
- {"utfR", lp_utfr},
+ {"U", lp_utfr},
{"locale", lp_locale},
{"version", NULL},
{"setmaxstack", lp_setmax},
diff --git a/lpvm.c b/lpvm.c
index f7c7e62..eae3d51 100644
--- a/lpvm.c
+++ b/lpvm.c
@@ -30,7 +30,7 @@ static const Instruction giveup = {{IGiveup, 0, 0}};
/*
** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
*/
-static const char *utf8_decode (const char *o, int *val) {
+const char *utf8_decode (const char *o, lua_Unsigned *val) {
static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu};
const unsigned char *s = (const unsigned char *)o;
unsigned int c = s[0]; /* first byte */
@@ -237,11 +237,11 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
continue;
}
case IUTFR: {
- int codepoint;
+ lua_Unsigned codepoint;
if (s >= e)
goto fail;
s = utf8_decode (s, &codepoint);
- if (s && p[1].offset <= codepoint && codepoint <= utf_to(p))
+ if (s && ((lua_Unsigned) p[1].offset) <= codepoint && codepoint <= ((lua_Unsigned) utf_to(p)))
p += 2;
else
goto fail;
diff --git a/lpvm.h b/lpvm.h
index ca625f9..67ee416 100644
--- a/lpvm.h
+++ b/lpvm.h
@@ -55,6 +55,7 @@ typedef union Instruction {
#define utf_to(inst) (((inst)->i.key << 8) | (inst)->i.aux)
+const char *utf8_decode (const char *o, lua_Unsigned *val);
void printpatt (Instruction *p, int n);
const char *match (lua_State *L, const char *o, const char *s, const char *e,
Instruction *op, Capture *capture, int ptop);
diff --git a/lptree.c b/lptree.c
index 30cef67..a5b7fb5 100644
--- a/lptree.c
+++ b/lptree.c
@@ -803,7 +803,13 @@ static TTree *newemptycapkey (lua_State *L, int cap, int idx) {
static int lp_divcapture (lua_State *L) {
switch (lua_type(L, 2)) {
case LUA_TFUNCTION: return capture_aux(L, Cfunction, 2);
- case LUA_TTABLE: return capture_aux(L, Cquery, 2);
+ case LUA_TTABLE: {
+ int call = luaL_getmetafield(L, 2, "__call");
+ if (call == LUA_TFUNCTION)
+ return capture_aux(L, Cfunction, 2);
+ else
+ return capture_aux(L, Cquery, 2);
+ }
case LUA_TSTRING: return capture_aux(L, Cstring, 2);
case LUA_TNUMBER: {
int n = lua_tointeger(L, 2);