lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Lua patterns support a capture of type %bxy that captures a sequence of characters with 'x' and 'y' balanced. This can be inappropriate for the cases where those characters can represent not the "bracket structure" only, but appear "escaped" as ordinary characters. I propose a patch that adds a %Bxyz capture type to Lua. It stands for "sequence of characters with balanced 'x' and 'y' characters where any character right after a 'z' is treated just as an ordinary character, not the one to balance".
I have used it to parse Apache log files like this:
str = '"GET /A\\"B HTTP/1.1"'
str:match('%B""\\') -- captures the whole string
The patch follows:

diff -Naur lua-5.1.4-old/src/lstrlib.c lua-5.1.4/src/lstrlib.c
--- lua-5.1.4-old/src/lstrlib.c	2008-07-11 17:27:21.000000000 +0000
+++ lua-5.1.4/src/lstrlib.c	2011-03-04 13:50:48.000000000 +0000
@@ -298,6 +298,35 @@
 }


+static const char *matchbalanceesc (MatchState *ms, const char *s,
+                                   const char *p) {
+  if (*p == 0 || *(p+1) == 0 || *(p+2) == 0 )
+    luaL_error(ms->L, "unbalanced pattern");
+  if (*s != *p) return NULL;
+  else {
+    int b = *p;
+    int e = *(p+1);
+    int m = *(p+2);
+    int cont = 1;
+    int esc = 0;
+    while (++s < ms->src_end) {
+      if (esc) {
+        esc=0;
+        continue;
+      }
+      if (*s == m) {
+        esc=1;
+      }
+      else if (*s == e) {
+        if (--cont == 0) return s+1;
+      }
+      else if (*s == b) cont++;
+    }
+  }
+  return NULL;  /* string ends out of balance */
+}
+
+
 static const char *max_expand (MatchState *ms, const char *s,
                                  const char *p, const char *ep) {
   ptrdiff_t i = 0;  /* counts maximum expand for item */
@@ -381,6 +410,11 @@
           if (s == NULL) return NULL;
           p+=4; goto init;  /* else return match(ms, s, p+4); */
         }
+        case 'B': {  /* balanced-escaped string? */
+          s = matchbalanceesc(ms, s, p+2);
+          if (s == NULL) return NULL;
+          p+=5; goto init;  /* else return match(ms, s, p+4); */
+        }
         case 'f': {  /* frontier? */
           const char *ep; char previous;
           p += 2;