lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Hi,

recent posts discussed EOL-differences.  The lexical analyzer in Lua
has similar problems with 'foreign' EOLs.  Another problem is that it
is locale specific.  I've backported some of my changes of Sol's lex
to lua-4.1-work4.  The patch is attached.  Here's a list of changes:

 - Accepts "\n", "\r", and "\r\n" as end of line.

 - Accepts CTRL-Z as end of file.

 - EOLs in long strings and escaped EOLs in strings are converted
   to a single "\n"

 - Has its own ctype macros to make identifiers locale independent.

These changes make sure that a Lua script gives exactly the same
compiled code on any system, regardless of EOL convention of locale
setting.  The next changes (also included in the patch) are for
convenience:

 - Adds "long comments": --[[ comment ]]  (see Lua-Wiki power patches).

 - The '#' comment is allowed not only in the first line but in all
   lines.  That way you have less problems preprocessing Lua scripts
   with cpp (inserts #line directives).

 - Changes the "\123" escape from decimal to octal as it is done in
   _all_ other languages I know.

Btw, with the patch in place all systems should be able to open Lua
source files in binary mode so the re-open in ldo.c becomes unnecessary.

Ciao, ET.
--- lua-4.1-work4/src/llex.c	Fri Feb  8 23:40:27 2002
+++ lua-4.1-work4-lex/src/llex.c	Fri Apr  5 01:05:45 2002
@@ -5,7 +5,6 @@
 */
 
 
-#include <ctype.h>
 #include <stdio.h>
 #include <string.h>
 
@@ -19,6 +18,34 @@
 #include "lzio.h"
 
 
+#define CTRL_Z	'\032'
+
+#define C 1	/* control char */
+#define O 2	/* octal digit [0-7] */
+#define D 4	/* digit [89] */
+#define A 8	/* alpha [a-zA-Z_] */
+#define E 16	/* end of line [\r\n*CTRL-Z**EOZ*] */
+
+static const char sctype[257] = /* lex is not locale specific! */
+{
+  E, /* EOZ */
+  C,C,C,C, C,C,C,C, C,C,E,C, C,E,C,C, C,C,C,C, C,C,C,C, C,C,E,C, C,C,C,C,
+  0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, O,O,O,O, O,O,O,O, D,D,0,0, 0,0,0,0,
+  0,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,0, 0,0,0,A,
+  0,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,0, 0,0,0,0,
+  C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C,
+  C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C,
+  C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C,
+  C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C,
+};
+
+#define lex_isdigit(c)	(sctype[(c)+1] & (O|D))
+#define lex_isodigit(c)	(sctype[(c)+1] & O)
+#define lex_isctrl(c)	(sctype[(c)+1] & (C|E))
+#define lex_isfname(c)	(sctype[(c)+1] & A)
+#define lex_isname(c)	(sctype[(c)+1] & (A|O|D))
+#define lex_iseol(c)	(sctype[(c)+1] & E)
+
 
 #define next(LS) (LS->current = zgetc(LS->z))
 
@@ -100,27 +127,30 @@
 }
 
 
-static void inclinenumber (LexState *LS) {
-  next(LS);  /* skip `\n' */
+static void inclinenumber (LexState *LS) {	/* called on '\r' and '\n' */
+  /* accept "\r", "\n" or "\r\n" as line terminator */
+  if (LS->current == '\n' || next(LS) == '\n')
+    next(LS);
   ++LS->linenumber;
   luaX_checklimit(LS, LS->linenumber, MAX_INT, "lines in a chunk");
 }
 
 
+static void eol_comment (LexState *LS) {
+  while (lex_iseol(LS->current))
+    next(LS);
+}
+
+
 void luaX_setinput (lua_State *L, LexState *LS, ZIO *z, TString *source) {
   LS->L = L;
   LS->lookahead.token = TK_EOS;  /* no look-ahead token */
   LS->z = z;
   LS->fs = NULL;
-  LS->linenumber = 1;
+  LS->linenumber = 0;
   LS->lastline = 1;
   LS->source = source;
-  next(LS);  /* read first char */
-  if (LS->current == '#') {
-    do {  /* skip first line */
-      next(LS);
-    } while (LS->current != '\n' && LS->current != EOZ);
-  }
+  LS->current = '\n';
 }
 
 
@@ -150,7 +180,7 @@
   do {
     checkbuffer(L, 10, l);
     save_and_next(L, LS, l);
-  } while (isalnum(LS->current) || LS->current == '_');
+  } while (lex_isname(LS->current));
   save(L, '\0', l);
   return l-1;
 }
@@ -162,7 +192,7 @@
   size_t l = 0;
   checkbuffer(L, 10, l);
   if (comma) save(L, '.', l);
-  while (isdigit(LS->current)) {
+  while (lex_isdigit(LS->current)) {
     checkbuffer(L, 10, l);
     save_and_next(L, LS, l);
   }
@@ -176,7 +206,7 @@
                  TK_NUMBER);
     }
   }
-  while (isdigit(LS->current)) {
+  while (lex_isdigit(LS->current)) {
     checkbuffer(L, 10, l);
     save_and_next(L, LS, l);
   }
@@ -184,7 +214,7 @@
     save_and_next(L, LS, l);  /* read `E' */
     if (LS->current == '+' || LS->current == '-')
       save_and_next(L, LS, l);  /* optional exponent sign */
-    while (isdigit(LS->current)) {
+    while (lex_isdigit(LS->current)) {
       checkbuffer(L, 10, l);
       save_and_next(L, LS, l);
     }
@@ -202,14 +232,16 @@
   checkbuffer(L, 10, l);
   save(L, '[', l);  /* save first `[' */
   save_and_next(L, LS, l);  /* pass the second `[' */
-  if (LS->current == '\n')  /* string starts with a newline? */
-    inclinenumber(LS);  /* skip it */
   for (;;) {
     checkbuffer(L, 10, l);
     switch (LS->current) {
       case EOZ:
+      case CTRL_Z:
         save(L, '\0', l);
-        luaX_error(LS, "unfinished long string", TK_EOS);
+        if (seminfo)
+          luaX_error(LS, "unterminated long string", TK_EOS);
+        else
+          luaX_error(LS, "unterminated comment", TK_EOS);
         break;  /* to avoid warnings */
       case '[':
         save_and_next(L, LS, l);
@@ -226,17 +258,22 @@
           save_and_next(L, LS, l);
         }
         continue;
+      case '\r':
       case '\n':
-        save(L, '\n', l);
+        if (l != 2)	/* skip linefeed directly following the [[ */
+          save(L, '\n', l);
         inclinenumber(LS);
         continue;
       default:
-        save_and_next(L, LS, l);
+        if (seminfo)
+          save(L, LS->current, l);
+        next(LS);
     }
   } endloop:
   save_and_next(L, LS, l);  /* skip the second `]' */
   save(L, '\0', l);
-  seminfo->ts = luaS_newlstr(L, cast(char *, G(L)->Mbuffer)+2, l-5);
+  if (seminfo)
+    seminfo->ts = luaS_newlstr(L, cast(char *, G(L)->Mbuffer)+2, l-5);
 }
 
 
@@ -248,9 +285,10 @@
   while (LS->current != del) {
     checkbuffer(L, 10, l);
     switch (LS->current) {
-      case EOZ:  case '\n':
+      case EOZ: case CTRL_Z:
+      case '\n': case '\r':
         save(L, '\0', l);
-        luaX_error(LS, "unfinished string", TK_EOS);
+        luaX_error(LS, "unterminated string", TK_EOS);
         break;  /* to avoid warnings */
       case '\\':
         next(LS);  /* do not save the `\' */
@@ -262,17 +300,17 @@
           case 'r': save(L, '\r', l); next(LS); break;
           case 't': save(L, '\t', l); next(LS); break;
           case 'v': save(L, '\v', l); next(LS); break;
-          case '\n': save(L, '\n', l); inclinenumber(LS); break;
+          case '\r': case '\n': save(L, '\n', l); inclinenumber(LS); break;
           default: {
-            if (!isdigit(LS->current))
+            if (!lex_isodigit(LS->current))
               save_and_next(L, LS, l);  /* handles \\, \", \', and \? */
             else {  /* \xxx */
               int c = 0;
               int i = 0;
               do {
-                c = 10*c + (LS->current-'0');
+                c = 8*c + (LS->current-'0');
                 next(LS);
-              } while (++i<3 && isdigit(LS->current));
+              } while (++i<3 && lex_isodigit(LS->current));
               if (c > UCHAR_MAX) {
                 save(L, '\0', l);
                 luaX_error(LS, "escape sequence too large", TK_STRING);
@@ -296,18 +334,24 @@
   for (;;) {
     switch (LS->current) {
 
-      case ' ': case '\t': case '\r':  /* `\r' to avoid problems with DOS */
+      case ' ': case '\t':
         next(LS);
         continue;
 
+      case '\r':
       case '\n':
         inclinenumber(LS);
+        if (LS->current == '#')
+          eol_comment(LS);
         continue;
 
       case '-':
         next(LS);
         if (LS->current != '-') return '-';
-        do { next(LS); } while (LS->current != '\n' && LS->current != EOZ);
+        if (next(LS) == '[' && next(LS) == '[')
+          read_long_string(LS, NULL);
+        else
+          eol_comment(LS);
         continue;
 
       case '[':
@@ -353,21 +397,22 @@
           }
           else return TK_CONCAT;   /* .. */
         }
-        else if (!isdigit(LS->current)) return '.';
+        else if (!lex_isdigit(LS->current)) return '.';
         else {
           read_number(LS, 1, seminfo);
           return TK_NUMBER;
         }
 
       case EOZ:
+      case CTRL_Z:
         return TK_EOS;
 
       default: {
-        if (isdigit(LS->current)) {
+        if (lex_isdigit(LS->current)) {
           read_number(LS, 0, seminfo);
           return TK_NUMBER;
         }
-        else if (isalpha(LS->current) || LS->current == '_') {
+        else if (lex_isfname(LS->current)) {
           /* identifier or reserved word */
           size_t l = readname(LS);
           TString *ts = luaS_newlstr(LS->L, cast(char *, G(LS->L)->Mbuffer), l);
@@ -378,7 +423,7 @@
         }
         else {
           int c = LS->current;
-          if (iscntrl(c))
+          if (lex_isctrl(c))
             luaX_invalidchar(LS, c);
           next(LS);
           return c;  /* single-char tokens (+ - / ...) */