[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: lex patch
- From: Edgar Toernig <froese@...>
- Date: Fri, 05 Apr 2002 01:35:09 +0200
Hi,
recent posts discussed EOL-differences. The lexical analyzer in Lua
has similar problems with 'foreign' EOLs. Another problem is that it
is locale specific. I've backported some of my changes of Sol's lex
to lua-4.1-work4. The patch is attached. Here's a list of changes:
- Accepts "\n", "\r", and "\r\n" as end of line.
- Accepts CTRL-Z as end of file.
- EOLs in long strings and escaped EOLs in strings are converted
to a single "\n"
- Has its own ctype macros to make identifiers locale independent.
These changes make sure that a Lua script gives exactly the same
compiled code on any system, regardless of EOL convention of locale
setting. The next changes (also included in the patch) are for
convenience:
- Adds "long comments": --[[ comment ]] (see Lua-Wiki power patches).
- The '#' comment is allowed not only in the first line but in all
lines. That way you have less problems preprocessing Lua scripts
with cpp (inserts #line directives).
- Changes the "\123" escape from decimal to octal as it is done in
_all_ other languages I know.
Btw, with the patch in place all systems should be able to open Lua
source files in binary mode so the re-open in ldo.c becomes unnecessary.
Ciao, ET.
--- lua-4.1-work4/src/llex.c Fri Feb 8 23:40:27 2002
+++ lua-4.1-work4-lex/src/llex.c Fri Apr 5 01:05:45 2002
@@ -5,7 +5,6 @@
*/
-#include <ctype.h>
#include <stdio.h>
#include <string.h>
@@ -19,6 +18,34 @@
#include "lzio.h"
+#define CTRL_Z '\032'
+
+#define C 1 /* control char */
+#define O 2 /* octal digit [0-7] */
+#define D 4 /* digit [89] */
+#define A 8 /* alpha [a-zA-Z_] */
+#define E 16 /* end of line [\r\n*CTRL-Z**EOZ*] */
+
+static const char sctype[257] = /* lex is not locale specific! */
+{
+ E, /* EOZ */
+ C,C,C,C, C,C,C,C, C,C,E,C, C,E,C,C, C,C,C,C, C,C,C,C, C,C,E,C, C,C,C,C,
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, O,O,O,O, O,O,O,O, D,D,0,0, 0,0,0,0,
+ 0,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,0, 0,0,0,A,
+ 0,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,A, A,A,A,0, 0,0,0,0,
+ C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C,
+ C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C,
+ C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C,
+ C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C, C,C,C,C,
+};
+
+#define lex_isdigit(c) (sctype[(c)+1] & (O|D))
+#define lex_isodigit(c) (sctype[(c)+1] & O)
+#define lex_isctrl(c) (sctype[(c)+1] & (C|E))
+#define lex_isfname(c) (sctype[(c)+1] & A)
+#define lex_isname(c) (sctype[(c)+1] & (A|O|D))
+#define lex_iseol(c) (sctype[(c)+1] & E)
+
#define next(LS) (LS->current = zgetc(LS->z))
@@ -100,27 +127,30 @@
}
-static void inclinenumber (LexState *LS) {
- next(LS); /* skip `\n' */
+static void inclinenumber (LexState *LS) { /* called on '\r' and '\n' */
+ /* accept "\r", "\n" or "\r\n" as line terminator */
+ if (LS->current == '\n' || next(LS) == '\n')
+ next(LS);
++LS->linenumber;
luaX_checklimit(LS, LS->linenumber, MAX_INT, "lines in a chunk");
}
+static void eol_comment (LexState *LS) {
+ while (lex_iseol(LS->current))
+ next(LS);
+}
+
+
void luaX_setinput (lua_State *L, LexState *LS, ZIO *z, TString *source) {
LS->L = L;
LS->lookahead.token = TK_EOS; /* no look-ahead token */
LS->z = z;
LS->fs = NULL;
- LS->linenumber = 1;
+ LS->linenumber = 0;
LS->lastline = 1;
LS->source = source;
- next(LS); /* read first char */
- if (LS->current == '#') {
- do { /* skip first line */
- next(LS);
- } while (LS->current != '\n' && LS->current != EOZ);
- }
+ LS->current = '\n';
}
@@ -150,7 +180,7 @@
do {
checkbuffer(L, 10, l);
save_and_next(L, LS, l);
- } while (isalnum(LS->current) || LS->current == '_');
+ } while (lex_isname(LS->current));
save(L, '\0', l);
return l-1;
}
@@ -162,7 +192,7 @@
size_t l = 0;
checkbuffer(L, 10, l);
if (comma) save(L, '.', l);
- while (isdigit(LS->current)) {
+ while (lex_isdigit(LS->current)) {
checkbuffer(L, 10, l);
save_and_next(L, LS, l);
}
@@ -176,7 +206,7 @@
TK_NUMBER);
}
}
- while (isdigit(LS->current)) {
+ while (lex_isdigit(LS->current)) {
checkbuffer(L, 10, l);
save_and_next(L, LS, l);
}
@@ -184,7 +214,7 @@
save_and_next(L, LS, l); /* read `E' */
if (LS->current == '+' || LS->current == '-')
save_and_next(L, LS, l); /* optional exponent sign */
- while (isdigit(LS->current)) {
+ while (lex_isdigit(LS->current)) {
checkbuffer(L, 10, l);
save_and_next(L, LS, l);
}
@@ -202,14 +232,16 @@
checkbuffer(L, 10, l);
save(L, '[', l); /* save first `[' */
save_and_next(L, LS, l); /* pass the second `[' */
- if (LS->current == '\n') /* string starts with a newline? */
- inclinenumber(LS); /* skip it */
for (;;) {
checkbuffer(L, 10, l);
switch (LS->current) {
case EOZ:
+ case CTRL_Z:
save(L, '\0', l);
- luaX_error(LS, "unfinished long string", TK_EOS);
+ if (seminfo)
+ luaX_error(LS, "unterminated long string", TK_EOS);
+ else
+ luaX_error(LS, "unterminated comment", TK_EOS);
break; /* to avoid warnings */
case '[':
save_and_next(L, LS, l);
@@ -226,17 +258,22 @@
save_and_next(L, LS, l);
}
continue;
+ case '\r':
case '\n':
- save(L, '\n', l);
+ if (l != 2) /* skip linefeed directly following the [[ */
+ save(L, '\n', l);
inclinenumber(LS);
continue;
default:
- save_and_next(L, LS, l);
+ if (seminfo)
+ save(L, LS->current, l);
+ next(LS);
}
} endloop:
save_and_next(L, LS, l); /* skip the second `]' */
save(L, '\0', l);
- seminfo->ts = luaS_newlstr(L, cast(char *, G(L)->Mbuffer)+2, l-5);
+ if (seminfo)
+ seminfo->ts = luaS_newlstr(L, cast(char *, G(L)->Mbuffer)+2, l-5);
}
@@ -248,9 +285,10 @@
while (LS->current != del) {
checkbuffer(L, 10, l);
switch (LS->current) {
- case EOZ: case '\n':
+ case EOZ: case CTRL_Z:
+ case '\n': case '\r':
save(L, '\0', l);
- luaX_error(LS, "unfinished string", TK_EOS);
+ luaX_error(LS, "unterminated string", TK_EOS);
break; /* to avoid warnings */
case '\\':
next(LS); /* do not save the `\' */
@@ -262,17 +300,17 @@
case 'r': save(L, '\r', l); next(LS); break;
case 't': save(L, '\t', l); next(LS); break;
case 'v': save(L, '\v', l); next(LS); break;
- case '\n': save(L, '\n', l); inclinenumber(LS); break;
+ case '\r': case '\n': save(L, '\n', l); inclinenumber(LS); break;
default: {
- if (!isdigit(LS->current))
+ if (!lex_isodigit(LS->current))
save_and_next(L, LS, l); /* handles \\, \", \', and \? */
else { /* \xxx */
int c = 0;
int i = 0;
do {
- c = 10*c + (LS->current-'0');
+ c = 8*c + (LS->current-'0');
next(LS);
- } while (++i<3 && isdigit(LS->current));
+ } while (++i<3 && lex_isodigit(LS->current));
if (c > UCHAR_MAX) {
save(L, '\0', l);
luaX_error(LS, "escape sequence too large", TK_STRING);
@@ -296,18 +334,24 @@
for (;;) {
switch (LS->current) {
- case ' ': case '\t': case '\r': /* `\r' to avoid problems with DOS */
+ case ' ': case '\t':
next(LS);
continue;
+ case '\r':
case '\n':
inclinenumber(LS);
+ if (LS->current == '#')
+ eol_comment(LS);
continue;
case '-':
next(LS);
if (LS->current != '-') return '-';
- do { next(LS); } while (LS->current != '\n' && LS->current != EOZ);
+ if (next(LS) == '[' && next(LS) == '[')
+ read_long_string(LS, NULL);
+ else
+ eol_comment(LS);
continue;
case '[':
@@ -353,21 +397,22 @@
}
else return TK_CONCAT; /* .. */
}
- else if (!isdigit(LS->current)) return '.';
+ else if (!lex_isdigit(LS->current)) return '.';
else {
read_number(LS, 1, seminfo);
return TK_NUMBER;
}
case EOZ:
+ case CTRL_Z:
return TK_EOS;
default: {
- if (isdigit(LS->current)) {
+ if (lex_isdigit(LS->current)) {
read_number(LS, 0, seminfo);
return TK_NUMBER;
}
- else if (isalpha(LS->current) || LS->current == '_') {
+ else if (lex_isfname(LS->current)) {
/* identifier or reserved word */
size_t l = readname(LS);
TString *ts = luaS_newlstr(LS->L, cast(char *, G(LS->L)->Mbuffer), l);
@@ -378,7 +423,7 @@
}
else {
int c = LS->current;
- if (iscntrl(c))
+ if (lex_isctrl(c))
luaX_invalidchar(LS, c);
next(LS);
return c; /* single-char tokens (+ - / ...) */