lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


After looking at the 4.1 alpha manual and then viewing the source
code, I realized that the section giving the complete syntax of Lua is
no longer valid.  This explains some of the inconsistencies between
that section and the section describing the language, as the authors
had not changed the complete syntax section yet.

Looking at lparse.c from the 4.1 alpha release, I noticed a couple of
things. 

(1) The source file contains the comment:

       ** LL(1) Parser and code generator for Lua

    but parsers are not LL(1), languages are, and LL(1) languages are
    of interest because they are easily recognized by recursive decent
    parsers.

(2) The language being parsed does not seem to be LL(1)!  One cannot
    decide if a statement is a function call or an assignment
    statement until many tokens are read.  In Lua's case, the parser
    seems to read the statement as an expression, and then examine the
    results to decide if it is a function call or the start of an
    assignment statement.  This fact makes it challenging to write a
    complete syntax for the language accepted by the parser.

I tried to think about how one might formally specify the language
accepted by the Lua 4.1 parser.  After several failed attempts, I
decided to write a YACC parser that accepted the language.  I thought
this would be more understandable than C code.  

I have enclosed the source file for the parser.  I failed to find a
way to parse Lua's table constructor syntax, so you will find a hack
in the code.  If anyone knows how to recognize the table constructor
syntax in YACC, please let me know.  Of course, if you find other
errors, tell me about them too.  Hopefully getting this straight will
make it easier to explain the syntax in the manual.

I simulated the Lua parser's inspection of the result of parsing an
expression by computing an expression class while parsing, and if the
class is inappropriate, a rule's action invokes the YYERROR macro.
Search for "YYERROR" to find these special rules.

The exercise of constructing the parser suggests a way of removing
some of the constraints on the language.  There are various places in
the language in which table references are forced to NAMEs, and may
not be reserved words.  Thus, the following is currently illegal:

        t = { title = "Me, Myself, and I", in = "My Book" }

and
        function thing.end (x) return x end

because "in" and "end" are not names, but reserved words.

The YACC parser shows that reserved words can appear in these
locations without causing any parsing troubles.  You can see this by
searching for "key" to find where reserved words can appear in
locations currently limited to names.

John

---------------------------- lsc.y -----------------------------
%{

/* Lua Syntax Checker -- Well an try at one...
   This version does not parse numbers 
   or table constructors correctly. 

   John D. Ramsdell -- August 2001
*/

/*
Copyright (c) 2001, John D. Ramsdell

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

/* #define YYDEBUG 1 */

#include <stddef.h>
#include <stdio.h>
#include <ctype.h>
#include <string.h>

enum {
  VALUE,
  LOCATION,
  CLOSURE,
  CALL,
  TABLE
};

static int line = 1;
static int  column = 0;
static char *filename = "-";

static void yyerror(const char *s) 
{
  fprintf(stderr, "%s:%d:%d: %s\n", filename, line, column, s);
}

%}

%start block

%token AND
%token ASSIGN
%token ASTERISK
%token BREAK
%token CARET
%token COLON
%token COMMA
%token DO
%token DOT_DOT
%token ELIPSES
%token ELSE
%token ELSEIF
%token END
%token EQ
%token FOR
%token FUNCTION
%token GE
%token GLOBAL
%token GT
%token IF
%token IN
%token LBRACE
%token LBRACKET
%token LE
%token LITERAL
%token LOCAL
%token LPAR
%token LT
%token MINUS
%token NAME
%token NE
%token NIL
%token NOT
%token NUMBER
%token OR
%token PERCENT
%token PERIOD
%token PLUS
%token RBRACE
%token RBRACKET
%token REPEAT
%token RETURN
%token RPAR
%token SEMICOLON
%token SLASH
%token THEN
%token UMINUS
%token UNTIL
%token WHILE

%nonassoc NONARG
%nonassoc LPAR LBRACE
%left AND OR
%left LT GT LE GE NE EQ
%left DOT_DOT
%left PLUS MINUS
%left ASTERISK SLASH
%left NOT UMINUS
%right CARET

%%

block:
    statement_list end;

statement_list:
    /* Empty */
  | statement_list statement optional_semicolon;

end:
    /* Empty */
  | RETURN optional_expression_list optional_semicolon
  | BREAK optional_name optional_semicolon;

optional_semicolon:
    /* Empty */
  | SEMICOLON;

optional_name:
    /* Empty */
  | NAME;

statement: 
    primary rest assignment
  | DO block END
  | WHILE expression DO block END
  | REPEAT block UNTIL expression
  | IF expression THEN block optional_elseif_list optional_else END
  | FOR NAME for_statement
  | FUNCTION function_name LPAR parameter_list RPAR block END
  | LOCAL name_list initializer;

name_list:
    NAME
  | name_list COMMA NAME;

initializer:
    /* Empty */
  | ASSIGN expression_list;

parameter_list:
    /* Empty */
  | ELIPSES
  | name_list optional_elipses;

optional_elipses:
    /* Empty */
  | COMMA ELIPSES;

assignment:
    /* Empty */ { if ($0 != CALL) YYERROR; } %prec NONARG
  | ASSIGN { if ($0 != LOCATION) YYERROR; } expression_list 
  | COMMA { if ($0 != LOCATION) YYERROR; } 
      location_list ASSIGN expression_list;
  
location_list:
    location
  | location_list COMMA location;

location:
    primary rest { if ($2 != LOCATION) YYERROR; };

optional_elseif_list:
    /* Empty */
  | optional_elseif_list ELSEIF expression THEN block;

optional_else:
    /* Empty */
  | ELSE block;

for_statement:
    ASSIGN expression COMMA expression optional_comma_expression DO block END
  | COMMA NAME IN expression DO block END;

optional_comma_expression:
    /* Empty */
  | COMMA expression;

function_name:
    NAME dotted_name_list optional_colon_name;

dotted_name_list:
    /* Empty */
  | dotted_name_list PERIOD key;

key:
    NAME | AND | BREAK | DO | END | ELSE | ELSEIF
  | FOR | FUNCTION | GLOBAL | IF | IN | LOCAL | NIL
  | NOT | OR | RETURN | REPEAT | THEN | UNTIL | WHILE; 

optional_colon_name:
    /* Empty */
  | COLON key;

expression_list:
    expression
  | expression_list COMMA expression;

optional_expression_list:
    /* Empty */
  | expression_list;

expression: 
    NIL { $$ = VALUE; }
  | LITERAL { $$ = VALUE; }
  | NUMBER { $$ = VALUE; }
  | primary rest { $$ = $2; } %prec NONARG
  | expression AND expression { $$ = VALUE; }
  | expression OR expression { $$ = VALUE; }
  | expression LT expression { $$ = VALUE; }
  | expression GT expression { $$ = VALUE; }
  | expression LE expression { $$ = VALUE; }
  | expression GE expression { $$ = VALUE; }
  | expression NE expression { $$ = VALUE; }
  | expression EQ expression { $$ = VALUE; }
  | expression DOT_DOT expression { $$ = VALUE; }
  | expression PLUS expression { $$ = VALUE; }
  | expression MINUS expression { $$ = VALUE; }
  | expression ASTERISK expression { $$ = VALUE; }
  | expression SLASH expression { $$ = VALUE; }
  | NOT expression { $$ = VALUE; }
  | MINUS expression { $$ = VALUE; } %prec UMINUS
  | expression CARET expression { $$ = VALUE; };

primary:
    NAME { $$ = LOCATION; }
  | PERCENT NAME { $$ = VALUE; }
  | FUNCTION LPAR parameter_list RPAR block END { $$ = CLOSURE; }
  | table_constructor { $$ = TABLE; }
  | LPAR expression RPAR { $$ = $2; };

rest:
    /* Empty */ { $$ = $0; }
  | rest PERIOD key
    { if ($0 == CLOSURE) YYERROR; $$ = LOCATION; }
  | rest LBRACKET expression RBRACKET 
    { if ($0 == CLOSURE) YYERROR; $$ = LOCATION; }
  | rest COLON key args 
    { if ($0 == CLOSURE) YYERROR; $$ = CALL;}
  | rest args 
    { if ($0 == TABLE) YYERROR; $$ = CALL;};

args:
    LPAR optional_expression_list RPAR
  | LITERAL
  | table_constructor;

table_constructor:
    LBRACE table_contents RBRACE;

table_contents: 
    optional_table_pair_list optional_table_item_list;

optional_table_pair_list:
    /* Empty */
  | table_pair_list optional_comma; 

table_pair_list:
    table_pair
  | table_pair_list COMMA table_pair;

table_pair:
    LBRACKET expression RBRACKET ASSIGN expression
  | key ASSIGN expression;

optional_comma:
    /* Empty */
  | COMMA;

optional_table_item_list:
    /* Empty */
  | SEMICOLON optional_table_item_list2 optional_comma;

optional_table_item_list2:
    /* Empty */
  | optional_table_item_list2 COMMA expression;

%%

static int pushed = 0;

static void ungetch(int ch) 
{
  pushed = ch;
}

static int getch() {
  int ch;
  if (pushed) {
    ch = pushed;
    pushed = 0;
    return ch;
  }
  else {
    ch = getc(stdin);
    if (ch == '\n') {
      line++;
      column = 0;
    }
    else 
      column++;
    return ch;
  }      
}

static void ignore_line()
{
  while (1) {
    int ch = getch();
    if (ch == '\n')
      return;
    if (ch == EOF) {
      ungetch(ch);
      return;
    }
  }
}

static void string(int delim) 
{
  while (1) {
    int ch = getch();
    if (ch == delim)
      return;
    if (ch == EOF) {
      yyerror("End of file in a string");
      exit(1);
    }
    if (ch == '\\') {
      ch = getch();
      if (isdigit(ch)) {
	ch = getch();
	if (isdigit(ch)) {
	  ch = getch();
	  if (!isdigit(ch)) {
	    ungetch(ch);
	  }
	}
	else
	  ungetch(ch);
      }
    }
  }
}

static void big_string()
{
  while (1) {
    int ch = getch();
    if (ch == EOF) {
      yyerror("End of file in a string");
      exit(1);
    }
    if (ch == ']') {
      ch = getch();
      if (ch == EOF) {
	yyerror("End of file in a string");
	exit(1);
      }
      if (ch == ']')
	return;
    }
  }
}

#define MAX_NAME 256
static char name[MAX_NAME];

static int name2token(const char *name) {
  switch (name[0]) {
  case 'a':
    if (strcmp(name, "and") == 0)
      return AND;
    break;
  case 'b':
    if (strcmp(name, "break") == 0)
      return BREAK;
    break;
  case 'd':
    if (strcmp(name, "do") == 0)
      return DO;
    break;
  case 'e':
    if (strcmp(name, "end") == 0)
      return END;
    if (strcmp(name, "else") == 0)
      return ELSE;
    if (strcmp(name, "elseif") == 0)
      return ELSEIF;
    break;
  case 'f':    
    if (strcmp(name, "for") == 0)
      return FOR;
    if (strcmp(name, "function") == 0)
      return FUNCTION;
    break;
  case 'g':
    if (strcmp(name, "global") == 0)
      return GLOBAL;
    break;
  case 'i':
    if (strcmp(name, "if") == 0)
      return IF;
    if (strcmp(name, "in") == 0)
      return IN;
    break;
  case 'l':
    if (strcmp(name, "local") == 0)
      return LOCAL;
    break;
  case 'n':
    if (strcmp(name, "nil") == 0)
      return NIL;
    if (strcmp(name, "not") == 0)
      return NOT;
    break;
  case 'o':
    if (strcmp(name, "or") == 0)
      return OR;
    break;
  case 'r':
    if (strcmp(name, "return") == 0)
      return RETURN;
    if (strcmp(name, "repeat") == 0)
      return REPEAT;
    break;
  case 't':
    if (strcmp(name, "then") == 0)
      return THEN;
    break;
  case 'u':
    if (strcmp(name, "until") == 0)
      return UNTIL;
    break;
  case 'w':
    if (strcmp(name, "while") == 0)
      return WHILE;
    break;
  }
  return NAME;
}

int yylex() 
{
  while (1) {
    int ch = getch();
    if (ch == EOF) {
      fclose(stdin);
      return EOF;
    }
    switch (ch) {
    case ' ':
    case '\t':
    case '\r':
    case '\n':
      continue;
    case 'a':
    case 'b':
    case 'c':
    case 'd':
    case 'e':
    case 'f':
    case 'g':
    case 'h':
    case 'i':
    case 'j':
    case 'k':
    case 'l':
    case 'm':
    case 'n':
    case 'o':
    case 'p':
    case 'q':
    case 'r':
    case 's':
    case 't':
    case 'u':
    case 'v':
    case 'w':
    case 'x':
    case 'y':
    case 'z':
    case 'A':
    case 'B':
    case 'C':
    case 'D':
    case 'E':
    case 'F':
    case 'G':
    case 'H':
    case 'I':
    case 'J':
    case 'K':
    case 'L':
    case 'M':
    case 'N':
    case 'O':
    case 'P':
    case 'Q':
    case 'R':
    case 'S':
    case 'T':
    case 'U':
    case 'V':
    case 'W':
    case 'X':
    case 'Y':
    case 'Z':
    case '_':
      {
	int i;
	name[0] = ch;
	for (i = 1; i < MAX_NAME - 1; i++) {
	  ch = getch();
	  if (!isalnum(ch) && ch != '_') {
	    ungetch(ch);
	    name[i] = 0;
	    return name2token(name);
	  }
	  name[i] = ch;
	}
	yyerror("Name too large");
	exit(1);
	return NAME;
      }
    case '-':
      ch = getch();
      if (ch == '-') {
	ignore_line();
	continue;
      }
      ungetch(ch);
      return MINUS;
    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9':
      /* Numbers must be integers or decimals for now. */
      while (1) {
	ch = getch();
	if (ch == '.') {
	  while (1) {
	    ch = getch();
	    if (!isdigit(ch)) {
	      ungetch(ch);
	      return NUMBER;
	    }
	  }
	}
	if (!isdigit(ch)) {
	  ungetch(ch);
	  return NUMBER;
	}
      }
    case '+':
      return PLUS;
    case '*':
      return ASTERISK;
    case '/':
      return SLASH;
    case '^':
      return CARET;
    case '%':
      return PERCENT;
    case '~':
      ch = getch();
      if (ch == '=')
	return NE;
      else {
	yyerror("Uncognized chararter on input");
	exit(1);
      }
    case '<':
    case '>':
    case '=':
      {
	int eq = getch();
	if (eq != '=')
	  ungetch(eq);
	switch (ch) {
	case '<':
	  return eq != '=' ? LT : LE;
	case '>':
	  return eq != '=' ? GT : GE;
	case '=':
	  return eq != '=' ? ASSIGN : EQ;
	}
      }
    case '(':
      return LPAR;
    case ')':
      return RPAR;
    case '[':
      ch = getch();
      if (ch == '[') {
	big_string();
	return LITERAL;
      }
      else
	ungetch(ch);
      return LBRACKET;
    case ']':
      return RBRACKET;
    case '{':
      return LBRACE;
    case '}':
      return RBRACE;
    case ';':
      return SEMICOLON;
    case ':':
      return COLON;
    case ',':
      return COMMA;
    case '.':
      ch = getch();
      if (ch != '.') {
	ungetch(ch);
	return PERIOD;
      }
      ch = getch();
      if (ch != '.') {
	ungetch(ch);
	return DOT_DOT;
      }
      else
	return ELIPSES;
    case '\'':
    case '"':
      string(ch);
      return LITERAL;
    default:
      yyerror("Uncognized character on input");
      exit(1);
      return NAME;
    }
    yyerror("Internal error");
    exit(1);
  }
}

static void strip() {
  int ch = getch();
  if (ch != '#') {
    ungetch(ch);
  }
  else {
    ignore_line();
  }
}

int main(int argc, char *argv[]) 
{
  int rc;
  if (argc > 1 && strcmp("-", argv[1])) {
    filename = argv[1];
    if (freopen(filename, "r", stdin) == NULL) {
      fprintf(stderr, "Error opening file %s\n", filename);
      return 1;
    }
  }
  strip();
#if YYDEBUG != 0
  yydebug = 1;
#endif
  rc = yyparse();
  if (rc) 
    fprintf(stderr, "Error during parse\n");
  return rc;
}