Lexical Analysis

lua-users home
wiki

Description

Some people have suggested that Lua's regular expressions are limited. The beauty of Lua is that it is so easy to add extra functionality. We don't need fancy regular expressions because we can easily add a lexical analyser with Peter Bumbulis' re2c [1].

Here is a lexical scanner that recognizes Lua 5.0's syntax and keywords. The function LexLua takes the string to be scanned and returns a function which does the scanning. The returned function is a closure that is bound to the C function scan and two upvales: the string to be scanned and a userdata to keep track of the state. Every time it is called, it returns the next token, or nil when it reaches the end of the string.

C code

re2c replaces the regular expressions in the special comments with code for the scanner. Here is the input [2] and output [3] of re2c.


/*
==============================================================================
  LexLua.c
==============================================================================
*/

#include "lua.h"
#include "lauxlib.h"

const char *name    = "<name>";
const char *number  = "<number>";
const char *literal = "<literal>";

#define YYCTYPE  char
#define YYCURSOR cursor
#define YYMARKER marker
#define YYLIMIT  limit
#define YYFILL(n)

#define save_state(i,s,c,m,l) {\
  (s)->cursor = (c)-(i); \
  (s)->marker = (m)-(i); \
  (s)->limit  = (l)-(i); \
}

typedef struct Scanner {
  int cursor, marker, limit;
} Scanner;

static Scanner *check_Scanner(lua_State *L, int index)
{
  luaL_check_type(L, index, LUA_TUSERDATA);
  return (Scanner*)lua_touserdata(L,index);
}

static int scan (lua_State *L)
{
  const char *input = luaL_check_string(L, lua_upvalueindex(1));
  Scanner    *state = check_Scanner(L,lua_upvalueindex(2));
  char *cursor = (char*)input + state->cursor;
  char *marker = (char*)input + state->marker;
  char *limit  = (char*)input + state->limit;
  char *white_space, *token;
  const char *ret = 0;
  int nest_count = 0;

/*!re2c

  D        = [0-9] ;
  E        = [Ee] [+-]? D+ ;
  L        = [a-zA-Z_] ;

  NUMBER   = ( D+ | D* "." D+ | D+ "." D* ) E? ;

  WS       = [ \t\n\v\f]+ ;
  LF       = [\n] ;
  END      = [\000] ;
  ANY      = [\000-\377] \ END ;

  ESC      = [\\] ;
  SQ       = ['] ;
  DQ       = ["] ;

  STRING1  = SQ ( ANY \ SQ \ ESC | ESC ANY )* SQ ;
  STRING2  = DQ ( ANY \ DQ \ ESC | ESC ANY )* DQ ;

*/

Begin:

  white_space = cursor; /* start of white space */

Space:

  token = cursor;       /* start of token */

/*!re2c

  WS               { goto Space; }
  "--[["           { nest_count=0; goto LongComment; }
  "--" | "#"       { goto Comment; }
  "and"            { goto Return; }
  "break"          { goto Return; }
  "do"             { goto Return; }
  "else"           { goto Return; }
  "elseif"         { goto Return; }
  "end"            { goto Return; }
  "false"          { goto Return; }
  "for"            { goto Return; }
  "function"       { goto Return; }
  "global"         { goto Return; }
  "if"             { goto Return; }
  "in"             { goto Return; }
  "local"          { goto Return; }
  "nil"            { goto Return; }
  "not"            { goto Return; }
  "or"             { goto Return; }
  "repeat"         { goto Return; }
  "return"         { goto Return; }
  "then"           { goto Return; }
  "true"           { goto Return; }
  "until"          { goto Return; }
  "while"          { goto Return; }
  "..."            { goto Return; }
  ".."             { goto Return; }
  "=="             { goto Return; }
  ">="             { goto Return; }
  "<="             { goto Return; }
  "~="             { goto Return; }
  "[["             { nest_count=0; goto LongString; }

  L ( L | D )*     { ret = name;    goto Return; }
  NUMBER           { ret = number;  goto Return; }
  STRING1          { ret = literal; goto Return; }
  STRING2          { ret = literal; goto Return; }

  ANY              { goto Return; }
  END              { goto TheEnd; }

*/

LongString:

/*!re2c

  "[["             { nest_count++; goto LongString; }
  "]]"             { if( nest_count == 0 ) { ret = literal; goto Return; }
                     nest_count--; goto LongString; }

  ANY              { goto LongString; }
  END              { luaL_error(L,"unfinished long string"); }

*/

Comment:

/*!re2c

  ( ANY \ LF )*    { goto Space; }
  END              { goto TheEnd; }

*/

LongComment:

/*!re2c

  "[["             { nest_count++; goto LongComment; }
  "]]"             { if( nest_count == 0 ) goto Space;
                     nest_count--; goto LongComment; }

  ANY              { goto LongComment; }
  END              { luaL_error(L,"unfinished long comment"); }

*/

  luaL_error(L,"impossible"); /* die */

TheEnd:

  if( --cursor != limit ) luaL_error(L,"didn't reach end of input"); /* die */
  lua_pushnil(L);
  lua_pushnil(L);
  lua_pushlstring(L, white_space, token - white_space );
  save_state(input,state,cursor,marker,limit);
  return 3; /* nil, nil, ws */

Return:

  lua_pushlstring(L, token, cursor - token );
  if( ret ) lua_pushstring(L, ret );
  else lua_pushnil(L);
  lua_pushlstring(L, white_space, token - white_space );
  save_state(input,state,cursor,marker,limit);
  return 3; /* token, type, ws */
}

static int scanner (lua_State *L)
{
  Scanner *s;
  int len;
  const char *input = luaL_check_lstr(L, 1, &len);
  s = (Scanner*)lua_newuserdata(L, sizeof(Scanner));
  s->cursor = 0;
  s->marker = 0;
  s->limit  = len;
  lua_pushcclosure(L, scan, 2); /* string, userdata */
  return 1;
}

int openLexLua (lua_State *L)
{
  lua_register(L, "LexLua", scanner);
  return 0;
}


Compiling the Code

This code can be compiled with into a unix shared lib as follows:


re2c -s LexLua.c > lex.c
gcc -fPIC -g -c lex.c -o lexlua.o
gcc -g -shared -Wl,-soname,liblexlua.so -o liblexlua.so.1.0.0 lexlua.o -L/usr/local/lib/ -llua -llualib

su
cp liblexlua.so.1.0.0 /usr/local/lib
cd /usr/local/lib
ln -s liblexlua.so.1.0.0 liblexlua.so
ldconfig -v /usr/local/lib

Lua Test Code

$ lua
Lua 5.0 (alpha)  Copyright (C) 1994-2002 Tecgraf, PUC-Rio
> assert(loadlib('/usr/local/lib/liblexlua.so','openLexLua'))()
> for tok, tt in LexLua[[ for i = 1,10 do print(i*2) end ]] do print(tok,tt) end
for     nil
i       <name>
=       nil
1       <number>
,       nil
10      <number>
do      nil
print   <name>
(       nil
i       <name>
*       nil
2       <number>
)       nil
end     nil
> 

For an example of how to add some colour to your Lua code, see [4] or LuaToHtml


RecentChanges · preferences
edit · history
Last edited August 6, 2008 9:41 am GMT (diff)