Lpeg Recipes

lua-users home
wiki

Lua Recipes for [LPeg] 'a New Pattern-Matching Library for Lua' by RobertoIerusalimschy

Place examples of Lua code using LPeg for parsing to help further the understanding of how to use "Parsing Expression Grammars".

C Comment Parser

local BEGIN_COMMENT = lpeg.P("/*")
local END_COMMENT = lpeg.P("*/")
local NOT_BEGIN = (1 - BEGIN_COMMENT)^0
local NOT_END = (1 - END_COMMENT)^0
local FULL_COMMENT_CONTENTS = BEGIN_COMMENT * NOT_END * END_COMMENT

-- Parser to find comments from a string
local searchParser = (NOT_BEGIN * lpeg.C(FULL_COMMENT_CONTENTS))^0
-- Parser to find non-comments from a string
local filterParser = (lpeg.C(NOT_BEGIN) * FULL_COMMENT_CONTENTS)^0 * lpeg.C(NOT_BEGIN)

-- Simpler version, although empirically it is slower.... (why?) ... any optimization
-- suggestions are desired as well as optimum integration w/ C++ comments and other
-- syntax elements
local searchParser = (lpeg.C(FULL_COMMENT_CONTENTS) + 1)^0
-- Suggestion by Roberto to make the search faster
-- Works because it loops fast over all non-slashes, then it begins the slower match phase
local searchParser = ((1 - lpeg.P"/")^0 * (lpeg.C(FULL_COMMENT_CONTENTS) + 1))^0

Evaluate Standard Roman Numerals

The numeral is given in the variable text.
do
local add = function (x,y) return x+y end
local P,Ca,Cc= lpeg.P,lpeg.Ca,lpeg.Cc
local symbols = { I=1,V=5,X=10,L=50,C=100,D=500,M=1000,
   IV=4,IX=9,XL=40,CD=400,CM=900}
local env = getfenv(1)
for s,n in pairs(symbols) do env[s:lower()] = P(s)*Cc(n)/add end
setfenv(1,env)
local MS = m^0
local CS = (d*c^(-4)+cd+cm+c^(-4))^(-1)
local XS = (l*x^(-4)+xl+x^(-4))^(-1)
local IS = (v*i^(-4)+ix+iv+i^(-4))^(-1)
local p = Ca(Cc(0)*MS*CS*XS*IS)
local result = p:match(text:upper())
print(result or "?")
end

Lua Lexer

This is a Lua lexer in LPeg. The original author is PeterOdding, and the original link is broken ( http://xolox.ath.cx/lua/lexer.html ).

--[[

= ABOUT
This module uses Roberto Ierusalimschy's powerful new pattern matching library
LPeg[1] to tokenize Lua source-code in to a table of tokens. I think it handles
all of Lua's syntax, but if you find anything missing I would appreciate a
xolox aatt home ddoott nl. This lexer is based on the BNF[2] from the Lua manual.

= USAGE
I've saved my copy of this module under [$LUA_PATH/lexers/lua.lua] which means
I can use it like in the following interactive prompt:

   Lua 5.1.1  Copyright (C) 1994-2006 Lua.org, PUC-Rio
   > require 'lexers.lua'
   > tokens = lexers.lua [=[
   >> 42 or 0
   >> -- some Lua source-code in a string]=]
   > = tokens
   table: 00422E40
   > lexers.lua.print(tokens)
   line 1, number: `42`
   line 1, whitespace: ` `
   line 1, keyword: `or`
   line 1, whitespace: ` `
   line 1, number: `0`
   line 1, whitespace: `
   `
   line 2, comment: `-- some Lua source-code in a string`
   total of 7 tokens, 2 lines

The returned table [tokens] looks like this:

{
   -- type       , text, line
   { 'number'    , '42', 1 },
   { 'whitespace', ' ' , 1 },
   { 'keyword'   , 'or', 1 },
   { 'whitespace', ' ' , 1 },
   { 'number'    , '0' , 1 },
   { 'whitespace', '\n', 1 },
   { 'comment'   , '-- some Lua source-code in a string', 2 },
}

= CREDITS
Written by Peter Odding, 2007/04/04

= THANKS TO
- the Lua authors for a wonderful language;
- Roberto for LPeg;
- caffeine for keeping me awake :)

= LICENSE
Shamelessly ripped from the SQLite[3] project:

   The author disclaims copyright to this source code.  In place of a legal
   notice, here is a blessing:

      May you do good and not evil.
      May you find forgiveness for yourself and forgive others.
      May you share freely, never taking more than you give.

[1] http://www.inf.puc-rio.br/~roberto/lpeg.html
[2] http://lua.org/manual/5.1/manual.html#8
[3] http://sqlite.org

--]]

-- since this module is intended to be loaded with require() we receive the
-- name used to load us in ... and pass it on to module()
module(..., package.seeall)

-- written for LPeg .5, by the way
local lpeg = require 'lpeg'
local P, R, S, C, Cc, Ct = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cc, lpeg.Ct

-- create a pattern which captures the lua value [id] and the input matching
-- [patt] in a table
local function token(id, patt) return Ct(Cc(id) * C(patt)) end

local digit = R('09')

-- range of valid characters after first character of identifier
local idsafe = R('AZ', 'az', '\127\255') + P '_'

-- operators
local operator = token('operator', P '==' + P '~=' + P '<=' + P '>=' + P '...'
                                          + P '..' + S '+-*/%^#=<>;:,.{}[]()')
-- identifiers
local ident = token('identifier', idsafe * (idsafe + digit + P '.') ^ 0)

-- keywords
local keyword = token('keyword', (P 'and' + P 'break' + P 'do' + P 'else' +
   P 'elseif' + P 'end' + P 'false' + P 'for' + P 'function' + P 'if' +
   P 'in' + P 'local' + P 'nil' + P 'not' + P 'or' + P 'repeat' + P 'return' +
   P 'then' + P 'true' + P 'until' + P 'while') * -(idsafe + digit))

-- numbers
local number_sign = S'+-'^-1
local number_decimal = digit ^ 1
local number_hexadecimal = P '0' * S 'xX' * R('09', 'AF', 'af') ^ 1
local number_float = (digit^1 * P'.' * digit^0 + P'.' * digit^1) *
                     (S'eE' * number_sign * digit^1)^-1
local number = token('number', number_hexadecimal +
                               number_float +
                               number_decimal)

-- callback for [=[ long strings ]=]
-- ps. LPeg is for Lua what regex is for Perl, which makes me smile :)
local longstring = #(P '[[' + (P '[' * P '=' ^ 0 * P '['))
local longstring = longstring * P(function(input, index)
   local level = input:match('^%[(=*)%[', index)
   if level then
      local _, stop = input:find(']' .. level .. ']', index, true)
      if stop then return stop + 1 end
   end
end)

-- strings
local singlequoted_string = P "'" * ((1 - S "'\r\n\f\\") + (P '\\' * 1)) ^ 0 * "'"
local doublequoted_string = P '"' * ((1 - S '"\r\n\f\\') + (P '\\' * 1)) ^ 0 * '"'
local string = token('string', singlequoted_string +
                               doublequoted_string +
                               longstring)

-- comments
local singleline_comment = P '--' * (1 - S '\r\n\f') ^ 0
local multiline_comment = P '--' * longstring
local comment = token('comment', multiline_comment + singleline_comment)

-- whitespace
local whitespace = token('whitespace', S('\r\n\f\t ')^1)

-- ordered choice of all tokens and last-resort error which consumes one character
local any_token = whitespace + number + keyword + ident +
                  string + comment + operator + token('error', 1)

-- private interface
local table_of_tokens = Ct(any_token ^ 0)

-- increment [line] by the number of line-ends in [text]
local function sync(line, text)
   local index, limit = 1, #text
   while index <= limit do
      local start, stop = text:find('\r\n', index, true)
      if not start then
         start, stop = text:find('[\r\n\f]', index)
         if not start then break end
      end
      index = stop + 1
      line = line + 1
   end
   return line
end

-- we only need to synchronize the line-counter for these token types
local multiline_tokens = { comment = true, string = true, whitespace = true }

-- public interface
getmetatable(getfenv(1)).__call = function(self, input)
   assert(type(input) == 'string', 'bad argument #1 (expected string)')
   local line = 1
   local tokens = lpeg.match(table_of_tokens, input)
   for i, token in pairs(tokens) do
      token[3] = line
      if multiline_tokens[token[1]] then line = sync(line, token[2]) end
   end
   return tokens
end

-- if you really want to try it out before writing any code :P
function print(tokens)
   local print, format = _G.print, _G.string.format
   for _, token in pairs(tokens) do
      print(format('line %i, %s: `%s`', token[3], token[1], token[2]))
   end
   print(format('total of %i tokens, %i lines', #tokens, tokens[#tokens][3]))
end

-- vim: set enc=utf-8 ts=3 sw=3 :

Lua Parser

For a Lua 5.1 parser in LPeg, see LuaFish or Leg[1].

C Lexer

This lexes ANSI C. Improvements welcome. --DavidManura

-- Lua LPeg lexer for C.
-- Note:
--   Does not handle C preprocessing macros.
--   Not well tested.
-- 
-- David Manura, 2007, public domain.  Based on ANSI C Lex
--   specification in http://www.quut.com/c/ANSI-C-grammar-l-1998.html
--   (Jutta Degener, 2006; Tom Stockfisch, 1987, Jeff Lee, 1985)

local lpeg = require 'lpeg'

local P, R, S, C =
  lpeg.P, lpeg.R, lpeg.S, lpeg.C

local whitespace = S' \t\v\n\f'

local digit = R'09'
local letter = R('az', 'AZ') + P'_'
local alphanum = letter + digit
local hex = R('af', 'AF', '09')
local exp = S'eE' * S'+-'^-1 * digit^1
local fs = S'fFlL'
local is = S'uUlL'^0

local hexnum = P'0' * S'xX' * hex^1 * is^-1
local octnum = P'0' * digit^1 * is^-1
local decnum = digit^1 * is^-1
local floatnum = digit^1 * exp * fs^-1 +
                 digit^0 * P'.' * digit^1 * exp^-1 * fs^-1 +
                 digit^1 * P'.' * digit^0 * exp^-1 * fs^-1
local numlit = hexnum + octnum + floatnum + decnum

local charlit =
  P'L'^-1 * P"'" * (P'\\' * P(1) + (1 - S"\\'"))^1 * P"'"

local stringlit =
  P'L'^-1 * P'"' * (P'\\' * P(1) + (1 - S'\\"'))^0 * P'"'

local ccomment = P'/*' * (1 - P'*/')^0 * P'*/'
local newcomment = P'//' * (1 - P'\n')^0
local comment = (ccomment + newcomment)
              / function(...) print('COMMENT', ...) end

local literal = (numlit + charlit + stringlit)
              / function(...) print('LITERAL', ...) end

local keyword = C(
  P"auto" + 
  P"_Bool" +
  P"break" +
  P"case" +
  P"char" +
  P"_Complex" +
  P"const" +
  P"continue" +
  P"default" +
  P"do" +
  P"double" +
  P"else" +
  P"enum" +
  P"extern" +
  P"float" +
  P"for" +
  P"goto" +
  P"if" +
  P"_Imaginary" +
  P"inline" +
  P"int" +
  P"long" +
  P"register" +
  P"restrict" +
  P"return" +
  P"short" +
  P"signed" +
  P"sizeof" +
  P"static" +
  P"struct" +
  P"switch" +
  P"typedef" +
  P"union" +
  P"unsigned" +
  P"void" +
  P"volatile" +
  P"while"
) / function(...) print('KEYWORD', ...) end

local identifier = (letter * alphanum^0 - keyword * (-alphanum))
                 / function(...) print('ID',...) end

local op = C(
  P"..." +
  P">>=" +
  P"<<=" +
  P"+=" +
  P"-=" +
  P"*=" +
  P"/=" +
  P"%=" +
  P"&=" +
  P"^=" +
  P"|=" +
  P">>" +
  P"<<" +
  P"++" +
  P"--" +
  P"->" +
  P"&&" +
  P"||" +
  P"<=" +
  P">=" +
  P"==" +
  P"!=" +
  P";" +
  P"{" + P"<%" +
  P"}" + P"%>" +
  P"," +
  P":" +
  P"=" +
  P"(" +
  P")" +
  P"[" + P"<:" +
  P"]" + P":>" +
  P"." +
  P"&" +
  P"!" +
  P"~" +
  P"-" +
  P"+" +
  P"*" +
  P"/" +
  P"%" +
  P"<" +
  P">" +
  P"^" +
  P"|" +
  P"?"
) / function(...) print('OP', ...) end

local tokens = (comment + identifier + keyword +
                literal + op + whitespace)^0

-- frontend
local filename = arg[1]
local fh = assert(io.open(filename))
local input = fh:read'*a'
fh:close()
print(lpeg.match(tokens, input))

~~ ThomasHarningJr : Suggestion for optimization of the 'op' matcher in the C preprocessor... This should be faster due to the use of sets instead of making tons of 'basic' string comparisons. Not sure 'how' much faster...

local shiftOps = P">>" + P"<<"
local digraphs = P"<%" + P"%>" + P"<:" + P":>" -- {, }, [, ]
local op = C(
-- First match the multi-char items
  P"..." +
  ((shiftOps + S("+-*/%&^|<>=!")) * P"=") +
  shiftOps +
  P"++" +
  P"--" +
  P"&&" +
  P"||" +
  P"->" +
  digraphs +
  S(";{},:=()[].&!~-+*/%<>^|?")
) / function(...) print('OP', ...) end

SciTE Lexers

[SciTE Tools] supports LPeg lexers. A number of [examples] are included.

Other Projects that use LPeg


FindPage · RecentChanges · preferences
edit · history
Last edited March 29, 2008 4:00 am GMT (diff)