Lpeg Recipes |
|
Place examples of Lua code using LPeg for parsing to help further the understanding of how to use "Parsing Expression Grammars".
local BEGIN_COMMENT = lpeg.P("/*") local END_COMMENT = lpeg.P("*/") local NOT_BEGIN = (1 - BEGIN_COMMENT)^0 local NOT_END = (1 - END_COMMENT)^0 local FULL_COMMENT_CONTENTS = BEGIN_COMMENT * NOT_END * END_COMMENT -- Parser to find comments from a string local searchParser = (NOT_BEGIN * lpeg.C(FULL_COMMENT_CONTENTS))^0 -- Parser to find non-comments from a string local filterParser = (lpeg.C(NOT_BEGIN) * FULL_COMMENT_CONTENTS)^0 * lpeg.C(NOT_BEGIN) -- Simpler version, although empirically it is slower.... (why?) ... any optimization -- suggestions are desired as well as optimum integration w/ C++ comments and other -- syntax elements local searchParser = (lpeg.C(FULL_COMMENT_CONTENTS) + 1)^0 -- Suggestion by Roberto to make the search faster -- Works because it loops fast over all non-slashes, then it begins the slower match phase local searchParser = ((1 - lpeg.P"/")^0 * (lpeg.C(FULL_COMMENT_CONTENTS) + 1))^0
text.
do local add = function (x,y) return x+y end local P,Ca,Cc= lpeg.P,lpeg.Ca,lpeg.Cc local symbols = { I=1,V=5,X=10,L=50,C=100,D=500,M=1000, IV=4,IX=9,XL=40,CD=400,CM=900} local env = getfenv(1) for s,n in pairs(symbols) do env[s:lower()] = P(s)*Cc(n)/add end setfenv(1,env) local MS = m^0 local CS = (d*c^(-4)+cd+cm+c^(-4))^(-1) local XS = (l*x^(-4)+xl+x^(-4))^(-1) local IS = (v*i^(-4)+ix+iv+i^(-4))^(-1) local p = Ca(Cc(0)*MS*CS*XS*IS) local result = p:match(text:upper()) print(result or "?") end
This is a Lua lexer in LPeg. The original author is PeterOdding, and the original link is broken ( http://xolox.ath.cx/lua/lexer.html ).
--[[ = ABOUT This module uses Roberto Ierusalimschy's powerful new pattern matching library LPeg[1] to tokenize Lua source-code in to a table of tokens. I think it handles all of Lua's syntax, but if you find anything missing I would appreciate a xolox aatt home ddoott nl. This lexer is based on the BNF[2] from the Lua manual. = USAGE I've saved my copy of this module under [$LUA_PATH/lexers/lua.lua] which means I can use it like in the following interactive prompt: Lua 5.1.1 Copyright (C) 1994-2006 Lua.org, PUC-Rio > require 'lexers.lua' > tokens = lexers.lua [=[ >> 42 or 0 >> -- some Lua source-code in a string]=] > = tokens table: 00422E40 > lexers.lua.print(tokens) line 1, number: `42` line 1, whitespace: ` ` line 1, keyword: `or` line 1, whitespace: ` ` line 1, number: `0` line 1, whitespace: ` ` line 2, comment: `-- some Lua source-code in a string` total of 7 tokens, 2 lines The returned table [tokens] looks like this: { -- type , text, line { 'number' , '42', 1 }, { 'whitespace', ' ' , 1 }, { 'keyword' , 'or', 1 }, { 'whitespace', ' ' , 1 }, { 'number' , '0' , 1 }, { 'whitespace', '\n', 1 }, { 'comment' , '-- some Lua source-code in a string', 2 }, } = CREDITS Written by Peter Odding, 2007/04/04 = THANKS TO - the Lua authors for a wonderful language; - Roberto for LPeg; - caffeine for keeping me awake :) = LICENSE Shamelessly ripped from the SQLite[3] project: The author disclaims copyright to this source code. In place of a legal notice, here is a blessing: May you do good and not evil. May you find forgiveness for yourself and forgive others. May you share freely, never taking more than you give. [1] http://www.inf.puc-rio.br/~roberto/lpeg.html [2] http://lua.org/manual/5.1/manual.html#8 [3] http://sqlite.org --]] -- since this module is intended to be loaded with require() we receive the -- name used to load us in ... and pass it on to module() module(..., package.seeall) -- written for LPeg .5, by the way local lpeg = require 'lpeg' local P, R, S, C, Cc, Ct = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cc, lpeg.Ct -- create a pattern which captures the lua value [id] and the input matching -- [patt] in a table local function token(id, patt) return Ct(Cc(id) * C(patt)) end local digit = R('09') -- range of valid characters after first character of identifier local idsafe = R('AZ', 'az', '\127\255') + P '_' -- operators local operator = token('operator', P '==' + P '~=' + P '<=' + P '>=' + P '...' + P '..' + S '+-*/%^#=<>;:,.{}[]()') -- identifiers local ident = token('identifier', idsafe * (idsafe + digit + P '.') ^ 0) -- keywords local keyword = token('keyword', (P 'and' + P 'break' + P 'do' + P 'else' + P 'elseif' + P 'end' + P 'false' + P 'for' + P 'function' + P 'if' + P 'in' + P 'local' + P 'nil' + P 'not' + P 'or' + P 'repeat' + P 'return' + P 'then' + P 'true' + P 'until' + P 'while') * -(idsafe + digit)) -- numbers local number_sign = S'+-'^-1 local number_decimal = digit ^ 1 local number_hexadecimal = P '0' * S 'xX' * R('09', 'AF', 'af') ^ 1 local number_float = (digit^1 * P'.' * digit^0 + P'.' * digit^1) * (S'eE' * number_sign * digit^1)^-1 local number = token('number', number_hexadecimal + number_float + number_decimal) -- callback for [=[ long strings ]=] -- ps. LPeg is for Lua what regex is for Perl, which makes me smile :) local longstring = #(P '[[' + (P '[' * P '=' ^ 0 * P '[')) local longstring = longstring * P(function(input, index) local level = input:match('^%[(=*)%[', index) if level then local _, stop = input:find(']' .. level .. ']', index, true) if stop then return stop + 1 end end end) -- strings local singlequoted_string = P "'" * ((1 - S "'\r\n\f\\") + (P '\\' * 1)) ^ 0 * "'" local doublequoted_string = P '"' * ((1 - S '"\r\n\f\\') + (P '\\' * 1)) ^ 0 * '"' local string = token('string', singlequoted_string + doublequoted_string + longstring) -- comments local singleline_comment = P '--' * (1 - S '\r\n\f') ^ 0 local multiline_comment = P '--' * longstring local comment = token('comment', multiline_comment + singleline_comment) -- whitespace local whitespace = token('whitespace', S('\r\n\f\t ')^1) -- ordered choice of all tokens and last-resort error which consumes one character local any_token = whitespace + number + keyword + ident + string + comment + operator + token('error', 1) -- private interface local table_of_tokens = Ct(any_token ^ 0) -- increment [line] by the number of line-ends in [text] local function sync(line, text) local index, limit = 1, #text while index <= limit do local start, stop = text:find('\r\n', index, true) if not start then start, stop = text:find('[\r\n\f]', index) if not start then break end end index = stop + 1 line = line + 1 end return line end -- we only need to synchronize the line-counter for these token types local multiline_tokens = { comment = true, string = true, whitespace = true } -- public interface getmetatable(getfenv(1)).__call = function(self, input) assert(type(input) == 'string', 'bad argument #1 (expected string)') local line = 1 local tokens = lpeg.match(table_of_tokens, input) for i, token in pairs(tokens) do token[3] = line if multiline_tokens[token[1]] then line = sync(line, token[2]) end end return tokens end -- if you really want to try it out before writing any code :P function print(tokens) local print, format = _G.print, _G.string.format for _, token in pairs(tokens) do print(format('line %i, %s: `%s`', token[3], token[1], token[2])) end print(format('total of %i tokens, %i lines', #tokens, tokens[#tokens][3])) end -- vim: set enc=utf-8 ts=3 sw=3 :
For a Lua 5.1 parser in LPeg, see LuaFish or Leg[1].
This lexes ANSI C. Improvements welcome. --DavidManura
-- Lua LPeg lexer for C. -- Note: -- Does not handle C preprocessing macros. -- Not well tested. -- -- David Manura, 2007, public domain. Based on ANSI C Lex -- specification in http://www.quut.com/c/ANSI-C-grammar-l-1998.html -- (Jutta Degener, 2006; Tom Stockfisch, 1987, Jeff Lee, 1985) local lpeg = require 'lpeg' local P, R, S, C = lpeg.P, lpeg.R, lpeg.S, lpeg.C local whitespace = S' \t\v\n\f' local digit = R'09' local letter = R('az', 'AZ') + P'_' local alphanum = letter + digit local hex = R('af', 'AF', '09') local exp = S'eE' * S'+-'^-1 * digit^1 local fs = S'fFlL' local is = S'uUlL'^0 local hexnum = P'0' * S'xX' * hex^1 * is^-1 local octnum = P'0' * digit^1 * is^-1 local decnum = digit^1 * is^-1 local floatnum = digit^1 * exp * fs^-1 + digit^0 * P'.' * digit^1 * exp^-1 * fs^-1 + digit^1 * P'.' * digit^0 * exp^-1 * fs^-1 local numlit = hexnum + octnum + floatnum + decnum local charlit = P'L'^-1 * P"'" * (P'\\' * P(1) + (1 - S"\\'"))^1 * P"'" local stringlit = P'L'^-1 * P'"' * (P'\\' * P(1) + (1 - S'\\"'))^0 * P'"' local ccomment = P'/*' * (1 - P'*/')^0 * P'*/' local newcomment = P'//' * (1 - P'\n')^0 local comment = (ccomment + newcomment) / function(...) print('COMMENT', ...) end local literal = (numlit + charlit + stringlit) / function(...) print('LITERAL', ...) end local keyword = C( P"auto" + P"_Bool" + P"break" + P"case" + P"char" + P"_Complex" + P"const" + P"continue" + P"default" + P"do" + P"double" + P"else" + P"enum" + P"extern" + P"float" + P"for" + P"goto" + P"if" + P"_Imaginary" + P"inline" + P"int" + P"long" + P"register" + P"restrict" + P"return" + P"short" + P"signed" + P"sizeof" + P"static" + P"struct" + P"switch" + P"typedef" + P"union" + P"unsigned" + P"void" + P"volatile" + P"while" ) / function(...) print('KEYWORD', ...) end local identifier = (letter * alphanum^0 - keyword * (-alphanum)) / function(...) print('ID',...) end local op = C( P"..." + P">>=" + P"<<=" + P"+=" + P"-=" + P"*=" + P"/=" + P"%=" + P"&=" + P"^=" + P"|=" + P">>" + P"<<" + P"++" + P"--" + P"->" + P"&&" + P"||" + P"<=" + P">=" + P"==" + P"!=" + P";" + P"{" + P"<%" + P"}" + P"%>" + P"," + P":" + P"=" + P"(" + P")" + P"[" + P"<:" + P"]" + P":>" + P"." + P"&" + P"!" + P"~" + P"-" + P"+" + P"*" + P"/" + P"%" + P"<" + P">" + P"^" + P"|" + P"?" ) / function(...) print('OP', ...) end local tokens = (comment + identifier + keyword + literal + op + whitespace)^0 -- frontend local filename = arg[1] local fh = assert(io.open(filename)) local input = fh:read'*a' fh:close() print(lpeg.match(tokens, input))
~~ ThomasHarningJr : Suggestion for optimization of the 'op' matcher in the C preprocessor... This should be faster due to the use of sets instead of making tons of 'basic' string comparisons. Not sure 'how' much faster...
local shiftOps = P">>" + P"<<" local digraphs = P"<%" + P"%>" + P"<:" + P":>" -- {, }, [, ] local op = C( -- First match the multi-char items P"..." + ((shiftOps + S("+-*/%&^|<>=!")) * P"=") + shiftOps + P"++" + P"--" + P"&&" + P"||" + P"->" + digraphs + S(";{},:=()[].&!~-+*/%<>^|?") ) / function(...) print('OP', ...) end
[SciTE Tools] supports LPeg lexers. A number of [examples] are included.