Ex Pattern |
|
Sometimes, however, Lua patterns are almost sufficient, as we may want to extend the functionality of patterns just a bit more without compiling in any additional C code. That's where this module comes in. This module builds extended pattern matching on top of Lua patterns. It takes a pattern expression written in a form similar to LPeg and translates it to a string of Lua code containing string.match calls, which is then compiled to a Lua function via loadstring. This is similar to the code you might write yourself, although this module automates it with code generation. It should have similar efficiency assuming you precompile any match objects that are used repeatedly.
Here is a simple example:
local M = require "xpattern" local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)' ):compile() local a,b,c = m('mmcccZzYybbZzYyddd') assert(a == 'bb' and b == nil and c == 'd')
which internally generates this Lua function:
local match = ...; -- string.match return function(s,pos1) local posa,c1,c2,c3; local pos2=pos1; local pos3=pos2; local pos4; posa,c1,pos4 = match(s, "()(b+)()", pos3); if not pos4 then c1 = nil posa,c2,pos4 = match(s, "()(c+)()", pos3); if not pos4 then c1,c2 = nil end end pos3=pos4; if pos3 then local pos4=pos3; while 1 do pos4 = match(s, "^[A-Z][a-z]()", pos4); if pos4 then pos3=pos4 else break end end end pos2=pos3; if pos2 then c3,pos2 = match(s, "^(.)()", pos2); end pos1=pos2; if pos1 then return c1,c2,c3 end end
See the test suite below for additional examples.
Warning: this is not fully developed or tested. Fix up the code if you want to use it in production.
-- xpattern.lua -- Preliminary regular expression-like support in Lua -- implemented in terms of Lua patterns with code generation. -- -- It translates an expression into a snippet of Lua code -- having a series of string.match calls, which is then -- compiled (via loadstring). -- -- WARNING: development not fully complete. not much tested. -- -- (c) 2008 David Manura. Licensed under the same terms as Lua (MIT license). -- Please post patches. local M = {} local string = string local format = string.format local match = string.match local assert = assert local error = error local ipairs = ipairs local loadstring = loadstring local setmetatable = setmetatable local type = type local print = print -- Count the number of captures '()' in Lua pattern string pat. -- n = count_captures(pat) local function count_captures(pat) local count = 0 local pos = 1 while pos <= #pat do local pos2 = pat:match('^[^%(%%%[]+()', pos) if pos2 then pos = pos2 elseif pat:match('^%(', pos) then count = count + 1 pos = pos + 1 elseif pat:match('^%%b..', pos) then pos = pos + 3 elseif pat:match('^%%.', pos) then pos = pos + 2 else local pos2 = pat:match('^%[[^%]%%]*()', pos) if pos2 then pos = pos2 while 1 do local pos2 = pat:match('^%%.[^%]%%]*()', pos) if pos2 then pos = pos2 elseif pat:match('^%]', pos) then pos = pos + 1 break else error('syntax', 2) end end else error('syntax', 2) end end end return count end M._count_captures = count_captures -- Append '()' to Lua pattern string. -- pat = pat_append_pos(pat) local function pat_append_pos(pat) local prefix = pat:match'^(.*)%$$' pat = prefix and prefix .. '()$' or pat .. '()' return pat end -- Prepend '()' to Lua pattern string. -- pat = pat_prepend_pos(pat) local function pat_prepend_pos(pat) local postfix = pat:match'^%^(.*)' pat = postfix and '^()' .. postfix or '()' .. pat return pat end -- Prepend '^' to Lua pattern string. -- pat = pat_append_pos(pat) local function pat_prepend_carrot(pat) local postfix = pat:match'^%^(.*)' pat = postfix and pat or '^' .. pat return pat end -- Return string listing capture variables with indices -- firstidx to lastidx. -- code = code_vars(firstidx, lastidx) local function code_vars(firstidx, lastidx) local code = '' for i=firstidx,lastidx do code = code .. (i == firstidx and '' or ',') .. 'c' .. i end return code end -- metatable for expression objects local epat_mt = {} epat_mt.__index = epat_mt -- Builds an expression object from Lua string pattern pat. -- epat = pattern(pat) local function pattern(pat) local epat = setmetatable({}, epat_mt) epat.call = function(srcidx0, destidx0, totncaptures0, anchor) local ncaptures = count_captures(pat) local lvars = code_vars(totncaptures0+1, totncaptures0+ncaptures) .. (ncaptures == 0 and '' or ',') .. 'pos' .. destidx0 pat = pat_append_pos(pat) if anchor then pat = pat_prepend_carrot(pat) else pat = pat_prepend_pos(pat) lvars = 'posa,' .. lvars end local str = format('%q', pat) local code = lvars .. ' = match(s, ' .. str .. ', pos' .. srcidx0 .. '); ' return code, ncaptures end return epat end M.P = pattern -- Generates code from pattern (either Lua pattern string or extended -- pattern object). -- code, ncaptures = gen(anypat, srcidx0, destidx0, totncaptures0, anchor) -- anypat - either Lua pattern string or extended pattern object -- srcidx0 - index of variable holding position to start matching at -- destidx0 - index of variable holding position to store subsequent -- match position at. stores nil if no match -- totncaptures0 - number of captures prior to this match -- anchor - boolean - whether to anchor match at starting position. -- -- code - Lua code string (code) and number of -- ncaptures - number of captures in pattern. local function gen(anypat, srcidx0, destidx0, totncaptures0, anchor) if type(anypat) == 'string' then if anchor then anypat = pat_prepend_carrot(anypat) end anypat = pattern(anypat) end local code, ncaptures = anypat(srcidx0, destidx0, totncaptures0, anchor) return code, ncaptures end -- Creates new pattern object that is the concatenation of the -- given list of pattern objects. -- epat2 = seq(epat...) local function seq(...) local epats = {...} local epat = setmetatable({}, epat_mt) epat.call = function(srcidx0, destidx0, totncaptures0, anchor) local ncaptures = 0 local destidx = destidx0 + 1 local code = ' local pos' .. destidx .. '=pos' .. srcidx0 .. '; ' local code_end = '' for i,pat in ipairs(epats) do local pat_code, pat_ncaptures = gen(pat, destidx, destidx, totncaptures0+ncaptures, anchor or i~=1) ncaptures = ncaptures + pat_ncaptures code = code .. pat_code if i ~= #epats then code = code .. ' if pos' .. destidx .. ' then ' code_end = ' end ' .. code_end end end code = code .. code_end .. ' pos' .. destidx0 .. '=pos' .. destidx .. '; ' return code, ncaptures end return epat end M.seq = seq -- Creates new pattern object that is the alternation of the -- given list of pattern objects. -- epat2 = alt(epat...) local function alt(...) local epats = {...} local epat = setmetatable({}, epat_mt) epat.call = function(srcidx0, destidx0, totncaptures0, anchor) local ncaptures = 0 local destidx = destidx0 + 1 local code = ' local pos' .. destidx .. '; ' local code_end = '' for i,pat in ipairs(epats) do local pat_code, pat_ncaptures = gen(pat, srcidx0, destidx, totncaptures0+ncaptures, anchor) ncaptures = ncaptures + pat_ncaptures code = code .. pat_code local cvars = code_vars(totncaptures0 + 1, totncaptures0+ncaptures, anchor) code = code .. ' if not pos' .. destidx .. ' then ' if cvars ~= '' then code = code .. ' ' .. cvars .. ' = nil ' end code_end = code_end .. ' end ' end code = code .. code_end .. ' pos' .. destidx0 .. '=pos' .. destidx .. '; ' return code, ncaptures end return epat end M.alt = alt -- Creates new pattern object that is zero or more repetitions of the -- given pattern object. -- e2 = zeroormore(e) local function zero_or_more(pat) local epat = setmetatable({}, epat_mt) epat.call = function(srcidx0, destidx0, totncaptures0, anchor) local ncaptures = 0 local destidx = destidx0 + 1 local code = ' local pos' .. destidx .. '=pos' .. srcidx0 .. '; ' if not anchor then code = code .. ' posa=pos' .. srcidx0 .. '; ' end local pat_code, pat_ncaptures = gen(pat, destidx, destidx, totncaptures0+ncaptures, true) ncaptures = ncaptures + pat_ncaptures code = code .. ' while 1 do ' .. pat_code .. ' if pos' .. destidx .. ' then ' .. ' pos' .. destidx0 .. '=pos' .. destidx .. ' else break end ' .. ' end ' return code, ncaptures end return epat end M.zero_or_more = zero_or_more -- Creates new pattern object that is zero or one of the -- given pattern object. -- epat2 = zero_or_one(epat) local function zero_or_one(epat) local epat2 = setmetatable({}, epat_mt) epat2.call = function(srcidx0, destidx0, totncaptures0, anchor) local ncaptures = 0 local destidx = destidx0 + 1 local code = ' local pos' .. destidx .. '=pos' .. srcidx0 .. '; ' if not anchor then code = code .. ' posa=pos' .. srcidx0 .. '; ' end local epat_code, epat_ncaptures = gen(epat, destidx, destidx, totncaptures0+ncaptures, true) ncaptures = ncaptures + epat_ncaptures code = code .. epat_code .. ' if pos' .. destidx .. ' then ' .. ' pos' .. destidx0 .. '=pos' .. destidx .. ' end ' return code, ncaptures end return epat2 end M.zero_or_one = zero_or_one -- Returns Lua core code string corresponding to pattern object. -- code = basic_code_of(epat) local function basic_code_of(epat) local code, ncaptures = epat(1, 1, 0, false) local lvars = code_vars(1, ncaptures) if lvars == '' then code = ' local posa; ' .. code .. ' if pos1 then return s:sub(posa or 1,pos1-1) end ' else code = ' local posa,' .. lvars .. '; ' .. code .. ' if pos1 then return ' .. lvars .. ' end ' end return code end M.basic_code_of = basic_code_of -- Returns Lua complete code string corresponding to pattern object. -- code = code_of(epat) local function code_of(epat) local code = ' local match = ...; return function(s,pos1) ' .. basic_code_of(epat) .. ' end ' return code end M.code_of = code_of -- Compiles pattern object to Lua function. -- f = patcompile(epat) local function compile(epat) local code = code_of(epat) if M.debug then print('DEBUG:', code) end local f = assert(loadstring(code))(match) return f end M.compile = compile -- operator for matching function epat_mt.__call(epat, ...) return epat.call(...) end -- operator for alternation function epat_mt.__add(a_epat, b_epat) return alt(a_epat, b_epat) end -- operator for concatenation function epat_mt.__mul(a_epat, b_epat) return seq(a_epat, b_epat) end -- operator for repetition function epat_mt.__pow(epat, n) if n == 0 then return zero_or_more(epat) elseif n == -1 then return zero_or_one(epat) else error 'FIX - unimplemented' end end -- IMPROVE design? epat_mt.compile = compile epat_mt.basic_code_of = basic_code_of epat_mt.code_of = code_of return M
-- xpattern_test.lua - test suite for xpattern.lua -- utility function: convert list of values to string. local function str(...) local n = select('#', ...) local t = {...} for i=1,n do t[i] = tostring(t[i]) end return table.concat(t, ',') end local M = require "xpattern" M.debug = true assert(M._count_captures'' == 0) assert(M._count_captures'a' == 0) assert(not pcall(function() M._count_captures'%' end)) assert(M._count_captures'()' == 1) assert(M._count_captures'%(%)' == 0) -- %( assert(M._count_captures'[()]' == 0) -- () inside [] assert(M._count_captures'[%(%)]' == 0) -- %( inside [] assert(M._count_captures'[%]()]' == 0) -- %] inside [] assert(M._count_captures'[]()]' == 1) assert(M._count_captures'%b()' == 0) -- () on %b.. assert(M._count_captures'(()().())' == 4) -- nested -- more complex example assert(M._count_captures'.(.%))[(]%(()' == 2) -- m("user@host", re.seq("^", re.opt("([a-z]+@)"), "([a-z]+)$") local P = M.P -- simple matching assert(str(P'':compile()('')) == '') assert(str(P'':compile()('a')) == '') assert(str(P'a':compile()('')) == '') assert(str(P'a':compile()('a')) == 'a') assert(str(P'a':compile()('ba')) == 'a') assert(str(P'a+':compile()('baa')) == 'aa') -- simple anchors assert(str(P'^a+':compile()('aa')) == 'aa') assert(str(P'^a+':compile()('baab')) == '') assert(str(P'a+$':compile()('baa')) == 'aa') assert(str(P'a+$':compile()('baab')) == '') -- simple captures assert(str(P'(a+)(b+)':compile()('baab')) == 'aa,b') assert(str(P'^(a+)(b+)':compile()('baab')) == '') -- simple combinations local m = ((P'(a+)' + P'b(b*)') * P'(c+)()'):compile() assert(str( m("aacccdd")) == 'aa,nil,ccc,6') assert(str( m("bbcccdd")) == 'nil,b,ccc,6') assert(str( m("bbdd")) == '') -- simple replication (*) local m = ( P'a'^0 ):compile() assert(str(m'') == '') assert(str(m'a') == 'a') assert(str(m'aab') == 'aa') -- replication (*) local m = ( (P'a+' + P'b+')^0 ):compile() assert(str(m'zabaabbc') == '') assert(str(m'abaabb') == 'abaabb') local m = ( (P'a+' * P'b+' + P'c+' * P'd+')^0 ):compile() assert(str(m'aabbccddaa') == 'aabbccdd') -- simple replication (?) local m = ( P'a'^-1 ):compile() assert(str(m'') == '') assert(str(m'a') == 'a') assert(str(m'aab') == 'a') -- replication (*) local m = ( P'c' * (P'a+' + P'b+')^-1 ):compile() assert(str(m'caabb') == 'caa') -- Some of these examples from Mastering Regular Expressions (MRE), -- 2nd Ed. Jeffrey .Friedl. -- MRE p.19 local m = ( P'^' * (P'From' + P'Subject' + P'Date') * P':%s*(.*)' ):compile() assert(str(m('Subject: test')) == 'test') -- MRE p.13 local m = ( (P'Geo' + P'Je') * P'ff' * (P're' + P'er') * P'y' ):compile() assert(str(m'Jeffrey') == 'Jeffrey') assert(str(m'Jeffery') == 'Jeffery') assert(str(m'Geoffrey') == 'Geoffrey') assert(str(m'Geoffery') == 'Geoffery') assert(str(m'Jefery') == '') assert(str(m'Geofferi') == '') -- MRE p.24 --FIX-TODO: local m = ( P'%$[0-9]+' * P'%.[0-9][0-9]'^-1 ):compile() assert(str(m'$20.00') == '$20.00') assert(str(m'$20') == '$20') assert(str(m'$20.00.00') == '$20.00') -- warning: matches b before c - ok? local M = require "xpattern" local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)' ):compile() assert(str(m('mmcccZzYybbZzYyddd')) == 'bb,nil,d') local a,b,c = m('mmcccZzYybbZzYyddd') assert(a == 'bb' and b == nil and c == 'd') print 'DONE'