Ex Pattern

lua-users home
wiki

xpattern is a Lua pattern matching library based on Lua patterns and in the style of LuaPeg.

Description

Lua patterns [1] are a limited form of regular expressions. Unlike regular expressions, patterns do not support grouping and quantification [2] operations on anything except single character sets. For full regular expression (or Perl-style regular expression) support, you may instead compile a regular expression module such as lrexlib [3]. For even greater flexibility, for parsing and lexical analysis applications or even for the simpler case, one can use parsing expression grammars (PEG) via the LuaPeg library [4] implemented in C.

Sometimes, however, Lua patterns are almost sufficient, as we may want to extend the functionality of patterns just a bit more without compiling in any additional C code. That's where this module comes in. This module builds extended pattern matching on top of Lua patterns. It takes a pattern expression written in a form somewhat similar to LPeg and translates it to a string of Lua code containing string.match calls, which is then compiled to a Lua function via loadstring. This is similar to the code you might write yourself, although this module automates it with code generation. It should have similar efficiency assuming you precompile any match objects that are used repeatedly.

Here is a simple example:

local M = require "xpattern"
local P = M.P
local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)()' ):compile()
local a,b,c,d = m('mmcccZzYybbZzYyddd') -- match c not b first
assert(a == nil and b == 'ccc' and c == 'b' and d == 11)

which internally generates this Lua function:

local match = ...  -- string.match
return function(s,pos)
  for pos1=(pos or 1),#s do
    local pos2
    local c1,c2,c3,c4
    local pos3 = pos1
    c1,pos3 = match(s, "^(b+)()", pos1)
    if not pos3 then
      c2,pos3 = match(s, "^(c+)()", pos1)
    end
    pos2 = pos3
    if pos2 then
      do
        local pos3=pos2
        pos2 = pos2
        while 1 do
          pos3 = match(s, "^[A-Z][a-z]()", pos3)
          if pos3 then
            pos2=pos3
          else break end
        end
      end
    end
    if pos2 then
      c3,c4,pos2 = match(s, "^(.)()()", pos2)
    end
    if pos2 then return c1,c2,c3,c4 end
  end
end

See the test suite below for additional examples.

Status

Warning: this code should be considered somewhat experimental. It is not fully developed or tested. Fix up the code if you want to use it in production.

Appendix: xpattern.lua

-- xpattern.lua
-- Preliminary regular expression-like support in Lua
-- Uses Lua patterns as the core building block.
--
-- Implemented in pure Lua with code generation technique.
-- It translates an expression into a snippet of Lua code
-- having a series of `string.match` calls, which is then
-- compiled (via `loadstring`).
--
-- Like lpeg, does not support backtracking.
--
-- WARNING: This is experimental code.  The design and implementation
-- has not been thoroughly tested.
--
-- Version v20091021.
-- (c) 2008-2009 David Manura. Licensed under the same terms as Lua (MIT license).
-- Please post patches.

local M = {}

local string = string
local format = string.format
local match  = string.match
local assert = assert
local error  = error
local ipairs = ipairs
local loadstring   = loadstring
local setmetatable = setmetatable
local type   = type
local print  = print


-- Adds whitespace to string `s`.
-- Whitespace string `ws` (default to '' if omitted) is prepended to each line
-- of `s`.  Also ensures `s` is is terminated by a newline.
local function add_whitespace(s, ws)
  ws = ws or ''
  s = s:gsub('[^\r\n]+', ws .. '%1')
  if s:match('[^\r\n]$') then
    s = s .. '\n'
  end
  return s
end

-- Counts the number `count` of captures '()' in Lua pattern string `pat`.
local function count_captures(pat)
  local count = 0
  local pos = 1
  while pos <= #pat do
    local pos2 = pat:match('^[^%(%%%[]+()', pos)
    if pos2 then
      pos = pos2
    elseif pat:match('^%(', pos) then
      count = count + 1
      pos = pos + 1
    elseif pat:match('^%%b..', pos) then
      pos = pos + 3
    elseif pat:match('^%%.', pos) then
      pos = pos + 2
    else
      local pos2 = pat:match('^%[[^%]%%]*()', pos)
      if pos2 then
        pos = pos2
        while 1 do
          local pos2 = pat:match('^%%.[^%]%%]*()', pos)
          if pos2 then
            pos = pos2
          elseif pat:match('^%]', pos) then
            pos = pos + 1
            break
          else
            error('syntax', 2)
          end
        end
      else
        error('syntax', 2)
      end
    end
  end
  return count
end
M._count_captures = count_captures


-- Appends '()' to Lua pattern string `pat`.
local function pat_append_pos(pat)
  local prefix = pat:match'^(.*)%$$'
  pat = prefix and prefix .. '()$' or pat .. '()'
  return pat
end

-- Prepends '()' to Lua pattern string `pat`.
local function pat_prepend_pos(pat)
  local postfix = pat:match'^%^(.*)'
  pat = postfix and '^()' .. postfix or '()' .. pat
  return pat
end


-- Prepends '^' to Lua pattern string `pat`.
local function pat_prepend_carrot(pat)
  local postfix = pat:match'^%^(.*)'
  pat = postfix and pat or '^' .. pat
  return pat
end


-- Return a string listing pattern capture variables with indices `firstidx`
-- to `lastidx`.
-- Ex: code_vars(1,2) --> 'c1,c2'
local function code_vars(firstidx, lastidx)
  local code = ''
  for i=firstidx,lastidx do
    code = code .. (i == firstidx and '' or ',') .. 'c' .. i
  end
  return code
end


-- Metatable for expression objects
local epat_mt = {}
epat_mt.__index = epat_mt


-- Builds an extended pattern object `epat` from Lua string pattern `pat`.
local function pattern(pat)
  local epat = setmetatable({}, epat_mt)
  epat.call = function(srcidx0, destidx0, totncaptures0)
    local ncaptures = count_captures(pat)
    local lvars =
      code_vars(totncaptures0+1, totncaptures0+ncaptures)
      .. (ncaptures == 0 and '' or ',') .. 'pos' .. destidx0
    local pat = pat_append_pos(pat)

    pat = pat_prepend_carrot(pat)

    local str = format('%q', pat)
    local code = lvars .. ' = match(s, ' .. str .. ', pos' .. srcidx0 .. ')\n'
    return code, ncaptures
  end
  epat.anchored = pat:sub(1,1) == '^'
  return epat
end


-- Generates code from pattern `anypat` (either Lua pattern string or extended
-- pattern object).
--  `anypat`    - either Lua pattern string or extended pattern object
--  `srcidx0`   - index of variable holding position to start matching at
--  `destidx0`  - index of variable holding position to store subsequent
--                match position at.  stores nil if no match
--  `totncaptures0` - number of captures prior to this match
--  `code`      - Lua code string (code) and number of
--  `ncaptures` - number of captures in pattern.
local function gen(anypat, srcidx0, destidx0, totncaptures0)
  if type(anypat) == 'string' then
    anypat = pat_prepend_carrot(anypat)
    anypat = pattern(anypat)
  end
  local code, ncaptures = anypat(srcidx0, destidx0, totncaptures0)
  return code, ncaptures
end


-- Creates a new extended pattern object `epat` that is the concatenation of
-- the given list (of size >= 0) of pattern objects.
-- Specify a single string argument to convert a Lua pattern to an extended
-- pattern object.
local function seq(...) -- epats...
  -- Ensure args are extended pattern objects.
  local epats = {...}
  for i=1,#epats do
    if type(epats[i]) == 'string' then
      epats[i] = pattern(epats[i])
    end
  end

  local epat = setmetatable({}, epat_mt)
  epat.call = function(srcidx0, destidx0, totncaptures0, ws)
    if #epats == 0 then
      return 'pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n', 0
    elseif #epats == 1 then
      return epats[1](srcidx0, destidx0, totncaptures0, ws)
    else
      local ncaptures = 0
      local pat_code, pat_ncaptures =
          gen(epats[1], srcidx0, destidx0, totncaptures0+ncaptures, true)
      ncaptures = ncaptures + pat_ncaptures
      local code = add_whitespace(pat_code, '')

      for i=2,#epats do
        local pat_code, pat_ncaptures =
            gen(epats[i], destidx0, destidx0, totncaptures0+ncaptures, true)
        ncaptures = ncaptures + pat_ncaptures
        code =
          code ..
          'if pos' .. destidx0 .. ' then\n' ..
            add_whitespace(pat_code, '  ') ..
          'end\n'
      end
      return code, ncaptures
    end
  end
  if epats[1] and epats[1].anchored then
    epat.anchored = true
  end
  return epat
end
M.P = seq


-- Creates new extended pattern object `epat` that is the alternation of the
-- given list of pattern objects `epats...`.
local function alt(...) -- epats...
  -- Ensure args are extended pattern objects.
  local epats = {...}
  for i=1,#epats do
    if type(epats[i]) == 'string' then
      epats[i] = pattern(epats[i])
    end
  end

  local epat = setmetatable({}, epat_mt)
  epat.call = function(srcidx0, destidx0, totncaptures0)
    if #epats == 0 then
      return 'pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n', 0
    elseif #epats == 1 then
      return epats[1](srcidx0, destidx0, totncaptures0)
    else
      local ncaptures = 0
      local pat_code, pat_ncaptures =
          gen(epats[1], srcidx0, destidx0+1, totncaptures0+ncaptures, true)
      ncaptures = ncaptures + pat_ncaptures
      local code = 'local pos' .. destidx0+1 .. ' = pos' .. srcidx0 .. '\n' ..
                   add_whitespace(pat_code, '')

      for i=2,#epats do
        local pat_code, pat_ncaptures =
            gen(epats[i], srcidx0, destidx0+1, totncaptures0+ncaptures, true)
        ncaptures = ncaptures + pat_ncaptures
        code =
          code ..
          'if not pos' .. destidx0+1 .. ' then\n' ..
            add_whitespace(pat_code, '  ') ..
          'end\n'
      end
      code = code .. 'pos' .. destidx0 .. ' = pos' .. destidx0+1 .. '\n'
      return code, ncaptures
    end
  end
  return epat
end
M.alt = alt


-- Creates new extended pattern object `epat` that is zero or more repetitions
-- of the given pattern object `pat` (if one evaluates to false) or one or more
-- (if one evaluates to true).
local function star(pat, one)
  local epat = setmetatable({}, epat_mt)
  epat.call = function(srcidx0, destidx0, totncaptures0)
    local ncaptures = 0
    local destidx = destidx0 + 1
    local code = 'do\n' ..
                 '  local pos' .. destidx .. '=pos' .. srcidx0 .. '\n'
    local pat_code, pat_ncaptures =
        gen(pat, destidx, destidx, totncaptures0+ncaptures, true)
    ncaptures = ncaptures + pat_ncaptures
    code = code ..
      (one and ('  pos' .. destidx0 .. ' = nil\n')
           or  ('  pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n')) ..
      '  while 1 do\n' ..
           add_whitespace(pat_code, '    ') ..
      '    if pos' .. destidx .. ' then\n' ..
      '      pos' .. destidx0 .. '=pos' .. destidx .. '\n' ..
      '    else break end\n' ..
      '  end\n' ..
      'end\n'
    return code, ncaptures
  end
  return epat
end
M.star = star


-- Creates new extended pattern object `epat` that is zero or one of the
-- given pattern object `epat0`.
local function zero_or_one(epat0)
  local epat = setmetatable({}, epat_mt)
  epat.call = function(srcidx0, destidx0, totncaptures0)
    local ncaptures = 0
    local destidx = destidx0 + 1
    local code = 'do\n' ..
                 '  local pos' .. destidx .. '=pos' .. srcidx0 .. '\n'
    local epat0_code, epat0_ncaptures =
        gen(epat0, destidx, destidx, totncaptures0+ncaptures, true)
    ncaptures = ncaptures + epat0_ncaptures
    code = code ..
      add_whitespace(epat0_code) ..
      '  if pos' .. destidx .. ' then\n' ..
      '    pos' .. destidx0 .. '=pos' .. destidx .. '\n' ..
      '  else\n' ..
      '    pos' .. destidx0 .. '=pos' .. srcidx0 .. '\n' ..
      '  end\n' ..
      'end\n'
    return code, ncaptures
  end
  return epat
end
M.zero_or_one = zero_or_one


-- Gets Lua core code string `code` corresponding to pattern object `epat`
local function basic_code_of(epat)
  local pat_code, ncaptures = epat(1, 2, 0, true)
  local lvars = code_vars(1, ncaptures)

  if epat.anchored then
    local code =
      'local pos1=pos or 1\n' ..
      'local pos2\n' ..
      (lvars ~= '' and '  local ' .. lvars .. '\n' or '') ..
      add_whitespace(pat_code) .. '\n' ..
      'if pos2 then return ' .. (lvars ~= '' and lvars or 's:sub(pos1,pos2-1)') .. ' end\n'
    return code
  else
    local code =
        'for pos1=(pos or 1),#s do\n' ..
        '  local pos2\n'
    if lvars == '' then
      code =
        code ..
           add_whitespace(pat_code, '  ') ..
        '  if pos2 then return s:sub(pos1,pos2-1) end\n'
    else
      code =
        code ..
        '  local ' .. lvars .. '\n' ..
           add_whitespace(pat_code, '  ') ..
        '  if pos2 then return ' .. lvars .. ' end\n'
    end
    code =
        code ..
        'end\n'
    return code
  end
end
M.basic_code_of = basic_code_of


-- Gets Lua complete code string `code` corresponding to pattern object `epat`.
local function code_of(epat)
  local code =
    'local match = ...\n' ..
    'return function(s,pos)\n' ..
    add_whitespace(basic_code_of(epat), '  ') ..
    'end\n'
  return code
end
M.code_of = code_of


-- Compiles pattern object `epat` to Lua function `f`.
local function compile(epat)
  local code = code_of(epat)
  if M.debug then print('DEBUG:\n' .. code) end
  local f = assert(loadstring(code))(match)
  return f
end
M.compile = compile


-- operator for pattern matching
function epat_mt.__call(epat, ...)
  return epat.call(...)
end


-- operator for pattern alternation
function epat_mt.__add(a_epat, b_epat)
  return alt(a_epat, b_epat)
end


-- operator for pattern concatenation
function epat_mt.__mul(a_epat, b_epat)
  return seq(a_epat, b_epat)
end


-- operator for pattern repetition
function epat_mt.__pow(epat, n)
  if n == 0 then
    return star(epat)
  elseif n == 1 then
    return star(epat, true)
  elseif n == -1 then
    return zero_or_one(epat)
  else
    error 'FIX - unimplemented'
  end
end


-- IMPROVE design?
epat_mt.compile       = compile
epat_mt.basic_code_of = basic_code_of
epat_mt.code_of       = code_of


return M

Appendix: xpattern_test.lua

-- xpattern_test.lua - test suite for xpattern.lua

-- utility function: convert list of values to string.
local function str(...)
  local n = select('#', ...)
  local t = {...}
  for i=1,n do t[i] = tostring(t[i]) end
  return table.concat(t, ',')
end

local M = require "xpattern"
local P = M.P
M.debug = true

-- internal: _count_captures
assert(M._count_captures'' == 0)
assert(M._count_captures'a' == 0)
assert(not pcall(function() M._count_captures'%' end))
assert(M._count_captures'()' == 1)
assert(M._count_captures'%(%)' == 0)    -- %(
assert(M._count_captures'[()]' == 0)    -- () inside []
assert(M._count_captures'[%(%)]' == 0)  -- %( inside []
assert(M._count_captures'[%]()]' == 0)  -- %] inside []
assert(M._count_captures'[]()]' == 1)
assert(M._count_captures'%b()' == 0)    -- () on %b..
assert(M._count_captures'(()().())' == 4)   -- nested
-- more complex example
assert(M._count_captures'.(.%))[(]%(()' == 2)


-- simple matching
assert(str(P'':compile()('')) == '')
assert(str(P'':compile()('a')) == '')
assert(str(P'a':compile()('')) == '')
assert(str(P'a':compile()('a')) == 'a')
assert(str(P'a':compile()('ba')) == 'a')
assert(str(P'a+':compile()('baa')) == 'aa')

-- simple anchors
assert(str(P'^a+':compile()('aa')) == 'aa')
assert(str(P'^a+':compile()('baab')) == '') -- $ fail
assert(str(P'a+$':compile()('baa')) == 'aa')
assert(str(P'a+$':compile()('baab')) == '') -- $ fail

-- simple captures
assert(str(P'(a+)(b+)':compile()('baab')) == 'aa,b')
assert(str(P'^(a+)(b+)':compile()('baab')) == '')

-- simple sequences
local m = P():compile()
assert(str( m('') ) == '')
assert(str( m('a') ) == '')
local m = P(''):compile()
assert(str( m('') ) == '')
assert(str( m('a') ) == '')
local m = P('a', 'b', 'c'):compile()
assert(str( m('.a.') ) == '')
assert(str( m('.abc.') ) == 'abc')
local m = (P'a' * P'b' * P'c'):compile() -- identical
assert(str( m('.a.') ) == '')
assert(str( m('.abc.') ) == 'abc')
local m = P(P'a', 'b', P'c'):compile() -- identical
assert(str( m('.a.') ) == '')
assert(str( m('.abc.') ) == 'abc')
local m = P(P'a+', 'b+', P'c+'):compile()
assert(str( m('.abaabcc.') ) == 'aabcc')

-- simple alternation
local m = (P'aa+' + P'bb+'):compile()
assert(str( m('abbaa') ) == 'bb')
local m = (P'aa+' + P'bb+' + P'cc+'):compile()
assert(str( m('abccaa') ) == 'cc')

-- simple combinations
local m = ((P'(a+)' + P'b(b*)') * P'(c+)()'):compile()
assert(str( m("aacccdd")) == 'aa,nil,ccc,6')
assert(str( m("bbcccdd")) == 'nil,b,ccc,6')
assert(str( m("bbdd")) == '')
print('?'..str( m("aabbcc")))
assert(str( m("aabbcc")) == 'nil,b,cc,7') -- alternative

-- simple replication (*)
local m = ( P'a'^0 ):compile()
assert(str(m'') == '')
assert(str(m'a') == 'a')
assert(str(m'aab') == 'aa')

-- replication (*)
local m = ( (P'a+' + P'b+')^0 ):compile()
assert(str(m'zabaabbc') == '')
assert(str(m'abaabb') == 'abaabb')
local m = ( (P'a+' * P'b+' + P'c+' * P'd+')^0 ):compile()
assert(str(m'aabbccddaa') == 'aabbccdd')
local m = ( P'aa'^0 * P'bb' * P'aa'^0 ):compile()
assert(str(m'aaccaaaabbaa') == 'aaaabbaa')

-- simple replication (+)
local m = ( P'a'^1 ):compile()
assert(str(m'') == '')
assert(str(m'a') == 'a')
assert(str(m'aab') == 'aa')

-- replacation (+)
local m = ( P'b' * P'a'^1 ):compile()
print(m'b')
assert(str(m'b') == '')
assert(str(m'ba') == 'ba')
assert(str(m'baab') == 'baa')

-- simple replication (?)
local m = ( P'a'^-1 ):compile()
assert(str(m'') == '')
assert(str(m'a') == 'a')
assert(str(m'aab') == 'a')

-- replication (?)
local m = ( P'c' * (P'a+' + P'b+')^-1 ):compile()
assert(str(m'caabb') == 'caa')


-- Some of these examples from Mastering Regular Expressions (MRE),
-- 2nd Ed. Jeffrey .Friedl.

-- MRE p.19
local m = ( P'^' * (P'From' + P'Subject' + P'Date') * P':%s*(.*)' ):compile()
assert(str(m('Subject: test')) == 'test')

-- MRE p.13
local m = ( (P'Geo' + P'Je') * P'ff' * (P're' + P'er') * P'y' ):compile()
assert(str(m'Jeffrey') == 'Jeffrey')
assert(str(m'Jeffery') == 'Jeffery')
assert(str(m'Geoffrey') == 'Geoffrey')
assert(str(m'Geoffery') == 'Geoffery')
assert(str(m'Jefery') == '')
assert(str(m'Geofferi') == '')
assert(str(m'GeoffrezGeoffery') == 'Geoffery') -- skips
assert(str(m'JefferzGeoffery') == 'Geoffery') -- skips
assert(str(m'GeoffJeffery') == 'Jeffery') -- skips

-- MRE p.24
local m = ( P'%$[0-9]+' * P'%.[0-9][0-9]'^-1 ):compile()
assert(str(m'$20.00') == '$20.00')
assert(str(m'$20') == '$20')
assert(str(m'$20.00.00') == '$20.00')

-- example
print 'example'
local M = require "xpattern"
local P = M.P
local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)()' ):compile()
local a,b,c,d = m('mmcccZzYybbZzYyddd') -- match c not b first
assert(a == nil and b == 'ccc' and c == 'b' and d == 11)

-- example
local m = P('foo', P'bar'+P'baz', 'qux'):compile()
assert(str(m'afoobazfoobarquxbar', 'foobarqux'))
local m = P('^foo', P'bar'+P'baz', 'qux'):compile() -- anchored
assert(str(m'afoobazfoobarquxbar', ''))
assert(str(m'foobarquxbar', ''))

-- http://lua-users.org/lists/lua-l/2009-10/msg00752.html
local m = (
  P'^' * ( ( P'ceil'+P'abs' +P'floor'+P'mod' +P'exp'+P'log'+P'pow'+
             P'sqrt'+P'acos'+P'asin' +P'atan'+P'cos'+P'sin'+P'tan'+
             P'deg' +P'rad' +P'random'
           ) * P'%('
           + P'[0-9%(%)%-%+%*%/%.%,]' + P'pi'
          )^1 * P'$'
):compile()
assert(m'cos(1+pi)' == 'cos(1+pi)')
assert(m'cos(1+p)' == nil) -- 'p'
assert(m'cos(12.3/2)+mod(2,3)' == 'cos(12.3/2)+mod(2,3)')
assert(m'cos(12.3/2)+mod(2,3) ' == nil) -- ' '
assert(m' cos(12.3/2)+mod(2,3)' == nil) -- ' '
assert(m'cos(12.3/2)+mod+2' == nil) -- no '('

print 'DONE'

Changes:

Author

DavidManura

See Also


RecentChanges · preferences
edit · history
Last edited January 7, 2010 2:27 am GMT (diff)