lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


> I am tempted to ask (but won't) why there isn't any lpeg.L(lpat) that
> returns an lpeg pattern that matches the same strings, and returns
> the same captures, that the Lua pattern lpat would.
> 
> Instead, I'm asking whether a question similar to this has burgeoned
> in the brain of a list member for long enough that some work in this
> direction has been done.

We have been working on it. Note that the task is not as easy as it may
sound, because the semantics of LPeg is different from the semantics of
Lua patterns. For instance, repetition in Lua is greedy, but in LPeg
it is possessive. The translation from one to the other is not difficult,
but the translation with captures is.

I have attached some code I wrote related to this translation. It
implements an equivalent to the 'string.find' function. It passes most
(all?) tests in the standard Lua test suite that do not use captures.

-- Roberto
local m = require'lpeg'
local re = require're'
-- require 'pt'

local any = m.P(1)


local allchars = ""
for i = 1, 255 do allchars = allchars..string.char(i) end

-- return the set of all chars of a given string that match a given class
local function classsub (s, p)
  p = m.Cs( ((any - p) / "" + p)^0 )
  return p:match(s)
end

-- return the set of all chars that match a given class
local function class2chars (p)
  return classsub(allchars, p)
end


-- return the intersection of two sets of chars, represented
-- as strings
local function intersection (c1, c2)
  return classsub(c1, m.S(c2))
end


-- return the set of all chars in a given category
local function charscat (c)
  return class2chars(re.compile(c))
end


local function charsfromto (c1, c2)
  local s = ""
  for i = string.byte(c1), string.byte(c2) do
    s = s .. string.char(i)
  end
  return s
end


local function concat (s1, s2)
  return s1 .. s2
end

local g = m.P{ "pattern",

  pattern = m.Ct(m.Cg("^", "init")^0
               * m.V"item"^0
               * (m.Cg("$" * m.P(-1), "fin"))^0) * -1,

  setelem = "%" * m.R("AZ", "az") / charscat
          + "%" * m.C(any)
          + (m.C(any) * "-" * m.C(any - "]")) / charsfromto
          +  m.C(any - "]"),

  set = "["
      * m.Cg(m.Cf(m.C"^"^-1 * m.C"]"^-1 * m.V"setelem"^0, concat), "set")
      * "]",

  esc = "%" * m.Cg(m.S"acdglpsuwxzACDGLPSUWXZ", "escape")
      + "%" * m.C(any),

  class = m.C(any - m.S"[].$()%%.*+-?" + ("$" * #m.P(1)))
        + m.Cg(".", "all")
        + m.V"esc"
        + m.V"set",

  item = "%" * m.Ct(m.Cg(m.R"09", "backref"))
       + "%b" * m.Ct(m.Cg(2, "balanced"))
       + "%f" * m.V"set"
       + m.Ct(m.V"class" * m.Cg(m.S"*+-?", "rep")^0)
       + m.Ct(m.Cg("()", "position"))
       + "(" * m.Cg(m.Ct(m.V"item"^1), "capture") * ")"

}

local function balanced (s)
  local a, b = s:sub(1, 1), s:sub(2, 2)
  return m.P{ a * ( (1 - m.S(s))^1 + m.V(1) )^0 * b }
end


local function class (e)
  if e.escape then return re.compile("%"..e.escape)
  elseif e.set then
    if (string.sub(e.set, 1, 1) == "^") then
      return any - m.S(string.sub(e.set, 2, -1))
    else
      return m.S(e.set)
    end
  elseif e.all then return any
  else return m.P(e[1])
  end
end


local count = 0
local function nextvar ()
  count = count + 1
  return "v" .. count
end


local function rep (e, k, r)
  local c = class(e)
  local fc = class2chars(c)
  local inter = (intersection(fc, k.first) ~= "")
  local union = fc .. k.first
  if r == "?" then
    if inter then
      k[1] = c * k[1] + k[1]
    else
      k[1] = c^-1 * k[1]
    end
  elseif r == "*" then
    local v = nextvar()
    if inter then
      k[v] = c * m.V(v) + k[1]
    else
      k[v] = c^0 * k[1]
    end
    k[1] = m.V(v)
  elseif r == "-" then
    local v = nextvar()
    if inter or k.empty then
      k[v] = k[1] + c * m.V(v)
    else
      k[v] = c^0 * k[1]
    end
    k[1] = m.V(v)
  elseif r == "+" then
    local k = rep(e, k, "*")
    k[1] = c * k[1]
    union = fc   -- must start with a char from class 'c'
    k.empty = false
  end
  k.first = union
  return k
end


local function traduz (p, i)
  if i > #p then
    return {p.fin and m.P(-1) or m.P(""); first = ""; empty = true}
  end
  local k = traduz(p, i + 1)
  local e = p[i]
  local et, first, empty
  if e.position then et = m.Cp(); first = k.first
  elseif e.balanced then
    et = balanced(e.balanced)
    first = string.sub(e.balanced, 1, 1)
    k.empty = false
  elseif e.rep then return rep(e, k, e.rep)
  else et = class(e); first = class2chars(et); k.empty = false
  end
  k[1] = et * k[1]
  k.first = first
  return k
end


local pg = m.P(g)

if arg[1] then
  local p = pg:match(arg[1])
  print(pt(p))
  p = traduz(p, 1)
  print(p.first, p.empty)
  p.first, p.empty = nil
  p = m.P(p); p:print()

  if arg[2] then
    print(p:match(arg[2]))
  end
  return
end


local function compile (p)
  local tree = pg:match(p)
  if not tree then error("invalid pattern " .. p) end
  local g = traduz(tree, 1); g.first, g.empty = nil
  local pp = m.P(g)
  pp = m.Cp() * pp * m.Cp()
  if tree.init then return pp
  else return m.P{ pp + 1 * m.V(1) }
  end
end


function find (s, p, i)
  local pp = compile(p)
  local a, b = pp:match(s, i)
  if a then return a, b - 1 end
end


--------------------------------------------------------------------------------

local function f(s, p)
  local i,e = find(s, p)
  if i then return string.sub(s, i, e) end
end

a,b = find('', '')    -- empty patterns are tricky
assert(a == 1 and b == 0);
a,b = find('alo', '')
assert(a == 1 and b == 0)
a,b = find('a\0o a\0o a\0o', 'a', 1)   -- first position
assert(a == 1 and b == 1)
a,b = find('a\0o a\0o a\0o', 'a\0o', 2)   -- starts in the midle
assert(a == 5 and b == 7)
a,b = find('a\0o a\0o a\0o', 'a\0o', 9)   -- starts in the midle
assert(a == 9 and b == 11)
a,b = find('a\0a\0a\0a\0\0ab', '\0ab', 2);  -- finds at the end
assert(a == 9 and b == 11);
a,b = find('a\0a\0a\0a\0\0ab', 'b')    -- last position
assert(a == 11 and b == 11)
assert(find('a\0a\0a\0a\0\0ab', 'b\0') == nil)   -- check ending
assert(find('', '\0') == nil)
assert(find('alo123alo', '12') == 4)
assert(find('alo123alo', '^12') == nil)

assert(f('aloALO', '%l*') == 'alo')
assert(f('aLo_ALO', '%a*') == 'aLo')

assert(f("  \n\r*&\n\r   xuxu  \n\n", "%g%g%g+") == "xuxu")

assert(f('aaab', 'a*') == 'aaa');
assert(f('aaa', '^.*$') == 'aaa');
assert(f('aaa', 'b*') == '');
assert(f('aaa', 'ab*a') == 'aa')
assert(f('aba', 'ab*a') == 'aba')
assert(f('aaab', 'a+') == 'aaa')
assert(f('aaa', '^.+$') == 'aaa')
assert(f('aaa', 'b+') == nil)
assert(f('aaa', 'ab+a') == nil)
assert(f('aba', 'ab+a') == 'aba')
assert(f('a$a', '.$') == 'a')
assert(f('a$a', '.%$') == 'a$')
assert(f('a$a', '.$.') == 'a$a')
assert(f('a$a', '$$') == nil)
assert(f('a$b', 'a$') == nil)
assert(f('a$a', '$') == '')
assert(f('', 'b*') == '')
assert(f('aaa', 'bb*') == nil)
assert(f('aaab', 'a-') == '')
assert(f('aaa', '^.-$') == 'aaa')
assert(f('aabaaabaaabaaaba', 'b.*b') == 'baaabaaabaaab')
assert(f('aabaaabaaabaaaba', 'b.-b') == 'baaab')
assert(f('alo xo', '.o$') == 'xo')
assert(f(' \n isto é assim', '%S%S*') == 'isto')
assert(f(' \n isto é assim', '%S*$') == 'assim')
assert(f(' \n isto é assim', '[a-z]*$') == 'assim')
assert(f('um caracter ? extra', '[^%sa-z]') == '?')
assert(f('', 'a?') == '')
assert(f('x', 'x?') == 'x')
assert(f('xbl', 'x?b?l?') == 'xbl')
assert(f('  xbl', 'x?b?l?') == '')
assert(f('aa', '^aa?a?a') == 'aa')
assert(f(']]]xb', '[^]]') == 'x')
assert(f("0alo alo", "%x*") == "0a")
assert(f("alo alo", "%C+") == "alo alo")
print('+')

assert(f("alo <xuxu> alo", "%b<>") == "<xuxu>")
assert(f("alo (xuxu>batata() and)) alo", "%b()") == "(xuxu>batata() and)")
assert(f("((( (xuxu>batata() and) alo", "%b()") == "(xuxu>batata() and)")

assert(f("alo alo 123 chegou", ".*123") == "alo alo 123")
assert(f("alo alo 123 chegou", ".*%d+") == "alo alo 123")
assert(f("alo alo 123 chegou", "^.*%d+") == "alo alo 123")
assert(not find("alo alo chegou", ".*%d+"))

print 'ok'