lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Tuomo Valkonen wrote:
> I'm looking for a simple XML parser/printer for Lua, that
> would let me manipulate an (incomplete) XHTML document 
> with little effort. I was thinking of lua-expat/etree,
> but they make dealing with character entities pain.
> The lua-expat parser seems to require DTDs and stuff
> to parse the character entities; and etree doesn't
> include support for converting the whole range back.
>   
May as well show what I have, if it is any use to anyone.

Two files.

One is the simple XML parser written by I think it was Luis, slightly
modified to accept a wider selection of xhtml

Do not feed illegal XML into this stuff such as HTML, a very different
beast. Note also that many supposed XML web pages are broken. Best stick
to xhtml strict.

The other is what I created. Calls the above to turn xml into a table,
then turns the table back into xml.

The thing to understand is that this is actually a parser and the table
is the intermediate representation.

The routine walks the table **and can do things to table entities as it
does so, it doesn't have to output as the demo shows.** In other words
it can be decorated with actions as I have done with versions I am not
showing.
Particularly tests for tag names is ok, can then act on particular
entities. Not sure what is happening, add some print statements.
Also note the output actions can be removed, maybe if you are altering
the table entities. The core is just a few lines.

Aside: you can also use table to json and json to table, all works
nicely together

======= put in file called perhaps table2xml.lua =======
require"xml" -- only used by test code at end

-- generate an xhtml page from Lua table made by xml.lua
-- ok with xhtml 1 strict, your mileage may vary
-- this in effect is also a parser, can be decorated with detection and
actions

function tabletoxml(x)
   
    -- list of selfclosing tags,,eg  <br /> and is correct for xhtml,
html is different
    local selfclosing = {meta=true, input=true, img=true, br=true, hr=true}
    local out=""

    -- bracflag is to do with writing more human readable xhtml by
writing newlines
   
    if x.label then
        if bracflag==true then out=out.."\n" bracflag=false end   
        out=out.."<".. x.label  -- open tag such as <div
        -- any parameters inside the tag head are in an xarg list
        -- such as <div id="fred" >
    end
    if x.xarg then
        for r,t in pairs(x.xarg) do -- do xarg tags and maybe close tag
            out=out.." ".. r.."=\"".. t .. "\""
            -- eg. style="mystyle"
        end
        if selfclosing[x.label] then
            out=out.." /" -- self closing such as  />, note space first
for bugged browsers
        end
        out=out..">"
        bracflag=true   
    end
    for v,w in pairs(x) do
        if type(w) == "table" then
            if v~="xarg" then -- skip xarg, already done first
                retout=tabletoxml(w) -- is table, recurse
                out=out..retout
            end
        else
            if v ~="label" and v ~= "empty" then
                bracflag=false
                out=out..w -- anything else, this does the legwork for
plain output
            end
        end
    end
    -- close named tag. note the following is after return from
recursing deeper levels
    -- so that nesting is correct
    -- x.label and any x.xarg are still valid here for this level
    -- at this point all information about a tag/label are known, we are
at end of item
    if x.label and (true~= selfclosing[x.label]) then
        if bracflag==true then out=out.."\n" bracflag=false end
        out=out.."</" .. x.label .. ">"  -- such as </div>
        bracflag=true
    end
    -- return from recursion or final return
    return out  -- create a long string with newlines which is xhtml
page, probably W3C legal
end
   
local f=io.open("afile.html", "r") 
pagetable=xmltotable(f:read("*all")) -- get xml into table
f:close();

out=tabletoxml(pagetable)

f=io.open("demo1.html" , "w")
f:write(out)
f:close()

print(out)

======
now the matching xml to table routine from the Lua archive
====== xml 2 table =========
function parseargs(s)
  local arg = {}
  string.gsub(s, "([%-%w]+)=([\"'])(.-)%2", function (w, _, a)
    arg[w] = a
  end)
  return arg
end
   
function xmltotable(s)
  local stack = {}
  local top = {}
  table.insert(stack, top)
  local ni,c,label,xarg, empty
  local i, j = 1, 1
  while true do
    ni,j,c,label,xarg, empty = string.find(s,
"<(%/?)([%w%:_%-]+)(.-)(%/?)>", i)
    if not ni then break end
    local text = string.sub(s, i, ni-1)
    if not string.find(text, "^%s*$") then
      table.insert(top, text)
    end
    if empty == "/" then  -- empty element tag
      table.insert(top, {label=label, xarg=parseargs(xarg), empty=1})
    elseif c == "" then   -- start tag
       top = {label=label, xarg=parseargs(xarg)}
      table.insert(stack, top)   -- new level
    else  -- end tag
      local toclose = table.remove(stack)  -- remove top
      top = stack[#stack]
      if #stack < 1 then
        error("nothing to close with "..label)
      end
      if toclose.label ~= label then
        error("trying to close "..toclose.label.." with "..label)
      end
      table.insert(top, toclose)
    end
    i = j+1
  end
  local text = string.sub(s, i)
  if not string.find(text, "^%s*$") then
    table.insert(stack[stack.n], text)
  end
  if #stack > 1 then
    error("unclosed "..stack[stack.n].label)
  end
  return stack[1]
end
=========