[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: Simple XHTML (XML) parser/printer
- From: Tim Channon <tc@...>
- Date: Wed, 25 Mar 2009 03:48:50 +0000
Tuomo Valkonen wrote:
> I'm looking for a simple XML parser/printer for Lua, that
> would let me manipulate an (incomplete) XHTML document
> with little effort. I was thinking of lua-expat/etree,
> but they make dealing with character entities pain.
> The lua-expat parser seems to require DTDs and stuff
> to parse the character entities; and etree doesn't
> include support for converting the whole range back.
>
May as well show what I have, if it is any use to anyone.
Two files.
One is the simple XML parser written by I think it was Luis, slightly
modified to accept a wider selection of xhtml
Do not feed illegal XML into this stuff such as HTML, a very different
beast. Note also that many supposed XML web pages are broken. Best stick
to xhtml strict.
The other is what I created. Calls the above to turn xml into a table,
then turns the table back into xml.
The thing to understand is that this is actually a parser and the table
is the intermediate representation.
The routine walks the table **and can do things to table entities as it
does so, it doesn't have to output as the demo shows.** In other words
it can be decorated with actions as I have done with versions I am not
showing.
Particularly tests for tag names is ok, can then act on particular
entities. Not sure what is happening, add some print statements.
Also note the output actions can be removed, maybe if you are altering
the table entities. The core is just a few lines.
Aside: you can also use table to json and json to table, all works
nicely together
======= put in file called perhaps table2xml.lua =======
require"xml" -- only used by test code at end
-- generate an xhtml page from Lua table made by xml.lua
-- ok with xhtml 1 strict, your mileage may vary
-- this in effect is also a parser, can be decorated with detection and
actions
function tabletoxml(x)
-- list of selfclosing tags,,eg <br /> and is correct for xhtml,
html is different
local selfclosing = {meta=true, input=true, img=true, br=true, hr=true}
local out=""
-- bracflag is to do with writing more human readable xhtml by
writing newlines
if x.label then
if bracflag==true then out=out.."\n" bracflag=false end
out=out.."<".. x.label -- open tag such as <div
-- any parameters inside the tag head are in an xarg list
-- such as <div id="fred" >
end
if x.xarg then
for r,t in pairs(x.xarg) do -- do xarg tags and maybe close tag
out=out.." ".. r.."=\"".. t .. "\""
-- eg. style="mystyle"
end
if selfclosing[x.label] then
out=out.." /" -- self closing such as />, note space first
for bugged browsers
end
out=out..">"
bracflag=true
end
for v,w in pairs(x) do
if type(w) == "table" then
if v~="xarg" then -- skip xarg, already done first
retout=tabletoxml(w) -- is table, recurse
out=out..retout
end
else
if v ~="label" and v ~= "empty" then
bracflag=false
out=out..w -- anything else, this does the legwork for
plain output
end
end
end
-- close named tag. note the following is after return from
recursing deeper levels
-- so that nesting is correct
-- x.label and any x.xarg are still valid here for this level
-- at this point all information about a tag/label are known, we are
at end of item
if x.label and (true~= selfclosing[x.label]) then
if bracflag==true then out=out.."\n" bracflag=false end
out=out.."</" .. x.label .. ">" -- such as </div>
bracflag=true
end
-- return from recursion or final return
return out -- create a long string with newlines which is xhtml
page, probably W3C legal
end
local f=io.open("afile.html", "r")
pagetable=xmltotable(f:read("*all")) -- get xml into table
f:close();
out=tabletoxml(pagetable)
f=io.open("demo1.html" , "w")
f:write(out)
f:close()
print(out)
======
now the matching xml to table routine from the Lua archive
====== xml 2 table =========
function parseargs(s)
local arg = {}
string.gsub(s, "([%-%w]+)=([\"'])(.-)%2", function (w, _, a)
arg[w] = a
end)
return arg
end
function xmltotable(s)
local stack = {}
local top = {}
table.insert(stack, top)
local ni,c,label,xarg, empty
local i, j = 1, 1
while true do
ni,j,c,label,xarg, empty = string.find(s,
"<(%/?)([%w%:_%-]+)(.-)(%/?)>", i)
if not ni then break end
local text = string.sub(s, i, ni-1)
if not string.find(text, "^%s*$") then
table.insert(top, text)
end
if empty == "/" then -- empty element tag
table.insert(top, {label=label, xarg=parseargs(xarg), empty=1})
elseif c == "" then -- start tag
top = {label=label, xarg=parseargs(xarg)}
table.insert(stack, top) -- new level
else -- end tag
local toclose = table.remove(stack) -- remove top
top = stack[#stack]
if #stack < 1 then
error("nothing to close with "..label)
end
if toclose.label ~= label then
error("trying to close "..toclose.label.." with "..label)
end
table.insert(top, toclose)
end
i = j+1
end
local text = string.sub(s, i)
if not string.find(text, "^%s*$") then
table.insert(stack[stack.n], text)
end
if #stack > 1 then
error("unclosed "..stack[stack.n].label)
end
return stack[1]
end
=========