[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: Simple XML Parser for Lua?
- From: Edgar Toernig <froese@...>
- Date: Fri, 3 Jun 2005 14:24:55 +0200
Premshree Pillai wrote:
>
> Is there a simple XML parser (preferably DOM) for Lua?
>
> My requirements are very simple:
> - I need to be able to get node values for elements
> (docroot['some_ele'][2].text)
> - I need to be able to get attribute values
> (docroot['some_ele'].attribute('foo'))
>
> Any help would be appreciated. Thanks!
I wrote one for Sol (my hacked version of Lua) to parse EPG
(electronic program guide) data. It's very simple and will
break when the XML isn't well-formed but it does the job.
You get nearly exactly the tree you want. Your two examples
translated:
docroot.SOME_ELE[2].data
docroot.SOME_ELE[2].attr.FOO
The lexer is implemented in C and hooked into the read function
as read"*xml". The tree building is done in Sol.
Conversion to Lua shouldn't be difficult.
Here's the Sol part:
----------------------------------------------------------------------------
-- Unscramble, unzip and parse file returning a parse tree
local function parse(fname)
local f, tok, name, val
local node
local map = {
function EOF(ctxt)
return 1
end
function ENDTAG(ctxt)
return 1
end
function PI(ctxt)
-- ignore
end
function DECL(ctxt)
-- ignore
end
function BRDECL(ctxt)
-- ignore
end
function COMMENT(ctxt)
-- ignore
end
function SPACE(ctxt)
-- ignore
end
function DATA(ctxt)
ctxt.data = (ctxt.data or "") .. name
end
function TAG(ctxt)
if not ctxt[name] then
ctxt[name] = {}
end
ctxt[name]:append(node{ attr = val })
end
function EMPTYTAG(ctxt)
if not ctxt[name] then
ctxt[name] = {}
end
ctxt[name]:append{ attr = val }
end
}
function node(ctxt)
while 1 do
tok, name, val = f:read"*xml"
if map[tok or 'EOF](ctxt) then
break
end
end
return ctxt
end
f = assert(File.popen("./unscramble <%s | gunzip":with(fname), "r"))
local root = node{}
f:close()
assert(tok == nil, fname..": malformed tvm file (bad end tag)")
assert(root.EXPORT, fname..": malformed tvm file (no <Export>)")
return root
end
----------------------------------------------------------------------------
And this is the C part from iolib.c:
#ifdef WITH_XML_LEXER
/*
* XML lexer
*/
static int
xml_spaces(FILE *f, int c)
{
while (c != EOF && c <= 0x20)
c = fgetc(f);
return c;
}
static int
xml_istoken(int c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
c == '_' || c == '-' || c == '.' || c == ':';
}
static int
xml_token(solL_Buffer *B, FILE *f, int c)
{
while (xml_istoken(c))
{
solL_putchar(B, c >= 'a' && c <= 'z' ? c-'a'+'A' : c);
c = fgetc(f);
}
solL_pushresult(B);
return xml_spaces(f, c);
}
static int
xml_data_put(solL_Buffer *B, int c, int cnext, int *ws_only)
{
if (c >= 0 && c < 256)
{
if (c == 0x0d)
{
if (cnext == 0x0a)
cnext = 999;
c = 0x0a;
}
solL_putchar(B, c);
if (c > 0x20 && ws_only)
*ws_only = 0;
}
return cnext;
}
static int
xml_data(solL_Buffer *B, FILE *f, int c3, int d1, int d2, int d3, int *ws_only)
{
int c1 = 999, c2 = 999; /* 'c1<-c2<-c3' are a 3 char fifo */
while (c3 != EOF)
{
if (d3==c3 && (!d2 || (d2==c2 && (!d1 || d1==c1))))
break;
c1 = xml_data_put(B, c1, c2, ws_only);
c2 = c3;
c3 = fgetc(f);
}
if (!d1)
c2 = xml_data_put(B, c1, c2, ws_only);
if (!d2)
xml_data_put(B, c2, c3, ws_only);
solL_pushresult(B);
return c3;
}
static int
xml_string(solL_Buffer *B, FILE *f, int c)
{
if (c == '"' || c == '\'')
{
int del = c;
c = fgetc(f);
while (c != EOF && c != del)
c = xml_data_put(B, c, fgetc(f), NULL);
if (c == del)
c = fgetc(f);
}
else while (c != EOF && c != '>' && c > 0x20)
c = xml_data_put(B, c, fgetc(f), NULL);
solL_pushresult(B);
return xml_spaces(f, c);
}
static int
read_xml(sol_State *L, FILE *f)
{
solL_Buffer B[1];
int c;
solL_buffinit(L, B);
switch (c = fgetc(f))
{
case EOF:
return 0;
case '<':
switch (c = fgetc(f))
{
case '!':
switch (c = fgetc(f))
{
case '[':
sol_pushliteral(L, "BRDECL");
c = xml_token(B, f, fgetc(f));
if (c == '[')
c = fgetc(f);
xml_data(B, f, c, ']',']','>', NULL);
return 3;
case '-':
if ((c = fgetc(f)) == '-')
{
sol_pushliteral(L, "COMMENT");
xml_data(B, f, fgetc(f), '-','-','>', NULL);
return 2;
}
ungetc(c, f);
c = '-';
/* fall through */
default:
sol_pushliteral(L, "DECL");
c = xml_token(B, f, c);
xml_data(B, f, c, 0,0,'>', NULL);
return 3;
}
case '?':
sol_pushliteral(L, "PI");
c = xml_token(B, f, fgetc(f));
xml_data(B, f, c, 0,'?','>', NULL);
return 3;
case '/':
sol_pushliteral(L, "ENDTAG");
c = xml_token(B, f, fgetc(f));
return 2;
default:
sol_pushnil(L);
c = xml_token(B, f, c);
if (xml_istoken(c))
{
sol_newtable(L, 1);
do
{
c = xml_token(B, f, c);
if (c == '=')
c = xml_string(B, f, fgetc(f));
else
sol_pushnumber(L, 1);
sol_rawset(L, -3);
} while (xml_istoken(c));
}
else
sol_pushnil(L);
if (c == '/')
{
sol_pushliteral(L, "EMPTYTAG");
c = fgetc(f);
}
else
sol_pushliteral(L, "TAG");
sol_setvalue(L, -4);
return 3;
}
default:
{
int ws_only = 1;
sol_pushnil(L);
c = xml_data(B, f, c, 0,0,'<', &ws_only);
if (c != EOF)
ungetc(c, f);
ws_only ? sol_pushliteral(L, "SPACE") : sol_pushliteral(L, "DATA");
sol_setvalue(L, -3);
return 2;
}
}
}
/***** end XML *****/
#endif
static int
io_read(sol_State *L)
{
[...]
#ifdef WITH_XML_LEXER
case 'x': /* xml */
success = read_xml(L, f);
break;
#endif
[...]
}
Happy Hacking, ET.