lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Premshree Pillai wrote:
>
> Is there a simple XML parser (preferably DOM) for Lua?
> 
> My requirements are very simple:
> - I need to be able to get node values for elements
> (docroot['some_ele'][2].text)
> - I need to be able to get attribute values
> (docroot['some_ele'].attribute('foo'))
> 
> Any help would be appreciated. Thanks!

I wrote one for Sol (my hacked version of Lua) to parse EPG
(electronic program guide) data.  It's very simple and will
break when the XML isn't well-formed but it does the job.

You get nearly exactly the tree you want.  Your two examples
translated:

	docroot.SOME_ELE[2].data
	docroot.SOME_ELE[2].attr.FOO

The lexer is implemented in C and hooked into the read function
as read"*xml".  The tree building is done in Sol.

Conversion to Lua shouldn't be difficult.

Here's the Sol part:

----------------------------------------------------------------------------
-- Unscramble, unzip and parse file returning a parse tree

local function parse(fname)
    local f, tok, name, val
    local node

    local map = {
	function EOF(ctxt)
	    return 1
	end
	function ENDTAG(ctxt)
	    return 1
	end
	function PI(ctxt)
	    -- ignore
	end
	function DECL(ctxt)
	    -- ignore
	end
	function BRDECL(ctxt)
	    -- ignore
	end
	function COMMENT(ctxt)
	    -- ignore
	end
	function SPACE(ctxt)
	    -- ignore
	end
	function DATA(ctxt)
	    ctxt.data = (ctxt.data or "") .. name
	end
	function TAG(ctxt)
	    if not ctxt[name] then
		ctxt[name] = {}
	    end
	    ctxt[name]:append(node{ attr = val })
	end
	function EMPTYTAG(ctxt)
	    if not ctxt[name] then
		ctxt[name] = {}
	    end
	    ctxt[name]:append{ attr = val }
	end
    }

    function node(ctxt)
	while 1 do
	    tok, name, val = f:read"*xml"
	    if map[tok or 'EOF](ctxt) then
		break
	    end
	end
	return ctxt
    end

    f = assert(File.popen("./unscramble <%s | gunzip":with(fname), "r"))
    local root = node{}
    f:close()
    assert(tok == nil, fname..": malformed tvm file (bad end tag)")
    assert(root.EXPORT, fname..": malformed tvm file (no <Export>)")
    return root
end

----------------------------------------------------------------------------

And this is the C part from iolib.c:

#ifdef WITH_XML_LEXER
/*
 * XML lexer
 */

static int
xml_spaces(FILE *f, int c)
{
    while (c != EOF && c <= 0x20)
	c = fgetc(f);
    return c;
}

static int
xml_istoken(int c)
{
    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
	   (c >= '0' && c <= '9') ||
	   c == '_' || c == '-' || c == '.' || c == ':';
}

static int
xml_token(solL_Buffer *B, FILE *f, int c)
{
    while (xml_istoken(c))
    {
	solL_putchar(B, c >= 'a' && c <= 'z' ? c-'a'+'A' : c);
	c = fgetc(f);
    }
    solL_pushresult(B);
    return xml_spaces(f, c);
}

static int
xml_data_put(solL_Buffer *B, int c, int cnext, int *ws_only)
{
    if (c >= 0 && c < 256)
    {
	if (c == 0x0d)
	{
	    if (cnext == 0x0a)
		cnext = 999;
	    c = 0x0a;
	}
	solL_putchar(B, c);
	if (c > 0x20 && ws_only)
	    *ws_only = 0;
    }
    return cnext;
}

static int
xml_data(solL_Buffer *B, FILE *f, int c3, int d1, int d2, int d3, int *ws_only)
{
    int c1 = 999, c2 = 999;	/* 'c1<-c2<-c3' are a 3 char fifo */

    while (c3 != EOF)
    {
	if (d3==c3 && (!d2 || (d2==c2 && (!d1 || d1==c1))))
	    break;
	c1 = xml_data_put(B, c1, c2, ws_only);
	c2 = c3;
	c3 = fgetc(f);
    }
    if (!d1)
	c2 = xml_data_put(B, c1, c2, ws_only);
    if (!d2)
	xml_data_put(B, c2, c3, ws_only);
    solL_pushresult(B);
    return c3;
}


static int
xml_string(solL_Buffer *B, FILE *f, int c)
{
    if (c == '"' || c == '\'')
    {
	int del = c;
	c = fgetc(f);
	while (c != EOF && c != del)
	    c = xml_data_put(B, c, fgetc(f), NULL);
	if (c == del)
	    c = fgetc(f);
    }
    else while (c != EOF && c != '>' && c > 0x20)
	c = xml_data_put(B, c, fgetc(f), NULL);

    solL_pushresult(B);
    return xml_spaces(f, c);
}


static int
read_xml(sol_State *L, FILE *f)
{
    solL_Buffer B[1];
    int c;

    solL_buffinit(L, B);
    switch (c = fgetc(f))
    {
	case EOF:
	    return 0;

	case '<':
	    switch (c = fgetc(f))
	    {
		case '!':
		    switch (c = fgetc(f))
		    {
			case '[':
			    sol_pushliteral(L, "BRDECL");
			    c = xml_token(B, f, fgetc(f));
			    if (c == '[')
				c = fgetc(f);
			    xml_data(B, f, c, ']',']','>', NULL);
			    return 3;

			case '-':
			    if ((c = fgetc(f)) == '-')
			    {
				sol_pushliteral(L, "COMMENT");
				xml_data(B, f, fgetc(f), '-','-','>', NULL);
				return 2;
			    }
			    ungetc(c, f);
			    c = '-';
			    /* fall through */
			default:
			    sol_pushliteral(L, "DECL");
			    c = xml_token(B, f, c);
			    xml_data(B, f, c, 0,0,'>', NULL);
			    return 3;
		    }
		case '?':
		    sol_pushliteral(L, "PI");
		    c = xml_token(B, f, fgetc(f));
		    xml_data(B, f, c, 0,'?','>', NULL);
		    return 3;

		case '/':
		    sol_pushliteral(L, "ENDTAG");
		    c = xml_token(B, f, fgetc(f));
		    return 2;

		default:
		    sol_pushnil(L);
		    c = xml_token(B, f, c);
		    if (xml_istoken(c))
		    {
			sol_newtable(L, 1);
			do
			{
			    c = xml_token(B, f, c);
			    if (c == '=')
				c = xml_string(B, f, fgetc(f));
			    else
				sol_pushnumber(L, 1);
			    sol_rawset(L, -3);
			} while (xml_istoken(c));
		    }
		    else
			sol_pushnil(L);
		    if (c == '/')
		    {
			sol_pushliteral(L, "EMPTYTAG");
			c = fgetc(f);
		    }
		    else
			sol_pushliteral(L, "TAG");
		    sol_setvalue(L, -4);
		    return 3;
	    }

	default:
	{
	    int ws_only = 1;

	    sol_pushnil(L);
	    c = xml_data(B, f, c, 0,0,'<', &ws_only);
	    if (c != EOF)
		ungetc(c, f);
	    ws_only ? sol_pushliteral(L, "SPACE") : sol_pushliteral(L, "DATA");
	    sol_setvalue(L, -3);
	    return 2;
	}
    }
}

/***** end XML *****/
#endif

static int
io_read(sol_State *L)
{
    [...]
#ifdef WITH_XML_LEXER
		case 'x':	/* xml */
		    success = read_xml(L, f);
		    break;
#endif
    [...]
}


Happy Hacking, ET.