lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


Yet Another Pure Lua XML Parser

At work, I needed a pure-lua XML parser that produced results similar to the LOM parser included with LuaExpat. I wrote one, stealing ideas from LOM and the pure-lua parser seen on the Wiki. What makes mine special?

* Strips out comments and processing directives
* Unescapes XML entities (&, <, >, ", ') in text and attributes
* (optionally) Strips leading/trailing whitespace from text elements
* (optionally) Creates an 'innertext' property that holds the concatenation of all child text for each element.
* (optionally) Allows access to child elements by name:
	* mylomnode.elementname -- gives the first child element with that name
* mylomnode.elements.elementname -- gives an array of all children with that name * Sort of handles CDATA sections (doesn't strip whitespace from them, but DOES incorrectly translate entities inside them)
* Provides a method for giving single-element string representations

Simple usage:
local theRoot = AKLOM.parse( '<root><message string="Hello World"></ root>' )
print( theRoot.message.attr.string )

local theRoot = AKLOM.parse( '<root><message>Hello <null/> World</ message></root>' )
print( theRoot[1][1] )
print( theRoot.message.innertext )


I include the code below for your review and comments.

module 'AKLOM'
-- Creates the 'elements' collection and named access to the first child element
	useElementCollectionFlag = true
	
	-- Creates an 'innertext' property that is the sum of all text objects
	useInnerTextFlag = true
	
	-- Strips all leading/trailing whitespace between nodes and text
	stripWhitespaceFlag = true

local sub, gsub, find, push, pop = string.sub, string.gsub, string.find, table.insert, table.remove

function unescape( inString )
	inString = gsub( inString, '&lt;', '<' )
	inString = gsub( inString, '&gt;', '>' )
	inString = gsub( inString, '&quot;', '"' )
	inString = gsub( inString, '&apos;', "'" )
	return gsub( inString, '&amp;', '&' )
end

function parse( inXMLString )
	-- Throw out SGML comments and processing directives
	inXMLString = gsub( inXMLString, '<!%-%-.-%-%->', '' )
	inXMLString = gsub( inXMLString, '<%?.-%?>', '' )
	
	if stripWhitespaceFlag then
		-- Throw out leading and trailing whitespace in text blocks
		inXMLString = gsub( inXMLString, '>%s+', '>' )
		inXMLString = gsub( inXMLString, '%s+<', '<' )
	end
	inXMLString = gsub( inXMLString, '<!%[CDATA%[', '' )
	inXMLString = gsub( inXMLString, '%]%]>', '' )
	
	local theDoc = useElementCollectionFlag and { elements={} } or { }
	local theCurrentElement = theDoc
	local theStack = { n=0 }
	local thePos = 1
	local theStart, theEnd, theClose, theName, theAttr, theEmpty
	local theLeadingText
	while true do
theStart, theEnd, theClose, theName, theAttr, theEmpty = find ( inXMLString, '<(%/?)(%a%w*)(.-)(%/?)>', thePos )
		if not theStart then break end
		
		local theIsParentFlag = ( theEmpty == '' )
		
theLeadingText = unescape( sub( inXMLString, thePos + 1, theStart - 1 ) )
		if theLeadingText ~= '' then
			push( theCurrentElement, theLeadingText )
			if useInnerTextFlag then
theCurrentElement.innertext = theCurrentElement.innertext .. theLeadingText
			end
		end

		thePos = theEnd
		
		if theClose ~= '' then
			if useInnerTextFlag and theCurrentElement.innertext == '' then
				theCurrentElement.innertext = nil
			end
			theCurrentElement = pop( theStack )
assert( theName == theCurrentElement.name, "Found close element '"..theName.."', expected '"..theCurrentElement.name.."'" )
			theCurrentElement = theStack[ theStack.n ]
			if not theCurrentElement then break end
		else
			local theElement = {
				name      = theName,
				attr      = {},
				elements  = useElementCollectionFlag and {} or nil,
				innertext = ( useInnerTextFlag and theIsParentFlag ) and '' or nil
			}

			-- Parse the attribute string
			gsub(
				theAttr,
				'([%a_:][%w._:-]*)%s*=%s*([\'"])(.-)%2',
				function( inAttName, _, inAttValue )
					theElement.attr[ inAttName ] = unescape( inAttValue )
				end
			)

			-- Add the element to the parent
			push( theCurrentElement, theElement )
			if useElementCollectionFlag then
				if not theCurrentElement[ theName ] then
					theCurrentElement[ theName ] = theElement
				end
				if not theCurrentElement.elements[ theName ] then
					theCurrentElement.elements[ theName ] = {}
				end
				push( theCurrentElement.elements[ theName ], theElement )
			end

			if theIsParentFlag then
				push( theStack, theElement )
				theCurrentElement = theElement
			end
		end
	end

	if theStack.n > 0 then
error( "AKLOM parsing ended early; I was still inside the '".. (theStack[theStack.n].name).."' element." )
	end

	return theDoc[ 1 ]
end

function lomstring( inLOM )
local theOutput = "<" .. inLOM.name .. " (" .. table.getn ( inLOM ) .. " children)"
	for k,v in pairs( inLOM.attr ) do
		theOutput = theOutput .. ' ' .. k .. '="' .. v .. '"'
	end
	return theOutput .. '>'
end