lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


On 21-Aug-16 16:45, Marc Balmer wrote:
I am working on code to easily create XML trees from Lua tables.  What would,
in your opinion, be a comfortable Lua table notation of XML trees, expressed
as Lua tables?

Some years ago I wrote a minimalist, gmatch-based XML parser, ~50 lines apart from testing code, to read .ods (Open Document Spreadsheet) files.

I never published it because it had limitations and I did not have the time to finish the job properly beyond my immediate needs, but I attach it here in case its approach to table organization (line 61) could have something useful for your design... perhaps just to see what is best avoided ;-)

The parser both builds an XML tree and calls an external handler (if any) when tags are encountered.

('+++' comments mean 'work in progress')

P.S. I put this code (from 2008) under the Lua license.

--
  Enrico
function Test()
    local fname = 'spazi.xml' -- +++ test +++
    -- fname = 'content.xml'

    local f = io.open(fname, 'r')
    local txt = f:read('*a')
    f:close()
    
    local startf = function(element) 
        print('start: ' .. element.name)
        for attr, val in pairs(element.attrib) do
            print('  ' .. attr .. ' = ' .. val)
        end
    end
    
    local endf = function(element) 
        print('end: ' .. element.name)
    end
    
    local dataf = function(element) 
        print('data: ' .. element.data)
    end
    
    local root = Parse(txt, startf, endf, dataf)
    
    io.read()
    PrintTree(root) -- +++
end


---------------------------------------------------------------------------

-- todo: +++
--   handle CDATA (put away in table, get them later?)
--   note: space in character data is preserved (not really standard)
--   note: currently only latest data chunk is preserved in tree
--   numeric chars for escape currently not supported
--   handle comments?
--   numeric escape (e.g.    ) not supported because of UTF-8,
--    can be handled at content level
--   Print (use write)
--   test/demo files


-- +++ data --> text
-- +++ multiple text: use array for children & data, select on type

-- +++ children/data order is not preserved, use handlers +++
-- +++ is this a design problem?


---------------------------------------------------------------------------
-- parse xml text, build element tree,
-- call startTagHandler(element), if any, at every start or empty tag,
-- call endTagHandler(element), if any, at every start or empty tag,
-- call dataHandler(element), if any, at every character data chunk,
-- return root element of the created element tree:

--
-- element = {
--     parent = parent (nil for root)
--     name = name
--     data = character data (as string)
--     attrib = { attr=val, attr=val, ... }
--     [1] = child element
--     [2] = child element
--     [n] = child element
-- }

function Parse(xmlText, startTagHandler, endTagHandler, dataHandler)
    -- unescape function (numeric escape currently not supported)
    local escapeTable = { 
        ['&amp;']='&', ['&lt;']='<', ['&gt;']='>', ['&apos;']="'", ['&quot;']='"'
    }
    local function unescape(s)
        return string.gsub(s, '(%&%a+%;)', escapeTable)
    end
     
    -- start with root element
    local root = { parent=nil, name='root', data=nil, attrib={} }
    currentElement = root

    -- get (<startTag attributes> | <emptyTag/> | </endTag>) and following data (if any)
    local namePattern = '%a[%w%.%-%_%:%&%;]*' -- (primitive but enough for basic parsing)
    local tagPattern = '%<([%/]?)(' .. namePattern .. ')(.-)([%/]?)%>([^%<]*)' -- (5 captures)
    local attributePattern = '(' .. namePattern .. ')%s*%=%s*([\'\"])(.-)%2'
    for endTagChar, name, attributes, emptyTagChar, data in string.gmatch(xmlText, tagPattern) do
        local isEndTag = (endTagChar ~= '')
        local isEmptyTag = (emptyTagChar ~= '')
        assert(not (isEndTag and isEmptyTag), 'endTag-emptyTag conflict in: ' .. name)

        -- handle start tag or empty tag
        if (not isEndTag) then
            -- create a new child element, add it to current element
            local newElement = { parent=currentElement, name=name, data=nil, attrib={} }
            currentElement[#currentElement + 1] = newElement
            -- move to the new element
            currentElement = newElement
            -- add attributes and their values to current element
            if attributes ~= '' then
                local attribTable = currentElement.attrib
                for attr, _, val, _ in string.gmatch(attributes, attributePattern) do
                    attribTable[attr] = unescape(val)
                end
            end
            -- call start tag handler, if any
            if startTagHandler then
                startTagHandler(currentElement)
            end
        end

        -- handle close tag or empty tag
        if (isEndTag or isEmptyTag) then
            assert((name == currentElement.name), 'inconsistent startTag/endTag in :' .. name)
            -- call end tag handler, if any
            if endTagHandler then
                endTagHandler(currentElement)
            end
            -- return to parent element
            currentElement = currentElement.parent
        end
     
        -- if any character data, store it and call data handler, if any
        -- (note: only last data chunk is stored, use handlers to get full data)
        if data and (data ~= '') then
            currentElement.data = unescape(data)
            dataHandler(currentElement)
        end
    end
        
    return root
end

---------------------------------------------------------------------------
-- recursively print element tree from given element
-- with optional indent step (default 2) and initial indent (defautl 0)

function PrintTree(element, indentStep, indent)
    indentStep = indentStep or 2
    indent = indent or 0
    local spc = string.rep(' ', indent)

    -- print as empty tag if no attributes and no children
    local emptyTag = ((next(element.attrib) == nil) and (#element == 0)) -- +++ no ??
    
    -- show open tag or empty tag
    if emptyTag then
        io.write(spc, '<', element.name, '/>\n')
    else
        io.write(spc, '<', element.name, '>\n')
    end

    -- show attributes-value pairs
    for attr, val in pairs(element.attrib) do
        io.write(spc, '-', attr, '=', val, '\n')
    end

    -- show (latest) data
    if element.data then
        io.write(spc, element.data, '\n')
    end

    -- show children
    for _, child in ipairs(element) do
        PrintTree(child, indentStep, indent + indentStep)
    end

    -- show close tag if any children
    if not emptyTag then
        io.write(spc, '</', element.name, '>\n')
    end
end

---------------------------------------------------------------------------

Test()