lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]

On Fri, Apr 8, 2011 at 16:35, Alexander Gladysh <> wrote:
> Hi, list!
> I'm looking for a Lua module to scrape some data from a (possibly
> broken) HTML page.
> Any usable ones out there?

Here is "html.lua" [1] module I adopted from Roberto's XML parser by
allowing empty elements like <br>, <base>, etc. I am using it for HTML
data scrapping and it is reasonably robust.

Hope it helps!

[1]  attached file: html.lua
--! Purpose: parse HTML code.
-- Based upon Roberto Ierusalimschy simple XML parser
-- Adopted to parse HTML by Leo Razoumov
-- Send bug reports to Leo Razoumov <>

local error,assert,type= 
local      find,       gsub,       sub,      concat,      insert,      remove=

-- wrap up module. 
module(...) --no old global space below this point

local empty_elements= { --lists all HTML empty (void) elements
	br      = true,
	img     = true,
	meta    = true,
	META    = true,
	frame   = true,
	area    = true,
	hr      = true,
	base    = true,
	col     = true,
	link    = true,
	input   = true,
	option  = true,
	param   = true,

local function parseargs(s)
    local arg = {}
    gsub(s, "(%w+)=([\"'])(.-)%2", function (w, _, a) arg[w] = a end)
    return arg

--[[collect: parses HTML code and returns corresponding tree as a Lua table
    @s - HTML code as a string
local function collect(s)
    local stack = {}
    local top = {}
    insert(stack, top)
    local ni,c,element,xarg, empty
    local i, j = 1, 1
    while true do
        ni,j,c,element,xarg, empty = find(s, "<(%/?)([%w:]+)(.-)(%/?)>", i)
        if not ni then break end
        if empty_elements[element] then empty= '/' end
        local text = sub(s, i, ni-1)
        if not find(text, "^%s*$") then
            insert(top, text)
        if empty == "/" then    -- empty element tag
            insert(top, {element=element, xarg=parseargs(xarg), empty=1})
        elseif c == "" then     -- start tag
            top = {element=element, xarg=parseargs(xarg)}
            insert(stack, top)  -- new level
        else  -- end tag
            local toclose = remove(stack)  -- remove top
            top = stack[#stack]
            if #stack < 1 then
                error("nothing to close with "..element)
            if toclose.element ~= element then
                error("trying to close "..toclose.element.." with "..element)
            insert(top, toclose)
        i = j+1
    local text = sub(s, i)
    if not find(text, "^%s*$") then
        insert(stack[#stack], text)
    if #stack > 1 then
        error("unclosed "..stack[#stack].element)
    return stack[1]
_M.collect = collect --export module function

--|> EMACS customization section.
-- mode:        Lua
-- coding:      utf-8
-- default-input-method: cyrillic-translit
-- fill-column: 88
-- tab-width:   4
-- END: