[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: Any html scraping libraries?
- From: Leo Razoumov <slonik.az@...>
- Date: Sun, 10 Apr 2011 08:37:10 -0400
On Fri, Apr 8, 2011 at 16:35, Alexander Gladysh <agladysh@gmail.com> wrote:
> Hi, list!
>
> I'm looking for a Lua module to scrape some data from a (possibly
> broken) HTML page.
>
> Any usable ones out there?
Here is "html.lua" [1] module I adopted from Roberto's XML parser by
allowing empty elements like <br>, <base>, etc. I am using it for HTML
data scrapping and it is reasonably robust.
Hope it helps!
--Leo--
[1] attached file: html.lua
--! Purpose: parse HTML code.
-- Based upon Roberto Ierusalimschy simple XML parser
-- Adopted to parse HTML by Leo Razoumov
-- Send bug reports to Leo Razoumov <slonik.az@gmail.com>
local error,assert,type=
error,assert,type
local find, gsub, sub, concat, insert, remove=
string.find,string.gsub,string.sub,table.concat,table.insert,table.remove
-- wrap up module.
module(...) --no old global space below this point
--------------------------------------------------------------------------------
local empty_elements= { --lists all HTML empty (void) elements
br = true,
img = true,
meta = true,
META = true,
frame = true,
area = true,
hr = true,
base = true,
col = true,
link = true,
input = true,
option = true,
param = true,
}
local function parseargs(s)
local arg = {}
gsub(s, "(%w+)=([\"'])(.-)%2", function (w, _, a) arg[w] = a end)
return arg
end
--[[collect: parses HTML code and returns corresponding tree as a Lua table
@s - HTML code as a string
--]]
local function collect(s)
local stack = {}
local top = {}
insert(stack, top)
local ni,c,element,xarg, empty
local i, j = 1, 1
while true do
ni,j,c,element,xarg, empty = find(s, "<(%/?)([%w:]+)(.-)(%/?)>", i)
if not ni then break end
if empty_elements[element] then empty= '/' end
local text = sub(s, i, ni-1)
if not find(text, "^%s*$") then
insert(top, text)
end
if empty == "/" then -- empty element tag
insert(top, {element=element, xarg=parseargs(xarg), empty=1})
elseif c == "" then -- start tag
top = {element=element, xarg=parseargs(xarg)}
insert(stack, top) -- new level
else -- end tag
local toclose = remove(stack) -- remove top
top = stack[#stack]
if #stack < 1 then
error("nothing to close with "..element)
end
if toclose.element ~= element then
error("trying to close "..toclose.element.." with "..element)
end
insert(top, toclose)
end
i = j+1
end
local text = sub(s, i)
if not find(text, "^%s*$") then
insert(stack[#stack], text)
end
if #stack > 1 then
error("unclosed "..stack[#stack].element)
end
return stack[1]
end
_M.collect = collect --export module function
--------------------------------------------------------------------------------
--|> EMACS customization section.
-- LOCAL VARIABLES:
-- mode: Lua
-- coding: utf-8
-- default-input-method: cyrillic-translit
-- fill-column: 88
-- tab-width: 4
-- END: