Lua Project List To Xml |
|
WARNING: The program will read and interpret HTML. Future changes in the format of the list *WILL* break the parser.
Good coding!
-----BEGIN LUA CODE-----
#!/usr/bin/env lua
-- Lua Project List To XML - a demonstration of use of Lua for more
-- complex text parsing.
--
-- (c) 2004 Alexandre Erwin Ittner <aittner AT netuno.com.br>
--
--
-- This program will read the list of projects using Lua from the
-- Lua website and outputs the list as a valid XML document. This program
-- requires GNU wget installed in the system (it's available
-- for major Unix and Windows systems, and it's are installed as default
-- in most Linux distributions.
--
-- WARNING: The program will read and interpret HTML. Future changes
-- in the format of the list *WILL* break the parser.
--
fname = "uses.html"
os.execute("wget -q -O " .. fname .. " http://www.lua.org/uses.html")
fp = io.open(fname, "r")
if fp == nil then
print("Error opening file '" .. fname .. "'.")
return 1;
end
s = fp:read("*a")
fp:close()
-- Remove optional spaces from the tags.
s = string.gsub(s, "\n", " ")
s = string.gsub(s, " *< *", " <")
s = string.gsub(s, " *> *", "> ")
s = string.gsub(s, "> *<", "><")
-- Put all the tags in lowercase.
s = string.gsub(s, "(<[^ >]+)", string.lower)
-- Remove images, scripts, etc.
s = string.gsub(s, "<img[^>]*>", "")
s = string.gsub(s, "<script[^>]*>.-</script>", "")
-- "Normalize" links for future use
s = string.gsub(s, "(<a[^>]*HREF *=)", string.lower)
s = string.gsub(s, "<a[^>]*href *= *", "<a href=")
print("<?xml version=\"1.0\" encoding=\"iso-8859-1\" ?>")
print("<luauses>")
for tmp in string.gfind(s, "<h3>.-<hr>") do
-- Current data format (without spaces and line-breaks):
-- <h3>
-- <a NAME="1" HREF="APPURL">APPNAM</a>
-- <br><small><em>USER</em></small>
-- </h3>
-- DESCR [can have html here]
-- <p> Contact: <a HREF="EMAIL">CONTACT</a>
-- <hr>
i, f, app = string.find(tmp, "<h3>(.-)</h3>")
if app then
app = string.gsub(app, "</?em>", "")
app = string.gsub(app, "<br>", "")
i, f, appurl, appnam = string.find(app, "<a href=\"([^\"> ]*)\"[^>]*>([^<]*)<")
if appurl == nil then
i, f, appnam = string.find(app, "<a[^>]*>([^<]*)</a>")
appurl = ""
end
end
i, f, user = string.find(tmp, "<small>(.-)</small>")
if user then
user = string.gsub(user, "</?.->", "")
user = string.gsub(user, "&", "&")
else
user = ""
end
i, f, desc = string.find(tmp, "</h3>(.-)<hr>")
if desc then
i, f, cont = string.find(desc, "<p> *Contact: *(.*)")
if cont then
desc = string.gsub(desc, "<p> *Contact:(.*)", "")
cont = string.gsub(cont, "<p> *Contact: *", "")
i, f, email, name = string.find(cont, "<a href=\"([^ \"]+)\"[^>]*>([^<]+)<")
if name == nil then
name = cont
email = ""
end
if email then
email = string.gsub(email, "mailto:/?/?", "")
else
email = ""
end
else
name = ""
email = ""
end
desc = string.gsub(desc, "&", "&")
desc = string.gsub(desc, "<", "<")
desc = string.gsub(desc, ">", ">")
else
desc = ""
end
print(" <use>")
print(" <app>" .. appnam .. "</app>")
print(" <url>" .. appurl .. "</url>")
print(" <user>" .. user .. "</user>")
print(" <desc>" .. desc .. "</desc>")
print(" <contact>" .. name .. "</contact>")
print(" <email>" .. email .. "</email>")
print(" </use>")
end
print("</luauses>")
-----END LUA CODE-----