Lua Project List To Xml

lua-users home
wiki

Showing revision 2
This sample program will read the list of projects using Lua from the Lua website and outputs the list as a valid XML document. This program requires GNU wget installed in the system (it's available for major Unix and Windows systems, and it's are installed as default in most Linux distributions.

WARNING: The program will read and interpret HTML. Future changes in the format of the list *WILL* break the parser.

Good coding!

-- AlexandreErwinIttner

-----BEGIN LUA CODE-----


#!/usr/bin/env lua

--  Lua Project List To XML - a demonstration of use of Lua for more
--  complex text parsing.
--
--  (c) 2004 Alexandre Erwin Ittner  <aittner AT netuno.com.br>
--
--
--  This program will read the list of projects using Lua from the
--  Lua website and outputs the list as a valid XML document. This program
--  requires GNU wget installed in the system (it's available
--  for major Unix and Windows systems, and it's are installed as default
--  in most Linux distributions.
--
--  WARNING: The program will read and interpret HTML. Future changes
--  in the format of the list *WILL* break the parser.
--

fname = "uses.html"
os.execute("wget -q -O " .. fname .. " http://www.lua.org/uses.html")

fp = io.open(fname, "r")
if fp == nil then
  print("Error opening file '" .. fname .. "'.")
  return 1;
end

s = fp:read("*a")
fp:close()

-- Remove optional spaces from the tags.
s = string.gsub(s, "\n", " ")
s = string.gsub(s, " *< *", " <")
s = string.gsub(s, " *> *", "> ")
s = string.gsub(s, "> *<", "><")

-- Put all the tags in lowercase.
s = string.gsub(s, "(<[^ >]+)", string.lower)

-- Remove images, scripts, etc.
s = string.gsub(s, "<img[^>]*>", "")
s = string.gsub(s, "<script[^>]*>.-</script>", "")

-- "Normalize" links for future use
s = string.gsub(s, "(<a[^>]*HREF *=)", string.lower)
s = string.gsub(s, "<a[^>]*href *= *", "<a href=")


print("<?xml version=\"1.0\" encoding=\"iso-8859-1\" ?>")
print("<luauses>")

for tmp in string.gfind(s, "<h3>.-<hr>") do

  -- Current data format (without spaces and line-breaks):
  --     <h3>
  --       <a NAME="1" HREF="APPURL">APPNAM</a>
  --       <br><small><em>USER</em></small>
  --     </h3>
  --       DESCR [can have html here]
  --       <p> Contact: <a HREF="EMAIL">CONTACT</a>
  --     <hr>

  i, f, app = string.find(tmp, "<h3>(.-)</h3>")
  if app then
    app = string.gsub(app, "</?em>", "")
    app = string.gsub(app, "<br>", "")
    i, f, appurl, appnam = string.find(app, "<a href=\"([^\"> ]*)\"[^>]*>([^<]*)<")
    if appurl == nil then
      i, f, appnam = string.find(app, "<a[^>]*>([^<]*)</a>")
      appurl = ""
    end
  end

  i, f, user = string.find(tmp, "<small>(.-)</small>")
  if user then
    user = string.gsub(user, "</?.->", "")
    user = string.gsub(user, "&", "&amp;")
  else
    user = ""
  end

  i, f, desc = string.find(tmp, "</h3>(.-)<hr>")
  if desc then
    i, f, cont = string.find(desc, "<p> *Contact: *(.*)")
    if cont then
      desc = string.gsub(desc, "<p> *Contact:(.*)", "")
      cont = string.gsub(cont, "<p> *Contact: *", "")
      i, f, email, name = string.find(cont, "<a href=\"([^ \"]+)\"[^>]*>([^<]+)<")
      if name == nil then
        name = cont
        email = ""
      end
      if email then
        email = string.gsub(email, "mailto:/?/?", "")
      else
        email = ""
      end
    else
      name = ""
      email = ""
    end
    desc = string.gsub(desc, "&", "&amp;")
    desc = string.gsub(desc, "<", "&lt;")
    desc = string.gsub(desc, ">", "&gt;")
  else
    desc = ""
  end

  print(" <use>")
  print("  <app>" .. appnam .. "</app>")
  print("  <url>" .. appurl .. "</url>")
  print("  <user>" .. user .. "</user>")
  print("  <desc>" .. desc .. "</desc>")
  print("  <contact>" .. name .. "</contact>")
  print("  <email>" .. email .. "</email>")
  print(" </use>")
end

print("</luauses>")

-----END LUA CODE-----


RecentChanges · preferences
edit · history · current revision
Edited January 18, 2005 2:24 am GMT (diff)