Lua Project List To Xml

lua-users home
wiki

This sample program will read the list of projects using Lua from the Lua website and outputs the list as a valid XML document. This program requires GNU wget installed in the system (it's available for major Unix and Windows systems, and it's are installed as default in most Linux distributions.

WARNING: The program will read and interpret HTML. Future changes in the format of the list *WILL* break the parser.

#!/usr/bin/env lua

local fname = "uses.html"
os.execute("wget -q -O " .. fname .. " http://www.lua.org/uses.html")

local fp = io.open(fname, "r")
if fp == nil then
  print("Error opening file '" .. fname .. "'.")
  os.exit(1)
end

local s = fp:read("*a")
fp:close()

-- Remove optional spaces from the tags.
s = string.gsub(s, "\n", " ")
s = string.gsub(s, " *< *", " <")
s = string.gsub(s, " *> *", "> ")
s = string.gsub(s, "> *<", "><")

-- Put all the tags in lowercase.
s = string.gsub(s, "(<[^ >]+)", string.lower)

-- Remove images, scripts, etc.
s = string.gsub(s, "<img[^>]*>", "")
s = string.gsub(s, "<script[^>]*>.-</script>", "")

-- "Normalize" links for future use
s = string.gsub(s, "(<a[^>]*HREF *=)", string.lower)
s = string.gsub(s, "<a[^>]*href *= *", "<a href=")


print("<?xml version=\"1.0\" encoding=\"iso-8859-1\" ?>")
print("<luauses>")

for tmp in string.gfind(s, "<h3>.-<hr>") do

  -- Current data format (without spaces and line-breaks):
  --     <h3>
  --       <a NAME="1" HREF="APPURL">APPNAM</a>
  --       <br><small><em>USER</em></small>
  --     </h3>
  --       DESCR [can have html here]
  --       <p> Contact: <a HREF="EMAIL">CONTACT</a>
  --     <hr>

  local i, f, app = string.find(tmp, "<h3>(.-)</h3>")
  if app then
    app = string.gsub(app, "</?em>", "")
    app = string.gsub(app, "<br>", "")
    i, f, appurl, appnam = string.find(app, "<a href=\"([^\"> ]*)\"[^>]*>([^<]*)<")
    if appurl == nil then
      i, f, appnam = string.find(app, "<a[^>]*>([^<]*)</a>")
      appurl = ""
    end
  end

  i, f, user = string.find(tmp, "<small>(.-)</small>")
  if user then
    user = string.gsub(user, "</?.->", "")
    user = string.gsub(user, "&", "&amp;")
  else
    user = ""
  end

  i, f, desc = string.find(tmp, "</h3>(.-)<hr>")
  if desc then
    i, f, cont = string.find(desc, "<p> *Contact: *(.*)")
    if cont then
      desc = string.gsub(desc, "<p> *Contact:(.*)", "")
      cont = string.gsub(cont, "<p> *Contact: *", "")
      i, f, email, name = string.find(cont, "<a href=\"([^ \"]+)\"[^>]*>([^<]+)<")
      if name == nil then
        name = cont
        email = ""
      end
      if email then
        email = string.gsub(email, "mailto:/?/?", "")
      else
        email = ""
      end
    else
      name = ""
      email = ""
    end
    desc = string.gsub(desc, "&", "&amp;")
    desc = string.gsub(desc, "<", "&lt;")
    desc = string.gsub(desc, ">", "&gt;")
  else
    desc = ""
  end

  print(" <use>")
  print("  <app>" .. appnam .. "</app>")
  print("  <url>" .. appurl .. "</url>")
  print("  <user>" .. user .. "</user>")
  print("  <desc>" .. desc .. "</desc>")
  print("  <contact>" .. name .. "</contact>")
  print("  <email>" .. email .. "</email>")
  print(" </use>")
end

print("</luauses>")

-- AlexandreErwinIttner


RecentChanges · preferences
edit · history
Last edited May 28, 2007 10:29 pm GMT (diff)