Lua Project List To Xml

lua-users home
wiki

Difference (from prior major revision) (minor diff, author diff)

Changed: 1c1
This sample program will read the list of projects using Lua from the Lua website and outputs the list as a valid XML document. This program requires GNU wget installed in the system (it's available for major Unix and Windows systems, and it's are installed as default in most Linux distributions.
This sample program will read the list of projects using Lua from the Lua website and outputs the list as a valid XML document. This program requires GNU wget installed in the system (it's available for major Unix and Windows systems, and it's are installed as default in most Linux distributions.

Changed: 5,15c5
Good coding!

-- AlexandreErwinIttner




-----BEGIN LUA CODE-----

{{{

{{{!Lua

Changed: 18,34c8
-- Lua Project List To XML - a demonstration of use of Lua for more
-- complex text parsing.
--
-- (c) 2004 Alexandre Erwin Ittner <aittner AT netuno.com.br>
--
--
-- This program will read the list of projects using Lua from the
-- Lua website and outputs the list as a valid XML document. This program
-- requires GNU wget installed in the system (it's available
-- for major Unix and Windows systems, and it's are installed as default
-- in most Linux distributions.
--
-- WARNING: The program will read and interpret HTML. Future changes
-- in the format of the list *WILL* break the parser.
--

fname = "uses.html"
local fname = "uses.html"

Changed: 37c11
fp = io.open(fname, "r")
local fp = io.open(fname, "r")

Changed: 40c14
return 1;
os.exit(1)

Changed: 43c17
s = fp:read("*a")
local s = fp:read("*a")

Changed: 78c52
i, f, app = string.find(tmp, "<h3>(.-)</h3>")
local i, f, app = string.find(tmp, "<h3>(.-)</h3>")

Changed: 137c111
-----END LUA CODE-----
-- AlexandreErwinIttner

This sample program will read the list of projects using Lua from the Lua website and outputs the list as a valid XML document. This program requires GNU wget installed in the system (it's available for major Unix and Windows systems, and it's are installed as default in most Linux distributions.

WARNING: The program will read and interpret HTML. Future changes in the format of the list *WILL* break the parser.

#!/usr/bin/env lua

local fname = "uses.html"
os.execute("wget -q -O " .. fname .. " http://www.lua.org/uses.html")

local fp = io.open(fname, "r")
if fp == nil then
  print("Error opening file '" .. fname .. "'.")
  os.exit(1)
end

local s = fp:read("*a")
fp:close()

-- Remove optional spaces from the tags.
s = string.gsub(s, "\n", " ")
s = string.gsub(s, " *< *", " <")
s = string.gsub(s, " *> *", "> ")
s = string.gsub(s, "> *<", "><")

-- Put all the tags in lowercase.
s = string.gsub(s, "(<[^ >]+)", string.lower)

-- Remove images, scripts, etc.
s = string.gsub(s, "<img[^>]*>", "")
s = string.gsub(s, "<script[^>]*>.-</script>", "")

-- "Normalize" links for future use
s = string.gsub(s, "(<a[^>]*HREF *=)", string.lower)
s = string.gsub(s, "<a[^>]*href *= *", "<a href=")


print("<?xml version=\"1.0\" encoding=\"iso-8859-1\" ?>")
print("<luauses>")

for tmp in string.gfind(s, "<h3>.-<hr>") do

  -- Current data format (without spaces and line-breaks):
  --     <h3>
  --       <a NAME="1" HREF="APPURL">APPNAM</a>
  --       <br><small><em>USER</em></small>
  --     </h3>
  --       DESCR [can have html here]
  --       <p> Contact: <a HREF="EMAIL">CONTACT</a>
  --     <hr>

  local i, f, app = string.find(tmp, "<h3>(.-)</h3>")
  if app then
    app = string.gsub(app, "</?em>", "")
    app = string.gsub(app, "<br>", "")
    i, f, appurl, appnam = string.find(app, "<a href=\"([^\"> ]*)\"[^>]*>([^<]*)<")
    if appurl == nil then
      i, f, appnam = string.find(app, "<a[^>]*>([^<]*)</a>")
      appurl = ""
    end
  end

  i, f, user = string.find(tmp, "<small>(.-)</small>")
  if user then
    user = string.gsub(user, "</?.->", "")
    user = string.gsub(user, "&", "&amp;")
  else
    user = ""
  end

  i, f, desc = string.find(tmp, "</h3>(.-)<hr>")
  if desc then
    i, f, cont = string.find(desc, "<p> *Contact: *(.*)")
    if cont then
      desc = string.gsub(desc, "<p> *Contact:(.*)", "")
      cont = string.gsub(cont, "<p> *Contact: *", "")
      i, f, email, name = string.find(cont, "<a href=\"([^ \"]+)\"[^>]*>([^<]+)<")
      if name == nil then
        name = cont
        email = ""
      end
      if email then
        email = string.gsub(email, "mailto:/?/?", "")
      else
        email = ""
      end
    else
      name = ""
      email = ""
    end
    desc = string.gsub(desc, "&", "&amp;")
    desc = string.gsub(desc, "<", "&lt;")
    desc = string.gsub(desc, ">", "&gt;")
  else
    desc = ""
  end

  print(" <use>")
  print("  <app>" .. appnam .. "</app>")
  print("  <url>" .. appurl .. "</url>")
  print("  <user>" .. user .. "</user>")
  print("  <desc>" .. desc .. "</desc>")
  print("  <contact>" .. name .. "</contact>")
  print("  <email>" .. email .. "</email>")
  print(" </use>")
end

print("</luauses>")

-- AlexandreErwinIttner


RecentChanges · preferences
edit · history
Last edited May 28, 2007 9:29 pm GMT (diff)