lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]



On 19-Dec-06, at 10:00 AM, dcharno wrote:

You're right. I rarely split on anything other than commas or semicolons. Here is a version that seems to handle the perl examples I could find and was a little shorter than other samples I saw:

   function strsplit(s, delim)
     local t = {}
     local pat = "(.-)" .. delim .. "(.+)"
     local c1, c2
     repeat
       c1, c2 = string.match(s, pat)
       t[#t+1] = c1 or s
       s = c2
     until c1 == nil
     return t
     end

Don't use that in production code. It's behaviour is quadratic:

for i = 5000, 50000, 5000 do
  io.write(("%5d repetitions"):format(i))
  time(strsplit, ("a,"):rep(i), ",")
end

 5000 repetitions  0.227 seconds
10000 repetitions  0.938 seconds
15000 repetitions  2.773 seconds
20000 repetitions  5.336 seconds
25000 repetitions  8.633 seconds
30000 repetitions 12.758 seconds
35000 repetitions 18.141 seconds
40000 repetitions 24.516 seconds
45000 repetitions 31.914 seconds
50000 repetitions 41.047 seconds

It's even worse if the last segment is long -- you should never use '.-' in a pattern if you don't know that whatever follows it will match. In this time trial, the target string has no separators at all:

for i = 5000, 50000, 5000 do
  io.write(("%5d repetitions"):format(i))
  time(strsplit, ("a"):rep(i), ",")
end

 5000 repetitions  0.688 seconds
10000 repetitions  2.961 seconds
15000 repetitions  7.727 seconds
20000 repetitions 14.828 seconds
25000 repetitions 24.875 seconds
30000 repetitions 38.203 seconds
35000 repetitions 54.641 seconds
40000 repetitions 73.992 seconds
45000 repetitions 95.609 seconds
50000 repetitions120.781 seconds


Not that this whole thing was really about string splitting anyways ...

Not at all. But for what it's worth, here's my entry. It produces an iterator, which seems more Lua-like to me. However, to make a fair comparison, I added a collector function:

collect = {}
function collect.keys(f, o, s)
  local rv = {}
  for k in f, o, s do rv[#rv+1] = k end
  return rv
end
-- Another useful one is collect.vals

-- Here's the split function itself. The pattern can be
-- any Lua pattern without captures.
function string:split(pat)
  local st = 1
  local g = self:gmatch("()"..pat.."()")
  local function splitter(self)
    if st then
      local s, f = g()
      local rv = self:sub(st, (s or 0)-1)
      st = f
      return rv
    end
  end
  return splitter, self
end

-- And an adaptor:
function strsplit(s, sep) return collect.keys(s:split(sep)) end

-- Timings:
for i = 5000, 50000, 5000 do
  io.write(("%5d repetitions"):format(i))
  time(strsplit, ("a,"):rep(i), ",")
end
 5000 repetitions  0.008 seconds
10000 repetitions  0.008 seconds
15000 repetitions  0.023 seconds
20000 repetitions  0.023 seconds
25000 repetitions  0.031 seconds
30000 repetitions  0.039 seconds
35000 repetitions  0.047 seconds
40000 repetitions  0.047 seconds
45000 repetitions  0.055 seconds
50000 repetitions  0.062 seconds

-- No separators in long string:
for i = 5000, 50000, 5000
  do io.write(("%5d repetitions"):format(i))
  time(splitstr, ("a"):rep(i), ",")
end
 5000 repetitions  0.000 seconds
10000 repetitions  0.000 seconds
15000 repetitions  0.000 seconds
20000 repetitions  0.000 seconds
25000 repetitions  0.000 seconds
30000 repetitions  0.000 seconds
35000 repetitions  0.008 seconds
40000 repetitions  0.000 seconds
45000 repetitions  0.000 seconds
50000 repetitions  0.000 seconds

-- For completeness:

function time(fn, ...)
  collectgarbage"collect"
  local now = os.clock()
  fn(...)
  print(("%7.3f seconds"):format(os.clock() - now))
end