String Trim

lua-users home
wiki

There are many ways to implement the "trim" function [1] in Lua:

-- trim implementations

function trim1(s)
   return (s:gsub("^%s*(.-)%s*$", "%1"))
end
-- from PiL2 20.4

function trim2(s)
   return s:match "^%s*(.-)%s*$"
end
-- variant of trim1 (match)

function trim3(s)
   return s:gsub("^%s+", ""):gsub("%s+$", "")
end
-- two gsub's

function trim4(s)
   return s:match"^%s*(.*)":match"(.-)%s*$"
end
-- variant of trim3 (match)

function trim5(s)
   return s:match'^%s*(.*%S)' or ''
end
-- warning: has bad performance when s:match'^%s*$' and #s is large

function trim6(s)
   return s:match'^()%s*$' and '' or s:match'^%s*(.*%S)'
end
-- fixes performance problem in trim5.
-- note: the '()' avoids the overhead of default string capture.
-- This overhead is small, ~ 10% for successful whitespace match call
-- alone, and may not be noticeable in the overall benchmarks here,
-- but there's little harm either.  Instead replacing the first `match`
-- with a `find` has a similar effect, but that requires localizing
-- two functions in the trim7 variant below.

local match = string.match
function trim7(s)
   return match(s,'^()%s*$') and '' or match(s,'^%s*(.*%S)')
end
-- variant of trim6 (localize functions)

local find = string.find
local sub = string.sub
function trim8(s)
   local i1,i2 = find(s,'^%s*')
   if i2 >= i1 then
      s = sub(s,i2+1)
   end
   local i1,i2 = find(s,'%s*$')
   if i2 >= i1 then
      s = sub(s,1,i1-1)
   end
   return s
end
-- based on penlight 0.7.2

function trim9(s)
   local _, i1 = find(s,'^%s*')
   local i2 = find(s,'%s*$')
   return sub(s, i1 + 1, i2 - 1)
end
-- simplification of trim8

function trim10(s)
   local a = s:match('^%s*()')
   local b = s:match('()%s*$', a)
   return s:sub(a,b-1)
end
-- variant of trim9 (match)

function trim11(s)
   local n = s:find"%S"
   return n and s:match(".*%S", n) or ""
end
-- variant of trim6 (use n position)
-- http://lua-users.org/lists/lua-l/2009-12/msg00904.html

function trim12(s)
   local from = s:match"^%s*()"
   return from > #s and "" or s:match(".*%S", from)
end
-- variant of trim11 (performs better for all
-- whitespace string). See Roberto's comments
-- on ^%s*$" v.s. "%S" performance:
-- http://lua-users.org/lists/lua-l/2009-12/msg00921.html

do
   local lpeg = require("lpeg")
   local space = lpeg.S' \t\n\v\f\r'
   local nospace = 1 - space
   local ptrim = space^0 * lpeg.C((space^0 * nospace^1)^0)
   local match = lpeg.match
   function trim13(s)
      return match(ptrim, s)
   end
end
-- lpeg.  based on http://lua-users.org/lists/lua-l/2009-12/msg00921.html

do
   local lpeg = require("lpeg")
   local re = require("re")
   local ptrim = re.compile"%s* {(%s* %S+)*}"
   local match = lpeg.match
   function trim14(s)
      return match(ptrim, s)
   end
end
-- variant with re module.

require 'trim'
local trim15 = trim
-- C implementation (see separate trim.c file)


-- test utilities

local function trimtest(trim)
   assert(trim'' == '')
   assert(trim' ' == '')
   assert(trim'  ' == '')
   assert(trim'a' == 'a')
   assert(trim' a' == 'a')
   assert(trim'a ' == 'a')
   assert(trim' a ' == 'a')
   assert(trim'  a  ' == 'a')
   assert(trim'  ab cd  ' == 'ab cd')
   assert(trim' \t\r\n\f\va\000b \r\t\n\f\v' == 'a\000b')
end

local function perftest(f, s)
   local time = os.clock  -- os.time or os.clock
   local t1 = time()
   for i=1,100000 do
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
      f(s)
   end
   local dt = time() - t1
   io.stdout:write(("%4.1f"):format(dt) .. " ")
end

local trims = {trim1, trim2, trim3, trim4, trim5, trim6, trim7,
               trim8, trim9, trim10, trim11, trim12, trim13, trim14, trim15}

-- correctness tests
for _,trim in ipairs(trims) do
   trimtest(trim)
end

-- performance tests
for j = 1, 3 do
   for i,trim in ipairs(trims) do
      io.stdout:write(string.format("%2d",i) .. ": ")
      perftest(trim,  "")
      perftest(trim,  "abcdef")
      perftest(trim,  "   abcdef   ")
      perftest(trim,  "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef")
      perftest(trim,  "  a b c d e f g h i j k l m n o p q r s t u v w x y z A B C ")
      perftest(trim,  "                               a                            ")
      perftest(trim,  "                                                            ")
      print()
   end
end

Optional C module for test15 based on LuaList:2009-12/msg00951.html:

/* trim.c - based on http://lua-users.org/lists/lua-l/2009-12/msg00951.html
            from Sean Conner */
#include <stddef.h>
#include <ctype.h>
#include <lua.h>
#include <lauxlib.h>

int trim(lua_State* L) {
   const char* front;
   const char* end;
   size_t size;

   front = luaL_checklstring(L,1,&size);
   end = &front[size - 1];

   while (size && isspace((unsigned char)*front)) {
      size--;
      front++;
   }

   while (size && isspace((unsigned char)*end)) {
      size--;
      end--;
   }

   lua_pushlstring(L,front,(size_t)(end - front) + 1);
   return 1;
}

int luaopen_trim(lua_State* L) {
   lua_register(L, "trim", trim);
   return 0;
}

Test results (lower numbers = faster):

Lua 5.1.4/Cygwin1.7
 1:  0.9  1.9  2.1  9.6 10.3  2.2  2.0
 2:  0.7  1.6  1.7  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.2  2.2  2.2 10.1 11.2  2.7  2.2
 5:  0.6  0.9  1.1  1.2  1.3  2.8 77.6
 6:  0.6  1.2  1.6  1.6  1.8  4.1  1.7
 7:  0.6  1.1  1.5  1.5  1.6  3.9  1.6
 8:  1.0  1.7  2.5  7.5  9.7  3.0  2.4
 9:  1.2  2.0  2.7  8.0  9.3 21.2  3.4
10:  1.5  2.4  2.6  9.8 10.8  2.8  2.6
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.7  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3
 1:  0.9  1.9  2.0  9.6 10.3  2.2  1.9
 2:  0.7  1.6  1.8  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.1  2.2  2.2 10.1 11.4  2.7  2.2
 5:  0.6  0.9  1.2  1.2  1.2  2.8 78.2
 6:  0.6  1.2  1.7  1.6  1.8  4.2  1.7
 7:  0.6  1.1  1.5  1.5  1.7  3.9  1.6
 8:  1.0  1.7  2.5  7.5  9.6  3.1  2.3
 9:  1.2  2.0  2.7  8.0  9.3 21.1  3.3
10:  1.5  2.4  2.5  9.8 10.8  2.8  2.5
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.7  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.4  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3
 1:  0.9  1.9  2.0  9.6 10.3  2.2  2.0
 2:  0.7  1.6  1.7  8.9 10.0  2.0  1.8
 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1
 4:  1.1  2.2  2.2 10.1 11.2  2.7  2.2
 5:  0.6  0.9  1.2  1.2  1.3  2.8 77.3
 6:  0.6  1.2  1.7  1.6  1.8  4.2  1.7
 7:  0.6  1.1  1.5  1.5  1.7  3.9  1.6
 8:  1.0  1.7  2.6  7.4  9.6  3.1  2.3
 9:  1.2  2.0  2.7  8.0  9.3 21.1  3.3
10:  1.5  2.4  2.6  9.8 10.8  2.8  2.6
11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5
12:  0.7  1.3  1.6  1.6  1.8  3.0  1.8
13:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0
15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3

LuaJIT 2.0.0-beta2/Cygwin1.7
 1:  0.6  1.5  1.5  7.7  8.3  1.3  1.2
 2:  0.4  1.2  1.2  7.1  7.8  1.1  1.0
 3:  0.6  1.0  1.2  3.1  4.9  1.7  1.3
 4:  0.8  1.6  1.8  8.4  9.0  1.9  1.3
 5:  0.4  0.6  0.8  1.2  1.2  2.3 99.2
 6:  0.4  0.9  1.1  1.5  1.5  3.2  0.9
 7:  0.3  0.8  1.1  1.4  1.5  3.0  0.8
 8:  0.6  1.2  1.6  5.3  6.8  1.7  1.3
 9:  0.7  1.2  1.8  5.6  6.7 14.4  1.7
10:  0.9  1.6  1.7  7.6  8.3  1.5  1.5
11:  0.3  0.8  1.1  1.4  1.5  2.9  2.1
12:  0.4  0.9  1.1  1.5  1.5  2.7  0.9
13:  0.6  0.7  0.7  1.0  1.4  0.8  0.7
14:  0.6  0.7  0.7  1.0  1.4  0.8  0.8
15:  0.1  0.1  0.1  0.3  0.3  0.2  0.2
...

The speed of trim5 is relatively favorable or competitive over the data set except it is quite poor in the case of a long string with only whitespace. The use of ".*" (rather than a non-greedy ".-") quickly plows to the end of a long string, and the "%S" then triggers backtracking to avoid the trailing whitespace, but the juxtaposition of "%s*" and ".*" imposes substantial backtracking (O(N^2)) if no "%S" match is found. trim6 handles the worst cast condition specially and behaves well over the entire data set. trim7 is a variant that localizes functions, which gives a bit larger code size and does not give a substantial speed improvement. (On further testing, each localization gives maybe 10% in best case involving empty string data and inlining the function call, but here there are two calls to match). trim11 is also a variant of trim6 with similar performance characteristics, perhaps slightly better, except speed is about half in the case of a long string with only whitespace (this is fixed in trim12). trim13 (or its variant trim14) is an lpeg (LuaPeg) implementation, which generally meets or exceeds the performance of the best pattern matching implementations, particularly exceeding in the case of lots of whitespace, but it is a bit slower in its worst case condition of alternating space and whitespace. The C implementation (trim15) is by far the fastest over the entire data set.

Reusing the same strings across all test iterations, as done above, might not properly gauge the effect of temporary string creation due to Lua's string interning. A quick test with local ss = {}; for i = 1,80000 do ss[i] = " " .. (" "..i):rep(10) .. " " end as a string cache doesn't seem to affect the basic conclusions above.

--DavidManura

See Also


RecentChanges · preferences
edit · history
Last edited April 15, 2022 3:02 am GMT (diff)