Splay Ropes

lua-users home
wiki

Difference (from prior major revision) (minor diff)

Changed: 16,242c16
The rope library:

--------------rope.lua------------
--
-- Splay ropes
-- Implemented by Rici Lake; released into the public domain, April 2004.
--
-- The splay implementation was inspired by Chris Okasaki, and splay trees themselves
-- were invented by Daniel Sleator and Robert Tarjan.

-- I don't think the application to interval maps is original either,
-- but I thought it was interesting.

-- This is not an industrial strength implementation: splay is recursive and
-- the stack can overflow; moreover, a realistic implementation would probably
-- attempt to ensure a minimum node size, in part to avoid that problem.

local version = "0.2"

local slen, ssub = string.len, string.sub

return function()
local rope = {}

-- splay(rope, pivot)
-- returns r1, str, idx, r2 such that:
-- either pivot is in str (starting at pivot - idx - 1) (yes, that is negative)
-- or pivot > idx and r2 is nil
-- This function will fail without warning if idx < 1; it is not intended for
-- public consumption.
local function splay(r, pivot)
local a, s, x, b = r()
if pivot <= x - slen(s) then
local a1, t, y, a2 = a()
if pivot <= y - slen(t) then
local small, u, z, big = splay(a1, pivot)
return small, u, z, tuple(big, t, y - z, tuple(a2, s, x - y, b))
elseif pivot <= y then
return a1, t, y, tuple(a2, s, x - y, b)
else
local small, u, z, big = splay(a2, pivot - y)
return tuple(a1, t, y, small), u, y + z, tuple(big, s, x - y - z, b)
end
elseif pivot <= x or not b then return a, s, x, b
else
pivot = pivot - x
local b1, t, y, b2 = b()
if pivot <= y - slen(t) then
local small, u, z, big = splay(b1, pivot)
return tuple(a, s, x, small), u, x + z, tuple(big, t, y - z, b2)
elseif pivot <= y or not b2 then
return tuple(a, s, x, b1), t, x + y, b2
else
local small, u, z, big = splay(b2, pivot - y)
return tuple(tuple(a, s, x, b1), t, x + y, small), u, x + y + z, big
end
end
end

local function len(r)
local accum = 0
while r do
local _, _, x, b = r()
r, accum = b, accum + x
end
return accum
end

local function new(s)
if s ~= "" then return tuple(nil, s, slen(s), nil) end
end

local function rr(a, b)
if a and b then
local b1, s, x, b2 = splay(b, 1)
return tuple(a, s, x + len(a), b2)
else
return a or b
end
end

local function rsr(a, s, b)
if s == "" then return rr(a, b)
else return tuple(a, s, len(a) + slen(s), b)
end
end

local function upto(r, j)
if r and j then
if j <= 0 then return nil end
local a, s, x, b = splay(r, j)
if j <= x then r = tuple(a, ssub(s, 1, j - x - 1), j, nil) end
end
return r
end

local function from(r, i)
if r and i and i > 0 then
local a, s, x, b = splay(r, i)
r = i <= x and tuple(nil, ssub(s, i - x - 1), x - i + 1, b)
end
return r
end

local function sub(r, i, j)
if i and j then
if i <= j then return from(upto(r, j), i) end
else return upto(r, i)
end
end

local function replace(r, i, j, s)
r = r and tuple(splay(r, i or 1))
if type(s) == "string" then
return rsr(upto(r, i - 1), s, from(r, j + 1))
else
return rr(upto(r, i - 1), rr(s, from(r, j + 1)))
end
end

-- This is just like table.concat; the table is a table of strings,
-- and sep is also a string.
local function concat(t, sep, st, fi)
st, fi = st or 1, fi or table.getn(t)
if st <= fi then
local rv = new(t[fi])
if sep and sep ~= "" then
local lsep = slen(sep)
for i = fi - 1, st, -1 do
rv = tuple(new(t[i]), sep, lsep + slen(t[i]), rv)
end
else
for i = fi - 1, st, -1 do
rv = tuple(nil, t[i], slen(t[i]), rv)
end
end
return rv
end
end

-- This is just like string.rep; but it attempts to maximise node
-- sharing, demonstrating a curious feature of functional splays
local function rep(s, n)
local rv
if n > 0 then
local lens = slen(s)
local k, l, prev = 1, lens
while k < n do
prev = tuple(prev, s, l, prev)
l, k = 2 * l, 2 * k + 1
end
rv = prev
local lenr = (n + 1) * lens - l
while lenr > 0 do
repeat
local a, s, x, b = prev()
prev, l = a, x
until l <= lenr
lenr = lenr - l
rv = tuple(prev, s, l, rv)
end
end
return rv
end

local function nextseg(_, r)
if r then
local a, s, x, b = splay(r, 1)
return b or false, s
end
end

-- segs(r) returns an internal key and successive string segments of r;
-- the key return should be ignored (it will be false on the last segment,
-- but I am not going to guarantee that just yet.)

-- If you can arrange for the argument to not be stored in a local variable,
-- the iteration will not hold onto any reference to the rope, so gradual garbage
-- collection would be possible. However, the only way I know of to do this is
-- for the argument to segs to be returned from some function.
local function segs(r) return nextseg, nil, r end

-- Unfortunately, tuples don't have metamethods. But this will flatten the
-- rope. For outputting, for example, it is probably better to pass
-- successive segments to an output function.
local function tostring(r)
local t = {n=0}
for _, s in segs(r) do table.insert(t, s) end
return table.concat(t)
end

-- debug
local function view(r)
if r then
local a, s, x, b = r()
return string.format("(%s %q:%d %s)", view(a), s, x, view(b))
else
return "-"
end
end

local rope = {
__VERSION = version,
new = new,
len = len,
from = from,
upto = upto,
sub = sub,
tconcat = concat,
concat = function(...) return concat(arg) end,
rep = rep,
replace = replace,
segs = segs,
tostring = tostring,
-- debug
splay = splay,
upto = upto,
from = from,
view = view
}

return rope
end


And here is a little library that uses it:

If you don't want to recompile Lua, you can implement the tuple function as follows:

Changed: 244,247c18,20
---------kmp.lua-----------
--
-- A small example of using ropes
-- (and maybe not a very good one)
local tuplemeta = {__call = table.unpack}
function tuple(...) return setmetatable(arg, tuplemeta) end
}}}

Changed: 249c22
local slen, ssub = string.len, string.sub
It will be slower and use more memory, but it is pure Lua.

Changed: 251,270c24
return function()
-- returns a function which matches p; it accepts characters one
-- at a time, and returns the length of the match (i.e. string.len(p))
-- when it is fed the last matching character.

-- The algorithm was taken more or less straight out of Sedgewick,
-- adjusted to be 1-based and to be inline rather than a string scan.
-- There are ways to make this work with wildcards and even intervals,
-- but this is just an example.
local function kmp(p)
-- We compute the transition vector and other goodies, and then
-- return a custom matcher function. It is convenient to also
-- split the pattern into a table
local P, M = {}, slen(p); for i = 1, M do P[i] = ssub(p, i, i) end
local i, j, next = 1, 0, {0}
while i < M do
while j > 0 and P[i] ~= P[j] do j = next[j] end
i, j = i + 1, j + 1
next[i] = (P[i] == P[j] and next[j]) or j
end
The rope library really got too big to fit on a Wiki, so I put it in the files area, along with a little Knuth Morris Pratt search/replace function as an example of how to use ropes (although character by character searching in Lua is not fast.)

Changed: 272,278c26
j = 1 -- used as closure
return function(ch)
while j > 0 and ch ~= P[j] do j = next[j] end
j = j + 1
if j > M then j = 1; return M end
end
end
To load the libraries, use LTN-11 style: (local) rope = dofile"rope.lua"()

Changed: 280,298c28
-- Given a rope r, a pattern string p and a replacement string q,
-- returns a rope with all occurrences of p to q.
-- (p and q are strings, not ropes,
-- but that could be fixed easily enough.)
local function replaceAll(r, p, q)
local rv = r
local scanner = kmp(p)
local j = 0
for _, seg in rope.segs(r) do
for i = 1, slen(seg) do
local m = scanner(ssub(seg, i, i))
if m then
rv = rope.replace(rv, j + i - m + 1, j + i, q)
end
end
j = j + slen(seg)
end
return rv
end
Download the libraries here: Files:wiki_insecure/users/rici/rope.lua and here: Files:wiki_insecure/users/rici/ropesearch.lua

Changed: 300,307c30
local sr = {
kmp = kmp,
replaceAll = replaceAll
}

return sr
end
}}}
Ropes are particularly useful for large text strings where there are random changes at localised points; that would make them ideal to implement a text editor, for example. Functional splay trees also have the interesting feature that you can simply keep a reference to the rope at any time, and later "return" to it; consequently maintaining an undo history is trivial. A single localised change is not likely to create more than a few splay nodes.

Added: 308a32
The splay ropes are implemented using an interval map: each node in the tree contains a left and right pointer, a string, and the relative position of the end of the string (this could have been the relative position of the beginning of the string or the total size of the node plus all its descendants; the three formulations are computationally equivalent.) This means that a node can be in more than one rope, or even more than one place in the same rope -- an extreme example of that is shown by the implementation of rope.rep, which uses intensive node-sharing to allow it to create ropes of size much greater than RAM: rope.rep("a", 1E40) takes a few milliseconds and uses about 10 kb, but it really is the representation of a string with 10^40 characters. IMHO, this is the sort of datastructure you end up with when you embrace immutable objects and garbage collection, instead of trying to implement C datatypes in a functional garbage collected environment.

Changed: 310c34
To load the libraries, use LTN-11 style: (local) rope = dofile"rope.lua"()
Another interesting fact about interval maps is that two of them can be kept with synchronised indexes, without sharing the same topology. The hypothetical text editor could, for example, keep text style information in another interval map, simply by executing the same insert, delete and replace operations on a stylerun interval map, even if the stylerun interval map was consolidating consecutive identical styles.

Changed: 312c36
An interesting feature of the implementation of rope.rep is its intensive node-sharing, possible only because of the use of functional nodes and the existence of a garbage collector. rope.rep can create truly enormous strings: try a = rope.rep("a", 1E40) which occupies about 10k.
Splay trees are useful in this sort of application because they do not need to be rebalanced after arbitrary deletion or insertion operations. On the other hand, the fact that the splay tree is self-adjusting means that you get a new splay object even after a lookup, not just after a mutation, and the performance of the splay tree depends on using the new object for subsequent lookups. For this reasons, splay trees are usually considered to be inappropriate for functional programs, particularly those which depend on persistence (or time travel, if you prefer); however, I believe that the text editor application would demonstrate that this is not actually true.

Ropes are binary trees each of whose nodes contains a string; the actual string can be read off of the rope with an in-order traverse.

This implementation uses functional splay trees to keep the ropes balanced; the implementation of splay trees was partially inspired by Chris Okasaki's excellent book on pure functional datatypes.

The nodes are implemented here with tuples, implemented with a tiny addition to the Lua base library:

static int l_tuple(lua_State *L) {
   lua_pushcclosure(L, lua_pushupvalues, lua_gettop(L));
   return 1;
}

/* also add {"tuple", l_tuple} to luaL_reg base_funcs[] */

If you don't want to recompile Lua, you can implement the tuple function as follows:

local tuplemeta = {__call = table.unpack}
function tuple(...) return setmetatable(arg, tuplemeta) end

It will be slower and use more memory, but it is pure Lua.

The rope library really got too big to fit on a Wiki, so I put it in the files area, along with a little Knuth Morris Pratt search/replace function as an example of how to use ropes (although character by character searching in Lua is not fast.)

To load the libraries, use LTN-11 style: (local) rope = dofile"rope.lua"()

Download the libraries here: Files:wiki_insecure/users/rici/rope.lua and here: Files:wiki_insecure/users/rici/ropesearch.lua

Ropes are particularly useful for large text strings where there are random changes at localised points; that would make them ideal to implement a text editor, for example. Functional splay trees also have the interesting feature that you can simply keep a reference to the rope at any time, and later "return" to it; consequently maintaining an undo history is trivial. A single localised change is not likely to create more than a few splay nodes.

The splay ropes are implemented using an interval map: each node in the tree contains a left and right pointer, a string, and the relative position of the end of the string (this could have been the relative position of the beginning of the string or the total size of the node plus all its descendants; the three formulations are computationally equivalent.) This means that a node can be in more than one rope, or even more than one place in the same rope -- an extreme example of that is shown by the implementation of rope.rep, which uses intensive node-sharing to allow it to create ropes of size much greater than RAM: rope.rep("a", 1E40) takes a few milliseconds and uses about 10 kb, but it really is the representation of a string with 10^40 characters. IMHO, this is the sort of datastructure you end up with when you embrace immutable objects and garbage collection, instead of trying to implement C datatypes in a functional garbage collected environment.

Another interesting fact about interval maps is that two of them can be kept with synchronised indexes, without sharing the same topology. The hypothetical text editor could, for example, keep text style information in another interval map, simply by executing the same insert, delete and replace operations on a stylerun interval map, even if the stylerun interval map was consolidating consecutive identical styles.

Splay trees are useful in this sort of application because they do not need to be rebalanced after arbitrary deletion or insertion operations. On the other hand, the fact that the splay tree is self-adjusting means that you get a new splay object even after a lookup, not just after a mutation, and the performance of the splay tree depends on using the new object for subsequent lookups. For this reasons, splay trees are usually considered to be inappropriate for functional programs, particularly those which depend on persistence (or time travel, if you prefer); however, I believe that the text editor application would demonstrate that this is not actually true.


RecentChanges · preferences
edit · history
Last edited April 13, 2004 10:16 pm GMT (diff)