[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: Trie?
- From: Philippe Lhoste <PhiLho@...>
- Date: Tue, 29 May 2007 10:35:23 +0200
On 28/05/2007 21:08, PA wrote:
Does anyone have an implementation of a trie [1] in Lua they would like
to share?
Yes! But it is not optimal for Lua, I made it as a prototype for an
algorithm to be implemented in C.
Well, it might give ideas, or just show what not to do... :-)
--
Philippe Lhoste
-- (near) Paris -- France
-- http://Phi.Lho.free.fr
-- -- -- -- -- -- -- -- -- -- -- -- -- --
#! Lua.exe
-- Just a prototype script to test a trie algorithm to be done in C.
-- Doing it first in Lua avoids some crashes, probably,
-- and might be faster to develop, at least without having a debugger handy...
dofile[[../Lua/DumpObject.lua]]
-- I pre-separate test words, low level stuff isn't necessary here... :-)
local words =
{
'bet', 'are', 'bit', 'cut', 'beer', 'ago', 'be', 'art', 'bee', 'a'--, 'artist'
}
-- Debug function
function DumpTrie(trie)
for i, t in ipairs(trie) do
offset = string.sub('00000' .. tostring(i), -5)
-- I don't dump the undefined state, as it is used only to build the trie
print(offset .. ": " .. t.character .. " (" ..
(t.state.terminal and 't ' or ' ') ..
(t.state.lastChild and 'l' or ' ') ..
") " .. tostring(t.firstChildOffset))
end
end
-- For something more compact than DumpObject...
function ExportTrie(trie, file)
local fh = assert(io.open(file, 'w'))
for i, t in ipairs(trie) do
fh:write(t.character) -- Assume it is printable... I should make it 8bit clean
if t.state.terminal then
if t.state.lastChild then
fh:write';' -- Both
else
fh:write'.' -- Terminal
end
else
if t.state.lastChild then
fh:write',' -- Last
else
fh:write' ' -- None
end
end
fh:write(tostring(t.firstChildOffset) .. "\n")
end
fh:close()
end
function ImportTrie(file)
local trie = {}
trie.bitfield = {}
local nodeCounter = 0
local bSettingBitfield = true
local fh = assert(io.open(file, 'r'))
for line in fh:lines(file) do
nodeCounter = nodeCounter + 1
local node = {}
node.character = string.sub(line, 1, 1)
node.state = {}
local state = string.sub(line, 2, 2)
if state == '.' or state == ';' then
node.state.terminal = true
end
if state == ',' or state == ';' then
node.state.lastChild = true
end
node.firstChildOffset = tonumber(string.sub(line, 3))
trie[nodeCounter] = node
if bSettingBitfield then
trie.bitfield[node.character] = true
if node.state.lastChild then
bSettingBitfield = false
end
end
end
fh:close()
return trie, nodeCounter
end
function FindWordInTrie(word, trie)
local currentOffset = 1
local lastGoodFound = 0
local lastGoodStringLen = 0
if not trie.bitfield[string.sub(word, 1, 1)] then
return 0, 0
end
-- Loop on searched string characters
for charIndex = 1, string.len(word) do
local char = string.sub(word, charIndex, charIndex)
-- Walk the list of siblings nodes at this level
local ni = currentOffset
local bFound = false
while true do
if trie[ni].character == char then
-- Found: mark the finding, in case we fail later
if trie[ni].state['terminal'] then
lastGoodFound = ni
lastGoodStringLen = charIndex
end
-- We can continue on next level, if any
currentOffset = ni + trie[ni].firstChildOffset
bFound = true
-- No need to search further
break
end
-- Not found, did we reached the end of the list of siblings?
if trie[ni].state['lastChild'] then
-- Yes, string not matched
break
end
-- Try other branch
ni = ni + 1 -- Next
end
if not bFound then
-- Current char hasn't been found, stop here
break
end
end
return lastGoodStringLen, lastGoodFound
end
function Print(s) print(s) end
--~ Print = function(s) end
function CreateTrie(words)
-- Simulation of bit field...
local bitfield = {}
-- The result
local trie = {}
-- The level (deepth) of each branch
-- Kept separate because once the trie is built, we no longer need it
local levels = {}
-- Initial setup: put the first character of each string in the trie
-- and create the bitfield.
local nodeCounter = 0
for i, word in ipairs(words) do
local firstChar = string.sub(word, 1, 1)
if bitfield[firstChar] == nil then
bitfield[firstChar] = true
nodeCounter = nodeCounter + 1
local node = {}
node.character = firstChar
node.state = {}
node.state.undefined = true
node.firstChildOffset = 0
trie[nodeCounter] = node
levels[nodeCounter] = 1
end
end
trie[nodeCounter].state.lastChild = true
-- Now, walk repeteadly the trie: for each node with undefined childs, look if any of the given
-- strings can reach this node. If so, add the next character (after the one reaching the node)
-- to the list of sub-nodes of this node, and make the node to point on the first one.
local bFinished
local lastGoodNode = 0
repeat
bFinished = true -- Unless we find there is still something to do...
io.write(nodeCounter .. " ")
-- Walk the list of nodes, searching if remains at least a node with undefined child
for nodeIndex = lastGoodNode + 1, nodeCounter do
if trie[nodeIndex].state['undefined'] then
lastGoodNode = nodeIndex -- This one will be good after that...
-- Found one, at 'level' deepth
local level = levels[nodeIndex]
-- Last assigned node
local currentNodeIndex = nodeCounter
-- The string that reaches this node
-- Not faster in Lua (than always doing the walk), perhaps even slower,
-- probably because of string creation (string.sub) overhead, to be tested in C
local nodeString = nil
-- Loop on all words, search which one reach this node
for wi, word in ipairs(words) do
if level <= string.len(word) then -- Grr, no continue...
local ni
if nodeString ~= nil and string.sub(word, 1, level) == nodeString then
ni = nodeIndex -- Right to the spot!
else
-- Walk the current trie up to the given level
local subNodeIndex = 1 -- Start at the beginning...
for charIndex = 1, level do
local char = string.sub(word, charIndex, charIndex)
ni = subNodeIndex
while true do
if trie[ni].character == char then
-- Found: we can continue on next level, if available
subNodeIndex = ni + trie[ni].firstChildOffset
break
end
-- I skip check against 'lastChild', as I am sure to find the char
ni = ni + 1 -- Next branch
end -- Loop on siblings
end -- Loop on branch
end -- Test if string reaching the node is known
if ni == nodeIndex then
-- We reached the right node
nodeString = string.sub(word, 1, level)
--~ Print("Reached " .. trie[nodeIndex].character .. " at " .. nodeIndex .. " with " .. word)
if level == string.len(word) then
trie[nodeIndex].state.terminal = true
--~ Print("Terminal")
if trie[nodeIndex].state['undefined'] then
-- Might be a leaf. This state can change later.
--~ trie[nodeIndex].firstChildOffset = 0
trie[nodeIndex].state.undefined = nil
end
else -- Not terminal, add a node
local nextSubChar = string.sub(word, level + 1, level + 1)
-- Not optimal in Lua, but mimics how I will do it in C...
local bNotAtThisLevel = true
-- See if this char has been already added in the list of children nodes
-- for this parent node
for i = nodeCounter + 1, currentNodeIndex do
if trie[i].character == nextSubChar then
bNotAtThisLevel = false
break
end
end
if bNotAtThisLevel then
-- Update the parent node on the first found children
if currentNodeIndex == nodeCounter then
trie[nodeIndex].firstChildOffset = currentNodeIndex + 1 - nodeIndex
trie[nodeIndex].state.undefined = nil
end
-- Add a new node with the next sub char
currentNodeIndex = currentNodeIndex + 1
local node = {}
node.character = nextSubChar
--~ Print("Add: " .. nextSubChar .. " at " .. currentNodeIndex)
node.state = {}
node.state.undefined = true
node.firstChildOffset = 0
trie[currentNodeIndex] = node
levels[currentNodeIndex] = level + 1
bFinished = false -- Another node to resolve...
end
end -- Test terminal or not
end -- Test if word reaches node
end -- Test on length
end -- Loop on words
nodeCounter = currentNodeIndex
trie[nodeCounter].state.lastChild = true
end
end
--~ DumpTrie(trie)
--~ print""
until bFinished
io.write(nodeCounter .. "\n")
trie.bitfield = bitfield
return trie, nodeCounter
end
--[[
DumpTrie(trie)
--~ print(DumpObject(trie))
for i, w in ipairs{ 'are', 'not in', 'belt', 'bind', 'agone', 'be', 'artist', 'outside' } do
print(w, string.len(w), FindWordInTrie(w, trie))
end
]]
local t1 = os.clock()
local swn = 500
local words, searchWords = {}, {}
local n = 1
local len, totLen, maxLen = 0, 0, 0
print"Reading file"
for line in io.lines([[C:\Program Files\UText\SciTE\PHP.api]]) do
local word = string.gsub(line, "^([$_%w>-]+).*", "%1")
words[n] = word
len = string.len(word)
totLen = totLen + len
if len > maxLen then maxLen = len end
searchWords[math.random(swn)] = word
--~ if n > 300 then break end
n = n + 1
end
local t2 = os.clock()
print("Found " .. n .. " strings, longest has " .. maxLen .. " chars, totalling " ..
totLen .. " chars, in " .. (t2 - t1) .. " seconds.")
local trie
if false then
ok, trie = pcall(dofile, "$DumpedTrie.lua")
else
ok, trie, n = pcall(ImportTrie, "$Dumped.trie")
end
if not ok then
print"Making trie"
t1 = os.clock()
trie, nodeCounter = CreateTrie(words)
t2 = os.clock()
print((t2 - t1) .. " seconds to build the trie (" .. nodeCounter .. " nodes).")
--~ DumpTrie(trie)
DumpObjectToFile(trie, "$DumpedTrie.lua", '', ' ')
ExportTrie(trie, "$Dumped.trie")
else
print(n .. " nodes read in trie.")
end
print"Searching words"
t1 = os.clock()
for i, w in ipairs{
'$SCRIPT_NAME', 'not in', 'DomDocument->create_text_node', 'bind',
'JDToFrench', 'be', 'aggregate_properties_by_list', 'outside' ,
'openssl_error_string', 're_replace_callback',
'preg_replace_callback', 'stream_socket_server',
'foobar', 'usort', 'unsort', 'zip_read'
} do
print(w, string.len(w), FindWordInTrie(w, trie))
end
t2 = os.clock()
print((t2 - t1) .. " seconds.")
t1 = os.clock()
local n = 0
for i, w in pairs(searchWords) do
assert(FindWordInTrie(w, trie) > 0)
n = n + 1
end
t2 = os.clock()
print("Found " .. n .. " words in " .. (t2 - t1) .. " seconds.")