lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


On 28/05/2007 21:08, PA wrote:
Does anyone have an implementation of a trie [1] in Lua they would like to share?

Yes! But it is not optimal for Lua, I made it as a prototype for an algorithm to be implemented in C.
Well, it might give ideas, or just show what not to do... :-)

--
Philippe Lhoste
--  (near) Paris -- France
--  http://Phi.Lho.free.fr
--  --  --  --  --  --  --  --  --  --  --  --  --  --
#! Lua.exe

-- Just a prototype script to test a trie algorithm to be done in C.
-- Doing it first in Lua avoids some crashes, probably,
-- and might be faster to develop, at least without having a debugger handy...

dofile[[../Lua/DumpObject.lua]]

-- I pre-separate test words, low level stuff isn't necessary here... :-)
local words =
{
	'bet', 'are', 'bit', 'cut', 'beer', 'ago', 'be', 'art', 'bee', 'a'--, 'artist'
}

-- Debug function
function DumpTrie(trie)
	for i, t in ipairs(trie) do
		offset = string.sub('00000' .. tostring(i), -5)
		-- I don't dump the undefined state, as it is used only to build the trie
		print(offset .. ": " .. t.character .. " (" ..
				(t.state.terminal and 't ' or '  ') ..
				(t.state.lastChild and 'l' or ' ') ..
				") " .. tostring(t.firstChildOffset))
	end
end

-- For something more compact than DumpObject...
function ExportTrie(trie, file)
	local fh = assert(io.open(file, 'w'))
	for i, t in ipairs(trie) do
		fh:write(t.character)	-- Assume it is printable... I should make it 8bit clean
		if t.state.terminal then
			if t.state.lastChild then
				fh:write';'	-- Both
			else
				fh:write'.'	-- Terminal
			end
		else
			if t.state.lastChild then
				fh:write','	-- Last
			else
				fh:write' '	-- None
			end
		end
		fh:write(tostring(t.firstChildOffset) .. "\n")
	end
	fh:close()
end

function ImportTrie(file)
	local trie = {}
	trie.bitfield = {}
	local nodeCounter = 0
	local bSettingBitfield = true
	local fh = assert(io.open(file, 'r'))
	for line in fh:lines(file) do
		nodeCounter = nodeCounter + 1
		local node = {}
		node.character = string.sub(line, 1, 1)
		node.state = {}
		local state = string.sub(line, 2, 2)
		if state == '.' or state == ';' then
			node.state.terminal = true
		end
		if state == ',' or state == ';' then
			node.state.lastChild = true
		end
		node.firstChildOffset = tonumber(string.sub(line, 3))
		trie[nodeCounter] = node
		if bSettingBitfield then
			trie.bitfield[node.character] = true
			if node.state.lastChild then
				bSettingBitfield = false
			end
		end
	end
	fh:close()

	return trie, nodeCounter
end

function FindWordInTrie(word, trie)
	local currentOffset = 1
	local lastGoodFound = 0
	local lastGoodStringLen = 0

	if not trie.bitfield[string.sub(word, 1, 1)] then
		return 0, 0
	end

	-- Loop on searched string characters
	for charIndex = 1, string.len(word) do
		local char = string.sub(word, charIndex, charIndex)
		-- Walk the list of siblings nodes at this level
		local ni = currentOffset
		local bFound = false
		while true do
			if trie[ni].character == char then
				-- Found: mark the finding, in case we fail later
				if trie[ni].state['terminal'] then
					lastGoodFound = ni
					lastGoodStringLen = charIndex
				end
				-- We can continue on next level, if any
				currentOffset = ni + trie[ni].firstChildOffset
				bFound = true
				-- No need to search further
				break
			end
			-- Not found, did we reached the end of the list of siblings?
			if trie[ni].state['lastChild'] then
				-- Yes, string not matched
				break
			end
			-- Try other branch
			ni = ni + 1	-- Next
		end
		if not bFound then
			-- Current char hasn't been found, stop here
			break
		end
	end
	return lastGoodStringLen, lastGoodFound
end

function Print(s) print(s) end
--~ Print = function(s) end

function CreateTrie(words)
	-- Simulation of bit field...
	local bitfield = {}
	-- The result
	local trie = {}
	-- The level (deepth) of each branch
	-- Kept separate because once the trie is built, we no longer need it
	local levels = {}

	-- Initial setup: put the first character of each string in the trie
	-- and create the bitfield.
	local nodeCounter = 0
	for i, word in ipairs(words) do
		local firstChar = string.sub(word, 1, 1)
		if bitfield[firstChar] == nil then
			bitfield[firstChar] = true
			nodeCounter = nodeCounter + 1
			local node = {}
			node.character = firstChar
			node.state = {}
			node.state.undefined = true
			node.firstChildOffset = 0
			trie[nodeCounter] = node
			levels[nodeCounter] = 1
		end
	end
	trie[nodeCounter].state.lastChild = true

	-- Now, walk repeteadly the trie: for each node with undefined childs, look if any of the given
	-- strings can reach this node. If so, add the next character (after the one reaching the node)
	-- to the list of sub-nodes of this node, and make the node to point on the first one.
	local bFinished
	local lastGoodNode = 0
	repeat
		bFinished = true	-- Unless we find there is still something to do...
		io.write(nodeCounter .. " ")
		-- Walk the list of nodes, searching if remains at least a node with undefined child
		for nodeIndex = lastGoodNode + 1, nodeCounter do
			if trie[nodeIndex].state['undefined'] then
				lastGoodNode = nodeIndex -- This one will be good after that...
				-- Found one, at 'level' deepth
				local level = levels[nodeIndex]
				-- Last assigned node
				local currentNodeIndex = nodeCounter
				-- The string that reaches this node
				-- Not faster in Lua (than always doing the walk), perhaps even slower,
				-- probably because of string creation (string.sub) overhead, to be tested in C
				local nodeString = nil
				-- Loop on all words, search which one reach this node
				for wi, word in ipairs(words) do
					if level <= string.len(word) then -- Grr, no continue...
					local ni
					if nodeString ~= nil and string.sub(word, 1, level) == nodeString then
						ni = nodeIndex -- Right to the spot!
					else
						-- Walk the current trie up to the given level
						local subNodeIndex = 1 -- Start at the beginning...
						for charIndex = 1, level do
							local char = string.sub(word, charIndex, charIndex)
							ni = subNodeIndex
							while true do
								if trie[ni].character == char then
									-- Found: we can continue on next level, if available
									subNodeIndex = ni + trie[ni].firstChildOffset
									break
								end
								-- I skip check against 'lastChild', as I am sure to find the char
								ni = ni + 1	-- Next branch
							end -- Loop on siblings
						end -- Loop on branch
					end -- Test if string reaching the node is known
					if ni == nodeIndex then
						-- We reached the right node
						nodeString = string.sub(word, 1, level)
--~ 	 Print("Reached " .. trie[nodeIndex].character .. " at " .. nodeIndex .. " with " .. word)
						if level == string.len(word) then
							trie[nodeIndex].state.terminal = true
--~ 	 Print("Terminal")
							if trie[nodeIndex].state['undefined'] then
								-- Might be a leaf. This state can change later.
--~ 								trie[nodeIndex].firstChildOffset = 0
								trie[nodeIndex].state.undefined = nil
							end
						else -- Not terminal, add a node
							local nextSubChar = string.sub(word, level + 1, level + 1)
							-- Not optimal in Lua, but mimics how I will do it in C...
							local bNotAtThisLevel = true
							-- See if this char has been already added in the list of children nodes
							-- for this parent node
							for i = nodeCounter + 1, currentNodeIndex do
								if trie[i].character == nextSubChar then
									bNotAtThisLevel = false
									break
								end
							end
							if bNotAtThisLevel then
								-- Update the parent node on the first found children
								if currentNodeIndex == nodeCounter then
									trie[nodeIndex].firstChildOffset = currentNodeIndex + 1 - nodeIndex
									trie[nodeIndex].state.undefined = nil
								end
								-- Add a new node with the next sub char
								currentNodeIndex = currentNodeIndex + 1
								local node = {}
								node.character = nextSubChar
--~ 	 Print("Add: " .. nextSubChar .. " at " .. currentNodeIndex)
								node.state = {}
								node.state.undefined = true
								node.firstChildOffset = 0
								trie[currentNodeIndex] = node
								levels[currentNodeIndex] = level + 1
								bFinished = false	-- Another node to resolve...
							end
						end -- Test terminal or not
					end -- Test if word reaches node
					end -- Test on length
				end -- Loop on words
				nodeCounter = currentNodeIndex
				trie[nodeCounter].state.lastChild = true
			end
		end
--~ DumpTrie(trie)
--~ print""
	until bFinished
	io.write(nodeCounter .. "\n")

	trie.bitfield = bitfield
	return trie, nodeCounter
end

--[[
DumpTrie(trie)
--~ print(DumpObject(trie))

for i, w in ipairs{ 'are', 'not in', 'belt', 'bind', 'agone', 'be', 'artist', 'outside' } do
	print(w, string.len(w), FindWordInTrie(w, trie))
end
]]

local t1 = os.clock()
local swn = 500
local words, searchWords = {}, {}
local n = 1
local len, totLen, maxLen = 0, 0, 0
print"Reading file"
for line in io.lines([[C:\Program Files\UText\SciTE\PHP.api]]) do
	local word = string.gsub(line, "^([$_%w>-]+).*", "%1")
	words[n] = word
	len = string.len(word)
	totLen = totLen + len
	if len > maxLen then maxLen = len end
	searchWords[math.random(swn)] = word
--~ 	if n > 300 then break end
	n = n + 1
end
local t2 = os.clock()
print("Found " .. n .. " strings, longest has " .. maxLen .. " chars, totalling " ..
		totLen .. " chars, in " .. (t2 - t1) .. " seconds.")

local trie

if false then
	ok, trie = pcall(dofile, "$DumpedTrie.lua")
else
	ok, trie, n = pcall(ImportTrie, "$Dumped.trie")
end
if not ok then
	print"Making trie"
	t1 = os.clock()
	trie, nodeCounter = CreateTrie(words)
	t2 = os.clock()
	print((t2 - t1) .. " seconds to build the trie (" .. nodeCounter .. " nodes).")

	--~ DumpTrie(trie)
	DumpObjectToFile(trie, "$DumpedTrie.lua", '', ' ')
	ExportTrie(trie, "$Dumped.trie")
else
print(n .. " nodes read in trie.")
end

print"Searching words"
t1 = os.clock()
for i, w in ipairs{
		'$SCRIPT_NAME', 'not in', 'DomDocument->create_text_node', 'bind',
		'JDToFrench', 'be', 'aggregate_properties_by_list', 'outside' ,
		'openssl_error_string', 're_replace_callback',
		'preg_replace_callback', 'stream_socket_server',
		'foobar', 'usort', 'unsort', 'zip_read'
		} do
	print(w, string.len(w), FindWordInTrie(w, trie))
end
t2 = os.clock()
print((t2 - t1) .. " seconds.")

t1 = os.clock()
local n = 0
for i, w in pairs(searchWords) do
	assert(FindWordInTrie(w, trie) > 0)
	n = n + 1
end
t2 = os.clock()
print("Found " .. n .. " words in " .. (t2 - t1) .. " seconds.")