lua-users home
lua-l archive

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


With some help from the list, I wrote a script that is reads in various strings from a file containing various payloads and generates SVM data for further classification. The idea is that you specify how many grams should be used when breaking up the payload, then the script creates a table with all possible grams for the parameter specified based on ASCII characters 32-126. The script then updates the hash value in the table holding all possible grams with the number of times that specific gram appears in the payload file. The script works fine when I specify to use two grams, but when I move to three grams the script uses up all the CPU on my machine and runs out of memory as well. The script is listed below. My question is how can I improve the script to use less CPU/memory and still be able to track the number of grams a payload has when the number of grams is greater than two?

Thanks,
Chris
==============================================================================================
#!/usr/bin/lua

PayloadFile = arg[1]
SVMOutput = arg[2]
NNGrams = arg[3]
HamFlag = arg[4]
	
-- Use ASCII characters 32-126
ASCII=' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~'

GramTable = {};
GramCounter = {};
GramID = {};
GramIDR = {};
GramResults = {};
Counter = 1

-- Ensure that we received the expected parameters
-- ToDo add input validation
if (PayloadFile == nil or SVMOutput == nil or NNGrams == nil or HamFlag == nil) then
	print("Usage: NGramsCreator.lua <Payload File> <SVMOutput File> <Number of NGrams> <Ham Flag>")
	return
end

-- Open output file
--io.output(io.open(SVMOutput,"a"));

function GenerateNGrams(Symbols, Length, Part)
	-- Recursively generate the grams based on the given length (NNGrams)
	Part = Part or ""
	if Length == 0 then 
		--print(Part);
		GramID[Part] = Counter;
		--print(GramID[Part],Part)
		GramIDR[Counter] = Part;
		GramCounter[Part] = 0;
		Counter = Counter + 1
		return
	end
	for Counter = 1, #Symbols do
		GenerateNGrams(Symbols, Length - 1, Part .. Symbols:sub(Counter,Counter))
	end
end

function CreateSVMData(Payload, SVMOutput, HamFlag)
	GenerateNGrams(ASCII, NNGrams)
	-- for j, k in ipairs(GramTable) do print(j,k) end
	PayloadLength = #Payload - 1 
	
	for PayloadCounter = 1, PayloadLength do
		PayloadEnd = PayloadCounter + NNGrams -1
		Gram = Payload:sub(PayloadCounter, PayloadEnd)

		-- Check if we've reached the end	
		if #Gram ~= tonumber(NNGrams) then 
			break 
		end	

		--print("[" .. Gram .. "]",GramCounter[Gram])
		if GramCounter[Gram] == nil then
			GramCounter[Gram] = 0
		end
		GramCounter[Gram] = GramCounter[Gram] + 1
	end
	return
end

function PrintResults()
	io.output(io.open(SVMOutput,"a"));
	-- Show final result
	for i,v in pairs(GramCounter) do 
		if v ~= nil then 
			--print(GramID[i],i,v)
			table.insert(GramResults,GramID[i])
		end 
	end

	table.sort(GramResults)
	io.write(HamFlag, " ")
	for _,a in pairs(GramResults) do
		--print(a .. ":" .. GramCounter[GramIDR[a]])
		io.write(a,":",GramCounter[GramIDR[a]]," ")
	end
	io.write("\n")
	io.close()
	return
end

function main()
	for line in io.lines(PayloadFile) do 
		--print(line) 
		CreateSVMData(line, SVMOutput, HamFlag)
	end
	PrintResults()
end
main()