Read Defined Chunks |
|
|
With this function you can read defined chunks (until a given delimiter is found) from an file or from io.stdin Its a complete rewrite of the first version: now its fast, doesn't concatenate strings unnecessary, needs less memory and is flexible. |
With this function you can read defined chunks (until a given delimiter is found) from an file or from io.stdin.It's a complete rewrite of a prior version: now its fast, doesn't concatenate strings unnecessary, needs less memory and is flexible. |
|
Please note a simple non Lua standard enhancement: I use the number variable lua.maxread to set the chunksizes used for io.reads(chunk of bytes) at a central point. Replace this variable by your preferred chunksize e.g. 2^13 for 8KB |
Please note a simple non Lua standard enhancement: I use the number variable lua.maxread to set the chunksizes used for io.reads(chunk of bytes) at a central point. Replace this variable by your preferred chunksize e.g. 2^13 for 8KB. |
|
{{{ |
|
{{{!Lua -- a simple example without using all the specials local Handle=io.open('File','r') local ReadUntil?=io.readuntil(Handle) repeat Line=coroutine.resume(ReadUntil?,'\n',true) if Line then |
|
function io.readuntil(Filehandle,Delimiter,Collect,Limit) |
|
end until Line==nil }}}
Code: {{{!Lua function io.readuntil(Filehandle, Delimiter, Collect, Limit) |
|
-- Delimiter (string, optional); max. length is lua.maxread; optional because coroutine.resume() also accepts <delimiter> -- Collect (boolean, optional) = true (default); read until <delimiter> is found or end of file or <limit> is reached and return string at once -- = false; return string also before <delimiter> is found or end of file or <limit> is reached -- Limit (number, optional); number of bytes to read from <filehandle>; default is unlimited |
|
-- Delimiter (string, optional); max. length is lua.maxread; -- optional because coroutine.resume() also accepts <delimiter> -- Collect (boolean, optional) = true (default); read until -- <delimiter> is found or end of file or <limit> is reached -- and return string at once = false; return string also before -- <delimiter> is found or end of file or <limit> is reached -- Limit (number, optional); number of bytes to read from -- <filehandle>; default is unlimited |
|
-- <function>=cooroutine.resume(Function,Delimiter,Collect) |
|
-- <function> = cooroutine.resume(Function, Delimiter, Collect) |
|
-- = false; an error occured and the second argument returned is the errormessage |
|
-- = false; an error occured and the second -- argument returned is the errormessage |
|
-- = true the end of file is reached and the next coroutine.resume returns true,nil(,nil) if type(Delimiter)=='boolean' then Collect,Delimiter = Delimiter,Collect end if type(Delimiter)=='number' then Limit,Delimiter = Delimiter,nil end if type(Collect)=='number' then Limit,Collect = Collect,nil end |
|
-- = true the end of file is reached and the next -- coroutine.resume returns true,nil(,nil) |
|
return coroutine.create(function(NewDelimiter?,NewCollect?) |
|
if type(Delimiter) == 'boolean' then Collect,Delimiter = Delimiter,Collect end if type(Delimiter) == 'number' then Limit,Delimiter = Delimiter,nil end if type(Collect) == 'number' then Limit,Collect = Collect,nil end |
|
local Next=function(NewDelimiter?,NewCollect?) |
|
return coroutine.create(function(NewDelimiter?,NewCollect?) |
|
if type(NewDelimiter?)=='boolean' then |
|
local Next = function(NewDelimiter?,NewCollect?) if type(NewDelimiter?) == 'boolean' then |
|
end return NewDelimiter? or Delimiter,NewCollect? or Collect end Delimiter,Collect = Next(NewDelimiter?,NewCollect?) local Chunksize,Chunk,Length,First,Second,SearchFrom?,GetFrom?,FoundFrom?,FoundTo? = lua.maxread,{},0,1,2,1,1 if Limit and Length+Chunksize>Limit then Chunk[First]=Limit-Length>0 and Filehandle:read(Limit-Length) else Chunk[First]=Filehandle:read(Chunksize) end |
|
end return NewDelimiter? or Delimiter,NewCollect? or Collect end Delimiter,Collect = Next(NewDelimiter?,NewCollect?) local Chunksize,Chunk,Length,First,Second,SearchFrom?, GetFrom?,FoundFrom?,FoundTo? = lua.maxread,{},0,1,2,1,1 if Limit and Length+Chunksize>Limit then Chunk[First] = Limit-Length>0 and Filehandle:read(Limit-Length) else Chunk[First] = Filehandle:read(Chunksize) end |
|
if Chunk[First] then Length=Length+string.len(Chunk[First]) |
|
if Chunk[First] then Length = Length + string.len(Chunk[First]) |
|
while true do |
|
while true do if string.len(Delimiter)>Chunksize then error('io.readuntil: delimiter to long') end |
|
if string.len(Delimiter)>Chunksize then error('io.readuntil: delimiter to long') |
|
FoundFrom?,FoundTo? = string.find( Chunk[First],Delimiter,SearchFrom?,true) if FoundFrom? then -- delimiter found in first chunk Delimiter,Collect = Next(coroutine.yield( string.sub(Chunk[First],GetFrom?,FoundFrom?-1),true)) SearchFrom?,GetFrom? = FoundTo?+1,FoundTo?+1 else if Limit and Length+Chunksize > Limit then Chunk[Second] = Limit-Length>0 and Filehandle:read(Limit-Length) else Chunk[Second] = Filehandle:read(Chunksize) |
|
FoundFrom?,FoundTo? = string.find(Chunk[First],Delimiter,SearchFrom?,true) if FoundFrom? then -- delimiter found in first chunk Delimiter,Collect = Next(coroutine.yield(string.sub(Chunk[First],GetFrom?,FoundFrom?-1),true)) SearchFrom?,GetFrom? = FoundTo?+1,FoundTo?+1 else |
|
if Chunk[Second] then Length = Length + string.len(Chunk[Second]) |
|
if Limit and Length+Chunksize>Limit then Chunk[Second]=Limit-Length>0 and Filehandle:read(Limit-Length) |
|
-- concatenate end of first chunk with start of -- second chunk so that a possible splitted delimiter -- must be found FoundFrom?,FoundTo? = string.find( string.sub(Chunk[First], string.len(Chunk[First])-string.len(Delimiter)+2) .. string.sub(Chunk[Second],1,string.len(Delimiter)-1), Delimiter,1,true) if FoundFrom? then -- delimiter is splitted between first and second chunk Delimiter,Collect = Next(coroutine.yield( string.sub(Chunk[First],GetFrom?,string.len(Chunk[First])- string.len(Delimiter)+FoundFrom?), true )) First,Second = Second,First SearchFrom?,GetFrom? = FoundFrom?+1,FoundFrom?+1 |
|
Chunk[Second]=Filehandle:read(Chunksize) end if Chunk[Second] then Length=Length+string.len(Chunk[Second]) -- concatenate end of first chunk with start of second chunk so that a possible splitted delimiter must be found FoundFrom?,FoundTo? = string.find(string.sub(Chunk[First],string.len(Chunk[First])-string.len(Delimiter)+2)..string.sub(Chunk[Second],1,string.len(Delimiter)-1),Delimiter,1,true) if FoundFrom? then -- delimiter is splitted between first and second chunk Delimiter,Collect = Next(coroutine.yield(string.sub(Chunk[First],GetFrom?,string.len(Chunk[First])-string.len(Delimiter)+FoundFrom?),true)) First,Second = Second,First SearchFrom?,GetFrom? = FoundFrom?+1,FoundFrom?+1 |
|
-- delimiter isn't splitted between first and second chunk if Collect then SearchFrom? = string.len(Chunk[First])+1 Chunk[First] = Chunk[First]..Chunk[Second] |
|
-- delimiter isn't splitted between first and second chunk if Collect then SearchFrom?=string.len(Chunk[First])+1 Chunk[First]=Chunk[First]..Chunk[Second] else if string.len(Chunk[First])>=GetFrom? then Delimiter,Collect = Next(coroutine.yield(string.sub(Chunk[First],GetFrom?),false)) end First,Second = Second,First SearchFrom?,GetFrom? = 1,1 |
|
if string.len(Chunk[First]) >= GetFrom? then Delimiter,Collect = Next(coroutine.yield( string.sub(Chunk[First],GetFrom?),false)) |
|
First,Second = Second,First SearchFrom?,GetFrom? = 1,1 |
|
else -- no delimiter found and no further input break |
|
else -- no delimiter found and no further input break |
|
if string.len(Chunk[First])>=GetFrom? then -- return rest of first chunk coroutine.yield(string.sub(Chunk[First],GetFrom?),false) end |
|
end) -- return (thread); a coroutine |
|
if string.len(Chunk[First]) >= GetFrom? then -- return rest of first chunk coroutine.yield(string.sub(Chunk[First],GetFrom?),false) end end end) -- return (thread); a coroutine |
|
-- MarkusHuber |
io.stdin.
It's a complete rewrite of a prior version: now its fast, doesn't concatenate strings unnecessary, needs less memory and is flexible.
The purpose is to process a) files with tons of megabaytes b) mixed formated input e.g. MIME multipart messages are a mix of lines \r\n and binary data
Please note a simple non Lua standard enhancement: I use the number variable lua.maxread to set the chunksizes used for io.reads(chunk of bytes) at a central point. Replace this variable by your preferred chunksize e.g. 2^13 for 8KB.
-- a simple example without using all the specials local Handle=io.open('File','r') local ReadUntil=io.readuntil(Handle) repeat Line=coroutine.resume(ReadUntil,'\n',true) if Line then end until Line==nil
-- another example local Handle = io.open('File', 'r') local ReadUntil = io.readuntil(Handle) local Chunk, Found repeat _,Chunk, Found = coroutine.resume(ReadUntil, 'search this string in a huge file',false) if Found then _,Chunk, Found = coroutine.resume(ReadUntil, 'search another string in the same file',true) if Found then break end end until Chunk == nil -- Now if chunk ~= nil, then chunk is the stuff between -- 'search this string in a huge file' and 'search another -- string in the same file'. Yes, it's possible to do the same -- very simply, but the advantage here is that the large file -- isn't loaded at once into memory.
Code:
function io.readuntil(Filehandle, Delimiter, Collect, Limit) -- Filehandle (userdata) -- Delimiter (string, optional); max. length is lua.maxread; -- optional because coroutine.resume() also accepts <delimiter> -- Collect (boolean, optional) = true (default); read until -- <delimiter> is found or end of file or <limit> is reached -- and return string at once = false; return string also before -- <delimiter> is found or end of file or <limit> is reached -- Limit (number, optional); number of bytes to read from -- <filehandle>; default is unlimited -- <function> = cooroutine.resume(Function, Delimiter, Collect) -- Function (thread); returned from io.readuntil() -- Delimiter (string, optional); see io.readuntil() -- Collect (boolean, optional); see io.readuntil() -- return (boolean); = true; no error -- = false; an error occured and the second -- argument returned is the errormessage -- (string or nil) = nil; end of file -- (boolean) = true; delimiter found -- = false; delimiter not found -- note: if the coroutine returns true,<string>,false then -- if <collect> = false it does not have to be the end of file -- = true the end of file is reached and the next -- coroutine.resume returns true,nil(,nil) if type(Delimiter) == 'boolean' then Collect,Delimiter = Delimiter,Collect end if type(Delimiter) == 'number' then Limit,Delimiter = Delimiter,nil end if type(Collect) == 'number' then Limit,Collect = Collect,nil end return coroutine.create(function(NewDelimiter,NewCollect) local Next = function(NewDelimiter,NewCollect) if type(NewDelimiter) == 'boolean' then NewCollect,NewDelimiter = NewDelimiter,nil end return NewDelimiter or Delimiter,NewCollect or Collect end Delimiter,Collect = Next(NewDelimiter,NewCollect) local Chunksize,Chunk,Length,First,Second,SearchFrom, GetFrom,FoundFrom,FoundTo = lua.maxread,{},0,1,2,1,1 if Limit and Length+Chunksize>Limit then Chunk[First] = Limit-Length>0 and Filehandle:read(Limit-Length) else Chunk[First] = Filehandle:read(Chunksize) end if Chunk[First] then Length = Length + string.len(Chunk[First]) while true do if string.len(Delimiter)>Chunksize then error('io.readuntil: delimiter to long') end FoundFrom,FoundTo = string.find( Chunk[First],Delimiter,SearchFrom,true) if FoundFrom then -- delimiter found in first chunk Delimiter,Collect = Next(coroutine.yield( string.sub(Chunk[First],GetFrom,FoundFrom-1),true)) SearchFrom,GetFrom = FoundTo+1,FoundTo+1 else if Limit and Length+Chunksize > Limit then Chunk[Second] = Limit-Length>0 and Filehandle:read(Limit-Length) else Chunk[Second] = Filehandle:read(Chunksize) end if Chunk[Second] then Length = Length + string.len(Chunk[Second]) -- concatenate end of first chunk with start of -- second chunk so that a possible splitted delimiter -- must be found FoundFrom,FoundTo = string.find( string.sub(Chunk[First], string.len(Chunk[First])-string.len(Delimiter)+2) .. string.sub(Chunk[Second],1,string.len(Delimiter)-1), Delimiter,1,true) if FoundFrom then -- delimiter is splitted between first and second chunk Delimiter,Collect = Next(coroutine.yield( string.sub(Chunk[First],GetFrom,string.len(Chunk[First])- string.len(Delimiter)+FoundFrom), true )) First,Second = Second,First SearchFrom,GetFrom = FoundFrom+1,FoundFrom+1 else -- delimiter isn't splitted between first and second chunk if Collect then SearchFrom = string.len(Chunk[First])+1 Chunk[First] = Chunk[First]..Chunk[Second] else if string.len(Chunk[First]) >= GetFrom then Delimiter,Collect = Next(coroutine.yield( string.sub(Chunk[First],GetFrom),false)) end First,Second = Second,First SearchFrom,GetFrom = 1,1 end end else -- no delimiter found and no further input break end end end if string.len(Chunk[First]) >= GetFrom then -- return rest of first chunk coroutine.yield(string.sub(Chunk[First],GetFrom),false) end end end) -- return (thread); a coroutine end
-- MarkusHuber