[Date Prev][Date Next][Thread Prev][Thread Next]
[Date Index]
[Thread Index]
- Subject: Re: LuaJIT2 performance for number crunching
- From: T T <t34www@...>
- Date: Thu, 17 Feb 2011 14:02:51 +0000
On 17 February 2011 10:52, steve donovan <steve.j.donovan@gmail.com> wrote:
> On Thu, Feb 17, 2011 at 12:26 PM, T T <t34www@googlemail.com> wrote:
>> I can't understand why so many people here are so fixated on this
>> templating idea without exploring simpler possibilities. Have you
>> missed my example with unrolling?
>
> Sorry, I did look at it now. So the idea is that there are a few
> predefined dimensions, which are 'conditionally' unrolled.
>
> So the dim >= 2 etc tests are pretty efficient? That is cool...
>From what I understood, the problem really comes down to small loops,
which are not jitted at all. Since, conditionals will get jitted, you
won't get that much of a performance hit with them.
> Naturally, I think in terms of program textual transformation, as one
> way to bridge the gap between desired code style and the current needs
> of the optimizer. That manual unrolling seems like a good candidate
> for automatic transformation. As Agent Smith once said, don't send in
> a human to do a machine's job.
My thoughts exactly, but maybe it's not that simple. Anyway, I didn't
do this unrolling completely manually. I used luapp.lua (
http://lua-users.org/wiki/SlightlyLessSimpleLuaPreprocessor ) with a
template (attached, if anyone wants to play with it further; I think
it makes for a nice test case).
> OK, it could be argued that writing good numerical code is hard, but
> only needs to be written once, and then put into a library... so I do
> see the point. Text templates with [[...]] are not ideal, you lose
> syntax highlighting and compile-time checking.
Not to mention that it takes more brain power to develop such a
templated code (it took me way more time for this simple example than
I expected) and, once you hand over your code, others have to follow
all these idiosyncrasies.
Cheers,
Tomek
# MAXUNROLL = tonumber(os.getenv('MAXUNROLL')) or 4
#
# LPP = require('luapp').preprocess
#
# function loop_unroll(CODE)
# local prefix = string.match(CODE, "%s+") or ""
# local _L = {MAXUNROLL=MAXUNROLL}
# CODE = string.gsub(CODE, "%[i%]", "[$(i)]")
# _L.i = 'i'
# _put(
# prefix, "", LPP{ input="if dim > $(MAXUNROLL) then", output='string', lookup=_L }, "\n",
# prefix, " ", LPP{ input= "for $(i) = 1,dim do", output='string', lookup=_L }, "\n",
# prefix, " ", LPP{ input=CODE, output='string', lookup=_L }, "\n",
# prefix, " end", "\n",
# prefix, "else", "\n")
# for i = 1,MAXUNROLL do
# _L.i = i
# _put(
# prefix, " ", LPP{ input="if dim >= $(i) then", output='string', lookup=_L }, "\n",
# prefix, " ", LPP{ input=CODE, output='string', lookup=_L }, "\n",
# prefix, " end", "\n")
# end
# _put(
# prefix, "end", "\n")
# end
use = 'Lua'
if use == 'FFI' then
ffi = require 'ffi'
darray = ffi.typeof("double[?]")
elseif use == 'GSL' then
darray = function(n) return new(n, 1) end
else
darray = function(n) local t = {}; for k = 1,n do t[k] = 0; end; return t end
end
rk4 = {}
function rk4.new(n)
local s = {
k= darray(n+1),
k1= darray(n+1),
y0= darray(n+1),
ytmp= darray(n+1),
y_onestep= darray(n+1),
dim = n
}
return s
end
function rk4.step(y, state, h, t, sys)
-- Makes a Runge-Kutta 4th order advance with step size h.
local dim = state.dim
local f = sys.f
-- initial values of variables y.
local y0 = state.y0
-- work space
local ytmp = state.ytmp
-- Runge-Kutta coefficients. Contains values of coefficient k1
-- in the beginning
local k = state.k
-- k1 step
# loop_unroll( " y[i] = y[i] + h / 6 * k[i]; ytmp[i] = y0[i] + 0.5 * h * k[i]" )
-- k2 step
f(t + 0.5 * h, ytmp, k)
# loop_unroll( " y[i] = y[i] + h / 3 * k[i]; ytmp[i] = y0[i] + 0.5 * h * k[i]" )
-- k3 step
f(t + 0.5 * h, ytmp, k)
# loop_unroll( " y[i] = y[i] + h / 3 * k[i]; ytmp[i] = y0[i] + 0.5 * h * k[i]" )
-- k4 step
f(t + h, ytmp, k)
# loop_unroll( " y[i] = y[i] + h / 6 * k[i]" )
end
function rk4.apply(state, t, h, y, yerr, dydt_in, dydt_out, sys)
local f, dim = sys.f, state.dim
local k, k1, y0, y_onestep = state.k, state.k1, state.y0, state.y_onestep
# loop_unroll( " y0[i] = y[i]" )
if dydt_in then
# loop_unroll( " k[i] = dydt_in[i]" )
else
f(t, y0, k)
end
-- Error estimation is done by step doubling procedure
-- Save first point derivatives
# loop_unroll( " k1[i] = k[i]" )
-- First traverse h with one step (save to y_onestep)
# loop_unroll( " y_onestep[i] = y[i]" )
rk4.step (y_onestep, state, h, t, sys)
-- Then with two steps with half step length (save to y)
# loop_unroll( " k[i] = k1[i]" )
rk4.step(y, state, h/2, t, sys)
-- Update before second step
f(t + h/2, y, k)
-- Save original y0 to k1 for possible failures
# loop_unroll( " k1[i] = y0[i]" )
-- Update y0 for second step
# loop_unroll( " y0[i] = y[i]" )
rk4.step(y, state, h/2, t + h/2, sys)
-- Derivatives at output
if dydt_out then f(t + h, y, dydt_out) end
-- Error estimation
--
-- yerr = C * 0.5 * | y(onestep) - y(twosteps) | / (2^order - 1)
--
-- constant C is approximately 8.0 to ensure 90% of samples lie within
-- the error (assuming a gaussian distribution with prior p(sigma)=1/sigma.)
# loop_unroll( " yerr[i] = 4 * (y[i] - y_onestep[i]) / 15" )
end
function f_ode1(t, y, dydt)
local p, q = y[1], y[2]
dydt[1] = - q - p^2
dydt[2] = 2*p - q^3
end
t0, t1, h0 = 0, 200, 0.001
function do_rk(p0, q0, sample, dim)
-- local dim = tonumber(os.getenv('dim') or 2)
local state = rk4.new(dim)
local y, dydt, yerr = darray(dim+1), darray(dim+1), darray(dim+1)
local sys = {f = f_ode1}
y[1], y[2] = p0, q0
local t = t0
local tsamp = t0
rk4.apply(state, t, h0, y, yerr, nil, dydt, sys)
t = t + h0
while t < t1 do
rk4.apply(state, t, h0, y, yerr, dydt, dydt, sys)
t = t + h0
if sample and t - tsamp > sample then
print(t, y[1], y[2])
tsamp = t
end
end
print(t, y[1], y[2])
end
for k=1, 10 do
local th = math.pi/4 -- *(k-1)/5
local p0, q0 = math.cos(th), math.sin(th)
local dim = tonumber(os.getenv('dim') or 2)
do_rk(p0, q0, sample, dim)
end
- References:
- Re: LuaJIT2 performance for number crunching, Florian Weimer
- Re: LuaJIT2 performance for number crunching, Francesco Abbate
- Re: LuaJIT2 performance for number crunching, Leo Razoumov
- Re: LuaJIT2 performance for number crunching, Francesco Abbate
- Re: LuaJIT2 performance for number crunching, Leo Razoumov
- Re: LuaJIT2 performance for number crunching, Francesco Abbate
- Re: LuaJIT2 performance for number crunching, Leo Razoumov
- Re: LuaJIT2 performance for number crunching, Francesco Abbate
- Re: LuaJIT2 performance for number crunching, T T
- Re: LuaJIT2 performance for number crunching, Daurnimator
- Re: LuaJIT2 performance for number crunching, Mike Pall
- Re: LuaJIT2 performance for number crunching, Philippe Lhoste
- Re: LuaJIT2 performance for number crunching, Alexander Gladysh
- Re: LuaJIT2 performance for number crunching, Miles Bader
- Re: LuaJIT2 performance for number crunching, T T
- Re: LuaJIT2 performance for number crunching, steve donovan
- Re: LuaJIT2 performance for number crunching, T T
- Re: LuaJIT2 performance for number crunching, steve donovan