• Subject: Re: LuaJIT2 performance for number crunching
• From: Mike Pall <mikelu-1102@...>
• Date: Sat, 19 Feb 2011 17:11:51 +0100

Francesco Abbate wrote:
> For the other size it seems that there is a small problem because the
> performance of LuaJIT2 are in this case below my expectations.

Fixed implementation attached. Much faster.

--Mike
# order = 5

\$(include 'ode-defs.lua.in')

# ah = { 1.0/4.0, 3.0/8.0, 12.0/13.0, 1.0, 1.0/2.0 }
# b3 = { 3.0/32.0, 9.0/32.0 }
# b4 = { 1932.0/2197.0, -7200.0/2197.0, 7296.0/2197.0}
# b5 = { 8341.0/4104.0, -32832.0/4104.0, 29440.0/4104.0, -845.0/4104.0}
# b6 = { -6080.0/20520.0, 41040.0/20520.0, -28352.0/20520.0, 9295.0/20520.0, -5643.0/20520.0}

# c1 = 902880.0/7618050.0
# c3 = 3953664.0/7618050.0
# c4 = 3855735.0/7618050.0
# c5 = -1371249.0/7618050.0
# c6 = 277020.0/7618050.0

-- These are the differences of fifth and fourth order coefficients
-- for error estimation */

# ec = { 0.0, 1.0 / 360.0, 0.0, -128.0 / 4275.0, -2197.0 / 75240.0, 1.0 / 50.0, 2.0 / 55.0 }

# y_err_only = (a_dydt == 0)

local function rkf45_evolve(s, f, t1)
local t, h = s.t, s.h

local \$(VL'y')
local \$(VL'k1') = \$(AL's.dydt')

if t + h > t1 then h = t1 - t end

while h > 0 do
\$(VL'y') = \$(AL's.y')
local rmax = 0

do
#     for i = 0, N-1 do
local ytmp_\$(i) = y_\$(i) + \$(ah[1]) * h * k1_\$(i)
#     end

-- k2 step
local \$(VL'k2') = f(t + \$(ah[1]) * h, \$(VL'ytmp'))

#     for i = 0, N-1 do
ytmp_\$(i) = y_\$(i) + h * (\$(b3[1]) * k1_\$(i) + \$(b3[2]) * k2_\$(i))
#     end

-- k3 step
local \$(VL'k3') = f(t + \$(ah[2]) * h, \$(VL'ytmp'))

#     for i = 0, N-1 do
ytmp_\$(i) = y_\$(i) + h * (\$(b4[1]) * k1_\$(i) + \$(b4[2]) * k2_\$(i) + \$(b4[3]) * k3_\$(i))
#     end

-- k4 step
local \$(VL'k4') = f(t + \$(ah[3]) * h, \$(VL'ytmp'))

#     for i = 0, N-1 do
ytmp_\$(i) = y_\$(i) + h * (\$(b5[1]) * k1_\$(i) + \$(b5[2]) * k2_\$(i) + \$(b5[3]) * k3_\$(i) + \$(b5[4]) * k4_\$(i))
#     end

-- k5 step
local \$(VL'k5') = f(t + \$(ah[4]) * h, \$(VL'ytmp'))

#     for i = 0, N-1 do
ytmp_\$(i) = y_\$(i) + h * (\$(b6[1]) * k1_\$(i) + \$(b6[2]) * k2_\$(i) + \$(b6[3]) * k3_\$(i) + \$(b6[4]) * k4_\$(i) + \$(b6[5]) * k5_\$(i))
#     end

-- k6 step and final sum
-- since k2 is no more used we can use k2 to store k6
local \$(VL'k6') = f(t + \$(ah[5]) * h, \$(VL'ytmp'))

local di
#     for i = 0, N-1 do
di = \$(c1) * k1_\$(i) + \$(c3) * k3_\$(i) + \$(c4) * k4_\$(i) + \$(c5) * k5_\$(i) + \$(c6) * k6_\$(i)
y_\$(i) = y_\$(i) + h * di
#     end

#     if not y_err_only then
local \$(VL'dydt') = f(t + h, \$(VL'y'))
#     for i = 0, N-1 do
s.dydt[\$(i)] = dydt_\$(i)
#     end
#     end

local yerr, r, d0

#     for i = 0, N-1 do
yerr = h * (\$(ec[2]) * k1_\$(i) + \$(ec[4]) * k3_\$(i) + \$(ec[5]) * k4_\$(i) + \$(ec[6]) * k5_\$(i) + \$(ec[7]) * k6_\$(i))
#     if y_err_only then
d0 = \$(eps_rel) * (\$(a_y) * abs(y_\$(i))) + \$(eps_abs)
#     else
d0 = \$(eps_rel) * (\$(a_y) * abs(y_\$(i)) + \$(a_dydt) * abs(h * dydt_\$(i))) + \$(eps_abs)
#     end
r = abs(yerr) / abs(d0)
rmax = max(r, rmax)
#     end
end

if inc >= 0 then break end

end

#  if y_err_only then
local \$(VL'dydt') = f(t + h, \$(VL'y'))
#     for i = 0, N-1 do
s.dydt[\$(i)] = dydt_\$(i)
#     end
#  end

#     for i = 0, N-1 do
s.y[\$(i)] = y_\$(i)
#     end
s.t = t + h