The main idea is to let g++ to allocate vfpu registers and try to compute without the need to struggle with the assembly code. But the drawback is the way you write the algorithm using vfpu, because it would appear to most people very weird.
that code :
Code: Select all
#include "vfpu.h"
extern "C" void vector2d_sample_linear(float *vector_result, float *vector_source, float *vector_target, float &alpha)
{
// load scalar alpha in a new register
typedef vfpu_scalar_load< vfpu_scalar_new_reg < > >
load_alpha, step1;
// load 2d vector source in a new register
typedef vfpu_vector_2d_load< vfpu_vector_new_reg < step1 > >
load_vector_source, step2;
// load 2d vector target in a new register
typedef vfpu_vector_2d_load< vfpu_vector_new_reg < step2 > >
load_vector_target, step3;
// saturated one's complement : { 1-alpha, alpha } = { (1.0 - alpha)[0..1], alpha[0..1] }
typedef vfpu_vector_2d_socp_result< load_alpha, step3 >
compute_alpha_and_one_minus_alpha, step4;
typedef vfpu_vector_2d_scl_result< load_vector_source, vfpu_vector_2d_component< compute_alpha_and_one_minus_alpha, 0 >, step4 >
scale_source_vector_with_one_minus_alpha, step5;
typedef vfpu_vector_2d_scl_result< load_vector_target, vfpu_vector_2d_component< compute_alpha_and_one_minus_alpha, 1 >, step5 >
scale_target_vector_with_alpha, step6;
// Vs * (1.0 - alpha)[0..1] + Vd * alpha[0..1]
typedef vfpu_vector_2d_add_result< scale_source_vector_with_one_minus_alpha, scale_target_vector_with_alpha, step6 >
add_vectors_source_and_target, step7;
// and store to vector result
typedef vfpu_vector_2d_store< add_vectors_source_and_target, step7 >
store_vector_result;
// execute
load_alpha _1(alpha);
load_vector_source _2(*vector_source);
load_vector_target _3(*vector_target);
compute_alpha_and_one_minus_alpha();
scale_source_vector_with_one_minus_alpha();
scale_target_vector_with_alpha();
add_vectors_source_and_target();
store_vector_result _(*vector_result);
}
Code: Select all
00000018 <vector2d_sample_linear>:
// load scalar alpha in a new register
18: c8e00000 lv.s S000.s,0(a3)
// load 2d vector source in a new register
1c: c8a10000 lv.s S010.s,0(a1)
20: c8a10005 lv.s S011.s,4(a1)
// load 2d vector target in a new register
24: c8c20000 lv.s S020.s,0(a2)
28: c8c20005 lv.s S021.s,4(a2)
// saturated one's complement : { 1-alpha, alpha } = { (1.0 - alpha)[0..1], alpha[0..1] }
2c: d0450103 vsocp.s C030.p,S000.s
// scale vector source with 1-alpha
30: 65030184 vscl.p C100.p,C010.p,S030.s
// scale vector target with alpha
34: 65230285 vscl.p C110.p,C020.p,S031.s
// add vectors source and target to vector result
38: 60050486 vadd.p C120.p,C100.p,C110.p
// store vector result
3c: e8860000 sv.s S120.s,0(a0)
40: e8860005 sv.s S121.s,4(a0)
// exit function
44: 03e00008 jr ra
48: 00000000 nop
I dunno if somebody may be interested in this library.
Sure, there is still a lot of job to do.
NOTE :
any vfpu_vector_2d_XXX template class has an optional template parameter named "clobbered" (those ones named "stepN" in my example) which allows g++ to remember which VFPU registers were "allocated". This is the main reason why we need so many typedefs and this weird syntax.