The main idea is to let g++ to allocate vfpu registers and try to compute without the need to struggle with the assembly code. But the drawback is the way you write the algorithm using vfpu, because it would appear to most people very weird.
that code :
Code: Select all
#include "vfpu.h"
extern "C" void vector2d_sample_linear(float *vector_result, float *vector_source, float *vector_target, float &alpha)
{
	// load scalar alpha in a new register
	typedef vfpu_scalar_load< vfpu_scalar_new_reg < > >
		load_alpha, step1;
	
	// load 2d vector source in a new register
	typedef vfpu_vector_2d_load< vfpu_vector_new_reg < step1 > >
		load_vector_source, step2;
	
	// load 2d vector target in a new register
	typedef vfpu_vector_2d_load< vfpu_vector_new_reg < step2 > >
		load_vector_target, step3;
	// saturated one's complement : { 1-alpha, alpha } = { (1.0 - alpha)[0..1], alpha[0..1] }
	typedef vfpu_vector_2d_socp_result< load_alpha, step3 >
		compute_alpha_and_one_minus_alpha, step4;
	typedef vfpu_vector_2d_scl_result< load_vector_source, vfpu_vector_2d_component< compute_alpha_and_one_minus_alpha, 0 >, step4 >
		scale_source_vector_with_one_minus_alpha, step5;
	typedef vfpu_vector_2d_scl_result< load_vector_target, vfpu_vector_2d_component< compute_alpha_and_one_minus_alpha, 1 >, step5 >
		scale_target_vector_with_alpha, step6;
	// Vs * (1.0 - alpha)[0..1] + Vd * alpha[0..1]
	typedef vfpu_vector_2d_add_result< scale_source_vector_with_one_minus_alpha, scale_target_vector_with_alpha, step6 >
		add_vectors_source_and_target, step7;
	// and store to vector result
	typedef vfpu_vector_2d_store< add_vectors_source_and_target, step7 >
		store_vector_result;
	// execute 
	load_alpha         _1(alpha);
	load_vector_source _2(*vector_source);
	load_vector_target _3(*vector_target);
	
	compute_alpha_and_one_minus_alpha();
	scale_source_vector_with_one_minus_alpha();
	scale_target_vector_with_alpha();
	add_vectors_source_and_target();
	store_vector_result _(*vector_result);
}Code: Select all
00000018 <vector2d_sample_linear>:
// load scalar alpha in a new register
  18:   c8e00000        lv.s    S000.s,0(a3)
// load 2d vector source in a new register
  1c:   c8a10000        lv.s    S010.s,0(a1)
  20:   c8a10005        lv.s    S011.s,4(a1)
// load 2d vector target in a new register
  24:   c8c20000        lv.s    S020.s,0(a2)
  28:   c8c20005        lv.s    S021.s,4(a2)
// saturated one's complement : { 1-alpha, alpha } = { (1.0 - alpha)[0..1], alpha[0..1] }
  2c:   d0450103        vsocp.s C030.p,S000.s
// scale vector source with 1-alpha
  30:   65030184        vscl.p  C100.p,C010.p,S030.s
// scale vector target with alpha
  34:   65230285        vscl.p  C110.p,C020.p,S031.s
// add vectors source and target to vector result
  38:   60050486        vadd.p  C120.p,C100.p,C110.p
// store vector result
  3c:   e8860000        sv.s    S120.s,0(a0)
  40:   e8860005        sv.s    S121.s,4(a0)
// exit function
  44:   03e00008        jr      ra
  48:   00000000        nopI dunno if somebody may be interested in this library.
Sure, there is still a lot of job to do.
NOTE :
any vfpu_vector_2d_XXX template class has an optional template parameter named "clobbered" (those ones named "stepN" in my example) which allows g++ to remember which VFPU registers were "allocated". This is the main reason why we need so many typedefs and this weird syntax.