Is it possible to use VFPU to optimize AVC IDCT job?

happycoding · Post by **happycoding** » Fri Jan 27, 2006 1:16 pm

Hi,

I am working on optimizing "PMP Mod 1.01" for AVC decoder, and I noticed following functions[1] in PMP Mod 1.01\libavcodec\h264idct.c are cpu consuming. The only optimization I can find now is to expand loop (e.g. expand "for(i=1;i<8;i++) {foo(i)}" to "foo(1);foo(2);...;foo(7)" and hope gcc can pipleline them more) which *may* improve +1fps decoding. I also noticed there was a MMX optimization version [2] in original ffmpeg\libavcodec\i386\h264dsp_mmx.c.

My questions is: Is it possible to use VFPU to optimize AVC IDCT job as well as mmx does it on i386 platform? I mean I can not find detailed VFPU instruction manual and samples.:(

Thank you!

Reference:
1. PMP Mod 1.01\libavcodec\h264idct.c

Code: Select all

#include "dsputil.h"

static always_inline void idct_internal&#40;uint8_t *dst, DCTELEM *block, int stride, int block_stride, int shift, int add&#41;&#123;
    int i;
    uint8_t *cm = cropTbl + MAX_NEG_CROP;

    block&#91;0&#93; += 1<<&#40;shift-1&#41;;

    for&#40;i=0; i<4; i++&#41;&#123;
        const int z0=  block&#91;0 + block_stride*i&#93;     +  block&#91;2 + block_stride*i&#93;;
        const int z1=  block&#91;0 + block_stride*i&#93;     -  block&#91;2 + block_stride*i&#93;;
        const int z2= &#40;block&#91;1 + block_stride*i&#93;>>1&#41; -  block&#91;3 + block_stride*i&#93;;
        const int z3=  block&#91;1 + block_stride*i&#93;     + &#40;block&#91;3 + block_stride*i&#93;>>1&#41;;

        block&#91;0 + block_stride*i&#93;= z0 + z3;
        block&#91;1 + block_stride*i&#93;= z1 + z2;
        block&#91;2 + block_stride*i&#93;= z1 - z2;
        block&#91;3 + block_stride*i&#93;= z0 - z3;
    &#125;

    for&#40;i=0; i<4; i++&#41;&#123;
        const int z0=  block&#91;i + block_stride*0&#93;     +  block&#91;i + block_stride*2&#93;;
        const int z1=  block&#91;i + block_stride*0&#93;     -  block&#91;i + block_stride*2&#93;;
        const int z2= &#40;block&#91;i + block_stride*1&#93;>>1&#41; -  block&#91;i + block_stride*3&#93;;
        const int z3=  block&#91;i + block_stride*1&#93;     + &#40;block&#91;i + block_stride*3&#93;>>1&#41;;

        dst&#91;i + 0*stride&#93;= cm&#91; add*dst&#91;i + 0*stride&#93; + &#40;&#40;z0 + z3&#41; >> shift&#41; &#93;;
        dst&#91;i + 1*stride&#93;= cm&#91; add*dst&#91;i + 1*stride&#93; + &#40;&#40;z1 + z2&#41; >> shift&#41; &#93;;
        dst&#91;i + 2*stride&#93;= cm&#91; add*dst&#91;i + 2*stride&#93; + &#40;&#40;z1 - z2&#41; >> shift&#41; &#93;;
        dst&#91;i + 3*stride&#93;= cm&#91; add*dst&#91;i + 3*stride&#93; + &#40;&#40;z0 - z3&#41; >> shift&#41; &#93;;
    &#125;
&#125;

void ff_h264_idct_add_c&#40;uint8_t *dst, DCTELEM *block, int stride&#41;&#123;
    idct_internal&#40;dst, block, stride, 4, 6, 1&#41;;
&#125;

void ff_h264_lowres_idct_add_c&#40;uint8_t *dst, int stride, DCTELEM *block&#41;&#123;
    idct_internal&#40;dst, block, stride, 8, 3, 1&#41;;
&#125;

void ff_h264_lowres_idct_put_c&#40;uint8_t *dst, int stride, DCTELEM *block&#41;&#123;
    idct_internal&#40;dst, block, stride, 8, 3, 0&#41;;
&#125;

void ff_h264_idct8_add_c&#40;uint8_t *dst, DCTELEM *block, int stride&#41;&#123;
    int i;
    DCTELEM &#40;*src&#41;&#91;8&#93; = &#40;DCTELEM&#40;*&#41;&#91;8&#93;&#41;block;
    uint8_t *cm = cropTbl + MAX_NEG_CROP;

    block&#91;0&#93; += 32;

    for&#40; i = 0; i < 8; i++ &#41;
    &#123;
        const int a0 =  src&#91;i&#93;&#91;0&#93; + src&#91;i&#93;&#91;4&#93;;
        const int a2 =  src&#91;i&#93;&#91;0&#93; - src&#91;i&#93;&#91;4&#93;;
        const int a4 = &#40;src&#91;i&#93;&#91;2&#93;>>1&#41; - src&#91;i&#93;&#91;6&#93;;
        const int a6 = &#40;src&#91;i&#93;&#91;6&#93;>>1&#41; + src&#91;i&#93;&#91;2&#93;;

        const int b0 = a0 + a6;
        const int b2 = a2 + a4;
        const int b4 = a2 - a4;
        const int b6 = a0 - a6;

        const int a1 = -src&#91;i&#93;&#91;3&#93; + src&#91;i&#93;&#91;5&#93; - src&#91;i&#93;&#91;7&#93; - &#40;src&#91;i&#93;&#91;7&#93;>>1&#41;;
        const int a3 =  src&#91;i&#93;&#91;1&#93; + src&#91;i&#93;&#91;7&#93; - src&#91;i&#93;&#91;3&#93; - &#40;src&#91;i&#93;&#91;3&#93;>>1&#41;;
        const int a5 = -src&#91;i&#93;&#91;1&#93; + src&#91;i&#93;&#91;7&#93; + src&#91;i&#93;&#91;5&#93; + &#40;src&#91;i&#93;&#91;5&#93;>>1&#41;;
        const int a7 =  src&#91;i&#93;&#91;3&#93; + src&#91;i&#93;&#91;5&#93; + src&#91;i&#93;&#91;1&#93; + &#40;src&#91;i&#93;&#91;1&#93;>>1&#41;;

        const int b1 = &#40;a7>>2&#41; + a1;
        const int b3 =  a3 + &#40;a5>>2&#41;;
        const int b5 = &#40;a3>>2&#41; - a5;
        const int b7 =  a7 - &#40;a1>>2&#41;;

        src&#91;i&#93;&#91;0&#93; = b0 + b7;
        src&#91;i&#93;&#91;7&#93; = b0 - b7;
        src&#91;i&#93;&#91;1&#93; = b2 + b5;
        src&#91;i&#93;&#91;6&#93; = b2 - b5;
        src&#91;i&#93;&#91;2&#93; = b4 + b3;
        src&#91;i&#93;&#91;5&#93; = b4 - b3;
        src&#91;i&#93;&#91;3&#93; = b6 + b1;
        src&#91;i&#93;&#91;4&#93; = b6 - b1;
    &#125;
    for&#40; i = 0; i < 8; i++ &#41;
    &#123;
        const int a0 =  src&#91;0&#93;&#91;i&#93; + src&#91;4&#93;&#91;i&#93;;
        const int a2 =  src&#91;0&#93;&#91;i&#93; - src&#91;4&#93;&#91;i&#93;;
        const int a4 = &#40;src&#91;2&#93;&#91;i&#93;>>1&#41; - src&#91;6&#93;&#91;i&#93;;
        const int a6 = &#40;src&#91;6&#93;&#91;i&#93;>>1&#41; + src&#91;2&#93;&#91;i&#93;;

        const int b0 = a0 + a6;
        const int b2 = a2 + a4;
        const int b4 = a2 - a4;
        const int b6 = a0 - a6;

        const int a1 = -src&#91;3&#93;&#91;i&#93; + src&#91;5&#93;&#91;i&#93; - src&#91;7&#93;&#91;i&#93; - &#40;src&#91;7&#93;&#91;i&#93;>>1&#41;;
        const int a3 =  src&#91;1&#93;&#91;i&#93; + src&#91;7&#93;&#91;i&#93; - src&#91;3&#93;&#91;i&#93; - &#40;src&#91;3&#93;&#91;i&#93;>>1&#41;;
        const int a5 = -src&#91;1&#93;&#91;i&#93; + src&#91;7&#93;&#91;i&#93; + src&#91;5&#93;&#91;i&#93; + &#40;src&#91;5&#93;&#91;i&#93;>>1&#41;;
        const int a7 =  src&#91;3&#93;&#91;i&#93; + src&#91;5&#93;&#91;i&#93; + src&#91;1&#93;&#91;i&#93; + &#40;src&#91;1&#93;&#91;i&#93;>>1&#41;;

        const int b1 = &#40;a7>>2&#41; + a1;
        const int b3 =  a3 + &#40;a5>>2&#41;;
        const int b5 = &#40;a3>>2&#41; - a5;
        const int b7 =  a7 - &#40;a1>>2&#41;;

        dst&#91;i + 0*stride&#93; = cm&#91; dst&#91;i + 0*stride&#93; + &#40;&#40;b0 + b7&#41; >> 6&#41; &#93;;
        dst&#91;i + 1*stride&#93; = cm&#91; dst&#91;i + 1*stride&#93; + &#40;&#40;b2 + b5&#41; >> 6&#41; &#93;;
        dst&#91;i + 2*stride&#93; = cm&#91; dst&#91;i + 2*stride&#93; + &#40;&#40;b4 + b3&#41; >> 6&#41; &#93;;
        dst&#91;i + 3*stride&#93; = cm&#91; dst&#91;i + 3*stride&#93; + &#40;&#40;b6 + b1&#41; >> 6&#41; &#93;;
        dst&#91;i + 4*stride&#93; = cm&#91; dst&#91;i + 4*stride&#93; + &#40;&#40;b6 - b1&#41; >> 6&#41; &#93;;
        dst&#91;i + 5*stride&#93; = cm&#91; dst&#91;i + 5*stride&#93; + &#40;&#40;b4 - b3&#41; >> 6&#41; &#93;;
        dst&#91;i + 6*stride&#93; = cm&#91; dst&#91;i + 6*stride&#93; + &#40;&#40;b2 - b5&#41; >> 6&#41; &#93;;
        dst&#91;i + 7*stride&#93; = cm&#91; dst&#91;i + 7*stride&#93; + &#40;&#40;b0 - b7&#41; >> 6&#41; &#93;;
    &#125;
&#125;

2. original ffmpeg\libavcodec\i386\h264dsp_mmx.c.

Code: Select all

void ff_h264_idct_add_mmx2&#40;uint8_t *dst, int16_t *block, int stride&#41;
&#123;
    /* Load dct coeffs */
    asm volatile&#40;
        "movq   &#40;%0&#41;, %%mm0 \n\t"
        "movq  8&#40;%0&#41;, %%mm1 \n\t"
        "movq 16&#40;%0&#41;, %%mm2 \n\t"
        "movq 24&#40;%0&#41;, %%mm3 \n\t"
    &#58;&#58; "r"&#40;block&#41; &#41;;

    asm volatile&#40;
        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
        IDCT4_1D&#40; %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 &#41;

        "movq      %0,    %%mm6 \n\t"
        /* in&#58; 1,4,0,2  out&#58; 1,2,3,0 */
        TRANSPOSE4&#40; %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 &#41;

        "paddw     %%mm6, %%mm3 \n\t"

        /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
        IDCT4_1D&#40; %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 &#41;

        "pxor %%mm7, %%mm7    \n\t"
    &#58;&#58; "m"&#40;ff_pw_32&#41;&#41;;

    asm volatile&#40;
    STORE_DIFF_4P&#40; %%mm0, %%mm1, %%mm7&#41;
        "add %1, %0             \n\t"
    STORE_DIFF_4P&#40; %%mm2, %%mm1, %%mm7&#41;
        "add %1, %0             \n\t"
    STORE_DIFF_4P&#40; %%mm3, %%mm1, %%mm7&#41;
        "add %1, %0             \n\t"
    STORE_DIFF_4P&#40; %%mm4, %%mm1, %%mm7&#41;
        &#58; "+r"&#40;dst&#41;
        &#58; "r" &#40;&#40;long&#41;stride&#41;
    &#41;;
&#125;

[/code]

jsgf · Post by **jsgf** » Fri Jan 27, 2006 4:44 pm

The VFPU could probably help. The big difference is that the VFPU works all in FP rather than integer, but it has instructions for converting int->float and float->int. All the operations look like they could be vectorized into batches of 4 parallel operations, and the various negate/swizzle/constant prefixes look like they'd be helpful too. Also, the VFPU has *masses* of registers, so you could probably completely unroll those loops and interleave them in the VFPU, making the code nicely pipelined.

There isn't much documentation on the VFPU. The resources which exist are:

the binutils patch in the toolchain, which lists all the opcode names, and what arguments they take
pspgl/pspgl_codegen.h is the first attempt to document the VFPU opcodes. It isn't complete, but what's there is useful
pspgl/glRotate/Scale/Translate/gluLookAt are my VFPU routines for doing 3D transforms; you may find them useful as examples
pspsdk/src/gum/pspgum_vfpu.c are similar
http://bradburn.net/mr.mr/vfpu.html documents how the VFPU registers work
various forum posts, in particular these threads: VFPU playground, code generation for gas-unsupported opcodes, VFPU yummy goodness: instruction prefixes and rotation, libvfpu: simple VFPU context switching
Do some experiments, post the results

Raphael · Post by **Raphael** » Sat Jan 28, 2006 3:41 am

Actually, the h.264 IDCT is a orthogonal matrix transformation, which is done by two matrix multiplications, so this could be fully done by the VFPU.
The dequantized (4x4) input matrix X is put into the equotion:

Y = Ct * X * C, thus returning the matrix of coefficients, which only needs to be rescaled afterwards, where

Ct =

(transposed of C)
C =

This transform would be done by VFPU by just two 4x4 matrix multiplies, rather then the unrolled C/MMX version and thus be pretty fast I suppose.
But that's just the theoretical maths behind the IDCT, so you'd have to figure out how this is implemented here with the strides and add and stuff.
Go give it a try.

PS: The 8x8 IDCT can be split up to 8 4x4 matrix multiplications and four matrix addition. If A,B,C,D,E,F,G,H are each 4x4 matrices with

X =

Y =

then X*Y =

So you'd just need the correct C and Ct for 8x8 IDCT. Probably I'll find them anywhere too. Yet, the question is if 8 matrix muls and 4 matrix adds are faster than a unrolled attempt.

jsgf · Post by **jsgf** » Sat Jan 28, 2006 8:58 am

OK, so that makes it more or less trivial. Here's a first cut:

Code: Select all

int idct&#40;int *block, int block_stride&#41;
&#123;
        asm volatile&#40;
                /* Load the input matrix.  Assumes row-major order in
                   memory, and each row is 16-byte aligned.  Use ulv.q for
                   unaligned loads. */
                "lv.q R100, %0\n"
                "lv.q R101, %1\n"
                "lv.q R102, %2\n"
                "lv.q R103, %3\n"

                /* Convert int to float */
                "vi2f.q R100, R100, 0\n"
                "vi2f.q R101, R101, 0\n"
                "vi2f.q R102, R102, 0\n"
                "vi2f.q R103, R103, 0\n"

                /* Load up the C matrix */
                "vone.q R000\n"
                "vmov.q R001, R001&#91;  1,1/2,1/2, -1&#93;\n"
                "vmov.q R002, R002&#91;  1, -1, -1,  1&#93;\n"
                "vmov.q R003, R003&#91;1/2, -1,  1,1/2&#93;\n"

                /* M200 = X * C */
                "vmmul.q M200, M100, M000\n"
                /* M100 = Ct * &#40;X * C&#41; &#40;E000 = transpose&#40;M000&#41;&#41; */
                "vmmul.q M100, E000, M200\n"

                /* Convert float to int &#40;truncated&#41; */
                "vf2iz.q R100, R100, 0\n"
                "vf2iz.q R101, R101, 0\n"
                "vf2iz.q R102, R102, 0\n"
                "vf2iz.q R103, R103, 0\n"

                /* Store result; use usv.q for unaligned */
                "sv.q R100, %0\n"
                "sv.q R101, %1\n"
                "sv.q R102, %2\n"
                "sv.q R103, %3\n"
        &#58; "+m" &#40;block&#91;block_stride*0&#93;&#41;,
          "+m" &#40;block&#91;block_stride*1&#93;&#41;,
          "+m" &#40;block&#91;block_stride*2&#93;&#41;,
          "+m" &#40;block&#91;block_stride*3&#93;&#41;&#41;;
&#125;

This is completely untested, of course... If you need to apply scaling or an offset to the values, the VFPU can easily do that too: vmscl.q will multiply a 4x4 matrix by a scalar.

I'm not sure what type DCTELEM is. If its bytes rather than 32-bit ints, then there would need to be some way of doing the conversion. The VFPU has a vi2c instruction, which I think is int to (signed) char, so you could use that. Not sure how it behaves, exactly though. There doesn't seem to be a corresponding vc2i though.

You could do the 2x2 multiplies making up an 8x8 in a similar way (vmmul.p does 2x2).

happycoding · Post by **happycoding** » Sun Jan 29, 2006 3:10 am

Thank you very very much for your warm-hearted help!

happycoding · Post by **happycoding** » Fri Feb 03, 2006 12:19 pm

@Raphael
Thank you very much for your direction and I've found the paper "Low-Complexity Transform and Quantization in H.264/AVC"[1] for this issue.

@jsgf
Thank you very much for your code.
Your code assumes block is (int *) type, but really it is
(short *) type ("typedef short DCTELEM;"). Maybe I need to copy a "short" array to a "float" array and save the vi2f.q instruction. Sorry I am a VFPU newbie and maybe it is stupid.

Anyway, I am working now and hope ...

Reference
1.http://research.microsoft.com/~malvar/p ... July03.pdf

Raphael · Post by **Raphael** » Fri Feb 03, 2006 2:31 pm

DCTELEM is of type short, so it's 16bit. So is there a method for using 16bit ints with the VFPU? Something like vs2f instead of vi2f? If not, then it's unavoidable to do an ugly conversion of the input block to 32bit int.
The conversion of the output block to uint8 is unavoidable by itself (don't know how much the vi2c helps there, as it converts to signed char but we need unsigned).

jsgf · Post by **jsgf** » Fri Feb 03, 2006 3:43 pm

There are vus2i, vi2us, vs2i, vi2s instructions, which I'm presuming convert unsigned and signed shorts to/from int. I don't know how they would work though. The questions are:

does i2us/s use the 16 MSB or LSB of the int?
does it expect the shorts to be packed in registers (ie, two adjacent in a single 32-bit register), or does it want one per 32-bit?
if the latter, is there any way to load them from memory in the appropriate way?

It seems to me that you might need to do the conversion out of the VFPU code, and at least turn the values into 32-bit ints before passing them into the asm.

There are both vi2uc and vi2c, so both signed and unsigned can be dealt with.

Raphael · Post by **Raphael** » Fri Feb 03, 2006 7:28 pm

That's great then.
So it's just a matter of trying out how the vi2s/vs2i and vi2uc works. If it would be perfect, we could load the shorts into the lower row bytes of the matrix and unpack it to int and then to float, do the math and convert back to int and then to unsigned char, writing the lower row bytes of the result matrix into dst.
If the conversion works as I suspect, by doing int|int|int|int -> 3x0|char|3x0|char|3x0|char|3x0|char then it would be hard to read out the single chars I suppose?
If it doesn't work like this and is a ugly mess, it really would be better to do a 'dirty' conversion of the blocks by creating new blocks of the wanted length.

happycoding · Post by **happycoding** » Fri Feb 03, 2006 8:52 pm

Thank you!

Following is my code but it hangs up the PSP. :-(

Code: Select all

void ff_h264_idct_add_c&#40;uint8_t *dst, DCTELEM *block, int stride&#41;&#123;
	uint8_t *cm = cropTbl + MAX_NEG_CROP;

	block&#91;0&#93; += 1<<&#40;6-1&#41;;
	float block32&#91;16&#93;;
	int i;
	for&#40;i = 0; i < 16; i++&#41;
	&#123;
		block32&#91;i&#93; = block&#91;i&#93;;
	&#125;
	asm volatile&#40; 
                /* Load the input matrix.  Assumes row-major order in 
                   memory, and each row is 16-byte aligned.  Use ulv.q for 
                   unaligned loads. */ 
                "lv.q R100, %0\n" 
                "lv.q R101, %1\n" 
                "lv.q R102, %2\n" 
                "lv.q R103, %3\n" 

                /* Load up the C matrix */ 
                "vone.q R000\n" 
                "vmov.q R001, R001&#91;  1,1/2,-1/2, -1&#93;\n" 
                "vmov.q R002, R002&#91;  1, -1, -1,  1&#93;\n" 
                "vmov.q R003, R003&#91;1/2, -1,  1,-1/2&#93;\n"

                /* M200 = X * C */ 
                "vmmul.q M200, M100, M000\n" 
                /* M100 = Ct * &#40;X * C&#41; &#40;E000 = transpose&#40;M000&#41;&#41; */ 
                "vmmul.q M100, E000, M200\n"  

                /* Store result; use usv.q for unaligned */ 
                "sv.q R100, %0\n" 
                "sv.q R101, %1\n" 
                "sv.q R102, %2\n" 
                "sv.q R103, %3\n" 

				&#58; "+m" &#40;block32&#91;4*0&#93;&#41;, 
				  "+m" &#40;block32&#91;4*1&#93;&#41;, 
				  "+m" &#40;block32&#91;4*2&#93;&#41;, 
				  "+m" &#40;block32&#91;4*3&#93;&#41;&#41;;
	for&#40;i = 0; i < 16; i++&#41;
	&#123;
		block&#91;i&#93; = block32&#91;i&#93;;
	&#125;
	
	for&#40;i=0; i<4; i++&#41;&#123;
        dst&#91;i + 0*stride&#93;= cm&#91; dst&#91;i + 0*stride&#93; + &#40;block&#91;i + 4*0&#93;>>6&#41; &#93;;
        dst&#91;i + 1*stride&#93;= cm&#91; dst&#91;i + 1*stride&#93; + &#40;block&#91;i + 4*1&#93;>>6&#41; &#93;;
        dst&#91;i + 2*stride&#93;= cm&#91; dst&#91;i + 2*stride&#93; + &#40;block&#91;i + 4*2&#93;>>6&#41; &#93;;
        dst&#91;i + 3*stride&#93;= cm&#91; dst&#91;i + 3*stride&#93; + &#40;block&#91;i + 4*3&#93;>>6&#41; &#93;;
    &#125;
&#125;

jsgf · Post by **jsgf** » Fri Feb 03, 2006 9:11 pm

One interesting extra feature is that vf2i and i2f have a scaling factor as their 3rd parameter, which is expressed as a power of 2 - in other words, its a shift. I think the appropriate sequence for getting the floats out as unsigned bytes is:

Code: Select all

vf2iz.q r000,r000,24 // convert floats to ints, truncated, and shift 24 bits
vi2uc.q s001,r000 // convert 8MSB of ints into unsigned chars, packed into a 32-bit scalar
sv.s s001,%0 // write result

happycoding · Post by **happycoding** » Sat Feb 04, 2006 12:40 pm

Is there a VFPU instruction for ">>1" ? Now I am using "vscl.q" to scale "0.5". If I want to ">>6", I can not "vscl.q" to scale "1/64.0" because of precision problem. (Maybe I should use "vdiv.q" to div "64" to keep precision).

I followed AVC's Spec and following is my code. It's strange that it hangs up the PSP ( I mean when I run the code, PSP will be shut down automatically). Maybe the VFPU code can not be run twice? Or I should initialize the VFPU (and its context)?

Code: Select all

void ff_h264_idct_add_c&#40;uint8_t *dst, DCTELEM *block, int stride&#41;&#123;
	uint8_t *cm = cropTbl + MAX_NEG_CROP;
	block&#91;0&#93; += 1<<&#40;6-1&#41;;
	float block32&#91;16&#93;;
	float shift&#91;4&#93;;
	int i;
	for&#40;i = 0; i < 16; i++&#41;
	&#123;
		block32&#91;i&#93; = block&#91;i&#93;;
	&#125;
	shift&#91;0&#93;=0.5;
	asm volatile&#40; 
                /* Load the input matrix.  Assumes row-major order in 
                   memory, and each row is 16-byte aligned.  Use ulv.q for 
                   unaligned loads. */ 
                "lv.q R100, %0\n" 
                "lv.q R101, %1\n" 
                "lv.q R102, %2\n" 
                "lv.q R103, %3\n" 

				/* S000 = 0.5 */
				"lv.s S000, %4\n"

				/* First, each &#40;vertical&#41; column of scaled transform coeffients is transformed using a one-dimentional inverse transform. */
				/* z0 = w0 + w2 */
				"vadd.q C200, C100, C120\n"

				/* z1 = w0 - w2 */
				"vsub.q C210, C100, C120\n"

				/* w1 >> 1 */
				"vscl.q C010, C110, S000\n"
				
				/* z2 = w1>>1 - w3 */
				"vsub.q C220, C010, C130\n"

				/* w3 >> 1 */
				"vscl.q C010, C130, S000\n"

				/* z3 = w1 + w3>>1 */
				"vadd.q C230, C110, C010\n"

				/* x0 = z0 + z3 */
				"vadd.q C100, C200, C230\n"

				/* x1 = z1 + z2 */
				"vadd.q C110, C210, C220\n"

				/* x2 = z1 - z2 */
				"vsub.q C120, C210, C220\n"

				/* x3 = z0 - z3 */
				"vsub.q C130, C200, C230\n"

				/* Then, each &#40;horizontal&#41; row of the resulting matrix is transformed using the same one-dimentional inverse transform.*/
				/* z0 = w0 + w2 */
				"vadd.q R200, R100, R102\n"

				/* z1 = w0 - w2 */
				"vsub.q R201, R100, R102\n"

				/* w1 >> 1 */
				"vscl.q R001, R101, S000\n"
				
				/* z2 = w1>>1 - w3 */
				"vsub.q R202, R001, R103\n"

				/* w3 >> 1 */
				"vscl.q R001, R103, S000\n"

				/* z3 = w1 + w3>>1 */
				"vadd.q R203, R101, R001\n"

				/* x0 = z0 + z3 */
				"vadd.q R100, R200, R203\n"

				/* x1 = z1 + z2 */
				"vadd.q R101, R201, R202\n"

				/* x2 = z1 - z2 */
				"vsub.q R102, R201, R202\n"

				/* x3 = z0 - z3 */
				"vsub.q R103, R200, R203\n"

                /* Store result; use usv.q for unaligned */ 
                "sv.q R100, %0\n" 
                "sv.q R101, %1\n" 
                "sv.q R102, %2\n" 
                "sv.q R103, %3\n" 

				&#58; "+m" &#40;block32&#91;4*0&#93;&#41;, 
				  "+m" &#40;block32&#91;4*1&#93;&#41;, 
				  "+m" &#40;block32&#91;4*2&#93;&#41;, 
				  "+m" &#40;block32&#91;4*3&#93;&#41;,
				  "+m" &#40;shift&#91;0&#93;&#41;&#41;;
	for&#40;i = 0; i < 16; i++&#41;
	&#123;
		block&#91;i&#93; = block32&#91;i&#93;;
	&#125;
	
	for&#40;i=0; i<4; i++&#41;&#123;
        dst&#91;i + 0*stride&#93;= cm&#91; dst&#91;i + 0*stride&#93; + &#40;block&#91;i + 4*0&#93;>>6&#41; &#93;;
        dst&#91;i + 1*stride&#93;= cm&#91; dst&#91;i + 1*stride&#93; + &#40;block&#91;i + 4*1&#93;>>6&#41; &#93;;
        dst&#91;i + 2*stride&#93;= cm&#91; dst&#91;i + 2*stride&#93; + &#40;block&#91;i + 4*2&#93;>>6&#41; &#93;;
        dst&#91;i + 3*stride&#93;= cm&#91; dst&#91;i + 3*stride&#93; + &#40;block&#91;i + 4*3&#93;>>6&#41; &#93;;
    &#125;
&#125;

jsgf · Post by **jsgf** » Sat Feb 04, 2006 7:22 pm

happycoding wrote:Is there a VFPU instruction for ">>1" ?

No, but /2 should be exactly equivalent. 1/2 is one of the special constants you can substitute with a prefix, so you don't need to do anything special.

Now I am using "vscl.q" to scale "0.5". If I want to ">>6", I can not "vscl.q" to scale "1/64.0" because of precision problem. (Maybe I should use "vdiv.q" to div "64" to keep precision).

What precision problem? That should be totally accurate in FP.

I followed AVC's Spec and following is my code. It's strange that it hangs up the PSP ( I mean when I run the code, PSP will be shut down automatically). Maybe the VFPU code can not be run twice? Or I should initialize the VFPU (and its context)?

Make sure you're seeing the VFPU attribute on the thread. Better still, use libvfpu in the SDK. In general, the PSP turning off is the symptom of a crash, and doesn't tell you much without further investigation.

Isn't all this code just a matrix multiply?

Code: Select all

...
             /* Load the input matrix.  Assumes row-major order in 
                   memory, and each row is 16-byte aligned.  Use ulv.q for 
                   unaligned loads. */ 
                "lv.q R100, %0\n" 
                "lv.q R101, %1\n" 
                "lv.q R102, %2\n" 
                "lv.q R103, %3\n" 

				/* S000 = 0.5 */
				"lv.s S000, %4\n"

Just use

Code: Select all

vmov.s S000, S000&#91;1/2&#93;

happycoding · Post by **happycoding** » Sat Feb 04, 2006 10:09 pm

@jsgf
First of all, thank you very much for your reply!!!

(1) I mean "* 1/64" is different with ">>6" and following is a sample.

Code: Select all

short a = -7136;
float f = &#40;float&#41;a/64.0; // f is "-115.500"
int   i1 =  f;   // i1 is "-111"
int   i2 = a>>6; // i2 is "-112"

(2) Thank you for your introduction for "vmov.s S000, S000[1/2]". :-)
(3) I've set VFPU attribute to the thread and following is my code. Is it enough?

Code: Select all

PSP_MODULE_INFO&#40;"pmpmod", 0x1000, 1, 1&#41;;
PSP_MAIN_THREAD_ATTR&#40;THREAD_ATTR_VFPU&#41;;

I've checked your libvfpu in the SDK and have several questions about it:
(1) I can not find a sample source code which calls libvfpu. I mean in [1], you mentioned "pspvfpu_initcontext(VMAT4 | VMAT5)", but in pspsdk svn revision 1779 pspsdk/src/vfpu/pspvfpu.c Line 153, it is "struct pspvfpu_context *pspvfpu_initcontext(void)". Now I am referencing pspsdk/src/gum/pspgum_vfpu.c Line 20, and it is "gum_vfpucontext = pspvfpu_initcontext();".
(2) Have the latest psptoolchain integrated your libvfpu? In another word, is it stable now? If Yes, I will spend several hours to update my pspsdk by psptoolchain.
(3) There is only one VFPU for R4000. I mean ME has no VFPU. Right? Because Jonny's PMP Mod 1.02 used ME and I wonder whether there will be any conflict between VFPU and ME. (I am thinking why my PSP crashes when it runs VFPU code...)
(4) Should I call pspvfpu_use_matrices(...) every time before I call VFPU asm code? I mean the reason why I choose VFPU is to save CPU time and I don't know the pspvfpu_use_matrices funtion's cost. Is it worth?

Reference
1.http://forums.ps2dev.org/viewtopic.php?t=4767

EDIT. Good & Bad news.
I applied libvfpu and it works!!!! Thanks jspf!!!
But, it seems the AVC IDCT VFPU optimization can not improve AVC decoder speed. I mean it is still 480*272 600kbps 10fps. :(

jsgf · Post by **jsgf** » Sun Feb 05, 2006 3:57 am

happycoding wrote:(1) I mean "* 1/64" is different with ">>6" and following is a sample.
Code: Select all
short a = -7136;
float f = &#40;float&#41;a/64.0; // f is "-115.500"
int   i1 =  f;   // i1 is "-111"
int   i2 = a>>6; // i2 is "-112"

I presume you mean 'f is "-111.5"'. The difference is in how the result gets rounded to integer, rather than any difference in the divide itself. There are lots of different ways to round FP to int; simply casting fp to int (like i1) is the same as i1 = trunc(f), whereas the shift is the same as i2=floor(a / 64.0). So, it comes down to whether you really need to round the fractions for all the intermediate calculations, or only at the end when

(2) Thank you for your introduction for "vmov.s S000, S000[1/2]". :-)

Note that "1/2" is one of the special values the VFPU supports in this context. To get 1/64, you're going to need something like "vfim.s s000, 0.015625 // 1./64"

I've checked your libvfpu in the SDK and have several questions about it:
(1) I can not find a sample source code which calls libvfpu. I mean in [1], you mentioned "pspvfpu_initcontext(VMAT4 | VMAT5)", but in pspsdk svn revision 1779 pspsdk/src/vfpu/pspvfpu.c Line 153, it is "struct pspvfpu_context *pspvfpu_initcontext(void)". Now I am referencing pspsdk/src/gum/pspgum_vfpu.c Line 20, and it is "gum_vfpucontext = pspvfpu_initcontext();".

I changed the API a few times since the first attempt, but it should be stable now. The only two users I know of are pspsdk/src/gum/pspgum_vfpu.c and in PSPGL. Also the pspvfpu.h header should document the API pretty well. It's very simple.

(2) Have the latest psptoolchain integrated your libvfpu? In another word, is it stable now? If Yes, I will spend several hours to update my pspsdk by psptoolchain.

It's in the SVN PSPSDK, so running the toolchain script will pick it up. But you don't need to do that; you can just check out the PSPSDK source and install it directly.

(3) There is only one VFPU for R4000. I mean ME has no VFPU. Right? Because Jonny's PMP Mod 1.02 used ME and I wonder whether there will be any conflict between VFPU and ME. (I am thinking why my PSP crashes when it runs VFPU code...)

As I understand it, the ME is a completely separate Mips core, which doesn't have a VFPU, but does have the VME. I don't know much beyond that though.

(4) Should I call pspvfpu_use_matrices(...) every time before I call VFPU asm code? I mean the reason why I choose VFPU is to save CPU time and I don't know the pspvfpu_use_matrices funtion's cost. Is it worth?

No, you should only call it if you think some other VFPU-using code has run since you last used the VFPU. Unless you go off and call some random physics or 3D library in the middle of decoding a frame, you should only need to call pspvfpu_use_matrices() once per frame. Even if you call it redundantly, it will return very quickly if there's nothing to be done (after only a few instructions).

I applied libvfpu and it works!!!! Thanks jspf!!!
But, it seems the AVC IDCT VFPU optimization can not improve AVC decoder speed. I mean it is still 480*272 600kbps 10fps. :(

So you've got decoding working with the VFPU? That's great! But... you've only just started. Assuming the code you converted to the VFPU was actually the bottleneck (are you sure it was?), there's still plenty of scope to improve things:

Profile the code. Are you sure you're optimising the right piece?
Can you make better use of he VFPU? You still have a lot of instructions in your inner loop. Could you use more matrix operations?
Have you scheduled your code to avoid stalls? The VFPU will stall if you have one instruction which writes a register and the following one reads it. You should put several unrelated instructions in between to let the first one get through the pipeline. Unfortunately I don't know of any documentation on this, though korskarn gave some hints in http://forums.ps2dev.org/viewtopic.php?t=4758
Can you do more in the VFPU code? You've still got a little block after the main VFPU code which does stuff that the VFPU should be able to do pretty efficiently. Also out-of-line conversion of the block from/to short seems pretty expensive.
Are you making good use of the cache? If this is a streaming operation (ie, the results are not used before they're evicted from the cache), then you should write them bypassing the cache (use "sv.q ..., wt" or "wb" for writethrough or writeback. Hm, it seems that without putting anything, writethrough is the default...
One idea is to do two or more blocks at a time, and interleave their calculations in the VFPU, assuming they're independent. This gives you lots of independent instructions to interleave to avoid pipeline stalls.

jonny · Post by **jonny** » Sun Feb 05, 2006 6:40 am

@happycoding:
my advice is also to start testing with clips encoded with a Baseline Profile (it's easy to get this with VirtualDub and the x264 codec, basically unchecking all the options in the codec configuration :)
this should give a base of ~20fps (iirc)
this is because some h264 options add a lot of complexity to the decoding
testing the simplest situation will be a lot easyer
(some advanced options would probably require specific optimizations in order to be activated)

happycoding · Post by **happycoding** » Sun Feb 05, 2006 1:05 pm

@jonny
Thank you for your advice. I create Baseline Profile AVC clips and you PMP Mod 1.02 can decode "480*272 @400kbps" 15 fps smoothly but decode "480*272 @400kbps" 20 fps with stutter. Anyway, my goal is to let the 20fps-stutter clip be smoothly decoded by optimizing PMP Mod 1.02 AVC decoder.

BTW, I tried to parallel AVC decoder's decode_slice function in h264.c ( you paralleled MPEG4 decoder's decode_slice function in h263dec.c between R4000 and ME successfully), but failed because I can not divide the original decode_slice into two *UNRELATED* parts: one for R4000 and another for ME. :-( I know PS3 uses CELL chip, which has 8 SPEs. (I think CELL's SPE is similar to PSP's ME, although one is PowerPC Arch and another is MIPS Arch). My point is: How to parallel multi-core is very important in the future, and unfortunately compiler does littler to support this feature so far.

EDIT.
@jsgf
Yes, you are right. The h264idct.c is not the right place for optimization. I mean I set all ff_h264_xxx functions to null and the PMP Mod 1.02 decodes "480*272 @400kbps" 20 fps with stutter, while 15fps works. (There is only audio, no video picture because I set all ff_h264_xxx functions to null).

Code: Select all

void ff_h264_idct_add_c&#40;uint8_t *dst, DCTELEM *block, int stride&#41;&#123;
&#125;

void ff_h264_lowres_idct_add_c&#40;uint8_t *dst, int stride, DCTELEM *block&#41;&#123;
&#125;

void ff_h264_lowres_idct_put_c&#40;uint8_t *dst, int stride, DCTELEM *block&#41;&#123;
&#125;

void ff_h264_idct8_add_c&#40;uint8_t *dst, DCTELEM *block, int stride&#41;&#123;
&#125;

I am very disappointed for the result! The IDCT stuff is not the bottleneck for AVC decoder!!!

happycoding · Post by **happycoding** » Tue Feb 07, 2006 4:02 pm

Thanks for Jonny's help that I find the ffmpeg's profile without mmx optimization for AVC baseline decoder. I think it has similar profile in PSP.
Following is profile:

Code: Select all

Flat profile&#58;
Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 15.07      0.33     0.33  1692014     0.00     0.00  ff_h264_idct_add_c
 10.96      0.57     0.24   417922     0.00     0.00  put_h264_chroma_mc8_c
 10.05      0.79     0.22   384684     0.00     0.00  put_h264_qpel8_v_lowpass
  8.68      0.98     0.19  1659908     0.00     0.00  decode_residual
  5.02      1.09     0.11    97975     0.00     0.00  put_h264_qpel16_h_lowpass
  4.57      1.19     0.10   305490     0.00     0.00  decode_mb_cavlc
  4.11      1.28     0.09   305490     0.00     0.00  hl_decode_mb
  3.65      1.36     0.08   191206     0.00     0.00  avg_h264_chroma_mc8_c
  3.20      1.43     0.07   305490     0.00     0.00  fill_caches
  1.83      1.47     0.04   878264     0.00     0.00  fill_rectangle
  1.83      1.51     0.04    99486     0.00     0.00  put_pixels16_l2
  1.83      1.55     0.04    76480     0.00     0.00  put_h264_qpel8_hv_lowpass
  1.83      1.59     0.04        2    20.00    20.00  MPV_decode_defaults

I've learnt a lot from this forum, and I am going on... ;P