this codelet allows you to emit asm instructions at comile- or runtime, even if the GNU assembler does not yet supports them. Thus we can easily play with the VFPU instruction set. The example below demonstrates this by initializing the VFPU vector register set to zero and then loading an identity matrix in a single instruction.
This raw example only contains some of the most basic instructions and register names (only GPRs, vector and matrix Quadword addressing). Nevertheless you should get the idea how to add new opcodes and play with them.
All opcode and instruction definitions are defined in codegen.h, here they also can get documented. To try, create a directory pspgl/test-vfpu/ and copy the following files into this folder:
Makefile:
Code: Select all
ARCH = psp-
CC = $(ARCH)gcc
PSP_INSTALL = ../tools/psp-install
RM = rm -f
PSPPATH := $(shell psp-config --pspsdk-path)
LIBS = -lpspdebug -lpspdisplay -lpspge -lpspsdk -lpspctrl -lm -lc -lpspuser -lpspkernel
CFLAGS = -g -Wall -O2 -MD -I$(PSPPATH)/include
LFLAGS = -g -Wall -O2 -L$(PSPPATH)/lib $(LIBS)
TARGET = test-vfpu
OBJS = main.o
BUILDDATE = $(shell date "+%Y/%m/%d %k:%M:%S")
PSPSDK=$(shell psp-config --pspsdk-path)
all: $(TARGET)
.c.o:
$(CC) $(CFLAGS) -c $<
$(TARGET): $(OBJS)
$(CC) $(OBJS) $(LFLAGS) -o $@
install: all
$(PSP_INSTALL) $(TARGET) --eboot-title="$(TARGET) $(BUILDDATE)"
clean:
$(RM) $(TARGET) *.d *.o *.a *.elf *.sfo EBOOT.PBP
-include $(wildcard *.d) dummy
codegen.h:
Code: Select all
#ifndef __codegen_h__
#define __codegen_h__
/* GPR register set */
#define R_zero 0
#define R_at 1
#define R_v0 2
#define R_v1 3
#define R_a0 4
#define R_a1 5
#define R_a2 6
#define R_a3 7
#define R_a4 8
#define R_a5 9
#define R_v6 10
#define R_v7 11
#define R_t0 12
#define R_t1 13
#define R_t2 14
#define R_t3 15
#define R_s0 16
#define R_s1 17
#define R_s2 18
#define R_s3 19
#define R_s4 20
#define R_s5 21
#define R_s6 22
#define R_s7 23
#define R_t8 24
#define R_t9 25
#define R_k0 26
#define R_k1 27
#define R_gp 28
#define R_sp 29
#define R_s8 30
#define R_ra 31
/* VFPU registers, Quadword addressing */
#define Q_C000 0 /* First digit specifies matrix, second the row */
#define Q_C010 1
#define Q_C020 2
#define Q_C030 3
#define Q_C100 4
#define Q_C110 5
#define Q_C120 6
#define Q_C130 7
#define Q_C200 8
#define Q_C210 9
#define Q_C220 10
#define Q_C230 11
#define Q_C300 12
#define Q_C310 13
#define Q_C320 14
#define Q_C330 15
#define Q_C400 16
#define Q_C410 17
#define Q_C420 18
#define Q_C430 19
#define Q_C500 20
#define Q_C510 21
#define Q_C520 22
#define Q_C530 23
#define Q_C600 24
#define Q_C610 25
#define Q_C620 26
#define Q_C630 27
#define Q_C700 28
#define Q_C710 29
#define Q_C720 30
#define Q_C730 31
#define Q_R000 32 /* First Digit specifies matrix, third the column */
#define Q_R001 33
#define Q_R002 34
#define Q_R003 35
#define Q_R100 36
#define Q_R101 37
#define Q_R102 38
#define Q_R103 39
#define Q_R200 40
#define Q_R201 41
#define Q_R202 42
#define Q_R203 43
#define Q_R300 44
#define Q_R301 45
#define Q_R302 46
#define Q_R303 47
#define Q_R400 48
#define Q_R401 49
#define Q_R402 50
#define Q_R403 51
#define Q_R500 52
#define Q_R501 53
#define Q_R502 54
#define Q_R503 55
#define Q_R600 56
#define Q_R601 57
#define Q_R602 58
#define Q_R603 59
#define Q_R700 60
#define Q_R701 61
#define Q_R702 62
#define Q_R703 63
/* VFPU registers, 4x4 Matrix (Quad) addressing */
#define Q_M000 0 /* First digit specifies matrix */
#define Q_M100 4
#define Q_M200 8
#define Q_M300 12
#define Q_M400 16
#define Q_M500 20
#define Q_M600 24
#define Q_M700 28
#define Q_E000 32
#define Q_E100 36
#define Q_E200 40
#define Q_E300 44
#define Q_E400 48
#define Q_E500 52
#define Q_E600 56
#define Q_E700 60
/*
+-------------+------------+---------+---------------------------------------+
|31 26|25 21|20 16|15 0 |
+-------------+------------+---------+---------------------------------------+
| opcode 0x8c | base[4-0] | rt[4-0] | offset[15-0] |
+-------------+------------+---------+---------------------------------------+
LoadWord Relative to Address in General Purpose Register
lw %rt, offset(%base)
%rt: GPR Target Register (0...31)
%base: GPR, specifies Source Address Base
offset: signed Offset added to Source Address Base
%rt <- word_at_address (offset + %base)
*/
#define lw(rt,offset,base) \
(0x8c000000 | ((base) << 21) | ((rt) << 16) | ((offset) & 0xffff))
/*
+-------------+------------+---------+---------------------------------------+
|31 26|25 21|20 16|15 0 |
+-------------+------------+---------+---------------------------------------+
| opcode 0xac | base[4-0] | rt[4-0] | offset[15-0] |
+-------------+------------+---------+---------------------------------------+
StoreWord Relative to Address in General Purpose Register
sw %rt, offset(%base)
%rt: GPR Target Register (0...31)
%base: GPR, specifies Source Address Base
offset: signed Offset added to Source Address Base
word_at_address (offset + %base) <- %rt
*/
#define sw(rt,offset,base) \
(0xac000000 | ((base) << 21) | ((rt) << 16) | ((offset) & 0xffff))
/*
+-------------+------------+---------+---------------------------------------+
|31 26|25 21|20 16|15 0 |
+-------------+------------+---------+---------------------------------------+
| opcode 0x42 | rs[4-0] | rt[4-0] | immediate |
+-------------+------------+---------+---------------------------------------+
Add Immediate Unsigned Word
addiu %rt, %rs, immediate
%rt: GPR Target Register (0...31)
%rs: GPR Source Register (0...31)
immediate: value added to Source Register
%rt <- %rs + sign_extended(immediate)
*/
#define addiu(rt,rs,immediate) \
(0x24000000 | ((rs) << 21) | ((rt) << 16) | ((immediate) & 0xffff))
/*
+-------------+-----------+---------+----------------------------+-----+-----+
|31 26|25 21|20 16|15 2 | 1 | 0 |
+-------------+-----------+---------+----------------------------+-----+-----+
| opcode 0xd8 | base[4-0] | vt[4-0] | offset[15-2] | 0 |vt[5]|
+-------------+-----------+---------+----------------------------+-----+-----+
LoadVector.Quadword Relative to Address in General Purpose Register
Final Address needs to be 64-byte aligned.
lv.q %vfpu_rt, offset(%base)
%fpu_rt: VFPU Vector Target Register (column0-31/row32-63)
%base: GPR, specifies Source Address Base
offset: signed Offset added to Source Address Base
fpu_vtr <- vector_at_address (offset + %gpr)
*/
#define lv_q(vfpu_rt,offset,base,cache_policy) \
(0xd8000000 | \
((base) << 21) | \
(((vfpu_rt) & 0x1f) << 16) | ((vfpu_vtreg) >> 4) | \
((offset) << 2) | \
((cache_policy) << 1))
/*
+-------------+-----------+---------+----------------------------+-----+-----+
|31 26|25 21|20 16|15 2 | 1 | 0 |
+-------------+-----------+---------+----------------------------+-----+-----+
| opcode 0xf8 | base[4-0] | vt[4-0] | offset[15-2] | c_p |vt[5]|
+-------------+-----------+---------+----------------------------+-----+-----+
StoreVector.Quadword Relative to Address in General Purpose Register
Final Address needs to be 64-byte aligned.
sv.q %vfpu_rt, offset(%base), cache_policy
%fpu_rt: VFPU Vector Target Register (column0-31/row32-63)
%base: specifies Source Address Base
offset: signed Offset added to Source Address Base
cache_policy: 0 = write-through, 1 = write-back
vector_at_address (offset + %gpr) <- fpu_vtr
*/
#define sv_q(vfpu_rt,offset,base,cache_policy) \
(0xf8000000 | \
((base) << 21) | \
(((vfpu_rt) & 0x1f) << 16) | ((vfpu_rt) >> 4) | \
((offset) << 2) | \
((cache_policy) << 1))
/*
+-------------------------------------------------------------+--------------+
|31 7 | 6 0 |
+-------------------------------------------------------------+--------------+
| opcode 0xd0060000 | vfpu_rt[6-0] |
+-------------------------------------------------------------+--------------+
SetVectorZero.Single/Pair/Triple/Quad
vzero.s %vfpu_rt ; Set 1 Vector Component to 0.0f
vzero.p %vfpu_rt ; Set 2 Vector Components to 0.0f
vzero.t %vfpu_rt ; Set 3 Vector Components to 0.0f
vzero.q %vfpu_rt ; Set 4 Vector Components to 0.0f
%vfpu_rt: VFPU Vector Target Register ([s|p|t|q]reg 0..127)
vfpu_regs[%vfpu_rt] <- 0.0f
*/
#define vzero_s(vfpu_rt) (0xd0060000 | (vfpu_rt))
#define vzero_p(vfpu_rt) (0xd0060080 | (vfpu_rt))
#define vzero_t(vfpu_rt) (0xd0068000 | (vfpu_rt))
#define vzero_q(vfpu_rt) (0xd0068080 | (vfpu_rt))
/*
+-------------------------------------------------------------+--------------+
|31 7 | 6 0 |
+-------------------------------------------------------------+--------------+
| opcode 0xd0070000 | vfpu_rt[6-0] |
+-------------------------------------------------------------+--------------+
SetVectorOne.Single/Pair/Triple/Quad
vone.s %vfpu_rt ; Set 1 Vector Component to 1.0f
vone.p %vfpu_rt ; Set 2 Vector Components to 1.0f
vone.t %vfpu_rt ; Set 3 Vector Components to 1.0f
vone.q %vfpu_rt ; Set 4 Vector Components to 1.0f
%vfpu_rt: VFPU Vector Target Register ([s|p|t|q]reg 0..127)
vfpu_regs[%vfpu_rt] <- 0.0f
*/
#define vone_s(vfpu_rt) (0xd0070000 | (vfpu_rt))
#define vone_p(vfpu_rt) (0xd0070080 | (vfpu_rt))
#define vone_t(vfpu_rt) (0xd0078000 | (vfpu_rt))
#define vone_q(vfpu_rt) (0xd0078080 | (vfpu_rt))
/*
+-------------------------------------------------------------+--------------+
|31 7 | 6 0 |
+-------------------------------------------------------------+--------------+
| opcode 0xf3868080 | vfpu_rt[6-0] |
+-------------------------------------------------------------+--------------+
SetMatrixZero.Single/Pair/Triple/Quad
vmzero.p %vfpu_rt ; Set 2x2 Submatrix to 0.0f
vmzero.t %vfpu_rt ; Set 3x3 Submatrix to 0.0f
vmzero.q %vfpu_rt ; Set 4x4 Matrix to 0.0f
%vfpu_rt: VFPU Matrix Target Register ([s|p|t|q]reg 0..127)
vfpu_mtx[%vfpu_rt] <- 0.0f
*/
#define vmzero_p(vfpu_rt) (0xf3860080 | (vfpu_rt))
#define vmzero_t(vfpu_rt) (0xf3868000 | (vfpu_rt))
#define vmzero_q(vfpu_rt) (0xf3868080 | (vfpu_rt))
/*
+-------------------------------------------------------------+--------------+
|31 7 | 6 0 |
+-------------------------------------------------------------+--------------+
| opcode 0xf3838080 | vfpu_rt[6-0] |
+-------------------------------------------------------------+--------------+
vmidt.p %vfpu_rt ; Set 2x2 Submatrix to Identity
vmidt.t %vfpu_rt ; Set 3x3 Submatrix to Identity
vmidt.q %vfpu_rt ; Set 4x4 Matrix to Identity
%vfpu_rt: VFPU Matrix Target Register ([s|p|t|q]reg 0..127)
vfpu_mtx[%vfpu_rt] <- identity matrix
*/
#define vmidt_p(vfpu_rt) (0xf3830080 | (vfpu_rt))
#define vmidt_t(vfpu_rt) (0xf3838000 | (vfpu_rt))
#define vmidt_q(vfpu_rt) (0xf3838080 | (vfpu_rt))
/* helpers for direct __asm__ use: */
#define _cgen_stringify(x) #x
#define cgen_stringify(x) _cgen_stringify(x)
#define cgen_asm(x) ".loc 1 " cgen_stringify(__LINE__) " 0\n\t.word " cgen_stringify(x) "\n\t"
#endif
main.c:
Code: Select all
#include <pspkernel.h>
#include <pspdebug.h>
#include <pspctrl.h>
#include <pspdisplay.h>
#include "codegen.h"
/* XXX SDK BUG: In theory everything should work when main is running in userspace.
Unfortunately the PSP hangs if we register the exception handler in the _init constructor, so we need to
call pspDebugInstallErrorHandler() in main().
*/
PSP_MAIN_THREAD_ATTR(/*PSP_THREAD_ATTR_USER |*/ PSP_THREAD_ATTR_VFPU);
PSP_MODULE_INFO("VFPU-test", 0x1000, 1, 1);
static int exit_callback(int arg1, int arg2, void *common)
{
sceKernelExitGame();
return 0;
}
static int callback_thread (SceSize args, void *argp)
{
int cbid = sceKernelCreateCallback("Exit Callback", exit_callback, NULL);
sceKernelRegisterExitCallback(cbid);
sceKernelSleepThreadCB();
return 0;
}
/* Sets up the callback thread and returns its thread id */
static void setup_callbacks (void) __attribute__((constructor));
static void setup_callbacks (void)
{
int thid = sceKernelCreateThread("update_thread", callback_thread, 0x11, 0xFA0, THREAD_ATTR_USER, 0);
if (thid >= 0)
sceKernelStartThread(thid, 0, 0);
}
static void back_to_kernel (void) __attribute__((destructor));
static void back_to_kernel (void)
{
sceKernelExitGame();
}
static void exception_handler (PspDebugRegBlock *regs)
{
pspDebugScreenInit();
pspDebugScreenSetBackColor(0x00FF0000);
pspDebugScreenSetTextColor(0xFFFFFFFF);
pspDebugScreenClear();
pspDebugScreenPrintf("Exception Details:\n");
pspDebugDumpException(regs);
}
void vfpu_init (void)
{
__asm__ volatile (
cgen_asm(vmzero_q(Q_M000)) /* access register array as matrices for speed */
cgen_asm(vmzero_q(Q_M100))
cgen_asm(vmzero_q(Q_M200))
cgen_asm(vmzero_q(Q_M300))
cgen_asm(vmzero_q(Q_M400))
cgen_asm(vmzero_q(Q_M500))
cgen_asm(vmzero_q(Q_M600))
cgen_asm(vmzero_q(Q_M700))
);
}
void vfpu_save_regs (float vfpu_regs [32][4])
{
register void *ptr __asm__ ("a0") = vfpu_regs;
__asm__ volatile (
cgen_asm(sv_q(0, 0 * 4, R_a0, 0))
cgen_asm(sv_q(1, 1 * 4, R_a0, 0))
cgen_asm(sv_q(2, 2 * 4, R_a0, 0))
cgen_asm(sv_q(3, 3 * 4, R_a0, 0))
cgen_asm(sv_q(4, 4 * 4, R_a0, 0))
cgen_asm(sv_q(5, 5 * 4, R_a0, 0))
cgen_asm(sv_q(6, 6 * 4, R_a0, 0))
cgen_asm(sv_q(7, 7 * 4, R_a0, 0))
cgen_asm(sv_q(8, 8 * 4, R_a0, 0))
cgen_asm(sv_q(9, 9 * 4, R_a0, 0))
cgen_asm(sv_q(10, 10 * 4, R_a0, 0))
cgen_asm(sv_q(11, 11 * 4, R_a0, 0))
cgen_asm(sv_q(12, 12 * 4, R_a0, 0))
cgen_asm(sv_q(13, 13 * 4, R_a0, 0))
cgen_asm(sv_q(14, 14 * 4, R_a0, 0))
cgen_asm(sv_q(15, 15 * 4, R_a0, 0))
cgen_asm(sv_q(16, 16 * 4, R_a0, 0))
cgen_asm(sv_q(17, 17 * 4, R_a0, 0))
cgen_asm(sv_q(18, 18 * 4, R_a0, 0))
cgen_asm(sv_q(19, 19 * 4, R_a0, 0))
cgen_asm(sv_q(20, 20 * 4, R_a0, 0))
cgen_asm(sv_q(21, 21 * 4, R_a0, 0))
cgen_asm(sv_q(22, 22 * 4, R_a0, 0))
cgen_asm(sv_q(23, 23 * 4, R_a0, 0))
cgen_asm(sv_q(24, 24 * 4, R_a0, 0))
cgen_asm(sv_q(25, 25 * 4, R_a0, 0))
cgen_asm(sv_q(26, 26 * 4, R_a0, 0))
cgen_asm(sv_q(27, 27 * 4, R_a0, 0))
cgen_asm(sv_q(28, 28 * 4, R_a0, 0))
cgen_asm(sv_q(29, 29 * 4, R_a0, 0))
cgen_asm(sv_q(30, 30 * 4, R_a0, 0))
cgen_asm(sv_q(31, 31 * 4, R_a0, 0))
: "=r"(ptr) : "r"(ptr) : "memory");
}
void vfpu_diff (float r1 [32][4], float r2 [32][4])
{
int i, j;
for (i=0; i<32; i++) {
for (j=0; j<4; j++) {
if (r1[i][j] != r2[i][j])
break;
}
if (j<4)
pspDebugScreenPrintf("- %i: % 5.5f % 5.5f % 5.5f % 5.5f\n",
i, r1[i][0], r1[i][1], r1[i][2], r1[i][3]);
}
for (i=0; i<32; i++) {
for (j=0; j<4; j++) {
if (r1[i][j] != r2[i][j])
break;
}
if (j<4)
pspDebugScreenPrintf("+ %i: % 5.5f % 5.5f % 5.5f % 5.5f\n",
i, r2[i][0], r2[i][1], r2[i][2], r2[i][3]);
}
}
static float vfpu_regs0 [32][4] __attribute__((aligned(64)));
static float vfpu_regs1 [32][4] __attribute__((aligned(64)));
/**
* ok... this function is the place to actually try the behaviour of some yet-unknown instructions.
*/
void vfpu_testcase (void)
{
__asm__(cgen_asm(vmidt_q(Q_M100)));
}
int main (int argc, char **argv)
{
pspDebugInstallErrorHandler(exception_handler);
sceCtrlSetSamplingCycle(0);
sceCtrlSetSamplingMode(PSP_CTRL_MODE_DIGITAL);
pspDebugScreenInit();
pspDebugScreenPrintf("VFPU test -- vfpu_regs0 = %p, vfpu_regs1 = %p\n\n", vfpu_regs0, vfpu_regs1);
pspDebugScreenPrintf("press O to run VFPU testcase or X to trap into breakpoint\n\n");
vfpu_init();
while (1) {
SceCtrlData pad;
sceCtrlReadBufferPositive(&pad, 1);
if (pad.Buttons & PSP_CTRL_CIRCLE) {
vfpu_save_regs(vfpu_regs0);
vfpu_testcase();
vfpu_save_regs(vfpu_regs1);
vfpu_diff(vfpu_regs0, vfpu_regs1);
}
if (pad.Buttons & PSP_CTRL_CROSS)
asm("break\n"); /* Cause a break exception, to check that the exception handler works... */
sceDisplayWaitVblankStart();
}
return 0;
}