The weird art of optimization

In today’s update we discuss a bit about a recent find on PlayStation micro-otpimizations and unexpected results. Full news after the jump.

So, the other day I was testing some optimized code to render polygons with the old procedures I’ve been using since day 1 of programming with Squeeze Bomb. Let’s see in detail what it does, starting from macros:

[code language=”CPP”]// a few extra macros for faster code, needs testing
// these don’t abuse the stack to store GTE calculation results
#define gte_stotz_m( r0 ) __asm__ volatile ( \
"mfc2 %0, $7;" \
: "=r"( r0 ) \
: )

#define gte_stflg_m( r0 ) __asm__ volatile ( \
"mfc2 %0, $31;" \
: "=r"( r0 ) \
: )

#define gte_stopz_m( r0 ) __asm__ volatile ( \
"mfc2 %0, $24;" \
: "=r"( r0 ) \
: )

// direct access to POLY_GT4.rgb3
#define gte_strr3_gt4( r0 ) __asm__ volatile ( \
"swc2 $22, 40( %0 );" \
: \
: "r"( r0 ) \
: "memory" )

// direct access to POLY_GT4.xy3
#define gte_stsxy_gt4_3( r0 ) __asm__ volatile (\
"swc2 $14, 0x2C( %0 )"\
: \
: "r"( r0 ) \
: "memory" )[/code]

The following is the actual rendering code for tris and quads:

[code language=”cpp”]void FastTG3L(void *ob, void *packet, CVECTOR *rgb, u32* ot)
{
register u32 i, is, *tag;
#if !CRAZY
IFO ifo;
#else
register int otz;
#endif
register POLY_GT3 *sx;
const MD1_TRIANGLES *obj = (const MD1_TRIANGLES*)ob;
const MD1_TRIANGLE *t = (const MD1_TRIANGLE*)obj->tri_offset;
const SVECTOR *vp = (const SVECTOR*)obj->vertex_offset;
const SVECTOR *vn = (const SVECTOR*)obj->normal_offset;

rgb->cd = (rgb->cd & 3) | CODE_PGT3;
gte_ldrgb(rgb);

sx = (POLY_GT3*)packet;

for (i = 0, is = obj->tri_count; i < is; t++)
{
POLY_GT3 *si;
gte_ldv3(&vp[t->v0], &vp[t->v1], &vp[t->v2]); /* load model vertices */
i++;
si = sx;
gte_rtpt_b(); /* perspective */

#if !CRAZY
gte_stflg(&ifo.flg); /* store flag */
if (ifo.flg & GTEFLG_ERROR) { sx += 2; continue; }
#else
gte_stflg_m(otz);
if (otz & GTEFLG_ERROR) { sx += 2; continue; }
#endif
gte_nclip_b(); /* normal clipping */
#if !CRAZY
gte_stopz(&ifo.otz); /* return orientation */
if (ifo.otz <= 0) { sx += 2; continue; }
#else
gte_stopz_m(otz);
if (otz <= 0) { sx += 2; continue; }
#endif
gte_stsxy3_gt3(si); /* store transformed result */
sx += 2;
gte_nop();
gte_avsz3_b(); /* calculate depth */
#if !CRAZY
gte_stotz(&ifo.otz); /* get depth */
if (!(ifo.otz >> 6)) continue; /* skip if it’s too low or too high */
#else
gte_stotz_m(otz);
if (!(otz >> 6)) continue;
#endif

gte_ldv3(&vn[t->n0], &vn[t->n1], &vn[t->n2]); /* set lighting */
#if !CRAZY
tag = &ot[ifo.otz >> 4];
#else
tag = &ot[otz >> 4];
si->tag = (*tag & 0x00FFFFFF) | 0x09000000;
#endif
gte_ncct_b(); /* calculate */
gte_strgb3_gt3(si); /* store rgb values */

// sort!!
#if !CRAZY
si->tag = (*tag & 0x00FFFFFF) | 0x09000000;
#endif
*tag = (u32)si & 0x00FFFFFF;
}
}

void FastTG4L(void *ob, void *packet, CVECTOR *rgb, u32* ot)
{
register u32 i, is, *tag;
#if !CRAZY
IFO ifo;
#else
int otz, flg;
#endif
register POLY_GT4 *sx;
const MD1_QUADS *obj = (const MD1_QUADS*)ob;
const MD1_QUAD *q = (const MD1_QUAD*)obj->quad_offset;
const SVECTOR *vp = (const SVECTOR*)obj->vertex_offset;
const SVECTOR *vn = (const SVECTOR*)obj->normal_offset;

rgb->cd = (rgb->cd & 3) | CODE_PGT4;
gte_ldrgb(rgb);

sx = (POLY_GT4*)packet;

for (i = 0, is = obj->quad_count; i < is; q++)
{
POLY_GT4 *si;
gte_ldv3(&vp[q->v0], &vp[q->v1], &vp[q->v2]);
si = sx;
i++;
gte_rtpt_b(); /* RotTransPers3 */

#if !CRAZY
gte_stflg(&ifo.flg0);
if (ifo.flg0 & GTEFLG_ERROR) { sx += 2; continue; }
gte_nclip_b(); /* NormalClip */
gte_stopz(&ifo.otz); /* back clip */
if (ifo.otz <= 0) { sx += 2; continue; } /* flipped, skip */
#else
gte_stflg_m(flg);
if (flg & GTEFLG_ERROR) { sx += 2; continue; }
gte_nclip_b(); /* NormalClip */
gte_stopz_m(otz); /* back clip */
if (otz <= 0) { sx += 2; continue; } /* flipped, skip */
#endif
gte_stsxy3_gt4((u_long *)si); gte_ldv0(&vp[q->v3]);
sx += 2;
gte_nop();
gte_rtps_b(); /* RotTransPers */
#if !CRAZY
gte_stflg(&ifo.flg);
if (ifo.flg & GTEFLG_ERROR) continue;
#else
gte_stflg_m(flg);
if (flg & GTEFLG_ERROR) continue;
#endif

gte_stsxy_gt4_3(si);
gte_avsz4();
#if !CRAZY
gte_stotz(&ifo.otz);
// limit range
if (!(ifo.otz >> 6)) continue;
#else
gte_stotz_m(otz);
if (!(otz >> 6)) continue;
#endif

gte_ldv3(&vn[q->n0], &vn[q->n1], &vn[q->n2]);
#if !CRAZY
tag = &ot[ifo.otz >> 4];
#else
tag = &ot[otz >> 4];
#endif
gte_ncct_b();
gte_strgb3_gt4(si);

gte_ldv0(&vn[q->n3]);
si->tag = (*tag & 0x00FFFFFF) | 0x0C000000;
gte_nccs_b();
gte_strr3_gt4(si);

// sort!!
*tag = (u32)si & 0x00FFFFFF;
}
}[/code]

If you are familiar with inline assembly and how the stack works, you can probably notice two minor differences that can translate to better performance. The macros above tend to change one very stupid behavior of Sony’s original tricks to retrieve GTE registers, which were previously stored in memory rather than registers. The code activated via CRAZY = TRUE is the one that uses register direct copies, while the other case defaults to stack writes. It’s not exactly the biggest change ever, but it avoids any unnecessary access to memory, which is a great penalty on the PlayStation.

At first I thought the code wouldn’t work because Sony made the macros work with memory as the only mean to access GTE registers, but apparently there are no differences whatsoever in behavior when you use mfc2 (possibly cfc2 too) instead of swc2. I’m still not sure how much this improves the general performance, but it could be enough to prevent any future lag. Similarly, the new macros to access POLY_GT3 and POLY_GT4 diffuse attributes does a little more optimization, even tho it’s not that great; all it does is performing straight access on the structures rather than creating temp register values for each attribute.