In today’s update we discuss a bit about a recent find on PlayStation micro-otpimizations and unexpected results. Full news after the jump.
So, the other day I was testing some optimized code to render polygons with the old procedures I’ve been using since day 1 of programming with Squeeze Bomb. Let’s see in detail what it does, starting from macros:
// a few extra macros for faster code, needs testing // these don't abuse the stack to store GTE calculation results #define gte_stotz_m( r0 ) __asm__ volatile ( \ "mfc2 %0, $7;" \ : "=r"( r0 ) \ : ) #define gte_stflg_m( r0 ) __asm__ volatile ( \ "mfc2 %0, $31;" \ : "=r"( r0 ) \ : ) #define gte_stopz_m( r0 ) __asm__ volatile ( \ "mfc2 %0, $24;" \ : "=r"( r0 ) \ : ) // direct access to POLY_GT4.rgb3 #define gte_strr3_gt4( r0 ) __asm__ volatile ( \ "swc2 $22, 40( %0 );" \ : \ : "r"( r0 ) \ : "memory" ) // direct access to POLY_GT4.xy3 #define gte_stsxy_gt4_3( r0 ) __asm__ volatile (\ "swc2 $14, 0x2C( %0 )"\ : \ : "r"( r0 ) \ : "memory" )
The following is the actual rendering code for tris and quads:
void FastTG3L(void *ob, void *packet, CVECTOR *rgb, u32* ot) { register u32 i, is, *tag; #if !CRAZY IFO ifo; #else register int otz; #endif register POLY_GT3 *sx; const MD1_TRIANGLES *obj = (const MD1_TRIANGLES*)ob; const MD1_TRIANGLE *t = (const MD1_TRIANGLE*)obj->tri_offset; const SVECTOR *vp = (const SVECTOR*)obj->vertex_offset; const SVECTOR *vn = (const SVECTOR*)obj->normal_offset; rgb->cd = (rgb->cd & 3) | CODE_PGT3; gte_ldrgb(rgb); sx = (POLY_GT3*)packet; for (i = 0, is = obj->tri_count; i < is; t++) { POLY_GT3 *si; gte_ldv3(&vp[t->v0], &vp[t->v1], &vp[t->v2]); /* load model vertices */ i++; si = sx; gte_rtpt_b(); /* perspective */ #if !CRAZY gte_stflg(&ifo.flg); /* store flag */ if (ifo.flg & GTEFLG_ERROR) { sx += 2; continue; } #else gte_stflg_m(otz); if (otz & GTEFLG_ERROR) { sx += 2; continue; } #endif gte_nclip_b(); /* normal clipping */ #if !CRAZY gte_stopz(&ifo.otz); /* return orientation */ if (ifo.otz <= 0) { sx += 2; continue; } #else gte_stopz_m(otz); if (otz <= 0) { sx += 2; continue; } #endif gte_stsxy3_gt3(si); /* store transformed result */ sx += 2; gte_nop(); gte_avsz3_b(); /* calculate depth */ #if !CRAZY gte_stotz(&ifo.otz); /* get depth */ if (!(ifo.otz >> 6)) continue; /* skip if it's too low or too high */ #else gte_stotz_m(otz); if (!(otz >> 6)) continue; #endif gte_ldv3(&vn[t->n0], &vn[t->n1], &vn[t->n2]); /* set lighting */ #if !CRAZY tag = &ot[ifo.otz >> 4]; #else tag = &ot[otz >> 4]; si->tag = (*tag & 0x00FFFFFF) | 0x09000000; #endif gte_ncct_b(); /* calculate */ gte_strgb3_gt3(si); /* store rgb values */ // sort!! #if !CRAZY si->tag = (*tag & 0x00FFFFFF) | 0x09000000; #endif *tag = (u32)si & 0x00FFFFFF; } } void FastTG4L(void *ob, void *packet, CVECTOR *rgb, u32* ot) { register u32 i, is, *tag; #if !CRAZY IFO ifo; #else int otz, flg; #endif register POLY_GT4 *sx; const MD1_QUADS *obj = (const MD1_QUADS*)ob; const MD1_QUAD *q = (const MD1_QUAD*)obj->quad_offset; const SVECTOR *vp = (const SVECTOR*)obj->vertex_offset; const SVECTOR *vn = (const SVECTOR*)obj->normal_offset; rgb->cd = (rgb->cd & 3) | CODE_PGT4; gte_ldrgb(rgb); sx = (POLY_GT4*)packet; for (i = 0, is = obj->quad_count; i < is; q++) { POLY_GT4 *si; gte_ldv3(&vp[q->v0], &vp[q->v1], &vp[q->v2]); si = sx; i++; gte_rtpt_b(); /* RotTransPers3 */ #if !CRAZY gte_stflg(&ifo.flg0); if (ifo.flg0 & GTEFLG_ERROR) { sx += 2; continue; } gte_nclip_b(); /* NormalClip */ gte_stopz(&ifo.otz); /* back clip */ if (ifo.otz <= 0) { sx += 2; continue; } /* flipped, skip */ #else gte_stflg_m(flg); if (flg & GTEFLG_ERROR) { sx += 2; continue; } gte_nclip_b(); /* NormalClip */ gte_stopz_m(otz); /* back clip */ if (otz <= 0) { sx += 2; continue; } /* flipped, skip */ #endif gte_stsxy3_gt4((u_long *)si); gte_ldv0(&vp[q->v3]); sx += 2; gte_nop(); gte_rtps_b(); /* RotTransPers */ #if !CRAZY gte_stflg(&ifo.flg); if (ifo.flg & GTEFLG_ERROR) continue; #else gte_stflg_m(flg); if (flg & GTEFLG_ERROR) continue; #endif gte_stsxy_gt4_3(si); gte_avsz4(); #if !CRAZY gte_stotz(&ifo.otz); // limit range if (!(ifo.otz >> 6)) continue; #else gte_stotz_m(otz); if (!(otz >> 6)) continue; #endif gte_ldv3(&vn[q->n0], &vn[q->n1], &vn[q->n2]); #if !CRAZY tag = &ot[ifo.otz >> 4]; #else tag = &ot[otz >> 4]; #endif gte_ncct_b(); gte_strgb3_gt4(si); gte_ldv0(&vn[q->n3]); si->tag = (*tag & 0x00FFFFFF) | 0x0C000000; gte_nccs_b(); gte_strr3_gt4(si); // sort!! *tag = (u32)si & 0x00FFFFFF; } }
If you are familiar with inline assembly and how the stack works, you can probably notice two minor differences that can translate to better performance. The macros above tend to change one very stupid behavior of Sony’s original tricks to retrieve GTE registers, which were previously stored in memory rather than registers. The code activated via CRAZY = TRUE is the one that uses register direct copies, while the other case defaults to stack writes. It’s not exactly the biggest change ever, but it avoids any unnecessary access to memory, which is a great penalty on the PlayStation.
At first I thought the code wouldn’t work because Sony made the macros work with memory as the only mean to access GTE registers, but apparently there are no differences whatsoever in behavior when you use mfc2 (possibly cfc2 too) instead of swc2. I’m still not sure how much this improves the general performance, but it could be enough to prevent any future lag. Similarly, the new macros to access POLY_GT3 and POLY_GT4 diffuse attributes does a little more optimization, even tho it’s not that great; all it does is performing straight access on the structures rather than creating temp register values for each attribute.