/* m_amv_4dir.alpha: DEC alpha assembler version of m_amv_4dir.c */ /* A matrix is adjoint (complex conjugate and transpose) */ #include "asdef.alpha.h" /* Create aliases for the registers */ #define ear fv0 /* real part of even element of A */ #define eai fv1 /* imaginary part of even element of A */ #define oar fa0 /* real part of odd element of A */ #define oai fa1 /* imaginary part of odd element of A */ #define b0r fa2 /* real part of B(0) */ #define b0i fa3 /* imaginary part of B(0) */ #define b1r fa4 /* real part of B(1) */ #define b1i fa5 /* imaginary part of B(1) */ #define b2r ft0 /* real part of B(2) */ #define b2i ft1 /* imaginary part of B(2) */ #define tr1 ft2 /* temporary register */ #define ti1 ft3 /* temporary register */ #define tr2 ft4 /* temporary register */ #define ti2 ft5 /* temporary register */ #define tr3 ft6 /* temporary register */ #define ti3 ft7 /* temporary register */ #define tr4 ft8 /* temporary register */ #define ti4 ft9 /* temporary register */ #define tr5 ft10 /* temporary register */ #define ti5 ft11 /* temporary register */ #define tr6 ft12 /* temporary register */ #define ti6 ft13 /* temporary register */ #define tr7 ft14 /* temporary register */ #define ti7 fs0 /* temporary register */ /* Arguments */ #define aptr a0 /* pointer to 1st adjoint matrix */ #define bptr a1 /* pointer to source vector */ #define cptr a2 /* pointer to 1st destination vector */ /* Scratch integer registers */ #define scratch t1 /* pure scratch */ #define count t2 /* loop counter */ #define idone t3 /* done / not done boolean */ .globl mult_adj_su3_mat_vec_4dir .ent mult_adj_su3_mat_vec_4dir 2 mult_adj_su3_mat_vec_4dir: /* Save register fs0, since linkage convention expects it. */ lda sp,-8(sp) stt fs0,0(sp) /* Prefetch all data required for this site */ lds fzero,0(aptr) /* 1st complex value in A(0) */ lds fzero,32(aptr) /* next cache line */ lds fzero,64(aptr) /* etc */ lds fzero,96(aptr) /* */ lds fzero,128(aptr) /* */ lds fzero,160(aptr) /* */ lds fzero,192(aptr) /* */ lds fzero,224(aptr) /* */ lds fzero,256(aptr) /* */ lds fzero,280(aptr) /* last complex value in A(3) */ lds fzero,0(bptr) lds fzero,16(bptr) /* lds fzero,0(cptr) ; test whether this speeds up code */ /* lds fzero,32(cptr) */ /* lds fzero,64(cptr) */ /* lds fzero,88(cptr) */ /* Load the B vector once for this call */ lds b0r,0(bptr) lds b0i,4(bptr) lds b1r,8(bptr) lds b1i,12(bptr) lds b2r,16(bptr) lds b2i,20(bptr) /* Load the first complex value of the A array */ /* Elements of the A array will be alternately loaded in the even (ear, eai), */ /* and odd (oar, oai) registers. */ lds ear,0(aptr) lds eai,4(aptr) /* Prime the loop over 4 matrices. Subsequent iterations will fold the */ /* initial operations into the loop. */ muls ear,b0r,tr1 /* AR(0,0)*BR(0) -> TR1 */ lds oar,24(aptr) /* */ muls eai,b0i,tr2 /* AI(0,0)*BI(0) -> TR2 */ lds oai,28(aptr) /* */ muls ear,b0i,ti1 /* AR(0,0)*BI(0) -> TI1 */ bis zero,zero,count /* initialize loop count */ muls eai,b0r,ti2 /* AI(O,O)*BR(0) -> TI2 */ muls oar,b1r,tr3 /* AR(1,0)*BR(1) -> TR3 */ lds ear,48(aptr) /* */ muls oai,b1i,tr4 /* AI(1,0)*BI(1) -> TR4 */ lds eai,52(aptr) /* */ muls oar,b1i,ti3 /* AR(1,0)*BI(1) -> TI3 */ muls oai,b1r,ti4 /* AI(1,0)*BR(1) -> TI4 */ adds tr1,tr2,tr1 /* Re{A(0,0)*B(0)} */ subs ti1,ti2,ti1 /* Im{A(0,0)*B(0)} */ LOOP: muls ear,b2r,tr2 /* AR(2,0)*BR(2) -> TR2 */ lds oar,8(aptr) /* */ muls eai,b2i,tr5 /* AI(2,0)*BI(2) -> TR5 */ lds oai,12(aptr) /* */ muls ear,b2i,ti2 /* AR(2,0)*BI(2) -> TI2 */ muls eai,b2r,ti5 /* AI(2,0)*BR(2) -> TI5 */ adds tr3,tr4,tr3 /* Re{A(1,0)*B(1)} */ subs ti3,ti4,ti3 /* Im{A(1,0)*B(1)} */ muls oar,b0r,tr4 /* AR(0,1)*BR(0) -> TR4 */ lds ear,32(aptr) /* */ muls oai,b0i,tr6 /* AI(0,1)*BI(0) -> TR6 */ lds eai,36(aptr) /* */ muls oar,b0i,ti4 /* AR(0,1)*BI(0) -> TI4 */ muls oai,b0r,ti6 /* AI(0,1)*BR(0) -> TI6 */ adds tr1,tr3,tr1 /* Re{A(0,0)*B(0) + A(1,0)*B(1)} */ adds ti1,ti3,ti1 /* Im{A(0,0)*B(0) + A(1,0)*B(1)} */ adds tr2,tr5,tr2 /* Re{A(2,0)*B(2)} */ subs ti2,ti5,ti2 /* Im{A(2,0)*B(2)} */ muls ear,b1r,tr3 /* AR(1,1)*BR(1) -> TR3 */ lds oar,56(aptr) /* */ muls eai,b1i,tr5 /* AI(1,1)*BI(1) -> TR5 */ lds oai,60(aptr) /* */ muls ear,b1i,ti3 /* AR(1,1)*BI(1) -> TI3 */ muls eai,b1r,ti5 /* AI(1,1)*BR(1) -> TI5 */ adds tr1,tr2,tr1 /* SUM [Re{A(0,k)*B(k)}], k=1,2,3 */ adds ti1,ti2,ti1 /* SUM [Im{A(0,k)*B(k)}], k=1,2,3 */ adds tr4,tr6,tr4 /* Re{A(0,1)*B(0)} */ subs ti4,ti6,ti4 /* Im{A(0,1)*B(0)} */ adds tr3,tr5,tr3 /* Re{A(1,1)*B(1)} */ subs ti3,ti5,ti3 /* Im{A(1,1)*B(1)} */ muls oar,b2r,tr2 /* AR(2,1)*BR(2) -> TR2 */ lds ear,16(aptr) /* */ muls oai,b2i,tr5 /* AI(2,1)*BI(2) -> TR5 */ lds eai,20(aptr) /* */ muls oar,b2i,ti2 /* AR(2,1)*BI(2) -> TI2 */ sts tr1,0(cptr) /* store Re{C(0)} */ muls oai,b2r,ti5 /* AI(2,1)*BR(2) -> TI5 */ sts ti1,4(cptr) /* store Im{C(0)} */ adds tr3,tr4,tr3 /* Re{A(0,1)*B(0) + A(1,1)*B(1)} -> TR3 */ adds ti3,ti4,ti3 /* Im{A(0,1)*B(0) + A(1,1)*B(1)} -> TR3 */ muls ear,b0r,tr1 /* AR(0,2)*BR(0) -> TR1 */ lds oar,40(aptr) /* */ muls eai,b0i,tr4 /* AI(0,2)*BI(0) -> TR4 */ lds oai,44(aptr) /* */ muls ear,b0i,ti1 /* AR(0,2)*BI(0) -> TI1 */ muls eai,b0r,ti4 /* AI(0,2)*BR(0) -> TI4 */ adds tr2,tr5,tr2 /* Re{A(2,1)*B(2)} -> TR2 */ subs ti2,ti5,ti2 /* Im{A(2,1)*B(2)} -> TI2 */ muls oar,b1r,tr5 /* AR(1,2)*BR(1) -> TR5 */ lds ear,64(aptr) /* */ muls oai,b1i,tr6 /* AI(1,2)*BI(1) -> TR6 */ lds eai,68(aptr) /* */ muls oar,b1i,ti5 /* AR(1,2)*BI(1) -> TI5 */ addq aptr,72,aptr /* update A pointer to next matrix */ muls oai,b1r,ti6 /* AI(1,2)*BR(1) -> TI6 */ adds tr1,tr4,tr1 /* Re{A(0,2)*B(0)} -> TR1 */ subs ti1,ti4,ti1 /* Im{A(0,2)*B(0)} -> TI1 */ muls ear,b2r,tr4 /* AR(2,2)*BR(2) -> TR4 */ addq count,1,count /* increment loop count */ muls eai,b2i,tr7 /* AI(2,2)*BI(2) -> TR7 */ cmplt count,4,idone /* check loop limit */ muls ear,b2i,ti4 /* AR(2,2)*BI(2) -> TI4 */ muls eai,b2r,ti7 /* AI(2,2)*BR(2) -> TI7 */ beq idone,FINISH /* special- case the last iteration */ adds tr5,tr6,tr5 /* Re{A(1,2)*B(1)} -> TR5 */ lds ear,0(aptr) /* */ subs ti5,ti6,ti5 /* Im{A(1,2)*B(1)} -> TI5 */ lds eai,4(aptr) /* */ adds tr2,tr3,tr6 /* SUM [Re{A(1,k)*B(k)}], k=1,2,3 -> TR6 */ adds ti2,ti3,ti6 /* SUM [Im{A(1,k)*B(k)}], k=1,2,3 -> TI6 */ adds tr4,tr7,tr7 /* Re{A(2,2)*B(2)} -> TR7 */ subs ti4,ti7,ti7 /* Im{A(2,2)*B(2)} -> TI7 */ adds tr1,tr5,tr5 /* Re{A(0,2)*B(0) + A(1,2)*B(1)} -> TR5 */ adds ti1,ti5,ti5 /* Im{A(0,2)*B(0) + A(1,2)*B(1)} -> TR5 */ /* prime next loop iteration */ muls ear,b0r,tr1 /* AR(0,0)*BR(0) -> TR1 */ lds oar,24(aptr) muls eai,b0i,tr2 /* AI(0,0)*BI(0) -> TR2 */ lds oai,28(aptr) muls ear,b0i,ti1 /* AR(0,0)*BI(0) -> TI1 */ sts tr6,8(cptr) /* store Re{C(1)} */ muls eai,b0r,ti2 /* AI(O,O)*BR(0) -> TI2 */ sts ti6,12(cptr) /* store Im{C(1)} */ adds tr5,tr7,tr5 /* SUM [Re{A(2,k)*B(k)}], k=1,2,3 -> TR5 */ adds ti5,ti7,ti5 /* SUM [Im{A(2,k)*B(k)}], k=1,2,3 -> TI5 */ muls oar,b1r,tr3 /* AR(1,0)*BR(1) -> TR3 */ lds ear,48(aptr) muls oai,b1i,tr4 /* AI(1,0)*BI(1) -> TR4 */ lds eai,52(aptr) muls oar,b1i,ti3 /* AR(1,0)*BI(1) -> TI3 */ muls oai,b1r,ti4 /* AI(1,0)*BR(1) -> TI4 */ adds tr1,tr2,tr1 /* Re{A(0,0)*B(0)} */ sts tr5,16(cptr) /* store Re{C(2)} */ subs ti1,ti2,ti1 /* Im{A(0,0)*B(0)} */ sts ti5,20(cptr) /* store Im{C(2)} */ /* jump back to loop */ addq cptr,24,cptr /* advance cptr to next dest vector */ br zero,LOOP /* finish last iteration without any fetch-ahead */ FINISH: adds tr5,tr6,tr5 /* Re{A(1,2)*B(1)} -> TR5 */ subs ti5,ti6,ti5 /* Im{A(1,2)*B(1)} -> TI5 */ adds tr2,tr3,tr6 /* SUM [Re{A(1,k)*B(k)}], k=1,2,3 -> TR6 */ adds ti2,ti3,ti6 /* SUM [Im{A(1,k)*B(k)}], k=1,2,3 -> TI6 */ adds tr4,tr7,tr7 /* Re{A(2,2)*B(2)} -> TR7 */ subs ti4,ti7,ti7 /* Im{A(2,2)*B(2)} -> TI7 */ adds tr1,tr5,tr5 /* Re{A(0,2)*B(0) + A(1,2)*B(1)} -> TR5 */ adds ti1,ti5,ti5 /* Im{A(0,2)*B(0) + A(1,2)*B(1)} -> TR5 */ sts tr6,8(cptr) /* store Re{C(1)} */ sts ti6,12(cptr) /* store Im{C(1)} */ adds tr5,tr7,tr5 /* SUM [Re{A(2,k)*B(k)}], k=1,2,3 -> TR5 */ adds ti5,ti7,ti5 /* SUM [Im{A(2,k)*B(k)}], k=1,2,3 -> TI5 */ /* Restore stack pointer */ ldt fs0,0(sp) /* same as ti7 */ lda sp, 8(sp) sts tr5,16(cptr) /* store Re{C(2)} */ sts ti5,20(cptr) /* store Im{C(2)} */ /* Return to caller */ ret zero,(ra) .end mult_adj_su3_mat_vec_4dir