#include ; Use the CRI definitions for the register names. Define the macro LEA ; for getting the address of the scratch space. CRI_REGISTER_NAMES .macro LEA reg,name laum reg, name(r31) sll reg, 32, reg lalm reg, name(reg) lal reg, name(reg) .endm ;------------------------------------------------------------------------------ ; Create aliases for the registers ear <- fv0 ; real part of even element of A eai <- fv1 ; imaginary part of even element of A oar <- fa0 ; real part of odd element of A oai <- fa1 ; imaginary part of odd element of A b0r <- fa2 ; real part of B(0) b0i <- fa3 ; imaginary part of B(0) b1r <- fa4 ; real part of B(1) b1i <- fa5 ; imaginary part of B(1) b2r <- ft0 ; real part of B(2) b2i <- ft1 ; imaginary part of B(2) tr1 <- ft2 ; temporary register ti1 <- ft3 ; temporary register tr2 <- ft4 ; temporary register ti2 <- ft5 ; temporary register tr3 <- ft6 ; temporary register ti3 <- ft7 ; temporary register tr4 <- ft8 ; temporary register ti4 <- fs0 ; temporary register tr5 <- fs1 ; temporary register ti5 <- fs2 ; temporary register c0r <- ft9 ; result component 0 real c0i <- ft10 ; result component 0 imag c1r <- ft11 ; result component 1 real c1i <- ft12 ; result component 1 imag c2r <- ft13 ; result component 2 real c2i <- ft14 ; result component 2 imag ; Arguments aptr <- a0 ; pointer to 1st adjoint matrix b0ptr <- a1 ; pointer to source vector 0 b1ptr <- a2 ; pointer to source vector 0 b2ptr <- a3 ; pointer to source vector 0 b3ptr <- a4 ; pointer to source vector 0 cptr <- a5 ; pointer to 1st destination vector bptr <- a1 ; pointer to current source vector ; Scratch integer registers scratch <- t1 ; pure scratch count <- t2 ; loop counter idone <- t3 ; done / not done boolean ;------------------------------------------------------------------------------ .ident sub$c ;------------------------------------------------------------------------------ ; Declare some scratch space .psect kernel@data,data,cache savefp: .quad 7 .endp ; Subroutine code starts here .psect kernel@code,code,cache ; ENTER mult_su3_mat_vec_sum_4dir,zero,user mult_su3_mat_vec_sum_4dir:: ; Saved registers LEA scratch,savefp stt fs0,0(scratch) stt fs1,8(scratch) stt fs2,16(scratch) ; Prefetch all data required for this site lds fzero,0(aptr) ; 1st complex value in A(0) lds fzero,32(aptr) ; next cache line lds fzero,64(aptr) ; etc lds fzero,96(aptr) ; lds fzero,128(aptr) ; lds fzero,160(aptr) ; lds fzero,192(aptr) ; lds fzero,224(aptr) ; lds fzero,256(aptr) ; lds fzero,280(aptr) ; last complex value in A(3) lds fzero,0(bptr) lds fzero,16(bptr) ;lds fzero,0(b1ptr) ;lds fzero,16(b1ptr) ;lds fzero,0(b2ptr) ;lds fzero,16(b2ptr) ;lds fzero,0(b3ptr) ;lds fzero,16(b3ptr) lds fzero,0(cptr) ; test whether this speeds up code lds fzero,16(cptr) ; Load the first B vector, set dest vector to zero cpys fzero,fzero,c0r lds b0r,0(bptr) cpys fzero,fzero,c0i lds b0i,4(bptr) cpys fzero,fzero,c1r lds b1r,8(bptr) cpys fzero,fzero,c1i lds b1i,12(bptr) cpys fzero,fzero,c2r lds b2r,16(bptr) cpys fzero,fzero,c2i lds b2i,20(bptr) ; Load the first complex value of the A array ; Elements of the A array will be alternately loaded in the even (ear, eai), ; and odd (oar, oai) registers. lds ear,0(aptr) lds eai,4(aptr) ; Prime the loop over 4 matrices. Subsequent iterations will fold the ; initial operations into the loop. muls/d ear,b0r,tr1 ; AR(0,0)*BR(0) -> TR1 lds oar,8(aptr) ; muls/d eai,b0i,tr2 ; AI(0,0)*BI(0) -> TR2 lds oai,12(aptr) ; muls/d ear,b0i,ti1 ; AR(0,0)*BI(0) -> TI1 bis zero,zero,count ; initialize loop count muls/d eai,b0r,ti2 ; AI(O,O)*BR(0) -> TI2 muls/d oar,b1r,tr3 ; AR(0,1)*BR(1) -> TR3 lds ear,16(aptr) ; muls/d oai,b1i,tr4 ; AI(0,1)*BI(1) -> TR4 lds eai,20(aptr) ; muls/d oar,b1i,ti3 ; AR(0,1)*BI(1) -> TI3 muls/d oai,b1r,ti4 ; AI(0,1)*BR(1) -> TI4 subs/d tr1,tr2,tr1 ; Re{A(0,0)*B(0)} adds/d ti1,ti2,ti1 ; Im{A(0,0)*B(0)} LOOP: muls/d ear,b2r,tr2 ; AR(0,2)*BR(2) -> TR2 lds oar,24(aptr) ; muls/d eai,b2i,tr5 ; AI(0,2)*BI(2) -> TR5 lds oai,28(aptr) ; muls/d ear,b2i,ti2 ; AR(0,2)*BI(2) -> TI2 muls/d eai,b2r,ti5 ; AI(O,2)*BR(2) -> TI5 adds/d c0r,tr1,c0r ; Re{A(0,0)*B(0)} + current adds/d c0i,ti1,c0i ; Im{A(0,0)*B(0)} + current subs/d tr3,tr4,tr3 ; Re{A(0,1)*B(1)} adds/d ti3,ti4,ti3 ; Im{A(0,1)*B(1)} muls/d oar,b0r,tr1 ; AR(1,0)*BR(0) -> TR4 lds ear,32(aptr) ; muls/d oai,b0i,tr4 ; AI(1,0)*BI(0) -> TR6 lds eai,36(aptr) ; muls/d oar,b0i,ti1 ; AR(1,0)*BI(0) -> TI4 muls/d oai,b0r,ti4 ; AI(1,0)*BR(0) -> TI6 adds/d c0r,tr3,c0r ; Re{A(0,0)*B(0) + A(0,1)*B(1)} adds/d c0i,ti3,c0i ; Im{A(0,0)*B(0) + A(0,1)*B(1)} subs/d tr2,tr5,tr2 ; Re{A(0,2)*B(2)} adds/d ti2,ti5,ti2 ; Im{A(0,2)*B(2)} subs/d tr1,tr4,tr1 ; Re{A(1,0)*B(0)} adds/d ti1,ti4,ti1 ; Im{A(1,0)*B(0)} muls/d ear,b1r,tr3 ; AR(1,1)*BR(1) -> TR3 lds oar,40(aptr) ; muls/d eai,b1i,tr4 ; AI(1,1)*BI(1) -> TR5 lds oai,44(aptr) ; muls/d ear,b1i,ti3 ; AR(1,1)*BI(1) -> TI3 muls/d eai,b1r,ti4 ; AI(1,1)*BR(1) -> TI5 adds/d c0r,tr2,c0r ; SUM [Re{A(0,k)*B(k)}], k=1,2,3 adds/d c0i,ti2,c0i ; SUM [Im{A(0,k)*B(k)}], k=1,2,3 adds/d c1r,tr1,c1r ; Re{A(1,0)*B(0)}+current adds/d c1i,ti1,c1i ; Im{A(1,0)*B(0)}+current subs/d tr3,tr4,tr3 ; Re{A(1,1)*B(1)} adds/d ti3,ti4,ti3 ; Im{A(1,1)*B(1)} muls/d oar,b2r,tr1 ; AR(1,2)*BR(2) -> TR2 lds ear,48(aptr) ; muls/d oai,b2i,tr2 ; AI(1,2)*BI(2) -> TR5 lds eai,52(aptr) ; muls/d oar,b2i,ti1 ; AR(1,2)*BI(2) -> TI2 muls/d oai,b2r,ti2 ; AI(1,2)*BR(2) -> TI5 adds/d c1r,tr3,c1r ; Re{A(1,0)*B(0) + A(1,1)*B(1)} -> TR3 adds/d c1i,ti3,c1i ; Im{A(1,0)*B(0) + A(1,1)*B(1)} -> TR3 muls/d ear,b0r,tr3 ; AR(2,0)*BR(0) -> TR1 lds oar,56(aptr) ; muls/d eai,b0i,tr4 ; AI(2,0)*BI(0) -> TR4 lds oai,60(aptr) ; muls/d ear,b0i,ti3 ; AR(2,0)*BI(0) -> TI1 muls/d eai,b0r,ti4 ; AI(2,0)*BR(0) -> TI4 subs/d tr1,tr2,tr1 ; Re{A(1,2)*B(2)} -> TR2 adds/d ti1,ti2,ti1 ; Im{A(1,2)*B(2)} -> TI2 muls/d oar,b1r,tr2 ; AR(2,1)*BR(1) -> TR5 lds ear,64(aptr) ; muls/d oai,b1i,tr5 ; AI(2,1)*BI(1) -> TR6 lds eai,68(aptr) ; muls/d oar,b1i,ti2 ; AR(2,1)*BI(1) -> TI5 addq aptr,72,aptr ; update A pointer to next matrix muls/d oai,b1r,ti5 ; AI(2,1)*BR(1) -> TI6 bis b1ptr,zero,bptr ; begin rotating pointers for next B subs/d tr3,tr4,tr3 ; Re{A(2,0)*B(0)} -> TR1 bis b2ptr,zero,b1ptr adds/d ti3,ti4,ti3 ; Im{A(2,0)*B(0)} -> TI1 bis b3ptr,zero,b2ptr adds/d c1r,tr1,c1r ; SUM [Re{A(1,k)*B(k)}], k=1,2,3 -> TR6 addq count,1,count ; increment loop count adds/d c1i,ti1,c1i ; SUM [Im{A(1,k)*B(k)}], k=1,2,3 -> TI6 cmplt count,4,idone ; check loop limit muls/d ear,b2r,tr1 ; AR(2,2)*BR(2) -> TR4 muls/d eai,b2i,tr4 ; AI(2,2)*BI(2) -> TR7 muls/d ear,b2i,ti1 ; AR(2,2)*BI(2) -> TI4 muls/d eai,b2r,ti4 ; AI(2,2)*BR(2) -> TI7 beq idone,FINISH ; special- case the last iteration subs/d tr2,tr5,tr2 ; Re{A(2,1)*B(1)} -> TR5 lds b0r,0(bptr) ; begin reading next B vector adds/d ti2,ti5,ti2 ; Im{A(2,1)*B(1)} -> TI5 lds b0i,4(bptr) ; read b first, because it probably isn't in cache? adds/d c2r,tr3,c2r ; Re{A(2,0)*B(0)}+current lds ear,0(aptr) ; adds/d c2i,ti3,c2i ; Im{A(2,0)*B(0)}+current lds eai,4(aptr) ; subs/d tr1,tr4,tr4 ; Re{A(2,2)*B(2)} -> TR7 lds b1r,8(bptr) adds/d ti1,ti4,ti4 ; Im{A(2,2)*B(2)} -> TI7 lds b1i,12(bptr) adds/d c2r,tr2,c2r ; Re{A(2,0)*B(0) + A(2,1)*B(1)} -> TR5 lds b2r,16(bptr) adds/d c2i,ti2,c2i ; Im{A(2,0)*B(0) + A(2,1)*B(1)} -> TR5 lds b2i,20(bptr) ; prime next loop iteration muls/d ear,b0r,tr1 ; AR(0,0)*BR(0) -> TR1 lds oar,8(aptr) muls/d eai,b0i,tr2 ; AI(0,0)*BI(0) -> TR2 lds oai,12(aptr) muls/d ear,b0i,ti1 ; AR(0,0)*BI(0) -> TI1 muls/d eai,b0r,ti2 ; AI(O,O)*BR(0) -> TI2 adds/d c2r,tr4,c2r ; SUM [Re{A(2,k)*B(k)}], k=1,2,3 -> TR5 adds/d c2i,ti4,c2i ; SUM [Im{A(2,k)*B(k)}], k=1,2,3 -> TI5 muls/d oar,b1r,tr3 ; AR(0,1)*BR(1) -> TR3 lds ear,16(aptr) muls/d oai,b1i,tr4 ; AI(0,1)*BI(1) -> TR4 lds eai,20(aptr) muls/d oar,b1i,ti3 ; AR(0,1)*BI(1) -> TI3 muls/d oai,b1r,ti4 ; AI(0,1)*BR(1) -> TI4 subs/d tr1,tr2,tr1 ; Re{A(0,0)*B(0)} adds/d ti1,ti2,ti1 ; Im{A(0,0)*B(0)} ; jump back to loop br zero,LOOP ; finish last iteration without any fetch-ahead FINISH: subs/d tr2,tr5,tr2 ; Re{A(2,1)*B(1)} -> TR5 adds/d ti2,ti5,ti2 ; Im{A(2,1)*B(1)} -> TI5 adds/d c2r,tr3,c2r ; Re{A(2,0)*B(0)}+current sts c0r,0(cptr) ; store Re{C(0)} adds/d c2i,ti3,c2i ; Im{A(2,0)*B(0)}+current sts c0i,4(cptr) ; store Im{C(0)} subs/d tr1,tr4,tr4 ; Re{A(2,2)*B(2)} -> TR7 adds/d ti1,ti4,ti4 ; Im{A(2,2)*B(2)} -> TI7 adds/d c2r,tr2,c2r ; Re{A(2,0)*B(0) + A(2,1)*B(1)} -> TR5 adds/d c2i,ti2,c2i ; Im{A(2,0)*B(0) + A(2,1)*B(1)} -> TR5 sts c1r,8(cptr) ; store Re{C(1)} sts c1i,12(cptr) ; store Im{C(1)} adds/d c2r,tr4,c2r ; SUM [Re{A(2,k)*B(k)}], k=1,2,3 -> TR5 adds/d c2i,ti4,c2i ; SUM [Im{A(2,k)*B(k)}], k=1,2,3 -> TI5 ; Restore frame pointer and other saved registers ldt fs0,0(scratch) ldt fs1,8(scratch) ldt fs2,16(scratch) sts c2r,16(cptr) ; store Re{C(2)} sts c2i,20(cptr) ; store Im{C(2)} ; Return to caller ret zero,(ra) .endp .end