#***************** m_mat_nn.s (in su3.a) ***************************** # For IBM RS6000 # C. DeTar 02/05/95 # # # void mult_su3_nn( su3_matrix *a, su3_matric *b, su3_matrix *c ) # C <- A * B # # This version is for single precision # Intermediate results are computed in double precision # but WITHOUT ROUNDING TO SINGLE PRECISION: TRUNCATION only # # .file "m_mat_nn.s" # XCOFF table of contents entry for subroutine linkage .globl mult_su3_nn[ds] .csect mult_su3_nn[ds] .long .mult_su3_nn[PR] .long TOC[tc0] .long 0 .toc T.mult_su3_nn: .tc .mult_su3_nn[tc],mult_su3_nn[ds] .globl .mult_su3_nn[PR] .csect .mult_su3_nn[PR] # General purpose registers # Retained as called .set a,3 .set b,4 .set c,5 # Offsets for arrays and structures ... # Real and imaginary parts of components of 3D complex vector .set word,4 # Single precision .set c0re,0*word .set c0im,1*word .set c1re,2*word .set c1im,3*word .set c2re,4*word .set c2im,5*word # Real and imaginary parts of components of SU(3) matrix .set e00re,0*word .set e00im,1*word .set e01re,2*word .set e01im,3*word .set e02re,4*word .set e02im,5*word .set e10re,6*word .set e10im,7*word .set e11re,8*word .set e11im,9*word .set e12re,10*word .set e12im,11*word .set e20re,12*word .set e20im,13*word .set e21re,14*word .set e21im,15*word .set e22re,16*word .set e22im,17*word # Floating point registers # Linkage conventions require preserving registers 14-31 .set c0r,0 .set c0i,1 .set c1r,2 .set c1i,3 .set c2r,4 .set c2i,5 .set b0r,6 .set b0i,7 .set b1r,8 .set b1i,9 .set b2r,6 .set b2i,7 .set a00r,10 .set a10r,11 .set a20r,12 .set a00i,10 .set a10i,11 .set a20i,12 .set a01r,10 .set a11r,11 .set a21r,12 .set a01i,10 .set a11i,11 .set a21i,12 .set a02r,10 .set a12r,11 .set a22r,12 .set a02i,10 .set a12i,11 .set a22i,12 .set prefetch,13 ###################################################################### # First dot product to be computed # b0r=b->e[0][j].real; b0i=b->e[0][j].imag; # b1r=b->e[1][j].real; b1i=b->e[1][j].imag; # b2r=b->e[2][j].real; b2i=b->e[2][j].imag; # a00r=a->e[0][0].real; a00i=a->e[0][0].imag; # a01r=a->e[0][1].real; a01i=a->e[0][1].imag; # a02r=a->e[0][2].real; a02i=a->e[0][2].imag; # c->e[0][j].real = a00r*b0r - a00i*b0i + a01r*b1r - a01i*b1i + a02r*b2r - a02i*b2i; # c->e[0][j].imag = a00r*b0i + a00i*b0r + a01r*b1i + a01i*b1r + a02r*b2i + a02i*b2r; ###################################################################### # Second dot product to be computed # a10r=a->e[1][0].real; a10i=a->e[1][0].imag; # a11r=a->e[1][1].real; a11i=a->e[1][1].imag; # a12r=a->e[1][3].real; a12i=a->e[1][2].imag; # c->e[1][j].real = a10r*b0r - a10i*b0i + a11r*b1r - a11i*b1i + a12r*b2r - a12i*b2i; # c->e[1][j].imag = a10r*b0i + a10i*b0r + a11r*b1i + a11i*b1r + a12r*b2i + a12i*b2r; ###################################################################### # Third dot product to be computed # a20r=a->e[2][0].real; a20i=a->e[2][0].imag; # a21r=a->e[2][1].real; a21i=a->e[2][1].imag; # a22r=a->e[2][2].real; a22i=a->e[2][2].imag; # c->e[2][j].real = a20r*b0r - a20i*b0i + a21r*b1r - a21i*b1i + a22r*b2r - a22i*b2i; # c->e[2][j].imag = a20r*b0i + a20i*b0r + a21r*b1i + a21i*b1r + a22r*b2i + a22i*b2r; # c->e[0][j].real <- c0r # c->e[0][j].imag <- c0i # c->e[1][j].real <- c1r # c->e[1][j].imag <- c1i # c->e[2][j].real <- c2r # c->e[2][j].imag <- c2i ###################################################################### # # Column j = 0 of product matrix # ###################################################################### # c0r = a00r*b0r # c1r = a10r*b0r # c2r = a20r*b0r lfs b0r,e00re(b) lfs a00r,e00re(a) lfs a10r,e10re(a) lfs a20r,e20re(a) fm c0r,a00r,b0r fm c1r,a10r,b0r fm c2r,a20r,b0r #################### # c0i = a00r*b0i # c1i = a10r*b0i # c2i = a20r*b0i lfs b0i,e00im(b) fm c0i,a00r,b0i fm c1i,a10r,b0i fm c2i,a20r,b0i #################### # c0r -= a00i*b0i # c1r -= a10i*b0i # c2r -= a20i*b0i lfs a00i,e00im(a) lfs a10i,e10im(a) lfs a20i,e20im(a) fnms c0r,a00i,b0i,c0r fnms c1r,a10i,b0i,c1r fnms c2r,a20i,b0i,c2r #################### # c0i += a00i*b0r # c1i += a10i*b0r # c2i += a20i*b0r fma c0i,a00i,b0r,c0i fma c1i,a10i,b0r,c1i fma c2i,a20i,b0r,c2i #################### # c0r += a01r*b1r # c1r += a11r*b1r # c2r += a21r*b1r lfs b1r,e10re(b) lfs a01r,e01re(a) lfs a11r,e11re(a) lfs a21r,e21re(a) fma c0r,a01r,b1r,c0r fma c1r,a11r,b1r,c1r fma c2r,a21r,b1r,c2r #################### # c0i += a01r*b1i # c1i += a11r*b1i # c2i += a21r*b1i lfs b1i,e10im(b) fma c0i,a01r,b1i,c0i fma c1i,a11r,b1i,c1i fma c2i,a21r,b1i,c2i #################### # c0r -= a01i*b1i # c1r -= a11i*b1i # c2r -= a21i*b1i lfs a01i,e01im(a) lfs a11i,e11im(a) lfs a21i,e21im(a) fnms c0r,a01i,b1i,c0r fnms c1r,a11i,b1i,c1r fnms c2r,a21i,b1i,c2r #################### # c0i += a01i*b1r # c1i += a11i*b1r # c2i += a21i*b1r fma c0i,a01i,b1r,c0i fma c1i,a11i,b1r,c1i fma c2i,a21i,b1r,c2i #################### # c0r += a02r*b2r # c1r += a12r*b2r # c2r += a22r*b2r lfs b2r,e20re(b) lfs a02r,e02re(a) lfs a12r,e12re(a) lfs a22r,e22re(a) fma c0r,a02r,b2r,c0r fma c1r,a12r,b2r,c1r fma c2r,a22r,b2r,c2r #################### # c0i += a02r*b2i # c1i += a12r*b2i # c2i += a22r*b2i lfs b2i,e20im(b) fma c0i,a02r,b2i,c0i fma c1i,a12r,b2i,c1i fma c2i,a22r,b2i,c2i #################### # c0r -= a02i*b2i; # c1r -= a12i*b2i; # c2r -= a22i*b2i; lfs a02i,e02im(a) lfs a12i,e12im(a) lfs a22i,e22im(a) fnms c0r,a02i,b2i,c0r fnms c1r,a12i,b2i,c1r fnms c2r,a22i,b2i,c2r #################### # c0i += a02i*b2r; # c1i += a12i*b2r; # c2i += a22i*b2r; fma c0i,a02i,b2r,c0i fma c1i,a12i,b2r,c1i fma c2i,a22i,b2r,c2i #################### # Round and save result frsp c0r,c0r # Round to single precision frsp c1r,c1r # Round to single precision frsp c2r,c2r # Round to single precision frsp c0i,c0i # Round to single precision frsp c1i,c1i # Round to single precision frsp c2i,c2i # Round to single precision stfs c0r,e00re(c) stfs c1r,e10re(c) stfs c2r,e20re(c) stfs c0i,e00im(c) stfs c1i,e10im(c) stfs c2i,e20im(c) ###################################################################### # # Column j = 1 of product matrix # ###################################################################### # c0r = a00r*b0r # c1r = a10r*b0r # c2r = a20r*b0r lfs b0r,e01re(b) lfs a00r,e00re(a) lfs a10r,e10re(a) lfs a20r,e20re(a) fm c0r,a00r,b0r fm c1r,a10r,b0r fm c2r,a20r,b0r #################### # c0i = a00r*b0i # c1i = a10r*b0i # c2i = a20r*b0i lfs b0i,e01im(b) fm c0i,a00r,b0i fm c1i,a10r,b0i fm c2i,a20r,b0i #################### # c0r -= a00i*b0i # c1r -= a10i*b0i # c2r -= a20i*b0i lfs a00i,e00im(a) lfs a10i,e10im(a) lfs a20i,e20im(a) fnms c0r,a00i,b0i,c0r fnms c1r,a10i,b0i,c1r fnms c2r,a20i,b0i,c2r #################### # c0i += a00i*b0r # c1i += a10i*b0r # c2i += a20i*b0r fma c0i,a00i,b0r,c0i fma c1i,a10i,b0r,c1i fma c2i,a20i,b0r,c2i #################### # c0r += a01r*b1r # c1r += a11r*b1r # c2r += a21r*b1r lfs b1r,e11re(b) lfs a01r,e01re(a) lfs a11r,e11re(a) lfs a21r,e21re(a) fma c0r,a01r,b1r,c0r fma c1r,a11r,b1r,c1r fma c2r,a21r,b1r,c2r #################### # c0i += a01r*b1i # c1i += a11r*b1i # c2i += a21r*b1i lfs b1i,e11im(b) fma c0i,a01r,b1i,c0i fma c1i,a11r,b1i,c1i fma c2i,a21r,b1i,c2i #################### # c0r -= a01i*b1i # c1r -= a11i*b1i # c2r -= a21i*b1i lfs a01i,e01im(a) lfs a11i,e11im(a) lfs a21i,e21im(a) fnms c0r,a01i,b1i,c0r fnms c1r,a11i,b1i,c1r fnms c2r,a21i,b1i,c2r #################### # c0i += a01i*b1r # c1i += a11i*b1r # c2i += a21i*b1r fma c0i,a01i,b1r,c0i fma c1i,a11i,b1r,c1i fma c2i,a21i,b1r,c2i #################### # c0r += a02r*b2r # c1r += a12r*b2r # c2r += a22r*b2r lfs b2r,e21re(b) lfs a02r,e02re(a) lfs a12r,e12re(a) lfs a22r,e22re(a) fma c0r,a02r,b2r,c0r fma c1r,a12r,b2r,c1r fma c2r,a22r,b2r,c2r #################### # c0i += a02r*b2i # c1i += a12r*b2i # c2i += a22r*b2i lfs b2i,e21im(b) fma c0i,a02r,b2i,c0i fma c1i,a12r,b2i,c1i fma c2i,a22r,b2i,c2i #################### # c0r -= a02i*b2i; # c1r -= a12i*b2i; # c2r -= a22i*b2i; lfs a02i,e02im(a) lfs a12i,e12im(a) lfs a22i,e22im(a) fnms c0r,a02i,b2i,c0r fnms c1r,a12i,b2i,c1r fnms c2r,a22i,b2i,c2r #################### # c0i += a02i*b2r; # c1i += a12i*b2r; # c2i += a22i*b2r; fma c0i,a02i,b2r,c0i fma c1i,a12i,b2r,c1i fma c2i,a22i,b2r,c2i #################### # Round and save result frsp c0r,c0r # Round to single precision frsp c1r,c1r # Round to single precision frsp c2r,c2r # Round to single precision frsp c0i,c0i # Round to single precision frsp c1i,c1i # Round to single precision frsp c2i,c2i # Round to single precision stfs c0r,e01re(c) stfs c1r,e11re(c) stfs c2r,e21re(c) stfs c0i,e01im(c) stfs c1i,e11im(c) stfs c2i,e21im(c) ###################################################################### # # Column j = 2 of product matrix # ###################################################################### # c0r = a00r*b0r # c1r = a10r*b0r # c2r = a20r*b0r lfs b0r,e02re(b) lfs a00r,e00re(a) lfs a10r,e10re(a) lfs a20r,e20re(a) fm c0r,a00r,b0r fm c1r,a10r,b0r fm c2r,a20r,b0r #################### # c0i = a00r*b0i # c1i = a10r*b0i # c2i = a20r*b0i lfs b0i,e02im(b) fm c0i,a00r,b0i fm c1i,a10r,b0i fm c2i,a20r,b0i #################### # c0r -= a00i*b0i # c1r -= a10i*b0i # c2r -= a20i*b0i lfs a00i,e00im(a) lfs a10i,e10im(a) lfs a20i,e20im(a) fnms c0r,a00i,b0i,c0r fnms c1r,a10i,b0i,c1r fnms c2r,a20i,b0i,c2r #################### # c0i += a00i*b0r # c1i += a10i*b0r # c2i += a20i*b0r fma c0i,a00i,b0r,c0i fma c1i,a10i,b0r,c1i fma c2i,a20i,b0r,c2i #################### # c0r += a01r*b1r # c1r += a11r*b1r # c2r += a21r*b1r lfs b1r,e12re(b) lfs a01r,e01re(a) lfs a11r,e11re(a) lfs a21r,e21re(a) fma c0r,a01r,b1r,c0r fma c1r,a11r,b1r,c1r fma c2r,a21r,b1r,c2r #################### # c0i += a01r*b1i # c1i += a11r*b1i # c2i += a21r*b1i lfs b1i,e12im(b) fma c0i,a01r,b1i,c0i fma c1i,a11r,b1i,c1i fma c2i,a21r,b1i,c2i #################### # c0r -= a01i*b1i # c1r -= a11i*b1i # c2r -= a21i*b1i lfs a01i,e01im(a) lfs a11i,e11im(a) lfs a21i,e21im(a) fnms c0r,a01i,b1i,c0r fnms c1r,a11i,b1i,c1r fnms c2r,a21i,b1i,c2r #################### # c0i += a01i*b1r # c1i += a11i*b1r # c2i += a21i*b1r fma c0i,a01i,b1r,c0i fma c1i,a11i,b1r,c1i fma c2i,a21i,b1r,c2i #################### # c0r += a02r*b2r # c1r += a12r*b2r # c2r += a22r*b2r lfs b2r,e22re(b) lfs a02r,e02re(a) lfs a12r,e12re(a) lfs a22r,e22re(a) fma c0r,a02r,b2r,c0r fma c1r,a12r,b2r,c1r fma c2r,a22r,b2r,c2r #################### # c0i += a02r*b2i # c1i += a12r*b2i # c2i += a22r*b2i lfs b2i,e22im(b) fma c0i,a02r,b2i,c0i fma c1i,a12r,b2i,c1i fma c2i,a22r,b2i,c2i #################### # c0r -= a02i*b2i; # c1r -= a12i*b2i; # c2r -= a22i*b2i; lfs a02i,e02im(a) lfs a12i,e12im(a) lfs a22i,e22im(a) fnms c0r,a02i,b2i,c0r fnms c1r,a12i,b2i,c1r fnms c2r,a22i,b2i,c2r #################### # c0i += a02i*b2r; # c1i += a12i*b2r; # c2i += a22i*b2r; fma c0i,a02i,b2r,c0i fma c1i,a12i,b2r,c1i fma c2i,a22i,b2r,c2i #################### # Round and save result frsp c0r,c0r # Round to single precision frsp c1r,c1r # Round to single precision frsp c2r,c2r # Round to single precision frsp c0i,c0i # Round to single precision frsp c1i,c1i # Round to single precision frsp c2i,c2i # Round to single precision stfs c0r,e02re(c) stfs c1r,e12re(c) stfs c2r,e22re(c) stfs c0i,e02im(c) stfs c1i,e12im(c) stfs c2i,e22im(c) # Return br