// mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, // su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c) // c <- A[0]*B0+A[1]*B1+A[2]*B2+A[3]*B3 // file m_mv_s_4dir.m4, i860 assembler version of m_mv_s_4dir.c // // Register usage: // f8,f9 = b[0] // f10,f11 = b[1] // f12,f13 = b[2] // f14,f15 = a[1][0] // f16,f17 = a[1][1] // f18,f19 = a[1][2] // f20,f21 = c[0] // f22,f23 = c[1] // f24,f25 = c[2] // f26,f27 = a[0][0] and a[2][0] // f28,f29 = a[0][1] and a[2][1] // f30,f31 = a[0][2] and a[2][2] define(A,r16) // address of vector of su3_matrices define(B0,r17) // address of vector to be multiplied define(B1,r18) // address of vector to be multiplied define(B2,r19) // address of vector to be multiplied define(B3,r20) // address of vector to be multiplied define(C,r21) // address of result define(B,r22) // temporary address of vector define(INC,r23) // increment for loop define(COUNT,r24) // loop counter define(b0,f8) // complex number = register pair define(b0r,f8) // real part define(b0i,f9) // imag part define(b1,f10) define(b1r,f10) define(b1i,f11) define(b2,f12) define(b2r,f12) define(b2i,f13) define(c0,f20) define(c0r,f20) define(c0i,f21) define(c1,f22) define(c1r,f22) define(c1i,f23) define(c2,f24) define(c2r,f24) define(c2i,f25) define(a00,f26) define(a00r,f26) define(a00i,f27) define(a01,f28) define(a01r,f28) define(a01i,f29) define(a02,f30) define(a02r,f30) define(a02i,f31) define(a10,f14) define(a10r,f14) define(a10i,f15) define(a11,f16) define(a11r,f16) define(a11i,f17) define(a12,f18) define(a12r,f18) define(a12i,f19) define(a20,f26) define(a20r,f26) define(a20i,f27) define(a21,f28) define(a21r,f28) define(a21i,f29) define(a22,f30) define(a22r,f30) define(a22i,f31) // First accumulate c0 real and imaginary parts and c1 real part, // then c2 real and imaginary and c1.imag .text //First vector .align 8 _mult_su3_mat_vec_sum_4dir: .align 8 // start dual mode, start fetching // enter zeroes into pipeline d.pfadd.ss f0,f0,f0; fld.d 0(A),a00 d.pfadd.ss f0,f0,f0; fld.d 0(B0),b0 d.pfadd.ss f0,f0,f0; fld.d 24(A),a10 // start zero'th column of A times B0 down pipeline d.pfmul.ss a00r,b0r,f0; nop d.pfmul.ss a00r,b0i,f0; fld.d 8(B0),b1 d.pfmul.ss a10r,b0r,f0; nop d.m12apm.ss a00i,b0i,f0; fld.d 8(A),a01 d.m12apm.ss a00i,b0r,f0; nop d.m12apm.ss a10i,b0i,f0; fld.d 32(A),a11 // first column of A d.m12asm.ss a01r,b1r,f0; adds -1,r0,INC d.m12apm.ss a01r,b1i,f0; fld.d 16(A),a02 d.m12asm.ss a11r,b1r,f0; or 2,r0,COUNT d.m12apm.ss a01i,b1i,f0; fld.d 16(B0),b2 d.m12apm.ss a01i,b1r,f0; bla INC,COUNT,DUMMY d.m12apm.ss a11i,b1i,f0; fld.d 40(A),a12 // second column of A DUMMY: d.m12asm.ss a02r,b2r,f0; nop d.m12apm.ss a02r,b2i,f0; fld.d 48(A),a20 d.m12asm.ss a12r,b2r,f0; nop d.m12apm.ss a02i,b2i,f0; fld.d 56(A),a21 m12apm.ss a02i,b2r,f0; nop m12apm.ss a12i,b2i,f0; fld.d 64(A),a22 // start multiplies for second half, where we accumulate c[2] // real and imaginary and c[1] imaginary. Sums from first half // still going through adder. m12asm.ss a20r,b0r,f0 m12apm.ss a20r,b0i,f0 m12asm.ss a10r,b0i,f0 // Now enter zeroes into adder pipe, while results from first // half are coming out. pfadd.ss f0,f0,c0r pfadd.ss f0,f0,c0i pfadd.ss f0,f0,c1r // continue with multiplies in second half, column 0 of A LOOP: m12apm.ss a20i,b0i,f0 m12apm.ss a20i,b0r,f0 m12apm.ss a10i,b0r,f0 // Row 1 of A m12asm.ss a21r,b1r,f0 m12apm.ss a21r,b1i,f0 m12apm.ss a11r,b1i,f0 m12apm.ss a21i,b1i,f0 .align 8 d.m12apm.ss a21i,b1r,f0 d.m12apm.ss a11i,b1r,f0 // Row 2 of A, start fetching for next vector d.m12asm.ss a22r,b2r,f0; adds 72,A,A d.m12apm.ss a22r,b2i,f0; fld.d 0(A),a00 d.m12apm.ss a12r,b2i,f0; mov B1,B d.m12apm.ss a22i,b2i,f0; fld.d 0(B),b0 d.m12apm.ss a22i,b2r,f0; mov B2,B1 d.m12apm.ss a12i,b2r,f0; fld.d 24(A),a10 // Next vector // First accumulate c0 real and imaginary parts and c1 real part, // then c2 real and imaginary and c1.imag // start zero'th column of A times B down pipeline // previous vector still going into adder d.m12asm.ss a00r,b0r,f0; mov B3,B2 d.m12apm.ss a00r,b0i,f0; fld.d 8(B),b1 d.m12apm.ss a10r,b0r,f0; nop // enter C into pipeline, results from last vector coming out d.pfadd.ss f0,c0r,c2r; nop d.pfadd.ss f0,c0i,c2i; nop d.pfadd.ss f0,c1r,c1i; nop // continue with multiplications for new vector d.m12apm.ss a00i,b0i,f0; fld.d 8(A),a01 d.m12apm.ss a00i,b0r,f0; nop d.m12apm.ss a10i,b0i,f0; fld.d 32(A),a11 // first column of A d.m12asm.ss a01r,b1r,f0; nop d.m12apm.ss a01r,b1i,f0; fld.d 16(A),a02 d.m12asm.ss a11r,b1r,f0; nop d.m12apm.ss a01i,b1i,f0; fld.d 16(B),b2 d.m12apm.ss a01i,b1r,f0; nop d.m12apm.ss a11i,b1i,f0; fld.d 40(A),a12 // second column of A d.m12asm.ss a02r,b2r,f0; nop d.m12apm.ss a02r,b2i,f0; fld.d 48(A),a20 d.m12asm.ss a12r,b2r,f0; nop d.m12apm.ss a02i,b2i,f0; fld.d 56(A),a21 d.m12apm.ss a02i,b2r,f0; nop d.m12apm.ss a12i,b2i,f0; nop // start multiplies for second half, where we accumulate c[2] // real and imaginary and c[1] imaginary. Sums from first half // still going through adder. // end dual mode m12asm.ss a20r,b0r,f0; nop m12apm.ss a20r,b0i,f0; fld.d 64(A),a22 m12asm.ss a10r,b0i,f0 // Now enter C into adder pipe, while results from first // half are coming out. pfadd.ss f0,c2r,c0r pfadd.ss f0,c2i,c0i bla INC,COUNT,LOOP pfadd.ss f0,c1i,c1r // go to LOOP from here // continue with multiplies in second half, column 0 of A m12apm.ss a20i,b0i,f0 m12apm.ss a20i,b0r,f0 m12apm.ss a10i,b0r,f0 // Row 1 of A m12asm.ss a21r,b1r,f0 m12apm.ss a21r,b1i,f0 m12apm.ss a11r,b1i,f0 m12apm.ss a21i,b1i,f0 m12apm.ss a21i,b1r,f0 m12apm.ss a11i,b1r,f0 // Row 2 of A m12asm.ss a22r,b2r,f0 m12apm.ss a22r,b2i,f0 m12apm.ss a12r,b2i,f0 m12apm.ss a22i,b2i,f0 m12apm.ss a22i,b2r,f0 m12apm.ss a12i,b2r,f0 // Empty multiplier pipe m12asm.ss f0,f0,f0 m12apm.ss f0,f0,f0 .align 8 d.m12apm.ss f0,f0,f0 // empty adder pipe, store results d.pfadd.ss f0,f0,c2r pfadd.ss f0,f0,c2i; fst.d c0,0(C) pfadd.ss f0,f0,c1i; fst.d c2,16(C) bri r1 fst.d c1,8(C) .globl _mult_su3_mat_vec_sum_4dir