#include #define PRECISION float #ifdef DEBUG extern show(); #endif /*************************************************************************** * * * DATA PARALLEL BLAS based on MPL * * * * Internal routine, this routine is not supposed to be * * called by user programs. * * * * Version 1.0 1/4-92 , * * For MasPar MP-1 computers * * * * para//ab, University of Bergen, NORWAY * * * * The calling sequence may be changed in a future version. * * Please report any BUGs, ideas for improvement or other * * comments to * * adm@parallab.uib.no * * * * Future versions may then reflect your suggestions. * * The most current version of this software is available * * from netlib@nac.no , send the message `send index from maspar' * * * * REVISIONS: * * * ***************************************************************************/ #ifdef __STDMPL__ void mpl_rec_smul( int trans, register plural PRECISION aps0, register plural PRECISION aps1, register plural PRECISION bps0, register plural PRECISION bps1, register plural PRECISION *c0, register plural PRECISION *c1, int m, int n, int p ) #else void mpl_rec_smul(trans, aps0, aps1, bps0, bps1, c0, c1, m, n, p) int trans; register plural PRECISION aps0, aps1, bps0, bps1; register plural PRECISION *c0, *c1; int m, n, p; #endif /* matrix multiplication on a 1:2 machine standard systolic algorithm (Cannon) Prereq: It is assumed that A and B have been preskewed by the mpl_rec_ps subroutines trans = 'n' or 'N' : no transpose for A or B - matrix A should have been preskewed north - matrix B should have been preskewed west trans = 't' or 'T' : both A and B are stored transposed - matrix A should have been preskewed west - matrix B should have been preskewed north This version has been tuned with: - loop unrolling to level 2 Comments: - Currently the multiplication loop always runs through nxproc cycles, although only p cycles is really necessary. But this would require wrap-around on the p*p corner block. Written by Erik Boman (erikb@ii.uib.no), July 1990 Last revised: Mar 92 INPUT PARAMETERS: trans : 'n' for no transpose, 't' if both A and B lie transposed aps0,aps1 : The A matrix (preskewed) bps0,bps1 : The B matrix (preskewed) *c0, *c1 : The C matrix (plain) m, n, p : A is m*p, B is p*n, C is m*n Warning: Dimensions are given in mpfortran style. Thus, m is the number of rows in the A matrix, but is the number of columns on the MasPar PE grid ! OUTPUT PARAMETERS: *c0, *c1 : C = C + A*B */ { register plural PRECISION cc0,cc1,tmp; register int i, ncycles; #ifdef DEBUG { plural double tmpa = aps0, tmpb =bps0, tmpc = *c0; show(&tmpa,nyproc,nxproc,"Aps in mult"); show(&tmpb,nyproc,nxproc,"Bps in mult"); show(&tmpc,nyproc,nxproc,"C before mult"); } #endif /* do ncycles iterations */ ncycles = nxproc; cc0 = 0.0; cc1 = 0.0; /* unrolling the loop to level 2 in order to avoid moving b elements from registers to registers */ if ((trans == 'n') || (trans == 'N')){ for (i=0; i<(ncycles>>1); i++){ cc0 += aps0*bps0; cc1 += aps1*bps1; xnetN[1].aps0 = aps0; xnetW[1].bps0 = bps0; xnetW[1].bps1 = bps1; cc0 += aps1*bps0; cc1 += aps0*bps1; xnetN[1].aps1 = aps1; xnetW[1].bps0 = bps0; xnetW[1].bps1 = bps1; } /* if odd #cycles, remember to do the last iteration */ if (ncycles &1){ cc0 += aps0*bps0; cc1 += aps1*bps1; tmp = aps0; aps0 = aps1; xnetN[1].aps1 = tmp; xnetW[1].bps0 = bps0; xnetW[1].bps1 = bps1; } } else { /* trans == 't' || trans == 'T' */ for (i=0; i<(ncycles>>1); i++){ cc0 += aps0*bps0; cc1 += aps1*bps1; xnetW[1].aps0 = aps0; xnetW[1].aps1 = aps1; xnetN[1].bps0 = bps0; cc0 += aps0*bps1; cc1 += aps1*bps0; xnetW[1].aps0 = aps0; xnetW[1].aps1 = aps1; xnetN[1].bps1 = bps1; } /* if odd #cycles, remember to do the last iteration */ if (ncycles &1){ cc0 += aps0*bps0; cc1 += aps1*bps1; xnetW[1].aps0 = aps0; xnetW[1].aps1 = aps1; tmp = bps0; bps0 = bps1; xnetN[1].bps1 = tmp; } } /* Add A*B to C. Mask out all processors outside the upper left m*n corner. This is important as other processors may have garbage in this memory location. */ if (ixproc