#include #define PRECISION float #ifdef DEBUG extern show(); #endif /*************************************************************************** * * * DATA PARALLEL BLAS based on MPL * * * * Internal routine, this routine is not supposed to be * * called by user programs. * * * * Version 1.0 1/4-92 , * * For MasPar MP-1 computers * * * * para//ab, University of Bergen, NORWAY * * * * The calling sequence may be changed in a future version. * * Please report any BUGs, ideas for improvement or other * * comments to * * adm@parallab.uib.no * * * * Future versions may then reflect your suggestions. * * The most current version of this software is available * * from netlib@nac.no , send the message `send index from maspar' * * * * REVISIONS: * * * ***************************************************************************/ #ifdef __STDMPL__ void mpl_sq_smul( int trans, plural PRECISION aps_in, plural PRECISION bps_in, plural PRECISION *c, int m, int n , int p ) #else void mpl_sq_smul(trans, aps_in, bps_in, c, m, n, p) int trans; plural PRECISION aps_in,bps_in; plural PRECISION *c; int m, n, p; #endif /* matrix multiplication on a square machine standard systolic algorithm (Cannon) Prereq: It is assumed that A and B have been preskewed by the mpl_sq_ps subroutines trans = 'n' or 'N' : no transpose for A or B - matrix A should have been preskewed north - matrix B should have been preskewed west trans = 't' or 'T' : both A and B are stored transposed - matrix A should have been preskewed west - matrix B should have been preskewed north Comments: - Currently the multiplication loop always runs through nxproc cycles, although only p cycles is really necessary. But this would require wrap-around on the p*p corner block. INPUT PARAMETERS: trans : 'n' for no transpose, 't' if both A and B lie transposed aps : The A matrix (preskewed) bps : The B matrix (preskewed) *c : The C matrix (plain) m, n, p : A is m*p, B is p*n, C is m*n OUTPUT PARAMETERS: *c : C = C + A*B */ { register plural PRECISION cc; register int i, ncycles; register plural PRECISION aps=aps_in, bps=bps_in; #ifdef DEBUG { plural double tmpa = aps, tmpb =bps, tmpc= *c; show(&tmpa,nyproc,nxproc,"Aps in mult"); show(&tmpb,nyproc,nxproc,"Bps in mult"); show(&tmpc,nyproc,nxproc,"C before mult"); } #endif /* do ncycles iterations */ ncycles = nxproc; cc = 0.0; if ((trans == 'n') || (trans == 'N')){ for (i=0; i< ncycles; i++){ cc += aps*bps; xnetN[1].aps = aps; xnetW[1].bps = bps; } } else { for (i=0; i< ncycles; i++){ cc += aps*bps; xnetW[1].aps = aps; xnetN[1].bps = bps; } } /* Add A*B to C. Mask out all processors outside the upper left m*n corner. This is important as other processors may have garbage in this memory location. */ if ((ixproc