#include #include #include #include "mpl_blas.h" #define PRECISION float #ifdef DEBUG extern show(); #endif /*************************************************************************** * * * DATA PARALLEL BLAS based on MPL * * * * Internal routine, this routine is not supposed to be * * called by user programs. * * * * Version 1.0 1/4-92 , * * For MasPar MP-1 computers * * * * para//ab, University of Bergen, NORWAY * * * * The calling sequence may be changed in a future version. * * Please report any BUGs, ideas for improvement or other * * comments to * * adm@parallab.uib.no * * * * Future versions may then reflect your suggestions. * * The most current version of this software is available * * from netlib@nac.no , send the message `send index from maspar' * * * * REVISIONS: * * * ***************************************************************************/ #ifdef __STDMPL__ void mpl_sgemm1 (int m, int n, int p, PRECISION alpha, plural PRECISION *a, plural PRECISION *b, plural PRECISION *c) #else void mpl_sgemm1 (m, n, p, alpha, a, b, c) int m, n, p; PRECISION alpha; plural PRECISION *a,*b,*c; #endif /* ************************************************************************ *** Incoming Arguments: *** *** m integer a is m X p *** n integer b is p X n *** p integer c is m X n *** alpha float scalar multiplying AB *** a plural float, array first matrix factor -- its address *** b plural float, array second matrix factor -- its address *** c plural float, array matrix added to -- its address ************************************************************************ *** Outgoing Arguments: *** *** c plural float, array alpha * a * b + c *** ************************************************************************ BLAS 3 general matrix matrix multiply on a MasPar. Based on the standard systolic algorithm (Cannon 69) Comments on this version : - poor performance on rest-blocks (non-multiples of nxproc) - too many function calls - register use should be optimized *********************************************************************** ** Implemented by : Erik Boman *** ** Implementation date: 15 Oct 90 *** ** Latest update : 20 Mar 92 *** *********************************************************************** */ { /* local variables */ int nbp,nbm,nbn; plural PRECISION bps0,bps1; plural PRECISION aa0,aa1; register plural PRECISION *aps0,*aps1; register int i,j,k; register int nx=nxproc; int restp; /* test whether you are running on a square or a rectangular machine */ if (SQUARE){ /* square processor grid */ #ifdef DEBUG printf("In mpl_sgemm1: square MasPar\n"); { plural float debug; debug = a[0]; show(&debug,nyproc,nxproc,"A"); debug = b[0]; show(&debug,nyproc,nxproc,"B"); } #endif if ( m <= nx && n <= nx && p <= nx){ /* only one block */ /* * Form C := alpha*A*B + C */ /* preskew A */ mpl_sq_spsN( *a, &aa0, p, m); /* preskew B */ mpl_sq_spsW( *b, &bps0, n, p); /* scale with alpha */ if (alpha != one){ bps0 *= alpha; } /* C += A*B */ mpl_sq_smul( 'n', aa0, bps0, c, m, n, p); } else { /* compute number of nxproc by nxproc blocks */ nbm = NBX(m); nbp = NBX(p); nbn = NBX(n); /* Allocate storage for the preskewed B-blocks */ if (! (aps0 = (plural PRECISION *)p_malloc(nbm*sizeof(PRECISION)))){ printf("Error in dgemm: Not enough pmem\n"); exit(1); } /* Do the block multiplication in k,j,i order. */ /* * Form C := alpha*A*B + C */ for (k=0; k < nbp; k++){ /* preskew blocks A(:,k) */ restp = MIN(nx,p-k*nx); for (i=0; i