#include #include #include #include "mpl_blas.h" #include /* For test purposes only*/ #define PRECISION float /*************************************************************************** * * * DATA PARALLEL BLAS based on MPL * * * * Internal routine, this routine is not supposed to be * * called by user programs. * * * * Version 1.0 1/4-92 , * * For MasPar MP-1 computers * * * * para//ab, University of Bergen, NORWAY * * * * The calling sequence may be changed in a future version. * * Please report any BUGs, ideas for improvement or other * * comments to * * adm@parallab.uib.no * * * * Future versions may then reflect your suggestions. * * The most current version of this software is available * * from netlib@nac.no , send the message `send index from maspar' * * * * REVISIONS: * * * ***************************************************************************/ /* The following routine performs lower triangular solve with multiple * right hand sides using blocking on the Maspar MP-1. * Each block is square and has sides == nxproc. On rectangular machines * it will therefore use two grids per block. * The routine performs triangular solves on the diagonal blocks in the * A matrix and the B-blocks level to them, and updates the B-blocks below * using matrix multiplication. * The routine expects data to be stored in the Fortran 90 fashion with * cut and stack. * * Last bug fix: July 6. -92, TS * */ #ifdef __STDMPL__ void mpl_strsm_lon( int int_diag, int m, int n, plural PRECISION *a, int lda, plural PRECISION *b ) #else void mpl_strsm_lon(int_diag,m,n,a,lda,b) int int_diag, m,n; plural PRECISION *a; int lda; plural PRECISION *b; #endif { char diag; /* * Local variables */ plural PRECISION *b0_ps,*b1_ps,a0_ps,a1_ps; register int nx=nxproc,ny=nyproc,nbxA,nbyA,nbxB,nbyB,i,j,k,bn,am,t; register plural int ix=ixproc,iy=iyproc; int l; if (int_diag) diag = 'u'; else diag = 'n'; /* * NB ! Below it is meant by "number of blocks", the number of grids * used to store the arrays. The following four variables that appear * on the lhs's are used to keep track of the stacking of the A and B * matrices on the dpu. */ nbxA = NBX(m); /* Number of blocks in row direction on A matrix */ nbyA = NBY(m); /* Number of blocks in column direction on A matrix */ nbxB = NBX(m); /* Number of blocks in row direction on B matrix */ nbyB = NBY(n); /* Number of blocks in column direction on B matrix */ /* Variables b0_ps and b1_ps are temporaries used to hold preskewed blocks * of the B matrix to be used in matrix multiplications. They are arrays * and hold multiple blocks. Each block is divided into two parts such that * the first half is stored in b0_ps[k] and the second is stored b1_ps[k]. * The same goes for the variables a0_ps and a1_ps, except that they are not * arrays, and they are used for the A matrix blocks. */ b0_ps = (plural PRECISION *) p_malloc(((nbyB+1)/2)*sizeof(PRECISION)); /* initializing boundary blocks to zero on processors off bounds */ bn = n&(ny-1); am = m&(nx-1); if ((ix >= am)&&(am>0)){ for (i=1;i<=nbyB; i++) b[i*nbxB-1] = 0.0; } if ((iy >= bn)&&(bn>0)) { for (i=0;i