#include #include "mpl_blas.h" #define PRECISION float /*************************************************************************** * * * DATA PARALLEL BLAS based on MPL * * * * Internal routine, this routine is not supposed to be * * called by user programs. * * * * Version 1.0 1/4-92 , * * For MasPar MP-1 computers * * * * para//ab, University of Bergen, NORWAY * * * * The calling sequence may be changed in a future version. * * Please report any BUGs, ideas for improvement or other * * comments to * * adm@parallab.uib.no * * * * Future versions may then reflect your suggestions. * * The most current version of this software is available * * from netlib@nac.no , send the message `send index from maspar' * * * * REVISIONS: * * * ***************************************************************************/ /* The following routine performs lower triangular solve with multiple * right hand sides on m*m A-matrices and m*n B matrices with * (1<= m <= nxproc) and (1<= n <= nxproc). Here the A matrix is the * triangle, while the B matrix represents the collection of rhs's * arranged columnwise. * After execution the B matrix is overwritten by the answers. * This implementation is based on the standard column oriented forward * substitution algorithm. * The systems are solved concurrently. This is done by spreading the * current column of the A matrix to all columns on the PE grid, so * that it can be used in processing each B column. That is, each * B column has a copy of the current A column aligned with it. * This routine applies to rectangular machines only, and the data * storage is as follows: * * If m <= nyproc, then each PE in the upper left m*m square of the dpu * contains one element each of the A matrix, and the upper left m*n * rectangle contains one element each of the B matrix, in the variables * a0 and b0 respectively. * If m > nyproc, the variables a1 and b1 are used to contain the rest of * the matrices. This is done such that the (nyproc+1)'th row of matrices * A and B is stored in the first row of PE's, the (nyproc+2)'th row in * the second row, and so on. Some PE's will then contain two matrix * elements. * Virtually this then looks like this (A matrix): * * _____________________ * * * ... * * ...* * * * 0 * * ..* *....... * * : :0 * * : * * : :..:0.* * .... * * --------------------- * 0 ......0 0 * ...* * * 0 ......0 * * * 0 ......0 0 ..0. * * 0 ......0 00000..0 * * --------------------- * * It appears transposed because that's the way MPF stores matrices. * * Last bug fix: July 3. -92, TS * */ #ifdef __STDMPL__ void mpl_stri_sln(register int diag, register int m, register int n, register plural PRECISION a0, plural PRECISION *b0) #else void mpl_stri_sln(diag,m,n,a0,b0) register int diag; register int m,n; register plural PRECISION a0; plural PRECISION *b0; #endif { /* * ix,iy,nx, and ny stores ixproc,iyproc.. in registers to increase * performance */ register int i,nx=nxproc,ny=nyproc; register plural int ix=ixproc,iy=iyproc; register plural PRECISION b0_wrk,temp0,a_wrk; b0_wrk = *b0; /* * This branch is executed if the A matrix is exactly * nxproc * nxproc large */ if (m == nx) { if ((diag != 'u') && (diag != 'U')) /* Scale A matrix by its diagonal */ { if (ix == iy) xnetcE[nx].temp0 = a0; if (ix>iy) a0 /= temp0 ; } /* Done scaling */ /* * Loop performing column-update on the righthandsides */ for (i =0; i i) { b0_wrk -= a_wrk * temp0; } } /* scale result by diagonal */ if ((diag != 'u') && (diag != 'U')) { if (ix == iy) xnetcS[ny].temp0 = a0; b0_wrk /= temp0; } } else /* if (m != nx) **********************************/ /* * This branch is executed for problems where the A matrix * is less than nxproc */ { if ((diag != 'u') && (diag != 'U')) /* Scale A matrix by its diagonal */ { if (ix == iy) xnetcE[nx].temp0 = a0; if ((ix>iy) && (ix i) && (ix < m)) { b0_wrk -= a_wrk * temp0; } } /* scale result by diagonal */ if ((diag != 'u') && (diag != 'U')) { if (ix == iy) xnetcS[ny].temp0 = a0; if (ix < m) { b0_wrk /= temp0; } } } if (ix < m) { if (iy < n) *b0 =b0_wrk; } }