#include <mpl.h> 
#define PRECISION float

/***************************************************************************
*                                                                          *
*   DATA PARALLEL BLAS based on MPL                                        *
*                                                                          *
*   Internal routine, this routine is not supposed to be                   *
*                     called by user programs.                             *
*                                                                          *
*   Version 1.0   1/4-92 ,                                                 *
*   For MasPar MP-1 computers                                              *
*                                                                          *
*   para//ab, University of Bergen, NORWAY                                 *
*                                                                          *
*   The calling sequence may be changed in a future version.               *
*   Please report any BUGs, ideas for improvement or other                 *
*   comments to                                                            *
*                    adm@parallab.uib.no                                   *
*                                                                          *
*   Future versions may then reflect your suggestions.                     *
*   The most current version of this software is available                 *
*   from netlib@nac.no , send the message `send index from maspar'         *
*                                                                          *
*   REVISIONS:                                                             *
*                                                                          *
***************************************************************************/

/* An upper triangularsolver for square  Matrix 
 on a Square  MasPar MP-1. 
 
The physical lay-out on the
processor array, transposes the matrix.

The algorithm is the straightforward column update algorithm,
(but remember: Matrix columns = Physical rows of processors,
Thus we update data on rows of processors)

The two parts are stack upon each other. Thus
l(i,j) is a resident of processor (j,i(mod 64)).

The elements of b, b(i), rest on (0,j) and travels downwards
as the update take place. When they arrive at the diagonal
the division takes place and they take the value of the unknown
(x), which is return in the final column.

version2: Here we actually compute LDx = b; where L is lower 
unit triangular. We then don't have to do the division of the pivot
in n step, but in one single in the beginning. Since division is slow
on MasPar we expect this to give great improvement in the performence


*/


#ifdef __STDMPL__
void mpl_sussolve(
	register int diag,
	register int m,
	register plural PRECISION a0,
	plural PRECISION *b0
	)
#else
void mpl_sussolve(diag,m,a0,b0)
	register int diag;
	register int m;
	register plural PRECISION a0;
	plural PRECISION *b0;
#endif

{
register int i,j,nx= nxproc,ny= nyproc,iter,mmod;
register plural int ix = ixproc, iy = iyproc;
register plural PRECISION btemp,temp;

/* Putting the righthand side into correct starting position */
mmod = m%ny-1;
if (mmod>=0)
	{if (iy== mmod ) btemp = xnetpN[mmod].*b0; else btemp = 0.0;}
else
 	{if (iy== ny-1 ) btemp = xnetS[1].*b0; else btemp = 0.0;}


/* scaling by the diagonal */
if (ix == iy) xnetcW[m].temp = a0;
if (ix < iy && iy < m) a0 = a0/temp;
   
for (i=m-1; i>=0; i--) {
   if (iy == i) {
	if(ix ==i) xnetcW[i].temp = btemp; 
   	if (ix < i) {
		btemp -= a0*temp;
		xnetN[1].btemp = btemp;
	}
   }
}

/* if scaled, scale the results, spread it out and put into the pointer */

temp = btemp;
if (diag != 'U' && diag != 'u')	if ((ix == iy)&&(iy<m)) btemp = btemp/a0;
if (ix == iy) xnetcW[nx-1].btemp = btemp;
*b0 = btemp;

}