#include <mpl.h>
#include "mpl_blas.h"
#define PRECISION float

/***************************************************************************
*                                                                          *
*   DATA PARALLEL BLAS based on MPL                                        *
*                                                                          *
*   Internal routine, this routine is not supposed to be                   *
*                     called by user programs.                             *
*                                                                          *
*   Version 1.0   1/4-92 ,                                                 *
*   For MasPar MP-1 computers                                              *
*                                                                          *
*   para//ab, University of Bergen, NORWAY                                 *
*                                                                          *
*   The calling sequence may be changed in a future version.               *
*   Please report any BUGs, ideas for improvement or other                 *
*   comments to                                                            *
*                    adm@parallab.uib.no                                   *
*                                                                          *
*   Future versions may then reflect your suggestions.                     *
*   The most current version of this software is available                 *
*   from netlib@nac.no , send the message `send index from maspar'         *
*                                                                          *
*   REVISIONS:                                                             *
*                                                                          *
***************************************************************************/

/* The following routine performs upper triangular solve with multiple
 * right hand sides on m*m A-matrices and m*n B matrices with
 * (1<= m <= nxproc) and (1<= n <= nxproc).
 * Here the A matrix is the triangle, while the B matrix represents the
 * collection of rhs's arranged columnwise.
 * After execution the B matrix is overwritten by the answers.
 * This implementation is based on the standard column oriented back
 * substitution algorithm. The systems are solved concurrently. This is
 * done by spreading the appropriate column of the A matrix to all
 * columns on the PE grid, so that it can be used in processing each
 * B vector.
 * This routine applies to square machines only, and the data storage
 * is as follows:
 * The m uppermost and m leftmost PE's contains one element each of the
 * A matrix in the plural variable a, while m uppermost and n leftmost
 * PE's contains the B matrix in the plural variable b.  Both variables
 * are parameters to the routine, and the matrices are expected to come
 * transposed.
 *
 * Virtually it then looks like this (A matrix):
 *
 *
 *
 *       | *  0  0  0  0  0  0 . . .|
 *       | *  *  0  0  0  0  0      |
 *   a   | *  *  *  0  0  0  0      |
 *       | *  *  *  *  0  0  0      |  nyproc
 *       | *  *  *  *  *  0  0   ?  |
 *       | *  *  *  *  *  *  0      |
 *       | *  *  *  *  *  *  *      |
 *       | .                   .    |
 *       | .        ?            .  |
 *       | .                       .|
 *
 *       <---------  nxproc -------->
 *
 *  Last bug fix: July 6. -92, TS
 *
 */

#ifdef __STDMPL__

void mpl_stri_sun(register int diag,
                  register int m,
                  register int n,
                  register plural PRECISION a,
                  plural PRECISION          *b)
#else

void mpl_stri_sun(diag,m,n,a,b)

register int          diag;
register int           m,n;
register plural PRECISION a;
plural PRECISION          *b;
#endif

{
 register int             i,nx=nxproc,ny=nyproc;
 register plural int      ix=ixproc,iy=iyproc;
 plural PRECISION             b_wrk,temp,a_wrk;

 b_wrk = *b;
 /*
  * This branch is executed if the A matrix is exactly
  * nxproc * nxproc large
  */
 if (m == nx)
   {
    if ((diag != 'u') && (diag != 'U'))
      {                  /* Scaling A-matrix by its diagonal */
       if (ix == iy)
	 xnetcW[nx].temp = a;
       if (ix<iy)
	 {
	  a /= temp;
         }
      }
            /* Done scaling */


    /*
     * Loop performing column-update on the righthandsides
     */
    for (i =ny-1;i>=0; i--)
      {
       /* Spreading (i+nyproc)'th A-column to all other columns */

       if (iy == i)
	 xnetcN[ny].a_wrk = a;

       /* Updating entries above row i */

       if (ix == i)
	 xnetcW[nx].temp = b_wrk;
       if (ix < i)
	 b_wrk -= a_wrk * temp;
      }

    if ((diag != 'u') && (diag != 'U')) /* scaling result by diagonal */
      {
       if (ix == iy)
	 xnetcS[ny].temp = a;
       b_wrk /= temp;
      }
   }


 else /* if (m != nx) *****************************************/

  /*
   * This branch is executed for problems where the A matrix
   * is less than nxproc * nxproc
   */

   {
    if ((diag != 'u') && (diag != 'U'))
      {                      /* Scaling A matrix by its diagonal */
       if (ix == iy)
	 xnetcW[nx].temp = a;
       if ((ix<iy) && (iy < m))
	 a /= temp;
      }
			    /* Done scaling */


    /*
     * Loop performing column-update on the righthandsides
     */
      for (i=m-1;i>=0;i--)
	{
	 /* Spreading i'th A-column to all other columns */

	 if (iy == i)
	   xnetcN[ny].a_wrk = a;

         /* Updating entries below row i */

	 if (ix == i)
	   xnetcW[nx].temp = b_wrk;
	 if (ix < i)
	    b_wrk -= a_wrk * temp;
	}


    if ((diag != 'u') && (diag != 'U'))
      {                      /* scaling result by diagonal */
       if (ix == iy)
	 xnetcS[ny].temp = a;
       if (ix < m)
	 b_wrk /= temp;
      }
   }
 if ((iy < n) && (ix < m)) 
   *b =b_wrk;
}