#include <mpl.h>
#include "mpl_blas.h"
#define PRECISION float

/***************************************************************************
*                                                                          *
*   DATA PARALLEL BLAS based on MPL                                        *
*                                                                          *
*   Internal routine, this routine is not supposed to be                   *
*                     called by user programs.                             *
*                                                                          *
*   Version 1.0   1/4-92 ,                                                 *
*   For MasPar MP-1 computers                                              *
*                                                                          *
*   para//ab, University of Bergen, NORWAY                                 *
*                                                                          *
*   The calling sequence may be changed in a future version.               *
*   Please report any BUGs, ideas for improvement or other                 *
*   comments to                                                            *
*                    adm@parallab.uib.no                                   *
*                                                                          *
*   Future versions may then reflect your suggestions.                     *
*   The most current version of this software is available                 *
*   from netlib@nac.no , send the message `send index from maspar'         *
*                                                                          *
*   REVISIONS:                                                             *
*                                                                          *
***************************************************************************/

/* The following routine performs lower triangular solve with multiple
 * right hand sides on m*m A-matrices and m*n B matrices with
 * (1<= m <= nxproc) and (1<= n <= nxproc).  Here the A matrix is the
 * triangle, while the B matrix represents the collection of rhs's
 * arranged columnwise.
 * After execution the B matrix is overwritten by the answers.
 * This implementation is based on the standard column oriented forward
 * substitution algorithm.
 * The systems are solved concurrently. This is done by spreading the
 * current column of the A matrix to all columns on the PE grid, so
 * that it can be used in processing each B column. That is, each 
 * B column has a copy of the current A column aligned with it.
 * This routine applies to rectangular machines only, and the data
 * storage is as follows:
 *
 * If m <= nyproc, then each PE in the upper left m*m square of the dpu
 * contains one element each of the A matrix, and the upper left m*n
 * rectangle contains one element each of the B matrix, in the variables
 * a0 and b0 respectively.
 * If m > nyproc, the variables a1 and b1 are used to contain the rest of
 * the matrices. This is done such that the (nyproc+1)'th row of matrices
 * A and B is stored in the first row of PE's, the (nyproc+2)'th row in
 * the second row, and so on. Some PE's will then contain two matrix
 * elements.
 * Virtually this then looks like this (A matrix):
 *
 *        _____________________
 *         * * ... * * ...* * * 
 *         0 * * ..* *....... * 
 *         : :0 *  * :        * 
 *         : :..:0.* * ....   * 
 *        --------------------- 
 *         0 ......0 0 * ...* * 
 *         0 ......0     *    * 
 *         0 ......0 0 ..0. *   
 *         0 ......0 00000..0 * 
 *        ---------------------
 *
 * It appears transposed because that's the way MPF stores matrices.
 *
 *  Last bug fix: July 3. -92, TS
 *
 */
#ifdef __STDMPL__

void mpl_stri_sln(register int diag,
                  register int m,
                  register int n,
                  register plural PRECISION a0,
                  plural PRECISION          *b0)
#else

void mpl_stri_sln(diag,m,n,a0,b0)

register int          diag;
register int           m,n;
register plural PRECISION a0;
plural PRECISION          *b0;
#endif
{
 /*
  * ix,iy,nx, and ny stores ixproc,iyproc.. in registers to increase
  * performance
  */
 register int             i,nx=nxproc,ny=nyproc;
 register plural int      ix=ixproc,iy=iyproc;
 register plural PRECISION   b0_wrk,temp0,a_wrk;

 b0_wrk = *b0;
 /*
  * This branch is executed if the A matrix is exactly
  * nxproc * nxproc large
  */
 if (m == nx)
   {
    if ((diag != 'u') && (diag != 'U')) /* Scale A matrix by its diagonal */
      {
       if (ix == iy)
	 xnetcE[nx].temp0 = a0;
       if (ix>iy)
	 a0 /= temp0 ;
      }                    /* Done scaling */


    /*
     * Loop performing column-update on the righthandsides
     */
    for (i =0; i<ny; i++)
      {
       if (iy == i)      /* Spread i'th A-column to all other columns */
	 xnetcS[ny].a_wrk = a0;

      /* Updating entries below row i */ 

       if (ix == i)
	 {
	  xnetcE[nx].temp0 = b0_wrk;
	 }
       if (ix > i)
         {
	  b0_wrk -= a_wrk * temp0;
         }
      }

       /* scale result by diagonal */
    if ((diag != 'u') && (diag != 'U'))
      {
       if (ix == iy)
	 xnetcS[ny].temp0 = a0;
       b0_wrk /= temp0;
      }
   }
 else /* if (m != nx)   **********************************/

 /*
  * This branch is executed for problems where the A matrix
  * is less than nxproc
  */
   {
    if ((diag != 'u') && (diag != 'U')) /* Scale A matrix by its diagonal */
      {
       if (ix == iy)
	 xnetcE[nx].temp0 = a0;
       if ((ix>iy) && (ix<m))
	 a0 /= temp0 ;
      }                    /* Done scaling */


    /*
     * Loop performing column-update on the righthandsides
     */
    for (i =0; (i<ny && i<m); i++)
      {
       if (iy == i)      /* Spread i'th A-column to all other columns */
	 xnetcS[ny].a_wrk = a0;

      /* Updating entries below row i */ 

       if (ix == i)
	 {
	  xnetcE[nx].temp0 = b0_wrk;
	 }
       if ((ix > i) && (ix < m))
         {
	  b0_wrk -= a_wrk * temp0;
         }
      }


      /* scale result by diagonal */
    if ((diag != 'u') && (diag != 'U'))
      {
       if (ix == iy)
	 xnetcS[ny].temp0 = a0;
       if (ix < m)
	 {
	  b0_wrk /= temp0;
	 }
      }
   }
 if (ix < m) 
   {
    if (iy < n) 
      *b0 =b0_wrk;
   }
}