#include <mpl.h>
#include "mpl_blas.h"
#define PRECISION float

/***************************************************************************
*                                                                          *
*   DATA PARALLEL BLAS based on MPL                                        *
*                                                                          *
*   Internal routine, this routine is not supposed to be                   *
*                     called by user programs.                             *
*                                                                          *
*   Version 1.0   1/4-92 ,                                                 *
*   For MasPar MP-1 computers                                              *
*                                                                          *
*   para//ab, University of Bergen, NORWAY                                 *
*                                                                          *
*   The calling sequence may be changed in a future version.               *
*   Please report any BUGs, ideas for improvement or other                 *
*   comments to                                                            *
*                    adm@parallab.uib.no                                   *
*                                                                          *
*   Future versions may then reflect your suggestions.                     *
*   The most current version of this software is available                 *
*   from netlib@nac.no , send the message `send index from maspar'         *
*                                                                          *
*   REVISIONS:                                                             *
*                                                                          *
***************************************************************************/
/*  The following routine performs upper triangular solve with multiple
 *  right hand sides using blocking.
 *  Each block is square and has sides == nxproc. On rectangular machines
 *  it therefore uses two grids per block.
 *  The routine performs triangular solves on the diagonal blocks in the
 *  A matrix and the B-blocks level to them, and updates B-blocks below
 *  using matrix multiplication.
 *
 *  Last bug fix: July 8. -92, TS
 *
 */

#ifdef __STDMPL__
void mpl_strsm_upn(
	int                int_diag,
	int  m,
	int  n,
	plural PRECISION       *a,
	int                lda,
	plural PRECISION       *b
	)
#else
void mpl_strsm_upn(int_diag,m,n,a,lda,b)
	int                int_diag,
			   m,
			   n;
	plural PRECISION       *a;
	int                lda;
	plural PRECISION       *b;
#endif

{
 char  diag;


 plural PRECISION       b0_ps,b1_ps,a0_ps,a1_ps;
 register int        nx=nxproc,ny=nyproc,am,bn,i,j,k,nbxA,nbyA,nbyB,nbxB;
 register plural int ix=ixproc,iy=iyproc;

 if (int_diag)
   diag = 'u';
 else
   diag = 'n';

/* NB ! Below we mean by "number of blocks", the number of grids used
 * to store the arrays
 */
 nbxA = NBX(m);/* Number of blocks in row direction on A matrix */
 nbyB = NBY(n);/* Number of blocks in column direction on B matrix */
 nbyA = NBY(m);/* Number of blocks in column direction on A matrix */
 nbxB = NBX(m);/* Number of blocks in row direction on B matrix */


 /* Variables b0_ps and b1_ps are temporaries used to hold preskewed  blocks
  * of the B matrix to be used in matrix multiplications. 
  * Similarly for the variables a0_ps and a1_ps. 
  */

 /* In case of rectangular MP-1, routines called from this routines
    always operate on pairs of blocks A and B. That means that in 
    cases of odd number of blocks in column direction it operates on data not
    initialized. This may cause problems!!! */

 /* initializing boundary blocks to zero on processors off bounds */

 bn = n&(ny-1);
 am = m&(nx-1);

 if ((ix >= am)&&(am>0)){
   for (i=1;i<=nbyB; i++)
        b[i*nbxB-1] = 0.0;
 }
 if ((iy >= bn)&&(bn>0)) {
   for (i=0;i<nbxB; i++)
        b[i+(nbyB-1)*nbxB] = 0.0;
 }

 /* Start processing. Using seperate code for square and rectangular
    MP-1 */

 if (SQUARE) /**************** if machine is square ****************/
   {
    for (i=nbxA-1; i>= 0; i--){
      if ((i== (nbxA-1)) && (m&(nx-1)))
        am = m&(nx-1);
      else
        am = nx;

      /* solve for all blocks in bloc-row i of B */

      for(k=0;k<nbyB;k++){
        if ((k == nbyB-1) && (n&(nx-1)))
          bn = n&(nx-1); /* n%nxproc */
        else
          bn = nx;
        mpl_stri_sun(diag,am,bn,a[i*nbxA+i],&b[i+nbxB*k]);

        /* preskewing B blocks */

        mpl_sq_spsW(b[i+nbxB*k],&b0_ps,bn,am);

        /* Updating upper part of B using matrixmultiplication */

        for(j=i-1;j>=0;j--){
          mpl_sq_spsN(a[j+nbxA*i],&a0_ps,am,nx);
          a0_ps *= -1;
          mpl_sq_smul('n',a0_ps,b0_ps,&b[j+nbxB*k],nx,bn,am);
       }
      }
      }
    }
 else  /**************** if machine is rectangular ****************/
   {
    for (i=nbxA-1; i>= 0; i--){
      if ((i== (nbxA-1)) && (m&(nx-1)))
        am = m&(nx-1);
      else
        am = nx;

      /* solve for all blocks in bloc-row i of B */

      for(k=0;k<(nbyB+1)/2;k++){
        if ((k == (nbyB+1)/2-1) && (n&(nx-1)))
          bn = n&(nx-1); /* n%nxproc */
        else
          bn = nx;
        mpl_stri_run(diag,am,bn,a[i*nbxA*2+i],a[(2*i+1)*nbxA+i],
                  &b[i+nbxB*2*k],&b[(2*k+1)*nbxB+i]);

        /* preskewing B blocks */

        mpl_rec_spsW(b[i+nbxB*2*k],b[(2*k+1)*nbxB+i],&b0_ps,&b1_ps,
               bn,am);

        /* Updating upper part of B using matrixmultiplication */

        for(j=i-1;j>=0;j--){
          mpl_rec_spsN(a[j+nbxA*2*i],a[(2*i+1)*nbxA+j],&a0_ps,&a1_ps,am,nx);
          a0_ps *= -1;
          a1_ps *= -1;
          mpl_rec_smul('n',a0_ps,a1_ps,b0_ps,b1_ps,
                    &b[j+nbxB*2*k],&b[(2*k+1)*nbxB+j],nx,bn,am);
       }
      }
      }
    }
}