#include <mpl.h>
#define PRECISION float

#ifdef DEBUG
extern show();
#endif

/***************************************************************************
*                                                                          *
*   DATA PARALLEL BLAS based on MPL                                        *
*                                                                          *
*   Internal routine, this routine is not supposed to be                   *
*                     called by user programs.                             *
*                                                                          *
*   Version 1.0   1/4-92 ,                                                 *
*   For MasPar MP-1 computers                                              *
*                                                                          *
*   para//ab, University of Bergen, NORWAY                                 *
*                                                                          *
*   The calling sequence may be changed in a future version.               *
*   Please report any BUGs, ideas for improvement or other                 *
*   comments to                                                            *
*                    adm@parallab.uib.no                                   *
*                                                                          *
*   Future versions may then reflect your suggestions.                     *
*   The most current version of this software is available                 *
*   from netlib@nac.no , send the message `send index from maspar'         *
*                                                                          *
*   REVISIONS:                                                             *
*                                                                          *
***************************************************************************/
#ifdef __STDMPL__
void mpl_sq_smul(
	int trans,
	plural PRECISION aps_in,
	plural PRECISION bps_in,
	plural PRECISION *c,
	int m, int n , int p
	)
#else
void mpl_sq_smul(trans, aps_in, bps_in, c, m, n, p)
	int trans;
	plural PRECISION aps_in,bps_in;
	plural PRECISION *c;
	int m, n, p;
#endif

	/*
		matrix multiplication on a square machine
		standard systolic algorithm (Cannon)

		Prereq: It is assumed that A and B have been preskewed by
		the mpl_sq_ps subroutines

		trans = 'n' or 'N' : no transpose for A or B
		 - matrix A should have been preskewed north
		 - matrix B should have been preskewed west
		
		trans = 't' or 'T' : both A and B are stored transposed
		 - matrix A should have been preskewed west
		 - matrix B should have been preskewed north

		Comments:
		 - Currently the multiplication loop always runs through
		   nxproc cycles, although only p cycles is really necessary.
		   But this would require wrap-around on the p*p corner block.


		INPUT PARAMETERS:
			trans     : 'n' for no transpose, 't' if both A and B lie transposed
			aps       : The A matrix (preskewed)
			bps       : The B matrix (preskewed)
			*c        : The C matrix (plain)
			m, n, p   : A is m*p, B is p*n, C is m*n


		OUTPUT PARAMETERS:
			*c        : C = C + A*B

    */


{
	register plural PRECISION cc;
	register int i, ncycles;
	register plural PRECISION aps=aps_in,
				  bps=bps_in;

#ifdef DEBUG
{
plural double tmpa = aps, tmpb =bps, tmpc= *c;
show(&tmpa,nyproc,nxproc,"Aps in mult");
show(&tmpb,nyproc,nxproc,"Bps in mult");
show(&tmpc,nyproc,nxproc,"C before mult");
}
#endif

	/* do ncycles iterations */
	ncycles = nxproc;

	cc = 0.0;

	if ((trans == 'n') || (trans == 'N')){
		for (i=0; i< ncycles; i++){
			cc += aps*bps;
			xnetN[1].aps = aps;
			xnetW[1].bps = bps;
		}
	} else {
		for (i=0; i< ncycles; i++){
			cc += aps*bps;
			xnetW[1].aps = aps;
			xnetN[1].bps = bps;
		}
	}
	
	/* Add A*B to C.
	   Mask out all processors outside the upper left m*n corner.
	   This is important as other processors may have garbage
	   in this memory location.
	*/

	if ((ixproc<m) && (iyproc<n)){
		*c += cc;
	}

#ifdef DEBUG
{ 
  plural double tmpc= *c;
  show(&tmpc,nyproc,nxproc,"C after mult");
}
#endif

}