#include <mpl.h>
#define PRECISION float

#ifdef DEBUG
extern show();
#endif

/***************************************************************************
*                                                                          *
*   DATA PARALLEL BLAS based on MPL                                        *
*                                                                          *
*   Internal routine, this routine is not supposed to be                   *
*                     called by user programs.                             *
*                                                                          *
*   Version 1.0   1/4-92 ,                                                 *
*   For MasPar MP-1 computers                                              *
*                                                                          *
*   para//ab, University of Bergen, NORWAY                                 *
*                                                                          *
*   The calling sequence may be changed in a future version.               *
*   Please report any BUGs, ideas for improvement or other                 *
*   comments to                                                            *
*                    adm@parallab.uib.no                                   *
*                                                                          *
*   Future versions may then reflect your suggestions.                     *
*   The most current version of this software is available                 *
*   from netlib@nac.no , send the message `send index from maspar'         *
*                                                                          *
*   REVISIONS:                                                             *
*                                                                          *
***************************************************************************/
#ifdef __STDMPL__
void mpl_rec_smul(
	int trans,
	register plural PRECISION aps0, 
	register plural PRECISION aps1, 
	register plural PRECISION bps0, 
	register plural PRECISION bps1,
	register plural PRECISION *c0, 
	register plural PRECISION *c1,
	int m, int n, int p
	)
#else
void mpl_rec_smul(trans, aps0, aps1, bps0, bps1, c0, c1, m, n, p)
	int trans;
	register plural PRECISION aps0, aps1, bps0, bps1;
	register plural PRECISION *c0, *c1;
	int m, n, p;
#endif

	/*
		matrix multiplication on a 1:2 machine
		standard systolic algorithm (Cannon)

		Prereq: It is assumed that A and B have been preskewed by
		the mpl_rec_ps subroutines

		trans = 'n' or 'N' : no transpose for A or B
		 - matrix A should have been preskewed north
		 - matrix B should have been preskewed west
		
		trans = 't' or 'T' : both A and B are stored transposed
		 - matrix A should have been preskewed west
		 - matrix B should have been preskewed north

		This version has been tuned with:
		 - loop unrolling to level 2

		Comments:
		 - Currently the multiplication loop always runs through
		   nxproc cycles, although only p cycles is really necessary.
		   But this would require wrap-around on the p*p corner block.

		Written by Erik Boman (erikb@ii.uib.no), July 1990
		Last revised: Mar 92


		INPUT PARAMETERS:
			trans     : 'n' for no transpose, 't' if both A and B lie transposed
			aps0,aps1 : The A matrix (preskewed)
			bps0,bps1 : The B matrix (preskewed)
			*c0, *c1  : The C matrix (plain)
			m, n, p   : A is m*p, B is p*n, C is m*n

			Warning: Dimensions are given in mpfortran style.
			         Thus, m is the number of rows in the A matrix,
			         but is the number of columns on the MasPar PE grid !

		OUTPUT PARAMETERS:
			*c0, *c1  : C = C + A*B
    */

{
	register plural PRECISION cc0,cc1,tmp;
	register int i, ncycles;
		
#ifdef DEBUG
{
plural double tmpa = aps0, tmpb =bps0, tmpc = *c0;
show(&tmpa,nyproc,nxproc,"Aps in mult");
show(&tmpb,nyproc,nxproc,"Bps in mult");
show(&tmpc,nyproc,nxproc,"C before mult");
}
#endif

	/* do ncycles iterations */
	ncycles = nxproc;

	cc0 = 0.0;
	cc1 = 0.0;

	/* unrolling the loop to level 2 in order to avoid moving b
		elements from registers to registers  */

	if ((trans == 'n') || (trans == 'N')){

		for (i=0; i<(ncycles>>1); i++){
	
			cc0 += aps0*bps0;
			cc1 += aps1*bps1;
			xnetN[1].aps0 = aps0;
			xnetW[1].bps0 = bps0;
			xnetW[1].bps1 = bps1;
	
			cc0 += aps1*bps0;
			cc1 += aps0*bps1;
			xnetN[1].aps1 = aps1;
			xnetW[1].bps0 = bps0;
			xnetW[1].bps1 = bps1;
	
		}
	
		/* if odd #cycles, remember to do the last iteration */
	
		if (ncycles &1){
			cc0 += aps0*bps0;
			cc1 += aps1*bps1;
			tmp = aps0;
			aps0 = aps1;
			xnetN[1].aps1 = tmp;
			xnetW[1].bps0 = bps0;
			xnetW[1].bps1 = bps1;
		}

	}
	else { /* trans == 't' || trans == 'T' */
	
		for (i=0; i<(ncycles>>1); i++){
	
			cc0 += aps0*bps0;
			cc1 += aps1*bps1;
			xnetW[1].aps0 = aps0;
			xnetW[1].aps1 = aps1;
			xnetN[1].bps0 = bps0;
	
			cc0 += aps0*bps1;
			cc1 += aps1*bps0;
			xnetW[1].aps0 = aps0;
			xnetW[1].aps1 = aps1;
			xnetN[1].bps1 = bps1;
	
		}
	
		/* if odd #cycles, remember to do the last iteration */
	
		if (ncycles &1){
			cc0 += aps0*bps0;
			cc1 += aps1*bps1;
			xnetW[1].aps0 = aps0;
			xnetW[1].aps1 = aps1;
			tmp = bps0;
			bps0 = bps1;
			xnetN[1].bps1 = tmp;
		}
	}
		
	/* Add A*B to C.
	   Mask out all processors outside the upper left m*n corner.
	   This is important as other processors may have garbage 
	   in this memory location.
	*/

	if (ixproc<m){
		if (iyproc < n)        *c0 += cc0;
		if (iyproc+nyproc < n) *c1 += cc1;
	}
		
#ifdef DEBUG
{
plural double tmpc = *c0;
show(&tmpc,nyproc,nxproc,"C after mult");
}
#endif

}