#include <mpl.h>
#include <stdio.h>
#include <stdlib.h>
#include "mpl_blas.h"
#define PRECISION float

#ifdef DEBUG
extern show();
#endif

/***************************************************************************
*                                                                          *
*   DATA PARALLEL BLAS based on MPL                                        *
*                                                                          *
*   Internal routine, this routine is not supposed to be                   *
*                     called by user programs.                             *
*                                                                          *
*   Version 1.0   1/4-92 ,                                                 *
*   For MasPar MP-1 computers                                              *
*                                                                          *
*   para//ab, University of Bergen, NORWAY                                 *
*                                                                          *
*   The calling sequence may be changed in a future version.               *
*   Please report any BUGs, ideas for improvement or other                 *
*   comments to                                                            *
*                    adm@parallab.uib.no                                   *
*                                                                          *
*   Future versions may then reflect your suggestions.                     *
*   The most current version of this software is available                 *
*   from netlib@nac.no , send the message `send index from maspar'         *
*                                                                          *
*   REVISIONS:                                                             *
*                                                                          *
***************************************************************************/

#ifdef __STDMPL__ 

void mpl_sgemm1 (int m, int n, int p, 
        PRECISION alpha,
	plural PRECISION *a,
	plural PRECISION *b,
	plural PRECISION *c)

#else

void mpl_sgemm1 (m, n, p, alpha, a, b, c)
	int m, n, p;
        PRECISION alpha;
	plural PRECISION *a,*b,*c;

#endif
/*
************************************************************************
*** Incoming Arguments:
***
***    m      integer   a is m X p
***    n      integer   b is p X n
***    p      integer   c is m X n
***    alpha  float    scalar multiplying AB
***    a      plural float, array   first matrix factor -- its address
***    b      plural float, array   second matrix factor -- its address
***    c      plural float, array   matrix added to -- its address
************************************************************************
*** Outgoing Arguments:                                              ***
***    c      plural float, array   alpha * a * b + c                ***
************************************************************************

		BLAS 3 general matrix matrix multiply on a MasPar.

		Based on the standard systolic algorithm (Cannon 69)

		Comments on this version :
			- poor performance on rest-blocks (non-multiples of nxproc)
			- too many function calls
			- register use should be optimized 

***********************************************************************
** Implemented by :      Erik Boman                                 ***
** Implementation date:  15 Oct  90                                 ***
** Latest update :       20 Mar  92                                 ***
***********************************************************************
*/

{
	/* local variables */
	int nbp,nbm,nbn;
	plural PRECISION bps0,bps1;
	plural PRECISION aa0,aa1;
	register plural PRECISION *aps0,*aps1;
	register int i,j,k;
	register int nx=nxproc;
	int restp;

	/* test whether you are running on a square or 
	   a rectangular machine */

	if (SQUARE){

		/* square processor grid */
#ifdef DEBUG
printf("In mpl_sgemm1: square MasPar\n");
{ 
  plural float debug;
  debug = a[0];
  show(&debug,nyproc,nxproc,"A");
  debug = b[0];
  show(&debug,nyproc,nxproc,"B");
}
#endif

		if ( m <= nx && n <= nx && p <= nx){

			/* only one block */

/*
*			Form  C := alpha*A*B + C
*/

			/* preskew A */

			mpl_sq_spsN( *a, &aa0, p, m);

			/* preskew B */
	
			mpl_sq_spsW( *b, &bps0, n, p);
		
			/* scale with alpha */
	
			if (alpha != one){
				bps0 *= alpha;
			}

			/* C += A*B */

			mpl_sq_smul( 'n', aa0, bps0, c, m, n, p);
		}

		else {

		/* compute number of nxproc by nxproc blocks */

		nbm = NBX(m);
		nbp = NBX(p);
		nbn = NBX(n);

		/* Allocate storage for the preskewed B-blocks */

		if (! (aps0 = (plural PRECISION *)p_malloc(nbm*sizeof(PRECISION)))){
			printf("Error in dgemm: Not enough pmem\n");
			exit(1);
			}

		/*
			Do the block multiplication in k,j,i order.
		*/
		
/*
*			Form  C := alpha*A*B + C
*/

				for (k=0; k < nbp; k++){
			
					/* preskew blocks A(:,k) */
			
					restp = MIN(nx,p-k*nx);
					for (i=0; i<nbm; i++){
						mpl_sq_spsN(
							a[k*nbm + i], 
							&aps0[i], 
							restp, MIN(nx, m-i*nx)
						);
					}
		
					for	(j=0; j<nbn; j++){
			
						/* preskew block B(k,j) */
			
						mpl_sq_spsW(
							b[j*nbp + k],
							&bps0, 
							MIN(nx,n-j*nx), restp
						);

						/* scale by alpha */
			
						if (alpha != one){
							bps0 *= alpha;
						}
				
						for (i=0; i<nbm; i++){
			
							/* C(i,j) += A(i,k)*B(k,j) */
		
							mpl_sq_smul(
								'n',
								aps0[i], bps0, 
								&c[j*nbm + i], 
								MIN(nx,m-i*nx),
								MIN(nx,n-j*nx),
								restp
							);
						}

					} 
				}
			p_free(aps0);
		}
	}

	else{ 

		/* rectangular  processor grid */
#ifdef DEBUG
printf("In mpl_sgemm1: rectangular MasPar\n");
{ 
  plural float debug;
  debug = a[0];
  show(&debug,nyproc,nxproc,"A");
  debug = b[0];
  show(&debug,nyproc,nxproc,"B");
}
#endif

		if ( m <= nx && n <= nx && p <= nx){

			/* only one block */

/*
*			Form  C := alpha*A*B + C
*/

			/* preskew A */
	
			mpl_rec_spsN(
				a[0], a[1], 
				&aa0, &aa1,
				p, m
			);

			/* preskew B */
	
			mpl_rec_spsW(
				b[0],b[1],
				&bps0, &bps1,
				n, p
			);
		
			/* scale with alpha */
	
			if (alpha != one){
				bps0 *= alpha;
				bps1 *= alpha;
			}

			/* C += A*B */

			mpl_rec_smul(
				'n',
				aa0, aa1, bps0, bps1, 
				&c[0], &c[1],
				m, n, p
			);
	}

	else {

		/* compute number of nxproc by nxproc blocks */

		nbm = NBX(m);
		nbp = NBX(p);
		nbn = NBX(n);

		/* Allocate storage for the preskewed B-blocks */

		if (! (aps0 = (plural PRECISION *)p_malloc(nbm*sizeof(PRECISION)))){
			printf("Error in dgemm: Not enough pmem\n");
			exit(1);
			}

		if (! (aps1 = (plural PRECISION *)p_malloc(nbm*sizeof(PRECISION)))){
			printf("Error in dgemm: Not enough pmem\n");
			exit(1);
			}

		/*
			Do the block multiplication in k,j,i order.
			Use two nyproc*nxproc "machine blocks" as one logical
			nxproc*nxproc block.
		*/
		
/*
*			Form  C := alpha*A*B + C
*/

				for (k=0; k < nbp; k++){
			
					/* preskew blocks A(:,k) */

					restp = MIN(nx,p-k*nx);
					for (i=0; i<nbm; i++){
						mpl_rec_spsN(
							a[2*k*nbm + i], a[(2*k+1)*nbm + i], 
							&aps0[i], &aps1[i],
							restp, MIN(nx, m-i*nx)
						);
					}

					for (j=0; j<nbn; j++){
			
						/* preskew block B(k,j) */
			
						mpl_rec_spsW(
							b[2*j*nbp + k], b[(2*j+1)*nbp + k],
							&bps0, &bps1,
							MIN(nx,n-j*nx), restp
						);

						/* scale by alpha */
			
						if (alpha != one){
							bps0 *= alpha;
							bps1 *= alpha;
						}
				
						for (i=0; i<nbm; i++){
			
							/* C(i,j) += A(i,k)*B(k,j) */
			
							mpl_rec_smul(
								'n',
								aps0[i], aps1[i], bps0, bps1, 
								&c[2*j*nbm + i], &c[(2*j+1)*nbm + i],
								MIN(nx,m-i*nx),
								MIN(nx,n-j*nx),
								restp
							);
						}

					} 
				}
			p_free(aps0);
			p_free(aps1);
		}
	}

/*
    End of mpl_sgemm1 
*/
}