#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include "pxsp.h"

#ifndef MAX
#define MAX(x,y) (((x) > (y) ) ? (x) : (y))
#endif

#ifndef MIN
#define MIN(x,y) (((x) < (y) ) ? (x) : (y))
#endif




void pdtpmm_( side, uplo, transa, diag,
		m,n, alpha,   A,ia,ja,desc_A,
                              B,ib,jb,desc_B )
/*
*  .. Scalar Arguments ..
*/
   F_CHAR      diag, side, transa, uplo;
   int         * ia, * ib, * ja, * jb, * m, * n;
   complex16   * alpha;
/* ..
*  .. Array Arguments ..
*/
   int         desc_A[], desc_B[];
   double   A[], B[];

{

/*
*  Purpose
*  =======
*
*  PDTRMM performs one of the distributed matrix-matrix operations
*
*     sub( B ) := alpha*op( sub( A ) )*sub( B ),
*
*  where sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1)  if SIDE = 'L',
*        sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1)  if SIDE = 'R',
*
*        sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1),
*
*  alpha is a scalar, sub( B ) is an M-by-N distributed matrix, sub( A )
*  is a unit, or non-unit, upper or lower triangular distributed matrix
*  and op( A ) is one of
*
*     op( A ) = A   or   op( A ) = A'   or   op( A ) = conjg( A' ).
*
*   A is stored in packed storage.
*
*  Notes
*  =====
*
*  Each global data object is described by an associated description
*  vector.  This vector stores the information required to establish
*  the mapping between an object element and its corresponding process
*  and memory location.
*
*  Let A be a generic term for any 2D block cyclicly distributed array.
*  Such a global array has an associated description vector descA.
*  In the following comments, the character _ should be read as
*  "of the global array".
*
*  NOTATION        STORED IN      EXPLANATION
*  --------------- -------------- --------------------------------------
*  DT_A   (global) descA[ DT_ ]   The descriptor type.  In this case,
*                                 DT_A = 1.
*  CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating
*                                 the BLACS process grid A is distribu-
*                                 ted over. The context itself is glo-
*                                 bal, but the handle (the integer
*                                 value) may vary.
*  M_A    (global) descA[ M_ ]    The number of rows in the global
*                                 array A.
*  N_A    (global) descA[ N_ ]    The number of columns in the global
*                                 array A.
*  MB_A   (global) descA[ MB_ ]   The blocking factor used to distribu-
*                                 te the rows of the array.
*  NB_A   (global) descA[ NB_ ]   The blocking factor used to distribu-
*                                 te the columns of the array.
*  RSRC_A (global) descA[ RSRC_ ] The process row over which the first
*                                 row of the array A is distributed.
*  CSRC_A (global) descA[ CSRC_ ] The process column over which the
*                                 first column of the array A is
*                                 distributed.
*  LLD_A  (local)  descA[ LLD_ ]  The leading dimension of the local
*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
*
*  Let K be the number of rows or columns of a distributed matrix,
*  and assume that its process grid has dimension p x q.
*  LOCr( K ) denotes the number of elements of K that a process
*  would receive if K were distributed over the p processes of its
*  process column.
*  Similarly, LOCc( K ) denotes the number of elements of K that a
*  process would receive if K were distributed over the q processes of
*  its process row.
*  The values of LOCr() and LOCc() may be determined via a call to the
*  ScaLAPACK tool function, NUMROC:
*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
*  An upper bound for these quantities may be computed by:
*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
*
*  The triangular distributed matrix sub( A ) must be distributed
*  according to a square block cyclic decomposition, i.e MB_A = NB_A, if
*  NA+MOD(IA-1,MB_A) > MB_A or NA+MOD(JA-1,NB_A) > NB_A.
*  If SIDE = 'Left', the distributed matrix sub( A ) is of order NA = M,
*  and NA = N if SIDE = 'Right'. If NA+MOD(IA-1,MB_A) > MB_A or
*  NA+MOD(JA-1,NB_A) > NB_A, then sub( A ) is not just contained into a
*  block, in which case IA-1 (resp. JA-1) must be a multiple of MB_A
*  (resp. NB_A).
*
*  If SIDE = 'L', the row process having the first entries of sub( B )
*  must also own the first entries of sub( A ).
*  If sub( A ) is not just contained into a block, IB-1 (resp. IA-1,
*  JA-1) must be a multiple of MB_B (resp. MB_A, NB_A = MB_A), and
*  the column block size of A should be equal to the row block size of
*  B, i.e NB_A = MB_B.
*
*  If SIDE = 'R', the column process having the first entries of
*  sub( B ) must also own the first entries of sub( A ).
*  If sub( A ) is not just contained into a block, JB-1 (resp. IA-1,
*  JA-1) must be a multiple of NB_B (resp. MB_A, NB_A = MB_A), and
*  the row block size of A should be equal to the column block size of
*  B, i.e NB_A = MB_B.
*
*  Parameters
*  ==========
*
*  SIDE    (global input) pointer to CHARACTER
*          On entry, SIDE specifies whether  op( sub( A ) ) multiplies
*          sub( B ) from the left or right as follows:
*
*          SIDE = 'L' or 'l'  sub( B ) := alpha*op( sub( A ) )*sub( B ),
*
*          SIDE = 'R' or 'r'  sub( B ) := alpha*sub( B )*op( sub( A ) ).
*
*  UPLO    (global input) pointer to CHARACTER
*          On entry, UPLO specifies whether the distributed matrix
*          sub( A ) is an upper or lower triangular distributed matrix
*          as follows:
*
*          UPLO = 'U' or 'u'  sub( A ) is an upper triangular
*                             distributed matrix,
*
*          UPLO = 'L' or 'l'  sub( A ) is a lower triangular
*                             distributed matrix.
*
*  TRANSA  (global input) pointer to CHARACTER
*          On entry, TRANSA specifies the form of op( A ) to be
*          used in the matrix multiplication as follows:
*
*          TRANSA = 'N' or 'n'   op( A ) = A,
*
*          TRANSA = 'T' or 't'   op( A ) = A',
*
*          TRANSA = 'C' or 'c'   op( A ) = conjg( A' ).
*
*  DIAG    (global input) pointer to CHARACTER
*          On entry, DIAG specifies whether or not sub( A ) is unit
*          triangular as follows:
*
*          DIAG = 'U' or 'u'  sub( A ) is assumed to be unit
*                             triangular,
*
*          DIAG = 'N' or 'n'  sub( A ) is not assumed to be unit
*                             triangular.
*
*  M       (global input) pointer to INTEGER
*          The number of rows to be operated on i.e the number of rows
*          of the distributed submatrix sub( B ). M >= 0.
*
*  N       (global input) pointer to INTEGER
*          The number of columns to be operated on i.e the number of
*          columns of the distributed submatrix sub( B ). N >= 0.
*
*  ALPHA   (global input) pointer to DOUBLE PRECISION
*          On entry, ALPHA specifies the scalar alpha.
*
*  A       (local input) DOUBLE PRECISION pointer into the local memory
*          to an array of dimension (LLD_A, LOCc(JA+NA-1). Before entry
*          with  UPLO = 'U' or 'u', the  leading NA-by-NA upper trian-
*          gular part of the distributed matrix sub( A ) must contain
*          the local pieces of the upper triangular distributed matrix
*          and its strictly lower triangular part is not referenced.
*          Before entry  with  UPLO = 'L' or 'l', the leading  NA-by-NA
*          lower triangular part of the distributed matrix sub( A ) must
*          contain the lower triangular distributed matrix and its
*          strictly upper triangular part is not referenced.  Note that
*          when  DIAG = 'U' or 'u', the diagonal elements of sub( A )
*          are not referenced either, but are assumed to be  unity.
*
*  IA      (global input) pointer to INTEGER
*          The global row index of the submatrix of the distributed
*          matrix A to operate on.
*
*  JA      (global input) pointer to INTEGER
*          The global column index of the submatrix of the distributed
*          matrix A to operate on.
*
*  DESCA   (global and local input) INTEGER array of dimension 8.
*          The array descriptor of the distributed matrix A.
*
*  B       (local input/local output) DOUBLE PRECISION pointer into the
*          local memory to an array of dimension (LLD_B, LOCc(JB+N-1)).
*          Before entry, this array contains the local pieces of the
*          distributed matrix sub( B ). On exit, sub( B ) is overwritten
*          by the transformed distributed matrix.
*
*  IB      (global input) pointer to INTEGER
*          The global row index of the submatrix of the distributed
*          matrix B to operate on.
*
*  JB      (global input) pointer to INTEGER
*          The global column index of the submatrix of the distributed
*          matrix B to operate on.
*
*  DESCB   (global and local input) INTEGER array of dimension 8.
*          The array descriptor of the distributed matrix B.
*
*  =====================================================================
*/

       int lwork, ineed;
       int info;
       double *work;
       double work1[1];


       lwork = -1; info = 0;
       pdtpmmf_( side, uplo, transa, diag,
                m,n, alpha,   A,ia,ja,desc_A,
                              B,ib,jb,desc_B,
                work1, &lwork, &info ); 
       assert( info == 0 );

  ineed = (int) work1[0];


       if (ineed == 1) {
	      work = &(work1[0]);
              lwork = 1;
	      }
       else {
	      work = ( double *) malloc( sizeof( double )*(ineed+1) );
	      assert( work != NULL );
	      lwork = ineed + 1;
	    };
        

       pdtpmmf_( side, uplo, transa, diag,
                m,n, alpha,   A,ia,ja,desc_A,
                              B,ib,jb,desc_B,
                work, &lwork, &info );
       assert( info == 0 );


       if (ineed != 1) {
	   free( work );
           };
}

