/* ---------------------------------------------------------------------
*
*  -- PBLAS auxiliary routine (version 2.0) --
*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
*     and University of California, Berkeley.
*     April 1, 1998
*
*  ---------------------------------------------------------------------
*/
/*
*  Include files
*/
#include "../pblas.h"
#include "../PBpblas.h"
#include "../PBtools.h"
#include "../PBblacs.h"
#include "../PBblas.h"

#ifdef __STDC__
void PB_CpgemmAC( PBTYP_T * TYPE, char * DIRECA, char * DIRECC,
                  char * TRANSA, char * TRANSB, int M, int N, int K,
                  char * ALPHA, char * A, int IA, int JA, int * DESCA,
                  char * B, int IB, int JB, int * DESCB, char * BETA,
                  char * C, int IC, int JC, int * DESCC )
#else
void PB_CpgemmAC( TYPE, DIRECA, DIRECC, TRANSA, TRANSB, M, N, K, ALPHA,
                  A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC,
                  DESCC )
/*
*  .. Scalar Arguments ..
*/
   char           * DIRECA, * DIRECC, * TRANSA, * TRANSB;
   int            IA, IB, IC, JA, JB, JC, K, M, N;
   char           * ALPHA, * BETA;
   PBTYP_T        * TYPE;
/*
*  .. Array Arguments ..
*/
   int            * DESCA, * DESCB, * DESCC;
   char           * A, * B, * C;
#endif
{
/*
*  Purpose
*  =======
*
*  PB_CpgemmAC  performs one of the matrix-matrix operations
*
*     sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ),
*
*  where
*
*     sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1),  and, op( X )  is one  of
*     op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ).
*
*  Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1)  if TRANSA = 'N',
*                               A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T',
*                        conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C',
*
*  and,  op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1)  if TRANSB = 'N',
*                               B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T',
*                        conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'.
*
*  Alpha and beta are scalars.  A, B and C are matrices;  op( sub( A ) )
*  is an  m by k submatrix,  op( sub( B ) )  is an  k by n submatrix and
*  sub( C ) is an m by n submatrix.
*
*  This is the inner-product algorithm using the logical LCM algorithmic
*  blocking technique. The submatrix operand sub( B ) stays in place.
*
*  Notes
*  =====
*
*  A description  vector  is associated with each 2D block-cyclicly dis-
*  tributed matrix.  This  vector  stores  the  information  required to
*  establish the  mapping  between a  matrix entry and its corresponding
*  process and memory location.
*
*  In  the  following  comments,   the character _  should  be  read  as
*  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
*  block cyclicly distributed matrix.  Its description vector is DESC_A:
*
*  NOTATION         STORED IN       EXPLANATION
*  ---------------- --------------- ------------------------------------
*  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
*  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
*                                   the NPROW x NPCOL BLACS process grid
*                                   A  is  distributed over. The context
*                                   itself  is  global,  but  the handle
*                                   (the integer value) may vary.
*  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
*                                   ted matrix A, M_A >= 0.
*  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
*                                   buted matrix A, N_A >= 0.
*  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
*                                   block of the matrix A, IMB_A > 0.
*  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
*                                   left   block   of   the  matrix   A,
*                                   INB_A > 0.
*  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
*                                   bute the last  M_A-IMB_A  rows of A,
*                                   MB_A > 0.
*  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
*                                   bute the last  N_A-INB_A  columns of
*                                   A, NB_A > 0.
*  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
*                                   row of the matrix  A is distributed,
*                                   NPROW > RSRC_A >= 0.
*  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
*                                   first column of  A  is  distributed.
*                                   NPCOL > CSRC_A >= 0.
*  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
*                                   array  storing  the  local blocks of
*                                   the distributed matrix A,
*                                   IF( Lc( 1, N_A ) > 0 )
*                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
*                                   ELSE
*                                      LLD_A >= 1.
*
*  Let K be the number of  rows of a matrix A starting at the global in-
*  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
*  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
*  receive if these K rows were distributed over NPROW processes.  If  K
*  is the number of columns of a matrix  A  starting at the global index
*  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
*  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
*  these K columns were distributed over NPCOL processes.
*
*  The values of Lr() and Lc() may be determined via a call to the func-
*  tion PB_Cnumroc:
*  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
*  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
*
*  Arguments
*  =========
*
*  TYPE    (local input) pointer to a PBTYP_T structure
*          On entry,  TYPE  is a pointer to a structure of type PBTYP_T,
*          that contains type information (See pblas.h).
*
*  DIRECA  (global input) pointer to CHAR
*          On entry,  DIRECA  specifies  the direction in which the rows
*          or columns of sub( A ) should be looped over as follows:
*             DIRECA = 'F' or 'f'   forward  or increasing,
*             DIRECA = 'B' or 'b'   backward or decreasing.
*
*  DIRECC  (global input) pointer to CHAR
*          On entry,  DIRECC  specifies  the direction in which the rows
*          or columns of sub( C ) should be looped over as follows:
*             DIRECC = 'F' or 'f'   forward  or increasing,
*             DIRECC = 'B' or 'b'   backward or decreasing.
*
*  TRANSA  (global input) pointer to CHAR
*          On entry,  TRANSA  specifies the form of op( sub( A ) ) to be
*          used in the matrix multiplication as follows:
*
*             TRANSA = 'N' or 'n'   op( sub( A ) ) = sub( A ),
*             TRANSA = 'T' or 't'   op( sub( A ) ) = sub( A )',
*             TRANSA = 'C' or 'c'   op( sub( A ) ) = conjg( sub( A )' ).
*
*  TRANSB  (global input) pointer to CHAR
*          On entry,  TRANSB  specifies the form of op( sub( B ) ) to be
*          used in the matrix multiplication as follows:
*
*             TRANSB = 'N' or 'n'   op( sub( B ) ) = sub( B ),
*             TRANSB = 'T' or 't'   op( sub( B ) ) = sub( B )',
*             TRANSB = 'C' or 'c'   op( sub( B ) ) = conjg( sub( B )' ).
*
*  M       (global input) INTEGER
*          On entry,  M  specifies  the number of rows of the  submatrix
*          op( sub( A ) ) and of the submatrix sub( C ). M  must  be  at
*          least  zero.
*
*  N       (global input) INTEGER
*          On entry, N specifies the number of columns of the  submatrix
*          op( sub( B ) )  and  the  number of columns of the  submatrix
*          sub( C ). N must be at least zero.
*
*  K       (global input) INTEGER
*          On entry, K specifies the number of columns of the  submatrix
*          op( sub( A ) )  and  the  number of rows   of  the  submatrix
*          op( sub( B ) ). K must be at least  zero.
*
*  ALPHA   (global input) pointer to CHAR
*          On entry, ALPHA specifies the scalar alpha.   When  ALPHA  is
*          supplied  as zero then the local entries of the arrays  A and
*          B corresponding to the entries of  the  submatrices  sub( A )
*          and sub( B ) respectively need not be set on input.
*
*  A       (local input) pointer to CHAR
*          On entry, A is an array of dimension (LLD_A, Ka), where Ka is
*          at least Lc( 1, JA+K-1 ) when  TRANSA = 'N' or 'n', and is at
*          least  Lc( 1, JA+M-1 )  otherwise.  Before  entry, this array
*          contains the local entries of the matrix A.
*
*  IA      (global input) INTEGER
*          On entry, IA  specifies A's global row index, which points to
*          the beginning of the submatrix sub( A ).
*
*  JA      (global input) INTEGER
*          On entry, JA  specifies A's global column index, which points
*          to the beginning of the submatrix sub( A ).
*
*  DESCA   (global and local input) INTEGER array
*          On entry, DESCA  is an integer array of dimension DLEN_. This
*          is the array descriptor for the matrix A.
*
*  B       (local input) pointer to CHAR
*          On entry, B is an array of dimension (LLD_B, Kb), where Kb is
*          at least Lc( 1, JB+N-1 ) when  TRANSB = 'N' or 'n', and is at
*          least Lc( 1, JB+K-1 )  otherwise.  Before  entry,  this array
*          contains the local entries of the matrix B.
*
*  IB      (global input) INTEGER
*          On entry, IB  specifies B's global row index, which points to
*          the beginning of the submatrix sub( B ).
*
*  JB      (global input) INTEGER
*          On entry, JB  specifies B's global column index, which points
*          to the beginning of the submatrix sub( B ).
*
*  DESCB   (global and local input) INTEGER array
*          On entry, DESCB  is an integer array of dimension DLEN_. This
*          is the array descriptor for the matrix B.
*
*  BETA    (global input) pointer to CHAR
*          On entry,  BETA  specifies the scalar  beta.   When  BETA  is
*          supplied  as  zero  then  the  local entries of  the array  C
*          corresponding to  the  entries of the submatrix sub( C ) need
*          not be set on input.
*
*  C       (local input/local output) pointer to CHAR
*          On entry, C is an array of dimension (LLD_C, Kc), where Kc is
*          at least Lc( 1, JC+N-1 ).  Before  entry, this array contains
*          the local entries of the matrix  C.
*          On exit, the entries of this array corresponding to the local
*          entries of the  submatrix  sub( C )  are  overwritten  by the
*          local entries of the m by n updated submatrix.
*
*  IC      (global input) INTEGER
*          On entry, IC  specifies C's global row index, which points to
*          the beginning of the submatrix sub( C ).
*
*  JC      (global input) INTEGER
*          On entry, JC  specifies C's global column index, which points
*          to the beginning of the submatrix sub( C ).
*
*  DESCC   (global and local input) INTEGER array
*          On entry, DESCC  is an integer array of dimension DLEN_. This
*          is the array descriptor for the matrix C.
*
*  -- Written on April 1, 1998 by
*     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
*
*  ---------------------------------------------------------------------
*/
/*
*  .. Local Scalars ..
*/
   char           Aroc, GemmTa, GemmTb, TrA, TrB, * one, * talpha, * tbeta,
                  top, * zero;
   int            Abufld, AcurrocR, Afr, Afwd, AiD, AiR, AiiD, AiiR, AinbD,
                  AinbR, Ainb1D, Ainb1R, AisR, Akk, Ald, AmyprocD, AmyprocR,
                  AnbD, AnbR, AnpD, AnpR, AnprocsD, AnprocsR, Aoff, ArocD,
                  ArocR, AsrcR, Asrc_, Bcol, Bii, Bimb1, Binb1, Bjj, Bld, Bm,
                  Bmb, Bmp, Bn, Bnb, Bnq, Brow, Cbufld, Ccol, Ccurrow, Cfr,
                  Cfwd, Cii, Cimb, Cimb1, Cinb, Cinb1, CisR, Cjj, Ckk, Cld,
                  Cmb, Cmp, Cnb, Cnq, Coff, Crow, Csrc, WAfr, WCfr, WCsum,
                  ctxt, lcmb, m, maxp, maxpm1, maxq, mb, mbb, mycol, myrow,
                  ncpq, nota, notb, npcol, npq=0, nprow, nrpq, p=0, q=0, size,
                  tmp;
   GEMM_T         gemm;
   GSUM2D_T       gsum2d;
/*
*  .. Local Arrays ..
*/
   PB_VM_T        VM;
   int            Bd0[DLEN_], DBUFA[DLEN_], DBUFC[DLEN_], WAd[DLEN_],
                  WCd[DLEN_];
   char           * Abuf = NULL, * Bptr = NULL, * Cbuf = NULL, * WA = NULL,
                  * WC   = NULL;
/* ..
*  .. Executable Statements ..
*
*/
   Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );

   Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD );
   Cfwd = ( Mupcase( DIRECC[0] ) == CFORWARD );
   nota = ( ( TrA = Mupcase( TRANSA[0] ) ) == CNOTRAN );
   notb = ( ( TrB = Mupcase( TRANSB[0] ) ) == CNOTRAN );

   size = TYPE->size; one  = TYPE->one; zero = TYPE->zero;
   gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
   mb   = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
/*
*  Compute local information for sub( A ), sub( B ) and sub( C )
*/
   if( nota )
   {
      AiD      = JA;           AiR      = IA;
      Asrc_    = RSRC_;        Aroc     = CROW;
      AinbR    = DESCA[IMB_ ]; AinbD    = DESCA[INB_];
      AnbR     = DESCA[MB_  ]; AnbD     = DESCA[NB_ ];
      AsrcR    = DESCA[Asrc_]; Ald      = DESCA[LLD_];
      AmyprocD = mycol;        AnprocsD = npcol;
      AmyprocR = myrow;        AnprocsR = nprow;
      PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD,
                   &AiiR, &AiiD, &ArocR, &ArocD );
   }
   else
   {
      AiD      = IA;           AiR      = JA;
      Asrc_    = CSRC_;        Aroc     = CCOLUMN;
      AinbD    = DESCA[IMB_ ]; AinbR    = DESCA[INB_];
      AnbD     = DESCA[MB_  ]; AnbR     = DESCA[NB_ ];
      AsrcR    = DESCA[Asrc_]; Ald      = DESCA[LLD_];
      AmyprocD = myrow;        AnprocsD = nprow;
      AmyprocR = mycol;        AnprocsR = npcol;
      PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR,
                   &AiiD, &AiiR, &ArocD, &ArocR );
   }
   Ainb1D = PB_Cfirstnb( K, AiD, AinbD, AnbD );
   AnpD   = PB_Cnumroc( K, 0, Ainb1D, AnbD, AmyprocD, ArocD, AnprocsD );
   Ainb1R = PB_Cfirstnb( M, AiR, AinbR, AnbR );

   Cimb   = DESCC[IMB_ ]; Cinb = DESCC[INB_];
   Cmb    = DESCC[MB_  ]; Cnb  = DESCC[NB_ ];
   Csrc   = DESCC[RSRC_]; Cld  = DESCC[LLD_];
   PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
                &Crow, &Ccol );
   Cimb1 = PB_Cfirstnb( M, IC, Cimb, Cmb );
   Cinb1 = PB_Cfirstnb( N, JC, Cinb, Cnb );
   Cnq   = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
/*
*  Retrieve the BLACS combine topology, compute conjugate of alpha for the
*  conjugate transpose case and set the transpose parameters to be passed to
*  the BLAS matrix multiply routine.
*/
   if( notb )
   {
      Bm     = K; Bn     = N;
      top    = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
      talpha = ALPHA; GemmTa = ( nota ? CTRAN : TrA ); GemmTb = CNOTRAN;
   }
   else
   {
      Bm     = N; Bn     = K;
      top    = *PB_Ctop( &ctxt, COMBINE, ROW,    TOP_GET );
      if( TrB == CCOTRAN )
      {
         talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha );
         GemmTb = ( ( TrA == CCOTRAN ) ? CTRAN : CCOTRAN );
      }
      else
      {
         talpha = ALPHA;
         GemmTb = ( ( TrA == CCOTRAN ) ? CCOTRAN : CTRAN );
      }
      GemmTa = CNOTRAN;
   }
/*
*  Compute descriptor Bd0 for sub( B )
*/
   PB_Cdescribe( Bm, Bn, IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
                 &Bld, &Bimb1, &Binb1, &Bmb, &Bnb, &Brow, &Bcol, Bd0 );

   Bmp = PB_Cnumroc( Bm, 0, Bimb1, Bmb, myrow, Brow, nprow );
   Bnq = PB_Cnumroc( Bn, 0, Binb1, Bnb, mycol, Bcol, npcol );
   if( ( Bmp > 0 ) && ( Bnq > 0 ) ) Bptr = Mptr( B, Bii, Bjj, Bld, size );
/*
*  When sub( A ) is not replicated and backward pass on sub( A ), find the
*  virtual process q owning the last row or column of sub( A ).
*/
   if( !( AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) ) ) && !Afwd )
   {
      tmp = PB_Cindxg2p( M - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR );
      q   = MModSub( tmp, ArocR, AnprocsR );
   }
/*
*  When sub( C ) is not replicated and backward pass on sub( C ), find the
*  virtual process p owning the last row or column of sub( C ).
*/
   if( !( CisR = ( ( Crow < 0 ) || ( nprow == 1 ) ) ) && !Cfwd )
   {
      tmp = PB_Cindxg2p( M - 1, Cimb1, Cmb, Crow, Crow, nprow );
      p   = MModSub( tmp, Crow, nprow );
   }
/*
*  Loop over the virtual process grid induced by the rows or columns of
*  sub( A ) and sub( C ).
*/
   lcmb   = PB_Clcm( ( maxp = ( CisR ? 1 : nprow    ) ) * Cmb,
                     ( maxq = ( AisR ? 1 : AnprocsR ) ) * AnbR );
   m      = M;
   maxpm1 = maxp - 1;

   while( m > 0 )
   {
/*
*  Initialize local virtual matrix in process (p,q)
*/
      AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) );
      Akk      = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, AnprocsR );
      AnpR     = PB_Cnumroc( M, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR );

      Ccurrow  = ( CisR ? -1 : MModAdd( Crow,  p, nprow    ) );
      Ckk      = PB_Cg2lrem( IC, Cimb, Cmb, Ccurrow, Csrc, nprow );
      Cmp      = PB_Cnumroc( M, 0, Cimb1, Cmb, Ccurrow, Crow, nprow );

      PB_CVMinit( &VM, 0, Cmp, AnpR, Cimb1, Ainb1R, Cmb, AnbR, p, q,
                  maxp, maxq, lcmb );
/*
*  Find how many diagonals in this virtual process
*/
      npq = PB_CVMnpq( &VM );

      m  -= npq;
/*
*  Re-adjust the number of rows or columns to be (un)packed, in order to
*  average the message sizes.
*/
      if( npq ) mbb = npq / ( ( npq - 1 ) / mb + 1 );

      while( npq )
      {
         mbb = MIN( mbb, npq );
/*
*  Find out how many rows or columns of sub( A ) and sub( C ) are contiguous
*/
         PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Aoff );

         if( nota )
         {
/*
*  Compute the descriptor DBUFA for the buffer that will contained the packed
*  columns of sub( A ).
*/
            if( ( Afr = ( ncpq < mbb ) ) != 0 )
            {
/*
*  If rows of sub( A ) are not contiguous, then allocate the buffer and
*  pack the mbb rows of sub( A ).
*/
               Abufld = mbb;
               if( AisR || ( AmyprocR == AcurrocR ) )
               {
                  Abuf   = PB_Cmalloc( AnpD * mbb * size );
                  PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, mbb,
                              AnpD, one, Mptr( A, Akk, AiiD, Ald, size ), Ald,
                              zero, Abuf, Abufld );
               }
            }
            else
            {
/*
*  Otherwise, re-use sub( B ) directly.
*/
               Abufld = Ald;
               if( AisR || ( AmyprocR == AcurrocR ) )
                  Abuf = Mptr( A, Akk+Aoff, AiiD, Ald, size );
            }
            PB_Cdescset( DBUFA, mbb, K, mbb, Ainb1D, mbb, AnbD, AcurrocR,
                         ArocD, ctxt, Abufld );
         }
         else
         {
/*
*  Compute the descriptor DBUFA for the buffer that will contained the packed
*  columns of sub( A ).
*/
            if( ( Afr = ( ncpq < mbb ) ) != 0 )
            {
/*
*  If columns of sub( A ) are not contiguous, then allocate the buffer and pack
*  the mbb columns of sub( A ).
*/
               Abufld = MAX( 1, AnpD );
               if( AisR || ( AmyprocR == AcurrocR ) )
               {
                  Abuf   = PB_Cmalloc( AnpD * mbb * size );
                  PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, mbb,
                              AnpD, one, Mptr( A, AiiD, Akk, Ald, size ), Ald,
                              zero, Abuf, Abufld );
               }
            }
            else
            {
/*
*  Otherwise, re-use sub( A ) directly.
*/
               Abufld = Ald;
               if( AisR || ( AmyprocR == AcurrocR ) )
                  Abuf = Mptr( A, AiiD, Akk+Aoff, Ald, size );
            }
            PB_Cdescset( DBUFA, K, mbb, Ainb1D, mbb, AnbD, mbb, ArocD,
                         AcurrocR, ctxt, Abufld );
         }

         if( notb )
         {
/*
*  Replicate this panel of rows or columns of sub( A ) over sub( B ) -> WA
*/
            PB_CInV( TYPE, NOCONJG, COLUMN, Bm, Bn, Bd0, mbb, Abuf, 0, 0,
                     DBUFA, &Aroc, &WA, WAd, &WAfr );
/*
*  Allocate space for temporary results in scope of sub( B ) -> WC
*/
            PB_COutV( TYPE, ROW,    INIT, Bm, Bn, Bd0, mbb, &WC, WCd, &WCfr,
                      &WCsum );
/*
*  Local matrix-matrix multiply iff I own some data
*/
            if( Bmp > 0 && Bnq > 0 )
               gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &mbb, &Bnq, &Bmp,
                     talpha, WA, &WAd[LLD_], Bptr, &Bld, zero, WC, &WCd[LLD_] );
            if( WAfr ) free( WA );
            if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) )
               if( Abuf ) free( Abuf );
/*
*  Accumulate the intermediate results in WC
*/
            if( WCsum )
            {
               WCd[RSRC_] = Ccurrow;
               if( Bnq > 0 )
                  gsum2d( ctxt, COLUMN, &top, mbb, Bnq, WC, WCd[LLD_],
                          WCd[RSRC_], mycol );
            }
/*
*  Compute the descriptor DBUFC for the buffer that will contained the packed
*  rows of sub( C ). Allocate it.
*/
            if( ( Cfr = ( nrpq < mbb ) ) != 0 )
            {
/*
*  If rows of sub( C ) are not contiguous, then allocate the buffer
*/
               Cbufld = mbb; tbeta = zero;
               if( CisR || ( myrow == Ccurrow ) )
                  Cbuf = PB_Cmalloc( Cnq * mbb * size );
            }
            else
            {
/*
*  Otherwise re-use sub( C )
*/
               Cbufld = Cld; tbeta = BETA;
               if( CisR || ( myrow == Ccurrow ) )
                  Cbuf = Mptr( C, Ckk+Coff, Cjj, Cld, size );
            }
            PB_Cdescset( DBUFC, mbb, N, mbb, Cinb1, mbb, Cnb, Ccurrow, Ccol,
                         ctxt, Cbufld );
/*
*  Cbuf := Cbuf + WC
*/
            PB_Cpaxpby( TYPE, NOCONJG, mbb, N, one, WC, 0, 0, WCd, ROW, tbeta,
                        Cbuf, 0, 0, DBUFC, ROW );
/*
*  Unpack the mbb rows of sub( C ) and release the buffer containing them.
*/
            if( Cfr && ( CisR || ( myrow == Ccurrow ) ) )
            {
               PB_CVMpack( TYPE, &VM, ROW, ROW,    UNPACKING, NOTRAN, mbb, Cnq,
                           BETA, Mptr( C, Ckk, Cjj, Cld, size ), Cld, one, Cbuf,
                           Cbufld );
               if( Cbuf ) free( Cbuf );
            }
            if( WCfr ) free( WC );
         }
         else
         {
/*
*  Replicate this panel of rows or columns of sub( A ) over sub( B ) -> WA
*/
            PB_CInV( TYPE, NOCONJG, ROW,    Bm, Bn, Bd0, mbb, Abuf, 0, 0,
                     DBUFA, &Aroc, &WA, WAd, &WAfr );
/*
*  Allocate space for temporary results in scope of sub( A ) -> WC
*/
            PB_COutV( TYPE, COLUMN, INIT, Bm, Bn, Bd0, mbb, &WC, WCd, &WCfr,
                      &WCsum );
/*
*  Local matrix-matrix multiply iff I own some data
*/
            if( Bmp > 0 && Bnq > 0 )
               gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &Bmp, &mbb, &Bnq,
                     talpha, Bptr, &Bld, WA, &WAd[LLD_], zero, WC, &WCd[LLD_] );
            if( WAfr ) free( WA );
            if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) )
               if( Abuf ) free( Abuf );
/*
*  Accumulate the intermediate results in WC
*/
            if( WCsum )
            {
               WCd[CSRC_] = 0;
               if( Bmp > 0 )
                  gsum2d( ctxt, ROW,    &top, Bmp, mbb, WC, WCd[LLD_], myrow,
                          WCd[CSRC_] );
            }
/*
*  Compute the descriptor DBUFC for the buffer that will contained the packed
*  rows of sub( C ). Allocate it.
*/
            if( ( Cfr = ( nrpq < mbb ) ) != 0 )
            {
/*
*  If rows of sub( C ) are not contiguous, then allocate the buffer
*/
               Cbufld = mbb; tbeta = zero;
               if( CisR || ( myrow == Ccurrow ) )
                  Cbuf = PB_Cmalloc( Cnq * mbb * size );
            }
            else
            {
/*
*  Otherwise re-use sub( C )
*/
               Cbufld = Cld; tbeta = BETA;
               if( CisR || ( myrow == Ccurrow ) )
                  Cbuf = Mptr( C, Ckk+Coff, Cjj, Cld, size );
            }
            PB_Cdescset( DBUFC, mbb, N, mbb, Cinb1, mbb, Cnb, Ccurrow, Ccol,
                         ctxt, Cbufld );
/*
*  Cbuf := Cbuf + WC'
*/
            PB_Cpaxpby( TYPE, ( TrB == CCOTRAN ? CONJG : NOCONJG ), N, mbb,
                        one, WC, 0, 0, WCd, COLUMN, tbeta, Cbuf, 0, 0, DBUFC,
                        ROW    );
/*
*  Unpack the mbb rows of sub( C ) and release the buffer containing them.
*/
            if( Cfr && ( CisR || ( myrow == Ccurrow ) ) )
            {
               PB_CVMpack( TYPE, &VM, ROW, ROW,    UNPACKING, NOTRAN, mbb, Cnq,
                           BETA, Mptr( C, Ckk, Cjj, Cld, size ), Cld, one, Cbuf,
                           Cbufld );
               if( Cbuf ) free( Cbuf );
            }
            if( WCfr ) free( WC );
         }
/*
*  Update the local indexes of sub( B ) and sub( C )
*/
         PB_CVMupdate( &VM, mbb, &Ckk, &Akk );

         npq -= mbb;
      }
/*
*  Go to next or previous virtual process row or column
*/
      if( ( Cfwd      && ( p == maxpm1 ) ) ||
          ( !( Cfwd ) && ( p == 0      ) ) )
         q = ( Afwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
      p = ( Cfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
   }

   if( TrB == CCOTRAN ) free( talpha );
/*
*  End of PB_CpgemmAC
*/
}