/******************************************************************************
 *
 * Function:    Strassen-Winograd Matrix-Matrix Multiplication
 *
 * Reference:   Craig C. Douglas, Michael Heroux, Gordon Slishman, and
 *              Roger M. Smith,
 *              GEMMW:  A portable Level 3 BLAS Winograd variant of
 *              Strassen's  matrix--matrix multiply algorithm,
 *              Yale University Department of Computer Science Report
 *              YALEU/DCS/904, New Haven, CT, 1992 (submitted).
 *
 * Contacts:    na.cdouglas@na-net.ornl.gov, na.heroux@na-net.ornl.gov,
 *              slishmn@watson.ibm.com, and smith-roger@cs.yale.edu
 *
 * Date last
 * modified:    September 25, 1995
 *
 * Algorithm:   Winograd's variation of Strassen' algorithm, as outlined in
 *              "The Design and Analysis of Computer Algorithms,"
 *              by Aho, Hopcroft and Ullman, 1974, exercise 6.5, page 247.
 *
 * Returns:     C = alpha * op(A) * op(B) + beta * C, where
 *
 *                          { X,              if transX == 'N' or 'n'
 *                  op(X) = { X transpose,    if transX == 'T' or 't'
 *                          { X conj transp,  if transX == 'C' or 'c'
 *                          { X conjugate,    if transX == 'K' or 'k'
 *              and
 *                  alpha and beta are scalars of the same data type as
 *                  the matrices A, B, and C.
 *
 *              The dimensions of the matrices are as follows:
 *
 *                             rows    columns
 *                          +--------+--------+
 *                    op(A) |   M    |    K   |
 *                    op(B) |   K    |    N   |
 *                    C     |   M    |    N   |
 *                          +--------+--------+
 *
 * Naming
 * Convention:  _name, where the underscore is replaced by 
 *                  c   Single precision complex
 *                  d   Double precision real
 *                  s   Single precision real
 *                  z   Double precision complex
 *
 * Modules:     1.  _gemmw is the front end for the computational
 *                  kernel.  _gemmw maps the caller's parameters
 *                  (Fortran perhaps) to the kernel interface, validates
 *                  the parameters, and obtains sufficient auxiliary storage
 *                  (approx. 2/3*N**2 or 5/3*N**2 elements for square
 *                  matrices, as described below), calls the kernel, frees
 *                  auxiliary storage, and returns to the caller.
 *
 *              2.  _gemmb is a Level 3 BLAS header to _gemmw.
 *                  The calling sequence is identical to _gemm and _gemmw will
 *                  automatically allocate memory for its auxiliary space.
 *
 *              3.  _winos is the kernel.  It calls itself and
 *                  subroutines of some linear algebra library, for
 *                  example,
 *                      2A. LAPACK BLAS
 *                      2B. Cray scientific library BLAS
 *                      2C. IBM BLAS for the RISC System 6000
 *                      2D. IBM ESSL BLAS
 *                  The linear algebra subroutines perform ordinary
 *                  matrix and vector operations such as addition,
 *                  subtraction and multiplication of "small" matrices.
 *                  This is an internal routine that is not intended to be
 *                  called directly by the users of _gemmw.
 *
 *              4.  winolap checks to see if the active part of the matrix C
 *                  overlaps with the active part of some other matrix op(X).
 *                  It returns 0 if there is no overlap and 1 if there is.
 *                  This is an internal routine that is not intended to be
 *                  called directly by the users of _gemmw.
 *
 * Storage:     _gemmw obtains auxiliary storage dynamically for the
 *              caller (unless enough is provdied in the aux varaible).
 *              The amount obtained in elements is (rounded up):
 *
 *              (1)    (M*max(K,N) + K*N)/3 + (max(K,N) + M + K + 3N)/2 + 32    
 *
 *              If beta != 0 or either A or B overlaps with C in memory,
 *              then an additional MN amount of storage is needed.
 *
 *              The caller needs to provide adequate virtual storage,
 *              either explicitly (the aux variable) or implicitly
 *              (letting _gemmw do a malloc call itself).  Hence, the
 *              above calculation need never be performed by the user.
 *
 *              Equation (1) is easily derived.  Each level of recursion
 *              needs two major work areas, one of size bounded by
 *
 *                  (M/2+1/2)*(max(K,N)/2+1/2)
 *
 *              and the other bounded by
 *
 *                  (K/2+1/2)*(N/2+1/2).
 *
 *              When M is odd, a vector of length N/2 is required as
 *              well.  The sum is
 *
 *                  (M*max(K,N) + K*N + max(K,N) + M + K + 3N + 2)/4
 *
 *              As the storage requirement decreases by a factor of four
 *              with each level of recursion for the square terms and by
 *              a factor of two for the linear ones and as
 *
 *                  1/4 + 1/16 + 1/256 + ... = 1/3 
 *              and 
 *                  1/2 + 1/4 + 1/8 = 1,
 *
 *              the formula (1) follows trivially.
 *
 * Recursion:   The depth of recursion has no a priori bound applicable
 *              to all problems.  Instead a compilation parameter
 *              "mindim" states the value of min(M,N,K) below which a
 *              conventional matrix-matrix multiplication routine is
 *              invoked.
 *
 * Implementation:  The particular implementation used here follows the
 *                  steps below.  MK and KN are the two large buffers required
 *                  at each level of the recursion.   The quadrants of C
 *                  are used liberally.
 *
 *            Step    MK      c11     c12     c21     c22     KN      Operation
 *            ----    ----    ----    ----    ----    ----    ----    ---------
 *
 *             1.                                             S7      B22-B12
 *                                                            |
 *             2.     S3                                      |       A11-A21
 *                    |                                       |
 *             3.     +---------------------> M4 <------------+       S3*S7
 *                                            |
 *             4.     S1                      |                       A21+A22
 *                    |                       |
 *             5.     |                       |               S5      B12-B11
 *                    |                       |               |
 *             6.     +-----------------------|-----> M5 <----+       S1*S5
 *                    |                       |       |       |
 *             7.     |                       |       |       S6      B22-S5
 *                    |                       |       |       |
 *             8.     S2                      |       |       |       S1-A11
 *                    |                       |       |       |
 *             9.     +-----> M1 <------------|-------|-------+       S2*S6
 *                    |       |               |       |       |
 *            10.     S4      |               |       |       |       A12-S2
 *                    |       |               |       |       |
 *            11.     +-------|-----> M6      |       |       |       S4*B22
 *                            |       |       |       |       |
 *            12.             |       T3 <----|-------+       |       M5+M6
 *                            |       |       |       |       |
 *            13.     M2      |       |       |       |       |       A11*B11
 *                    |       |       |       |       |       |
 *            14.     +-----> T1      |       |       |       |       M1+M2
 *                    |       |       |       |       |       | 
 *            15.     |       +-----> C12     |       |       |       T1+T3
 *                    |       |               |       |       |
 *            16.     |       T2 <------------+       |       |       T1+M4
 *                    |       |                       |       |
 *            17.     |       |                       |       S8      S6-B21
 *                    |       |                       |       |
 *            18.     |       |               M7 <----|-------+       A22*S8
 *                    |       |               |       |
 *            19.     |       +-------------> C21     |               T2-M7
 *                    |       |                       |
 *            20.     |       +---------------------> C22             T2+M5
 *                    |
 *            21.     |       M3                                      A12*B21
 *                    |       |
 *            22.     +-----> C11                                     M2+M3
 *
 *
 * When M, N, or K is odd we get special cases of interest:
 *
 * Odd M/   Conceptually the approach is to imagine matrix A with its
 * odd N    odd middle row duplicated and/or B with its odd middle
 *          column duplicated.  The product would then have a duplicated
 *          middle row or middle column accordingly.  Letting the output
 *          quadrants overlap by one row or column eliminates the
 *          duplicated row and/or column produced from conceptual A and
 *          B.
 *
 * Odd K    Again, imagine B with its odd middle row duplicated, and
 *          imagine A with a column of zeroes inserted after its odd
 *          middle column.  In each operation involving A12 or A22, the
 *          first column is either omitted (because it's zero) or
 *          handled as a virtual column of zeroes.
 *
 * We recurse on conceptual A and B in all cases.
 *
 * Look for "if ( ko )" or "if ( mo )" in two places each in the code to
 * see how the special cases are handled.  There are absolutely no rank
 * one updates in this code.
 *
 *****************************************************************************/
 
#define __GEMMW_C_FILE

#include "gemmw.h"

/******************************************************************************
 *
 * Forward declarations of internal functions.
 *
 *****************************************************************************/

void    WINOS ();
extern  INT winolap ();   
 
/******************************************************************************
 *
 * _gemmw
 *
 * The front end to the computational kernel _winos.  This checks parameters,
 * allocates memory, does the final alpha*op(A)op(B) + beta*C step, and frees
 * memory.
 *
 *****************************************************************************/
 
void GEMMW ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
             aux, naux )
               char*   transa;      /* op(A) indicator                       */
               char*   transb;      /* op(B) indicator                       */
               INT*    m;           /* Rows of C and op(A)                   */
               INT*    n;           /* Cols of op(B) and C                   */
               INT*    k;           /* Cols of op(A) and Rows of op(B)       */
               FLOAT*  alpha;       /* Scalar multiplier of op(A)op(B)       */
               FLOAT*  a;           /* Pointer to coefficients of A          */
               INT*    lda;         /* Leading dimension of A, Fortran style */
               FLOAT*  b;           /* Pointer to coefficients of B          */
               INT*    ldb;         /* Leading dimension of B, Fortran style */
               FLOAT*  beta;        /* scalar multiplier of C                */
               FLOAT*  c;           /* Pointer to coefficients of C          */
               INT*    ldc;         /* Leading dimension of C, Fortran style */
               FLOAT*  aux;         /* Pointer to auxillary storage or NULL  */
               INT*    naux;        /* Number of elements in aux or <= 0     */
{
    INT     alpha_val;      /* alpha == 0, 1, or other indicator             */
    INT     beta_val;       /* beta  == 0, 1, or other indicator             */
    INT     olap;           /* olap  == 0, 1, according A or B intersects C  */
    FLOAT*  ci;             /* Temporary pointer                             */
    INT     i;              /* Temporary variable                            */
    INT     i0   = 0 ;      /* Integer 0 for routine calls                   */
    INT     i1   = 1 ;      /* Integer 1 for routine calls                   */
    INT     ldopab;         /* Leading dimension of op(A)op(B) matrix        */
    FLOAT*  mallocd;        /* Pointer to internal or user supplied aux area */
    FLOAT*  malloce;        /* Pointer to byte following aux area            */
    FLOAT*  mallocd_val;    /* Pointer to mallocd returned by malloc()       */
    FLOAT*  opab;           /* Pointer to op(A)op(B) matrix                  */
    FLOAT*  opabi;          /* Temporary pointer                             */
    INT     mn;             /* Temporary variable                            */
    INT     mx;             /* Temporary variable                            */
    INT     stg;            /* Minimum aux size required for _winos          */
 
    REAL    static neg1[] = { -1.0, 0.0 };      /* -1 in real or complex     */
    REAL    static pos1[] = {  1.0, 0.0 };      /* +1 in real or complex     */
    REAL    static zero[] = {  0.0, 0.0 };      /*  0 in real or complex     */

    /*******************************************************************
     * Are the matrices really compatible?
     ******************************************************************/
    if ( 1 > *m || 1 > *k || 1 >*n )
    {   fprintf( stderr,
                 "Winograd-Strassen received a bad k, m, or n: %d, %d, %d.\n",
                 *k, *m, *n );
        exit( 0 );
    }

    /*******************************************************************
     * Are the leading dimensions are greater than the number of rows?
     ******************************************************************/
    mx = 0;
    if ( *m > *lda || *lda <= 0 )
    {   fprintf( stderr,
                 "Winograd-Strassen received a bad LDA < M: %d < %d.\n",
                 *lda, *m );
        mx = 1;
    }
    if ( *k > *ldb || *ldb <= 0  )
    {   fprintf( stderr,
                 "Winograd-Strassen received a bad LDB < K: %d < %d.\n",
                 *ldb, *k );
        mx = 1;
    }
    if ( *m > *ldc || *ldc <= 0 )
    {   fprintf( stderr,
                 "Winograd-Strassen received a bad LDC < M: %d < %d.\n",
                 *ldc, *m );
        mx = 1;
    }
    if ( ! ( *transa == 'N' || *transa == 'n' ||
             *transa == 'T' || *transa == 't' ||
             *transa == 'C' || *transa == 'c' ||  
             *transa == 'K' || *transa == 'k' ) )
    {   fprintf( stderr,
                 "Winograd-Strassen received bad TRANSA: %.1s\n", transa );
        mx = 1;
    }
    if ( ! ( *transb == 'N' || *transb == 'n' ||
             *transb == 'T' || *transb == 't' ||
             *transb == 'C' || *transb == 'c' || 
             *transb == 'K' || *transb == 'k' ) )
    {   fprintf( stderr,
                 "Winograd-Strassen received bad TRANSB: %.1s\n", transb );
        mx = 1;
    }
    if ( mx )
        exit( 0 );
 
    /*******************************************************************
     * Check for overlap between A and C or B and C.
     ******************************************************************/
    i = ( *transa == 'N' || *transa == 'n' ||
          *transa == 'K' || *transa == 'k' );
    olap  = winolap( c, *ldc * sizeof(FLOAT), *n, *m * sizeof(FLOAT),
                     a, *lda * sizeof(FLOAT), 
                     i ? *k : *m, ( i ? *m : *k ) * sizeof(FLOAT) );
    i = ( *transb == 'N' || *transb == 'n' ||
          *transb == 'K' || *transb == 'k' );
    olap |= winolap( c, *ldc * sizeof(FLOAT), *n, *m * sizeof(FLOAT),
                     b, *ldb * sizeof(FLOAT), 
                     i ? *n : *k, ( i ? *k : *n ) * sizeof(FLOAT) );

    /*******************************************************************
     * Determine if alpha and beta are 0, 1, or other.  Set ..._val
     * to be 0, 1, or 2, respectively.
     ******************************************************************/
#if FLOAT_TYPE == 1 || FLOAT_TYPE == 2
    alpha_val = *alpha == 0.0 ? 0 : ( *alpha == 1.0 ? 1 : 2 );
    beta_val  = *beta  == 0.0 ? 0 : ( *beta  == 1.0 ? 1 : 2 );
#else
    if ( IM(*alpha) != 0.0 )
        alpha_val = 2;
    else
        alpha_val = RE(*alpha) == 0.0 ? 0 : ( RE(*alpha) == 1.0 ? 1 : 2 );
    if ( IM(*beta) != 0.0 )
        beta_val = 2;
    else
        beta_val = RE(*beta) == 0.0 ? 0 : ( RE(*beta) == 1.0 ? 1 : 2 );
#endif


    /*******************************************************************
     * Check for some special cases:
     *      alpha_val == 0 and beta_val == 0 => zero C
     *      beta_val == 2                    => scale C by beta
     *      alpha_val == 0 and beta_val == 1 => no work at all
     * Return if alpha == 0 since C = beta*C now.
     ******************************************************************/
    mn = *m * *n;
    if ( alpha_val == 0 && beta_val == 0 )
    {   if ( *ldc == *m )
            VYAX( &mn, zero, zero, &i0, c, &i1 );
        else
        {   ci = c;
            for ( i = 0 ; i < *n ; i++ )
            {   VYAX( m, zero, zero, &i0, ci, &i1 );
                ci += *ldc;
            }
        }
        return;
    }

    /*******************************************************************
     * How much auxillary space is needed and can we use what was
     * passed to us?  If beta  != 0, space is required for saving C.
     ******************************************************************/
    opab    = c;
    ldopab  = *ldc;
    mallocd = mallocd_val = aux;

    mx = *k > *n ? *k : *n;
    stg = (*m * mx + *k * *n)/3 + (mx + *m + *k  + 3 * *n)/2 + 32 ;
    if ( beta_val | olap )
        stg += *m * *n; 
    if ( *naux < stg )
    {   if ( (FLOAT*) 0 == ( mallocd_val = mallocd = 
                           ( FLOAT* ) malloc( stg*sizeof( FLOAT ) ) ) )
        {   fprintf( stderr, "Malloc error in Winograd-Strassen\n" );
            exit( 0 );
        }
    }

    if ( beta_val | olap )
    {   opab     = mallocd;
        ldopab   = *m;
        mallocd += *m * *n;
    }

    malloce = mallocd + (*naux > stg ? *naux : stg );

    /*******************************************************************
     * Call _winos to calculate op(A)op(B).
     ******************************************************************/
    if ( alpha_val )
    WINOS ( a, *lda, transa, 
            b, *ldb, transb,  
            opab, ldopab,
            *m, *k, *n,
            mallocd, malloce );
 
    /*******************************************************************
     * Complete C = alpha*op(A)op(B) + beta*C.
     *     beta != 0                => C = alpha * op(A)op(B) + beta * C
     *     beta == 0 and alpha != 1 => C = alpha * op(A)op(B)
     ******************************************************************/
    if ( beta_val | olap )
    {   if ( *ldc == *m )
        {   if ( beta_val != 1 )
                VYAX( &mn, beta, c, &i1, c, &i1 );
            VAXPY( &mn, alpha, opab, &i1, c, &i1 );
        }
        else
        {   ci = c;
            opabi = opab;
            for ( i = 0 ; i < *n ; i++ )
            {   if ( beta_val != 1 )
                    VYAX( m, beta, ci, &i1, ci, &i1 );
                VAXPY( m, alpha, opabi, &i1, ci, &i1 );
                ci += *ldc;
                opabi += *m;
            }
        }
    }
    else if ( alpha_val == 2 )
    {   if ( *ldc == *m )
            VYAX( &mn, alpha, c, &i1, c, &i1 );
        else
        {   ci = c;
            for ( i = 0 ; i < *n ; i++ )
            {  VYAX( m, alpha, ci, &i1, ci, &i1 );
                ci += *ldc;
            }
        }
    }

    /*******************************************************************
     * Free any space allocated in _gemmw and then return.
     ******************************************************************/
    if ( mallocd_val != aux )
        free( mallocd_val );

    return;
}

/******************************************************************************
 *
 * _gemmb
 *
 * This can be substituted for calls to _gemm without changing the arguments.
 * This is a strict Level 3 BLAS routine.  It merely calls _gemmw and lets
 * that routine do everything including dynamic memory allocation of aux.
 *
 *****************************************************************************/
 
void GEMMB ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc )
               char*   transa;      /* op(A) indicator                       */
               char*   transb;      /* op(B) indicator                       */
               INT*    m;           /* Rows of C and op(A)                   */
               INT*    n;           /* Cols of op(B) and C                   */
               INT*    k;           /* Cols of op(A) and Rows of op(B)       */
               FLOAT*  alpha;       /* Scalar multiplier of op(A)op(B)       */
               FLOAT*  a;           /* Pointer to coefficients of A          */
               INT*    lda;         /* Leading dimension of A, Fortran style */
               FLOAT*  b;           /* Pointer to coefficients of B          */
               INT*    ldb;         /* Leading dimension of B, Fortran style */
               FLOAT*  beta;        /* scalar multiplier of C                */
               FLOAT*  c;           /* Pointer to coefficients of C          */
               INT*    ldc;         /* Leading dimension of C, Fortran style */
{
    INT     n0;

    n0 = 0;
    GEMMW ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
            NULL, &n0 );
}

/******************************************************************************
 *
 * _winos
 *
 * The recursive routine that does all of the computational work.  The last
 * argument is a pointer into an already allocated chunk of memory.  It is
 * freed by strassen after strasse returns.
 *
 * This is an internal routine that is not intended to be called
 * directly by the users of _gemmw.
 *
 *****************************************************************************/
 
void WINOS ( a, lda, transa,  
             b, ldb, transb,  
             c, ldc,
             m, k, n,
             mallocd, malloce )

               FLOAT*  a;           /* Pointer to coefficients of A          */
               INT     lda;         /* Leading dimension of A, Fortran style */
               char*   transa;      /* transposition flag for matrix a       */
               FLOAT*  b;           /* Pointer to coefficients of B          */
               INT     ldb;         /* Leading dimension of B, Fortran style */
               char*   transb;      /* transposition flag for matrix b       */
               FLOAT*  c;           /* Pointer to coefficients of C          */
               INT     ldc;         /* Leading dimension of C, Fortran style */
               INT     m;           /* Rows of C and op(A)                   */
               INT     k;           /* Columns of C and op(A), rows of op(B) */
               INT     n;           /* Rows of op(B) and columns of op(A)    */
               FLOAT*  mallocd;     /* Pointer to iauxillary area            */ 
               FLOAT*  malloce;     /* Pointer to byte following aux         */ 
{
    char    static cn[] = { "N" };  /* Treat matrix normally                 */
    char    ptransa;                /* physically transposed                 */
    char    ptransb;                /* physically transposed                 */
    char    conjuga;                /* Conjugate only, no transpose          */
    char    conjugb;                /* Conjugate only, no transpose          */
    INT     i1   = 1 ;              /* Integer 1 for routine calls           */
    REAL    static neg1[] = { -1.0, 0.0 };      /* -1 in real or complex     */
    REAL    static pos1[] = {  1.0, 0.0 };      /* +1 in real or complex     */
    REAL    static zero[] = {  0.0, 0.0 };      /*  0 in real or complex     */
#ifdef Library_NAG
    INT     static  ifail;          /* NAG error return variable             */
#endif
 
    FLOAT*  mk;                     /* Pointer to mk workspace in aux        */
    FLOAT*  kn;                     /* Pointer to kn workspace in aux        */
    FLOAT*  m5;                     /* Pointer to row in aux for M5          */

    INT     kr;                     /* K/2 rounded down                      */
    INT     kt;                     /* K/2 rounded up                        */
    INT     ko;                     /* K odd indicator                       */
    INT     mr;                     /* M/2 rounded down                      */
    INT     mt;                     /* M/2 rounded up                        */
    INT     mo;                     /* M odd indicator                       */
    INT     nr;                     /* N/2 rounded down                      */
    INT     nt;                     /* N/2 rounded up                        */
    INT     no;                     /* N odd indicator                       */

    FLOAT*  a12;                    /* Pointer to upper right submatrix A    */
    FLOAT*  a21;                    /* Pointer to lower left  submatrix A    */
    FLOAT*  a22;                    /* Pointer to lower right submatrix A    */
    FLOAT*  b12;                    /* Pointer to upper right submatrix B    */
    FLOAT*  b21;                    /* Pointer to lower left  submatrix B    */
    FLOAT*  b22;                    /* Pointer to lower right submatrix B    */

    INT     ar1;                    /* Offset to next row of A               */
    INT     ac1;                    /* Offset to next col of A               */
    INT     br1;                    /* Offset to next row of B               */
    INT     bc1;                    /* Offset to next col of B               */
 
    /*******************************************************************
     * If the matrices are small, then use the classical algorithm.
     ******************************************************************/
    if ( m < mindim || k < mindim || n < mindim )
    {   mt = (REAL*)malloce - (REAL*)mallocd - 2;
        MATMUL( a, &lda, transa, b, &ldb, transb, c, &ldc, &m, &k, &n, 
                mallocd, &mt );                    
        return;
    }
 
    /*******************************************************************
     * This is the Winograd variant's of Strassen.
     ******************************************************************/
 
    /*******************************************************************
     * First, determine properties about the 4 submatrices of each of
     *  A, B, and C.
     ******************************************************************/
    mt = m / 2;         /* truncated m */
    mr = m - mt;        /* rounded up m */
    mo = mr - mt;       /* odd m indicator */
 
    kt = k / 2;         /* truncated k */
    kr = k - kt;        /* rounded up k */
    ko = kr - kt;       /* odd k indicator */
 
    nt = n / 2;         /* truncated n */
    nr = n - nt;        /* rounded up n */
    no = nr - nt;       /* odd n indicator */

    if ( *transa == 'N' || *transa == 'n' || *transa == 'K' || *transa == 'k' )
    {   a12 = a + kt*lda;
        a21 = a + mt; 
        a22 = a12 + mt;
        ar1 = 1;
        ac1 = lda;
        ptransa = 'N';
        conjuga = ( *transa == 'K' || *transa == 'k' ) ? 'K' : 'N';
    }
    else
    {   a12 = a + kt;
        a21 = a + mt*lda;
        a22 = a21 + kt;
        ar1 = lda;
        ac1 = 1;
        ptransa = 'T';
        conjuga = ( *transa == 'C' || *transa == 'c' ) ? 'K' : 'N';
    }
     
    if ( *transb == 'N' || *transb == 'n' || *transb == 'K' || *transb == 'k' )
    {   b12 = b + nt*ldb;
        b21 = b + kt; 
        b22 = b12 + kt;
        br1 = 1;
        bc1 = ldb;
        ptransb = 'N';
        conjugb = ( *transb == 'K' || *transb == 'k' ) ? 'K' : 'N';
    }
    else
    {   b12 = b + nt;
        b21 = b + kt*ldb;
        b22 = b21 + nt;
        br1 = ldb;
        bc1 = 1;
        ptransb = 'T';
        conjugb = ( *transb == 'C' || *transb == 'c' ) ? 'K' : 'N';
    }
 
    /*******************************************************************
     * Grab some free storage for the aux matrices and point to the
     * next level's scratch storage.
     ******************************************************************/
    mk      = mallocd;
    kn      = mk + (mr * (kr > nr ? kr : nr));
    mallocd = kn + (kr * nr);
 
    /*******************************************************************
     * Finally, do Strassen-Winograd, recursion and all.
     ******************************************************************/
 
/*S7*/  MATSUB(b22,         &ldb,  &ptransb,        /* B22-B12         */
               b12,         &ldb,  &ptransb,        /*                 */
               kn,          &kr,                    /*                 */
                            &kr,   &nr);            /* into kn         */
 
/*S3*/  MATSUB(a+ar1*mo,    &lda,  &ptransa,        /* A11-A21         */
               a21+ar1*mo,  &lda,  &ptransa,        /*                 */
               mk,          &mt,                    /*                 */
                            &mt,   &kr);            /* into mk         */
 
/*M4*/   WINOS(mk,          mt,    &conjuga,        /* S3*S7==mk*kn    */
               kn,          kr,    &conjugb,        /*                 */
               c+mr,        ldc,                    /*                 */
               mt,          kr,    nr,              /* into C21        */
               mallocd,     malloce );
 
/*S1*/  if ( ko )                                   /*                 */
             VCOPY(&mr,                             /* copy A21 col 1  */
                   a21,         &ar1,               /*                 */
                   mk,          &i1);               /* into mk         */
 
        MATADD(a21+ac1*ko,  &lda,  &ptransa,        /* A21+A22         */
               a22+ac1*ko,  &lda,  &ptransa,        /*                 */
               mk+mr*ko,    &mr,                    /*                 */
                            &mr,   &kt);            /* into mk         */
 
/*S5*/  MATSUB(b12,         &ldb,  &ptransb,        /* B12-B11         */
               b,           &ldb,  &ptransb,        /*                 */
               kn,          &kr,                    /*                 */
                            &kr,   &nr);            /* into kn         */
 
/*M5*/   WINOS(mk,          mr,    &conjuga,        /* S1*S5=mk*kn     */
               kn+kr*no,    kr,    &conjugb,        /*                 */
               c+ldc*nr+mt, ldc,                    /*                 */
               mr,          kr,    nt,              /* into C22        */
               mallocd,     malloce );

/*S6*/  MATSUB(b22,         &ldb,  &ptransb,        /* B22-S5          */
               kn,          &kr,   cn,              /*                 */
               kn,          &kr,                    /*                 */
                            &kr,   &nr);            /* into kn         */
 
/*S2*/  MATSUB(mk,          &mr,   cn,              /* S1-A11          */
               a,           &lda,  &ptransa,        /*                 */
               mk,          &mr,                    /*                 */
                            &mr,   &kr);            /* into mk         */
 
/*M1*/   WINOS(mk,          mr,    &conjuga,        /* S2*S6=mk*kn     */
               kn,          kr,    &conjugb,        /*                 */
               c,           ldc,                    /*                 */ 
               mr,          kr,    nr,              /* into C11        */
               mallocd,     malloce );
 
/*S4*/  if ( ko )                                   /* if common dim   */
              VYAX(&mr,         neg1,               /* is odd, 1st col */
                   mk,          &i1,                /* of A12 = 0.     */
                   mk,          &i1);               /* into mk         */
 
        MATSUB(a12+ac1*ko,  &lda,  &ptransa,        /* A12-S2=A12-mk   */
               mk+mr*ko,    &mr,   cn,              /*                 */
               mk+mr*ko,    &mr,                    /*                 */
                            &mr,   &kt);            /* into mk         */
 
/*M6*/ if ( mo )                                    /* M5/M6 stg prob. */
       {     m5 = mallocd;                          /* -> aux vector   */  
             mallocd += nt;                         /* length = nt     */
               
             VCOPY(&nt,                             /* top row M5 to   */
                   c+ldc*nr+mt, &ldc,               /* aux M5          */
                   m5,          &i1);                                    
       }
 
         WINOS(mk,          mr,    &conjuga,        /* S4*B22=mk*B22   */
               b22+bc1*no,  ldb,   transb,          /*                 */
               c+ldc*nr,    ldc,                    /*                 */
               mr,          kr,    nt,              /* into C12        */ 
               mallocd,     malloce );

       if ( mo )                                    /* if m odd        */
       {    VAXPY(&nt,          pos1,               /* row 0 of M6+M5  */
                  m5,          &i1,
                  c+ldc*nr,    &ldc);
            mallocd -= nt;
        }
 
        MATADD(c+ldc*nr+mo, &ldc,  cn,              /* M6+M5=C12+C22,  */
               c+ldc*nr+mr, &ldc,  cn,              /* excl. row 0     */
               c+ldc*nr+mo, &ldc,                   /*                 */
                            &mt,   &nt);            /* into C12        */
 
/*M2*/   WINOS(a,           lda,   transa,          /* A11*B11         */
               b,           ldb,   transb,          /*                 */
               mk,          mr,                     /*                 */
               mr,          kr,    nr,              /* into mk         */
               mallocd,     malloce );
 
/*T1*/  MATADD(mk,          &mr,   cn,              /* M1+M2, i.e.,    */
               c,           &ldc,  cn,              /* W1+C11          */
               c,           &ldc,                   /*                 */
                            &mr,   &nr);            /* into C11        */
 
/*C12*/ MATADD(c+ldc*no,    &ldc,  cn,              /* T1+M5+M6=C12+C22*/
               c+ldc*nr,    &ldc,  cn,              /*                 */
               c+ldc*nr,    &ldc,                   /*                 */
                            &mr,   &nt);            /* into C12 (done) */
 
/*T2*/  MATADD(c+mo,        &ldc,  cn,              /* T1+M4=C11+C21   */
               c+mr,        &ldc,  cn,              /*                 */
               c+mo,        &ldc,                   /*                 */
                            &mt,   &nr);            /* into C11        */
 
/*S8*/  MATSUB(kn,          &kr,   cn,              /* S6-B21=kn-B21   */
               b21,         &ldb,  &ptransb,        /*                 */
               kn,          &kr,                    /*                 */
                            &kr,   &nr);            /* into kn         */
 
/*M7*/   WINOS(a22+ac1*ko+ar1*mo,  lda, transa,     /* A22*S8=A22*kn   */
               kn+ko,       kr,    &conjugb,        /*                 */
               c+mr,        ldc,                    /*                 */
               mt,          kt,    nr,              /* into C21        */
               mallocd,     malloce );
 
/*C21*/ MATSUB(c+mo,        &ldc,  cn,              /* T2-M7=C21-C11   */
               c+mr,        &ldc,  cn,              /*                 */
               c+mr,        &ldc,                   /*                 */
                            &mt,   &nr);            /* into C21 (done) */
 
/*C22*/ MATADD(c+ldc*nr+mr, &ldc,  cn,              /* T2+M5=C21+M5    */
               c+ldc*no+mo, &ldc,  cn,              /*                 */
               c+ldc*nr+mr, &ldc,                   /*                 */
                            &mt,   &nt);            /* into C22 (done) */
 
/*M3*/   WINOS(a12+ac1*ko,  lda,   transa,          /* A12*B21         */
               b21+br1*ko,  ldb,   transb,          /*                 */
               c,           ldc,                    /*                 */
               mr,          kt,    nr,              /* into C11        */
               mallocd,     malloce );
 
/*C11*/ MATADD(c,           &ldc,  cn,              /* M2+M3=C11+mk    */
               mk,          &mr,   cn,              /*                 */
               c,           &ldc,                   /*                 */
                            &mr,   &nr);            /* into C11 (done) */
 
    /*******************************************************************
     * Return to either _winos or _gemmw.
     ******************************************************************/
    return;
}
