// File: s_bls2.cc -- implents class s_bls2
// Author: Suvrit Sra
// Date: 14 Nov, 2003

/*************************************************************************
 THE ORIGINAL SVDPAKC COPYRIGHT
                           (c) Copyright 1993
                        University of Tennessee
                          All Rights Reserved                          
 *************************************************************************/

#include <cstdio>
#include <cstdlib>
#include <cerrno>
#include <cmath>
#include <cstring>

#include <unistd.h>
#include <fcntl.h>

#include "s_bls2.h"

using namespace ssvd;
/***********************************************************************
 *                                                                     *
 *                        blklan2()                                    *
 *                  Block Lanczos SVD Alogrithm                        *
 *                                                                     *
 ***********************************************************************/
/***********************************************************************

   Description
   -----------

   blklan2.c is designed to compute singular values and singular vectors
   of a large sparse matrix A.  The singular values (and singular vectors)
   of A are determined by the eigenvalues (and eigenvectors) of the matrix
   B, where B = A'A.

   The eigenvalues of B are the squares of the singular values of A, the
   eigenvectors of B correspond to the right singular vectors of A only.
   The left singular vectors of A are determined by 

		u = 1/sigma A*v,

   where {u, sigma, v} is a singular triplet of A.  This version of blklan2 
   is designed to approximate the ik-largest singular triplets of A. 
   This hybrid block Lanczos procedure consists of five phases:

   Phase 1:  Block Lanczos outer iteration to yield a symmetric block
             tridiagonal matrix S whose eigenvalues approximate those
	     of the matrix B = A'A.  Total (or complete)
	     re-orthogonalization is used here.

   Phase 2:  Lanczos method for tridiagonalizing the S matrix from
	     Phase 1 to yield the tridiagonal matrix T whose eigenvalues
	     also approximate those of B.  Total (or complete)
	     re-orthogonalization is used for this Lanczos recursion.  
	     A polong Lanczos method (single vector) is used if a blocksize
	     of 1 is encountered via normal deflation.

   Phase 3:  Apply an appropriate QL iteration to diagonalize T and
	     hence produce approximate eigenvalues (array alpha)
	     of matrix B = A'A and hence the squares of the singular
	     values of the original sparse matrix A.

   Phase 4:  Convergence test using a user-supplied residual tolerance.

   Phase 5:  Iteration restart with orthogonalization with respect
	     to any (all) converged eigenvectors of B = A'A.



   Arguments
   ---------

   (input)
   fp       polonger to output file
   nnzero   number of nonzeros in matrix A
   m        row dimension of the sparse matrix A whose SVD is sought
   n        column dimension of the sparse matrix A whose SVD is sought
   ik       number of singular triplets desired
   ib       initial block size for outer iteration
   ic       upper bound for dimension of Krylov subspace generated via
	      outer iteration.  It is the maximum dimension for the block
	      upper-bidiagonal matrix S generated in Phase 1 above.
   tol     user-specified tolerance for approximate singular triplets
   maxit   maximum number of outer iterations allowed

   (output)
   iko     number of singular triplets approximated
   ico     last bound for dimension of Krylov subspace used within outer 
	      iteration
   ibo     final block size used in outer iteration
   eig     linear array containing the iko approximate singular values
   v       two-dimensional array containing the iko approximate right 
	      singular vectors corresponding to the approximate singular 
	      values in array eig
   res     linear array containing the iko residuals of the approximate 
	   singular triplets
   memory  memory storage needed in bytes


   External parameters
   -------------------

   Defined and documented in bls2.h


   Local parameters
   ----------------

    k        current # of desired triplets (exit when reduced to 0)
    k0       count of triplets found in current iteration
    nb       current block size
    nc       size of current subspace
    ns       number of blocks in current iteration
   

   Functions called
   --------------

   BLAS         dgemv, dgemm2, tql2, orthg
   MISC         validate, random, imin
   BLS2         polong2, block2


   NOTE:  Unlike Fortran, C stores arrays in row-major order.  For the
	  sake of efficiency, most matrices and vectors in the algorithm
	  are transposed, i.e., each vector and each column of a
	  matrix are stored row-wise.

 ***********************************************************************/

long s_bls2::blklan2(FILE* fp, long nnzero, long m, long n, long ik, double *v,
	     double *eig, long ic, long ib, double tol, double *res, 
	     long maxit, long *iko, long *ico, long *ibo, long *memory)

{
   long irand;
   long nbold, cont, ns, nb, nc, k, k0, length, memsize;
   long jj, ii, kk, ll, final, flag, i, j, *index;
   double **sp, **rp, **bigsp, **workptr2, **tempptr2;
   double *s, *r, *bigs, *workptr1, *tempptr1;
   double *ptr1, *ptr2;

   nb = ib;
   nc = ic;
   k  = ik;
   k0 = 0;
   ns = nc / nb;

   /* reset converged vector counter */
   iconv = 0;
   irand = (long) SEED;

   /* determine number of blocks for first iteration */
   if (ns < MINBLKS) {
      nb = nc / MINBLKS;
      ns = nc / nb;
   }
   /* return upon error on input */
   if (validate(fp, nnzero, m, n, *ico, *ibo, ik, tol))
      return(-1);

   /* allocate memory for linear arrays of doubles s, r, bigs, y, vv
    * v0, tres, uvtmp, z, p, q, t, alpha, beta, pp, qq, v */

   memsize = sizeof(double) * 
	     (nb * (nc + nc - nb + n + 1)     +
              nc * (nb + 2 * n + 3 + nc + nc) +
	      ik * (n + n)                    + 
	      4 * n);                   

   *memory += memsize;
 /***********************************************************************
  *         Allocate work area and initialize polongers                  *
  *   polonger              size                                         *
  * s     (sp)          nc      by nb                                   *
  * r     (rp)          (nc-nb) by nb                                   *
  * bigs  (bigsp)       nc      by (nb+1)                               *
  * y     (yp)          nb      by n                                    *
  * vv    (vvp)         nc      by n                                    *
  * v0                  ik      by n                                    *
  * tres                nb                                              *
  * uvtmp (uvtmpp)      (nc+ik) by n                                    *
  * z                   n                                               *
  * p                   n                                               *
  * q                   n                                               *
  * t                   n                                               *
  * alpha               nc                                              *
  * beta                nc                                              *
  * pp    (ppp)         nc      by nc                                   *
  * qq    (qqp)         nc      by nc                                   * 
  * v                   ik      by n  (allocated in bls2.c)             *
  * ztemp (ztempp)      nc      by n  (allocated in bls2.c)             *
  * index               nb                                              *
  ***********************************************************************/

   if (!(workptr1 = (double *)malloc(memsize)) ||
       !(index   = (long   *)malloc(sizeof(long) * nb))){
      perror("FIRST MALLOC FAILED in BLKLAN2()");
      exit(errno);
   }

   /* memory for linear array index */

   *memory += sizeof(long) * nb;

   /* allocate memory for arrays of polongers sp, rp, bigsp, yp, vvp,
    * uvtmpp, ppp, qqp.  This will allow the memory areas s, r, bigs, y,
    * vv, uvtmp, pp and qq to be addressed as 2-dimensional arrays */

   memsize = sizeof(double *) * (7 * nc + nb + ik);
   *memory += memsize;
   if (!(workptr2 = (double **)malloc(memsize))){
      perror("SECOND MALLOC FAILED in BLKLAN2()");
      exit(errno);
   }
   tempptr1  = workptr1;
   tempptr2  = workptr2;

   length    = nc * nb;
   s         = tempptr1;
   tempptr1 += length;
   sp        = tempptr2;
   tempptr2 += nc;
   j = 0;
   for (i = 0; i < length; i += nb) sp[j++] = &s[i];

   length = (nc - nb) * nb;
   r         = tempptr1; 
   tempptr1 += length;
   rp        = tempptr2;
   tempptr2 += (nc - nb);
   j = 0;
   for (i = 0; i < length; i += nb) rp[j++] = &r[i];

   length    = nc * (nb + 1);
   bigs      = tempptr1;
   tempptr1 += length;
   bigsp     = tempptr2;
   tempptr2 += nc;
   j = 0;
   for (i = 0; i < length; i += nb + 1) bigsp[j++] = &bigs[i];

   length    = n * nb;
   y         = tempptr1;
   tempptr1 += length;
   yp        = tempptr2;
   tempptr2 += nb;
   j = 0;
   for (i = 0; i < length; i += n) yp[j++] = &y[i];

   length    = n * nc;
   vv        = tempptr1;
   tempptr1 += length;
   vvp       = tempptr2;
   tempptr2 += nc;
   j = 0;
   for (i = 0; i < length; i += n) vvp[j++] = &vv[i];

   v0        = tempptr1;
   tempptr1 += n * ik;

   tres      = tempptr1;
   tempptr1 += nb;

   length    = n * (nc + ik);
   uvtmp     = tempptr1;
   tempptr1 += length;
   uvtmpp    = tempptr2;
   tempptr2 += nc + ik;
   j = 0;
   for (i = 0; i < length; i += n) uvtmpp[j++] = &uvtmp[i];

   z         = tempptr1;
   tempptr1 += n;

   p         = tempptr1;
   tempptr1 += n;

   q         = tempptr1;
   tempptr1 += n;

   t         = tempptr1;
   tempptr1 += n;

   alpha     = tempptr1;
   tempptr1 += nc;

   beta      = tempptr1;
   tempptr1 += nc;

   length    = nc * nc;
   pp        = tempptr1;
   tempptr1 += length;
   qq        = tempptr1;
   tempptr1 += length;
   ppp       = tempptr2;
   tempptr2 += nc;
   qqp       = tempptr2;
   tempptr2 += nc;
   j = 0;
   for (i = 0; i < length; i += nc) {
      ppp[j]   = &pp[i];
      qqp[j++] = &qq[i];
   }
   length = n * nb;
   for (i = 0; i < length; i++) vv[i] = mrandom(&irand);

   orthg(nb, 0, n, yp, vvp, ztemp); 
   iter = 0;
   while (iter < maxit) {
      nn = nb * ns;
      cont = TRUE;
      iter += 1;

      /*------------------------------------------------------------------*
       *           PHASE 1 and PHASE 2 (block algorithm)                  *
       *------------------------------------------------------------------*/

      if (nb > 1) 
	 block2(sp, rp, bigsp, m, n, nb, ns, &irand);
      else {
	 if (nbold != 1) k0 = 0;
	 /*------------------------------------------------------------------*
	  *                   PHASE 2A (polong algorithm)                     *
	  *------------------------------------------------------------------*/

	 cont = polong2(&k, m, n, v, res, eig, tol);
      }
      if (!cont) break;

      /*------------------------------------------------------------------*
       *                        PHASE 3                                   *
       *------------------------------------------------------------------*/

      /* solve symmetric tridiagonal EVP */
      for (i = 0; i < nn; i++) 
         for (j = 0; j < nn; j++) qqp[i][j] = ZERO;
      for (i = 0; i < nn; i++) qqp[i][i] = ONE;

      /* resort alpha's and rows of pp (need descending order) */
      if (tql2(nn, alpha, beta, qqp)) break;

      i = nn - 1;
      for (jj = 0; jj < nn; jj++) {
	 z[jj] = alpha[i];
	 for (ii = 0; ii < nn; ii++) uvtmpp[jj][ii] = qqp[i][ii];
	 i--;
      }
      for (jj = 0; jj < nn; jj++) {
	 alpha[jj] = z[jj];
	 for (ii = 0; ii < nn; ii++) qqp[jj][ii] = uvtmpp[jj][ii];
      }

      /*------------------------------------------------------------------*
       *                        PHASE 4                                   *
       *------------------------------------------------------------------*/

      nbold = nb;
      if (iter > 1 && nb > 1) {
	 k0 = 0;
	 final = imin(nb, k);
	 for (i = 0; i < final; i++) {
	    if (fabs(tres[i]) <= tol) {
	      index[k0] = i;
	      eig[iconv + k0] = alpha[i];
	      res[iconv + k0] = tres[i];
	      k0 += 1;
            }
         }
	 if (nb >= k) nb -= k0;
	 else nb = imin(nb, k - k0);
	 nc -= k0;
	 k -= k0;
	 if (k) ns = nc / nb;
         if (ns < MINBLKS) {
            nb = nc / MINBLKS;
            ns = nc / nb;
         }
      }

      /*------------------------------------------------------------------*
       *                  PHASE 5 (back transformation)                   *
       *------------------------------------------------------------------*/

      if (nbold > 1) {
         dgemm2(NTRANSP, NTRANSP, nn, nn, nn, ONE, qqp, ppp, ZERO, ztempp);
         dgemm2(NTRANSP, NTRANSP, nb + k0, n, nn, ONE, ztempp, vvp, ZERO, uvtmpp);
	 ptr1 = uvtmp;
	 length = (nb + k0) * n;
	 for (i = 0; i < length; i++) v[i] = *ptr1++;
      }
      else dgemv(TRANSP, nn, n, ONE, vvp, qq, ZERO, v);
      if (k0) {
         final = iconv + k0;
         ii = 0;
         for (jj = iconv; jj < final; jj++) {
	    ptr1 =  &v[n * index[ii++]];
	    ptr2 = &v0[n * jj];
            for (i = 0; i < n; i++) *ptr2++ = *ptr1++;
         }
         iconv = final;
         if (k <= 0) {
	    iter -= 1;
	    cont = FALSE;
         }
         if (cont) {
            kk = 0;
            final = nb + k0;
            for (jj = 0; jj < final; jj++) {
	       flag = FALSE;
	       ll = 0;
	       while (ll < k0 && !flag) {
	          if (index[ll] != jj) ll++;
	          else flag = TRUE;
	       }
	       if (!flag) {
	          ptr1 = &vv[n * kk];
	          ptr2 =  &v[n * jj];
                  for (i = 0; i < n; i++) *ptr1++ = *ptr2++;
	          kk++;
	       }
            }
	 }
      }
      else { 
         ptr1 =  v;
         for (jj = 0; jj < nb; jj++)
            for (i = 0; i < n; i++) vvp[jj][i] = *ptr1++; 
      }
      if (!cont) break;
   }
   ptr1 =  v;
   ptr2 = v0;
   for (j = 0; j < iconv; j++) 
      for (i = 0; i < n; i++) *ptr1++ = *ptr2++;
   *iko = ik - k;
   *ibo  = nb;
   *ico = nc;

   free(workptr1);
   free(workptr2);
   free(index);
   return(0);
}


/***********************************************************************
 *                                                                     *
 *                        polong2()                                     *
 *                                                                     *
 ***********************************************************************/
/***********************************************************************

   Description
   -----------

   This function is a single-vector Lanczos tridiagonalization procedure
   (degenerate case of block size = 1 for block2.c) which is used when
   normal deflation reduces the current block size to 1.

   The function returns DONE when the number of remaining triplets to be 
   approximated is less than or equal to zero.  Otherwise, it returns
   status CONTINUE.
   

   Arguments
   ---------

   (input)
   k        current # of desired triplets
   m        row dimension of the sparse matrix A whose SVD is sought
   n        column dimension of the sparse matrix A whose SVD is sought
   tol     user-specified tolerance for approximate singular triplets
   wp      work space

   (output)
   sing    linear array containing the iko approximate singular values
   res     linear array containing the iko residuals of the approximate 
	   singular triplets
   alpha   diagonal elements of symmetric tridiagonal matrix from the 
	   inner recursion
   beta    off-diagonal elements of symmetric tridiagonal matrix from the 
	   inner recursion

   External parameters
   -------------------

   Defined and documented in bls2.h


   Functions called
   --------------

   BLAS         ddot, enorm, daxpy, orthg
   USER         opb

 ***********************************************************************/

long s_bls2::polong2(long *k, long m, long n, 
            double *v, double *res, double *eig, double tol)

{
   double *ptr, dum, znm;
   long i, j, jj, convpj;

   for (j = 0; j < n; j++) p[j] = vv[j];
   for (j = 0; j < n; j++) z[j] = ZERO;

   for (j = 0; j < nn; j++) {
      opb(m, n, p, q);
      daxpy(n, ONE, q, 1, z, 1);
      alpha[j] = ddot(n, p, 1, z, 1);
      if (j == nn - 1) return(CONTINUE);

      /* compute Z[j] := Z[j] - alpha[j] * P */
      if (alpha[j] != ZERO) {
	 daxpy(n, -alpha[j], p, 1, z, 1);
	 if (!j) {
	    if ((znm = enorm(n, z)) <= tol) {
	       ptr = &v[iconv * n];
               for (i = 0; i < n; i++) *ptr++ = p[i];
	       eig[iconv] = alpha[0];
	       res[iconv] = znm;
               *k -= 1;
	       iter -= 1;
	       return(DONE);
            }
	 }

	 /* orthogonalize Z w.r.t. converged right S-vectors and previous 
	    VV's */
         convpj = iconv + j;
	 ptr = v0;
	 for (jj = 0; jj < iconv; jj++)
	    for (i = 0; i < n; i++)
	       uvtmpp[jj][i] = *ptr++;
	 ptr = vv;
	 for (jj = iconv; jj <= convpj; jj++)
	    for (i = 0; i < n; i++)
	       uvtmpp[jj][i] = *ptr++;
         if (convpj) {
	    ptr = uvtmpp[convpj + 1];
	    for (i = 0; i < n; i++) *ptr++ = z[i];
	    orthg(1, convpj + 1, n, yp, uvtmpp, ztemp);
	    ptr = uvtmpp[convpj + 1];
	    for (i = 0; i < n; i++) z[i] = *ptr++;
	 }
	 else {
	    dum = -ddot(n, uvtmp, 1, z, 1);
	    daxpy(n, dum, uvtmp, 1, z, 1);
	 }

	 /* compute beta[j] */
	 beta[j] = enorm(n,z);
	 if (beta[j] != ZERO) {

            /* compute P[j+1] := Z[j] / beta[j] */
	    ptr = vvp[j + 1];
	    for (i = 0; i < n; i++) {
	       t[i] = p[i];
	       p[i] = z[i] / beta[j];
	       *ptr++ = p[i];
	       z[i] = -beta[j] * t[i];
	    }
         }
	 else return(CONTINUE);
      }
      else return(CONTINUE);
   }
}


/***********************************************************************
 *                                                                     *
 *                        block2()                                     *
 *                                                                     *
 ***********************************************************************/
/***********************************************************************

   Description
   -----------

   This function implements the first two phases of the hybrid block
   Lanczos procedure.  In the first phase, which is also known as the
   block Lanczos outer iteration, a symmetric block tridiagonal matrix S
   is formed.  The eigenvalues of the matrix S approximate those 
   of matrix B, where B = A'A and A is the original sparse matrix.  
   Total (or complete) re-orthogonalization is used.

   In the second phase, single-vector Lanczos tridiagonalization is used
   to reduce (preserving eigenvalues of S) the block matrix S to a 
   symmetric tridiagonal matrix T. 


   Arguments
   ---------

   (input)
   w, wp     work space
   sp        diagonal blocks (symmetric submatrices) of the
		symmetric block tridiagonal matrix S 
   rp        super-diagonal blocks (upper-triangular submatrices)
		of the symmetric block tridiagonal matrix S 
   bigsp     symmetric block tridiagonal matrix S
   m         row dimension of sparse matrix A
   n         column dimension of sparse matrix A
   nb        current block size
   ns        number of blocks in current iteration
   irand     seed for random number generator
   

   (output - globally defined)
   alpha     diagonal elements of symmetric tridiagonal matrix T (reduced 
                from matrix S)
   beta      off-diagonal elements of symmetric tridiagonal matrix T 
		(reduced from matrix S)
   tres      residuals of approximate eigenvalues determined from
	     a previous set of block Lanczos outer iterations.
   ppp       array of eigenvectors of S


   External parameters
   -------------------

   Defined and documented in bls2.h


   Functions called
   --------------

   BLAS         ddot, daxpy, enorm, dgemm2, orthg, dsbmv
   USER         opm
   MISC         random
   BLS2         formbigs

 ***********************************************************************/

void s_bls2::block2(double **sp, double **rp, double **bigsp, long m, long n, 
            long nb, long ns, long *irand)

{
   long jinc, nk, nj, i, j, k, blkptr;
   double *ptr, dum, pnm;

   for (i = 0; i < nn; i++)
      for (j = 0; j < nb; j++) sp[i][j] = ZERO;
   /* ns (number of blocks) is assumed to be at least 2 */
   nk = nn - nb;
   for (i = 0; i < nk; i++)
      for (j = 0; j < nb; j++) rp[i][j] = ZERO;

   opm(m, n, nb, vvp, yp);

   dgemm2(NTRANSP, TRANSP, nb, nb, n, ONE, vvp, yp, ZERO, sp);

   blkptr = 0;
   for (j = 1; j < ns; j++) {

      dgemm2(TRANSP, NTRANSP, nb, n, nb, -ONE, &sp[blkptr],
            &vvp[blkptr], ONE, yp);
      if (j > 1)
         dgemm2(NTRANSP, NTRANSP, nb, n, nb, -ONE, &rp[blkptr - nb],
               &vvp[blkptr - nb], ONE, yp);

      if (j == 1 && iter > 1)
         for (i = 0; i < nb; i++) tres[i] = enorm(n, yp[i]);

      if (iconv) {
         nk = nb * j;
         nj = nk + iconv;

         ptr = vv;
         for (k = 0; k < nk; k++) 
            for (i = 0; i < n; i++) uvtmpp[k][i] = *ptr++; 
         ptr = v0;
         for (k = nk; k < nj; k++) 
            for (i = 0; i < n; i++) uvtmpp[k][i] = *ptr++; 
      }
      else {
         nj = nb * j;
         ptr = vv;
         for (k = 0; k < nj; k++) 
            for (i = 0; i < n; i++) uvtmpp[k][i] = *ptr++;
      }
      ptr = y;
      for (k = 0; k < nb; k++) 
         for (i = 0; i < n; i++) {
	    uvtmpp[nj + k][i] = *ptr;
	    *ptr++ = ZERO;
         }
      orthg(nb, nj, n, yp, uvtmpp, ztemp);

      for (k = 0; k < nb; k++) 
         for (i = k; i < nb; i++) rp[blkptr + k][i] = yp[i][k];

      jinc = blkptr + nb;
      ptr = vvp[jinc];
      for (k = nj; k < nj + nb; k++) 
         for (i = 0; i < n; i++) *ptr++ = uvtmpp[k][i]; 
  
      opm(m, n, nb, &vvp[jinc], yp);
      dgemm2(NTRANSP, TRANSP, nb, nb, n, ONE, &vvp[jinc], yp, ZERO, &sp[jinc]);
      blkptr += nb;
   }
   formbigs(nn, nb, rp, sp, bigsp);

   for (i = 0; i < nn; i++) p[i] = mrandom(irand);
   pnm = enorm(nn,p);

   ptr = pp;
   for (i = 0; i < nn; i++) {
      p[i] /= pnm;
      *ptr++ = p[i];
      z[i] = ZERO;
   }
   for (j = 0; j < nn; j++) {
      dsbmv(nn, nb, ONE, bigsp, p, ONE, z);
      alpha[j] = ddot(nn, p, 1, z, 1);
      if (j == nn - 1) break;

      /* compute Z[j] := Z[j] - alpha[j] * P */
      daxpy(nn, -alpha[j], p, 1, z, 1);

      /* orthogonalize Z w.r.t. previous PP's */
      for (k = 0; k <= j; k++)
         for (i = 0; i < nn; i++) uvtmpp[k][i] = ppp[k][i];
      if (j) {
         ptr = uvtmpp[j + 1];
         for (i = 0; i < nn; i++) *ptr++ = z[i];
	 orthg(1, j+1, nn, yp, uvtmpp, ztemp);
	 ptr = uvtmpp[j + 1];
         for (i = 0; i < nn; i++) z[i] = *ptr++;
      }
      else {
         dum = -ddot(nn, uvtmp, 1, z, 1);
	 daxpy(nn, dum, uvtmp, 1, z, 1);
      }
      beta[j] = enorm(nn,z);
      if (beta[j] != ZERO) {

	 /* compute P[j+1] := Z[j] / beta[j] */
         ptr = ppp[j + 1];
         for (i = 0; i < nn; i++) {
	    t[i] = p[i];
	    p[i] = z[i] /beta[j];
	    *ptr++ = p[i];
	    z[i] = -beta[j] * t[i];
	 }
      }
      else return;
   }
   return;
}

