/* $Id: ClpCholeskyDense.cpp 1910 2013-01-27 02:00:13Z stefan $ */
/*
  Copyright (C) 2003, International Business Machines Corporation
  and others.  All Rights Reserved.

  This code is licensed under the terms of the Eclipse Public License (EPL).
*/
#include "CoinPragma.hpp"
#include "CoinHelperFunctions.hpp"
#include "ClpConfig.h"
#include "ClpHelperFunctions.hpp"
#include "ClpInterior.hpp"
#include "ClpCholeskyDense.hpp"
#include "ClpMessage.hpp"
#include "ClpQuadraticObjective.hpp"
#if CLP_HAS_ABC
#include "CoinAbcCommon.hpp"
#endif
#if CLP_HAS_ABC<3
#undef cilk_for 
#undef cilk_spawn
#undef cilk_sync
#define cilk_for for 
#define cilk_spawn
#define cilk_sync
#endif

/*#############################################################################*/
/* Constructors / Destructor / Assignment*/
/*#############################################################################*/

/*-------------------------------------------------------------------*/
/* Default Constructor */
/*-------------------------------------------------------------------*/
ClpCholeskyDense::ClpCholeskyDense ()
     : ClpCholeskyBase(),
       borrowSpace_(false)
{
     type_ = 11;;
}

/*-------------------------------------------------------------------*/
/* Copy constructor */
/*-------------------------------------------------------------------*/
ClpCholeskyDense::ClpCholeskyDense (const ClpCholeskyDense & rhs)
     : ClpCholeskyBase(rhs),
       borrowSpace_(rhs.borrowSpace_)
{
     assert(!rhs.borrowSpace_ || !rhs.sizeFactor_); /* can't do if borrowing space*/
}


/*-------------------------------------------------------------------*/
/* Destructor */
/*-------------------------------------------------------------------*/
ClpCholeskyDense::~ClpCholeskyDense ()
{
     if (borrowSpace_) {
          /* set NULL*/
          sparseFactor_ = NULL;
          workDouble_ = NULL;
          diagonal_ = NULL;
     }
}

/*----------------------------------------------------------------*/
/* Assignment operator */
/*-------------------------------------------------------------------*/
ClpCholeskyDense &
ClpCholeskyDense::operator=(const ClpCholeskyDense& rhs)
{
     if (this != &rhs) {
          assert(!rhs.borrowSpace_ || !rhs.sizeFactor_); /* can't do if borrowing space*/
          ClpCholeskyBase::operator=(rhs);
          borrowSpace_ = rhs.borrowSpace_;
     }
     return *this;
}
/*-------------------------------------------------------------------*/
/* Clone*/
/*-------------------------------------------------------------------*/
ClpCholeskyBase * ClpCholeskyDense::clone() const
{
     return new ClpCholeskyDense(*this);
}
/* If not power of 2 then need to redo a bit*/
#define BLOCK 16
#define BLOCKSHIFT 4
/* Block unroll if power of 2 and at least 8*/
#define BLOCKUNROLL

#define BLOCKSQ ( BLOCK*BLOCK )
#define BLOCKSQSHIFT ( BLOCKSHIFT+BLOCKSHIFT )
#define number_blocks(x) (((x)+BLOCK-1)>>BLOCKSHIFT)
#define number_rows(x) ((x)<<BLOCKSHIFT)
#define number_entries(x) ((x)<<BLOCKSQSHIFT)
/* Gets space */
int
ClpCholeskyDense::reserveSpace(const ClpCholeskyBase * factor, int numberRows)
{
     numberRows_ = numberRows;
     int numberBlocks = (numberRows_ + BLOCK - 1) >> BLOCKSHIFT;
     /* allow one stripe extra*/
     numberBlocks = numberBlocks + ((numberBlocks * (numberBlocks + 1)) / 2);
     sizeFactor_ = numberBlocks * BLOCKSQ;
     /*#define CHOL_COMPARE*/
#ifdef CHOL_COMPARE
     sizeFactor_ += 95000;
#endif
     if (!factor) {
          sparseFactor_ = new longDouble [sizeFactor_];
          rowsDropped_ = new char [numberRows_];
          memset(rowsDropped_, 0, numberRows_);
          workDouble_ = new longDouble[numberRows_];
          diagonal_ = new longDouble[numberRows_];
     } else {
          borrowSpace_ = true;
          int numberFull = factor->numberRows();
          sparseFactor_ = factor->sparseFactor() + (factor->size() - sizeFactor_);
          workDouble_ = factor->workDouble() + (numberFull - numberRows_);
          diagonal_ = factor->diagonal() + (numberFull - numberRows_);
     }
     numberRowsDropped_ = 0;
     return 0;
}
/* Returns space needed */
CoinBigIndex
ClpCholeskyDense::space( int numberRows) const
{
     int numberBlocks = (numberRows + BLOCK - 1) >> BLOCKSHIFT;
     /* allow one stripe extra*/
     numberBlocks = numberBlocks + ((numberBlocks * (numberBlocks + 1)) / 2);
     CoinBigIndex sizeFactor = numberBlocks * BLOCKSQ;
#ifdef CHOL_COMPARE
     sizeFactor += 95000;
#endif
     return sizeFactor;
}
/* Orders rows and saves pointer to matrix.and model */
int
ClpCholeskyDense::order(ClpInterior * model)
{
     model_ = model;
     int numberRows;
     int numberRowsModel = model_->numberRows();
     int numberColumns = model_->numberColumns();
     if (!doKKT_) {
          numberRows = numberRowsModel;
     } else {
          numberRows = 2 * numberRowsModel + numberColumns;
     }
     reserveSpace(NULL, numberRows);
     rowCopy_ = model->clpMatrix()->reverseOrderedCopy();
     return 0;
}
/* Does Symbolic factorization given permutation.
   This is called immediately after order.  If user provides this then
   user must provide factorize and solve.  Otherwise the default factorization is used
   returns non-zero if not enough memory */
int
ClpCholeskyDense::symbolic()
{
     return 0;
}
/* Factorize - filling in rowsDropped and returning number dropped */
int
ClpCholeskyDense::factorize(const CoinWorkDouble * diagonal, int * rowsDropped)
{
     assert (!borrowSpace_);
     const CoinBigIndex * columnStart = model_->clpMatrix()->getVectorStarts();
     const int * columnLength = model_->clpMatrix()->getVectorLengths();
     const int * row = model_->clpMatrix()->getIndices();
     const double * element = model_->clpMatrix()->getElements();
     const CoinBigIndex * rowStart = rowCopy_->getVectorStarts();
     const int * rowLength = rowCopy_->getVectorLengths();
     const int * column = rowCopy_->getIndices();
     const double * elementByRow = rowCopy_->getElements();
     int numberColumns = model_->clpMatrix()->getNumCols();
     CoinZeroN(sparseFactor_, sizeFactor_);
     /*perturbation*/
     CoinWorkDouble perturbation = model_->diagonalPerturbation() * model_->diagonalNorm();
     perturbation = perturbation * perturbation;
     if (perturbation > 1.0) {
#ifdef COIN_DEVELOP
          /*if (model_->model()->logLevel()&4) */
          std::cout << "large perturbation " << perturbation << std::endl;
#endif
          perturbation = CoinSqrt(perturbation);;
          perturbation = 1.0;
     }
     int iRow;
     int newDropped = 0;
     CoinWorkDouble largest = 1.0;
     CoinWorkDouble smallest = COIN_DBL_MAX;
     CoinWorkDouble delta2 = model_->delta(); /* add delta*delta to diagonal*/
     delta2 *= delta2;
     if (!doKKT_) {
          longDouble * work = sparseFactor_;
          work--; /* skip diagonal*/
          int addOffset = numberRows_ - 1;
          const CoinWorkDouble * diagonalSlack = diagonal + numberColumns;
          /* largest in initial matrix*/
          CoinWorkDouble largest2 = 1.0e-20;
          for (iRow = 0; iRow < numberRows_; iRow++) {
               if (!rowsDropped_[iRow]) {
                    CoinBigIndex startRow = rowStart[iRow];
                    CoinBigIndex endRow = rowStart[iRow] + rowLength[iRow];
                    CoinWorkDouble diagonalValue = diagonalSlack[iRow] + delta2;
                    for (CoinBigIndex k = startRow; k < endRow; k++) {
                         int iColumn = column[k];
                         CoinBigIndex start = columnStart[iColumn];
                         CoinBigIndex end = columnStart[iColumn] + columnLength[iColumn];
                         CoinWorkDouble multiplier = diagonal[iColumn] * elementByRow[k];
                         for (CoinBigIndex j = start; j < end; j++) {
                              int jRow = row[j];
                              if (!rowsDropped_[jRow]) {
                                   if (jRow > iRow) {
                                        CoinWorkDouble value = element[j] * multiplier;
                                        work[jRow] += value;
                                   } else if (jRow == iRow) {
                                        CoinWorkDouble value = element[j] * multiplier;
                                        diagonalValue += value;
                                   }
                              }
                         }
                    }
                    for (int j = iRow + 1; j < numberRows_; j++)
                         largest2 = CoinMax(largest2, CoinAbs(work[j]));
                    diagonal_[iRow] = diagonalValue;
                    largest2 = CoinMax(largest2, CoinAbs(diagonalValue));
               } else {
                    /* dropped*/
                    diagonal_[iRow] = 1.0;
               }
               addOffset--;
               work += addOffset;
          }
          /*check sizes*/
          largest2 *= 1.0e-20;
          largest = CoinMin(largest2, CHOL_SMALL_VALUE);
          int numberDroppedBefore = 0;
          for (iRow = 0; iRow < numberRows_; iRow++) {
               int dropped = rowsDropped_[iRow];
               /* Move to int array*/
               rowsDropped[iRow] = dropped;
               if (!dropped) {
                    CoinWorkDouble diagonal = diagonal_[iRow];
                    if (diagonal > largest2) {
                         diagonal_[iRow] = diagonal + perturbation;
                    } else {
                         diagonal_[iRow] = diagonal + perturbation;
                         rowsDropped[iRow] = 2;
                         numberDroppedBefore++;
                    }
               }
          }
          doubleParameters_[10] = CoinMax(1.0e-20, largest);
          integerParameters_[20] = 0;
          doubleParameters_[3] = 0.0;
          doubleParameters_[4] = COIN_DBL_MAX;
          integerParameters_[34] = 0; /* say all must be positive*/
#ifdef CHOL_COMPARE
          if (numberRows_ < 200)
               factorizePart3(rowsDropped);
#endif
          factorizePart2(rowsDropped);
          newDropped = integerParameters_[20] + numberDroppedBefore;
          largest = doubleParameters_[3];
          smallest = doubleParameters_[4];
          if (model_->messageHandler()->logLevel() > 1)
               std::cout << "Cholesky - largest " << largest << " smallest " << smallest << std::endl;
          choleskyCondition_ = largest / smallest;
          /*drop fresh makes some formADAT easier*/
          if (newDropped || numberRowsDropped_) {
               newDropped = 0;
               for (int i = 0; i < numberRows_; i++) {
                    char dropped = static_cast<char>(rowsDropped[i]);
                    rowsDropped_[i] = dropped;
                    if (dropped == 2) {
                         /*dropped this time*/
                         rowsDropped[newDropped++] = i;
                         rowsDropped_[i] = 0;
                    }
               }
               numberRowsDropped_ = newDropped;
               newDropped = -(2 + newDropped);
          }
     } else {
          /* KKT*/
          CoinPackedMatrix * quadratic = NULL;
          ClpQuadraticObjective * quadraticObj =
               (dynamic_cast< ClpQuadraticObjective*>(model_->objectiveAsObject()));
          if (quadraticObj)
               quadratic = quadraticObj->quadraticObjective();
          /* matrix*/
          int numberRowsModel = model_->numberRows();
          int numberColumns = model_->numberColumns();
          int numberTotal = numberColumns + numberRowsModel;
          longDouble * work = sparseFactor_;
          work--; /* skip diagonal*/
          int addOffset = numberRows_ - 1;
          int iColumn;
          if (!quadratic) {
               for (iColumn = 0; iColumn < numberColumns; iColumn++) {
                    CoinWorkDouble value = diagonal[iColumn];
                    if (CoinAbs(value) > 1.0e-100) {
                         value = 1.0 / value;
                         largest = CoinMax(largest, CoinAbs(value));
                         diagonal_[iColumn] = -value;
                         CoinBigIndex start = columnStart[iColumn];
                         CoinBigIndex end = columnStart[iColumn] + columnLength[iColumn];
                         for (CoinBigIndex j = start; j < end; j++) {
                              /*choleskyRow_[numberElements]=row[j]+numberTotal;*/
                              /*sparseFactor_[numberElements++]=element[j];*/
                              work[row[j] + numberTotal] = element[j];
                              largest = CoinMax(largest, CoinAbs(element[j]));
                         }
                    } else {
                         diagonal_[iColumn] = -value;
                    }
                    addOffset--;
                    work += addOffset;
               }
          } else {
               /* Quadratic*/
               const int * columnQuadratic = quadratic->getIndices();
               const CoinBigIndex * columnQuadraticStart = quadratic->getVectorStarts();
               const int * columnQuadraticLength = quadratic->getVectorLengths();
               const double * quadraticElement = quadratic->getElements();
               for (iColumn = 0; iColumn < numberColumns; iColumn++) {
                    CoinWorkDouble value = diagonal[iColumn];
                    CoinBigIndex j;
                    if (CoinAbs(value) > 1.0e-100) {
                         value = 1.0 / value;
                         for (j = columnQuadraticStart[iColumn];
                                   j < columnQuadraticStart[iColumn] + columnQuadraticLength[iColumn]; j++) {
                              int jColumn = columnQuadratic[j];
                              if (jColumn > iColumn) {
                                   work[jColumn] = -quadraticElement[j];
                              } else if (iColumn == jColumn) {
                                   value += quadraticElement[j];
                              }
                         }
                         largest = CoinMax(largest, CoinAbs(value));
                         diagonal_[iColumn] = -value;
                         CoinBigIndex start = columnStart[iColumn];
                         CoinBigIndex end = columnStart[iColumn] + columnLength[iColumn];
                         for (j = start; j < end; j++) {
                              work[row[j] + numberTotal] = element[j];
                              largest = CoinMax(largest, CoinAbs(element[j]));
                         }
                    } else {
                         value = 1.0e100;
                         diagonal_[iColumn] = -value;
                    }
                    addOffset--;
                    work += addOffset;
               }
          }
          /* slacks*/
          for (iColumn = numberColumns; iColumn < numberTotal; iColumn++) {
               CoinWorkDouble value = diagonal[iColumn];
               if (CoinAbs(value) > 1.0e-100) {
                    value = 1.0 / value;
                    largest = CoinMax(largest, CoinAbs(value));
               } else {
                    value = 1.0e100;
               }
               diagonal_[iColumn] = -value;
               work[iColumn-numberColumns+numberTotal] = -1.0;
               addOffset--;
               work += addOffset;
          }
          /* Finish diagonal*/
          for (iRow = 0; iRow < numberRowsModel; iRow++) {
               diagonal_[iRow+numberTotal] = delta2;
          }
          /*check sizes*/
          largest *= 1.0e-20;
          largest = CoinMin(largest, CHOL_SMALL_VALUE);
          doubleParameters_[10] = CoinMax(1.0e-20, largest);
          integerParameters_[20] = 0;
          doubleParameters_[3] = 0.0;
          doubleParameters_[4] = COIN_DBL_MAX;
          /* Set up LDL cutoff*/
          integerParameters_[34] = numberTotal;
          /* KKT*/
          int * rowsDropped2 = new int[numberRows_];
          CoinZeroN(rowsDropped2, numberRows_);
#ifdef CHOL_COMPARE
          if (numberRows_ < 200)
               factorizePart3(rowsDropped2);
#endif
          factorizePart2(rowsDropped2);
          newDropped = integerParameters_[20];
          largest = doubleParameters_[3];
          smallest = doubleParameters_[4];
          if (model_->messageHandler()->logLevel() > 1)
	    COIN_DETAIL_PRINT(std::cout << "Cholesky - largest " << largest << " smallest " << smallest << std::endl);
          choleskyCondition_ = largest / smallest;
          /* Should save adjustments in ..R_*/
          int n1 = 0, n2 = 0;
          CoinWorkDouble * primalR = model_->primalR();
          CoinWorkDouble * dualR = model_->dualR();
          for (iRow = 0; iRow < numberTotal; iRow++) {
               if (rowsDropped2[iRow]) {
                    n1++;
                    /*printf("row region1 %d dropped\n",iRow);*/
                    /*rowsDropped_[iRow]=1;*/
                    rowsDropped_[iRow] = 0;
                    primalR[iRow] = doubleParameters_[20];
               } else {
                    rowsDropped_[iRow] = 0;
                    primalR[iRow] = 0.0;
               }
          }
          for (; iRow < numberRows_; iRow++) {
               if (rowsDropped2[iRow]) {
                    n2++;
                    /*printf("row region2 %d dropped\n",iRow);*/
                    /*rowsDropped_[iRow]=1;*/
                    rowsDropped_[iRow] = 0;
                    dualR[iRow-numberTotal] = doubleParameters_[34];
               } else {
                    rowsDropped_[iRow] = 0;
                    dualR[iRow-numberTotal] = 0.0;
               }
          }
     }
     return 0;
}
/* Factorize - filling in rowsDropped and returning number dropped */
void
ClpCholeskyDense::factorizePart3( int * rowsDropped)
{
     int iColumn;
     longDouble * xx = sparseFactor_;
     longDouble * yy = diagonal_;
     diagonal_ = sparseFactor_ + 40000;
     sparseFactor_ = diagonal_ + numberRows_;
     /*memcpy(sparseFactor_,xx,sizeFactor_*sizeof(double));*/
     CoinMemcpyN(xx, 40000, sparseFactor_);
     CoinMemcpyN(yy, numberRows_, diagonal_);
     int numberDropped = 0;
     CoinWorkDouble largest = 0.0;
     CoinWorkDouble smallest = COIN_DBL_MAX;
     double dropValue = doubleParameters_[10];
     int firstPositive = integerParameters_[34];
     longDouble * work = sparseFactor_;
     /* Allow for triangular*/
     int addOffset = numberRows_ - 1;
     work--;
     for (iColumn = 0; iColumn < numberRows_; iColumn++) {
          int iRow;
          int addOffsetNow = numberRows_ - 1;;
          longDouble * workNow = sparseFactor_ - 1 + iColumn;
          CoinWorkDouble diagonalValue = diagonal_[iColumn];
          for (iRow = 0; iRow < iColumn; iRow++) {
               double aj = *workNow;
               addOffsetNow--;
               workNow += addOffsetNow;
               diagonalValue -= aj * aj * workDouble_[iRow];
          }
          bool dropColumn = false;
          if (iColumn < firstPositive) {
               /* must be negative*/
               if (diagonalValue <= -dropValue) {
                    smallest = CoinMin(smallest, -diagonalValue);
                    largest = CoinMax(largest, -diagonalValue);
                    workDouble_[iColumn] = diagonalValue;
                    diagonalValue = 1.0 / diagonalValue;
               } else {
                    dropColumn = true;
                    workDouble_[iColumn] = -1.0e100;
                    diagonalValue = 0.0;
                    integerParameters_[20]++;
               }
          } else {
               /* must be positive*/
               if (diagonalValue >= dropValue) {
                    smallest = CoinMin(smallest, diagonalValue);
                    largest = CoinMax(largest, diagonalValue);
                    workDouble_[iColumn] = diagonalValue;
                    diagonalValue = 1.0 / diagonalValue;
               } else {
                    dropColumn = true;
                    workDouble_[iColumn] = 1.0e100;
                    diagonalValue = 0.0;
                    integerParameters_[20]++;
               }
          }
          if (!dropColumn) {
               diagonal_[iColumn] = diagonalValue;
               for (iRow = iColumn + 1; iRow < numberRows_; iRow++) {
                    double value = work[iRow];
                    workNow = sparseFactor_ - 1;
                    int addOffsetNow = numberRows_ - 1;;
                    for (int jColumn = 0; jColumn < iColumn; jColumn++) {
                         double aj = workNow[iColumn];
                         double multiplier = workDouble_[jColumn];
                         double ai = workNow[iRow];
                         addOffsetNow--;
                         workNow += addOffsetNow;
                         value -= aj * ai * multiplier;
                    }
                    work[iRow] = value * diagonalValue;
               }
          } else {
               /* drop column*/
               rowsDropped[iColumn] = 2;
               numberDropped++;
               diagonal_[iColumn] = 0.0;
               for (iRow = iColumn + 1; iRow < numberRows_; iRow++) {
                    work[iRow] = 0.0;
               }
          }
          addOffset--;
          work += addOffset;
     }
     doubleParameters_[3] = largest;
     doubleParameters_[4] = smallest;
     integerParameters_[20] = numberDropped;
     sparseFactor_ = xx;
     diagonal_ = yy;
}
/*#define POS_DEBUG*/
#ifdef POS_DEBUG
static int counter = 0;
int ClpCholeskyDense::bNumber(const longDouble * array, int &iRow, int &iCol)
{
     int numberBlocks = (numberRows_ + BLOCK - 1) >> BLOCKSHIFT;
     longDouble * a = sparseFactor_ + BLOCKSQ * numberBlocks;
     int k = array - a;
     assert ((k % BLOCKSQ) == 0);
     iCol = 0;
     int kk = k >> BLOCKSQSHIFT;
     /*printf("%d %d %d %d\n",k,kk,BLOCKSQ,BLOCKSQSHIFT);*/
     k = kk;
     while (k >= numberBlocks) {
          iCol++;
          k -= numberBlocks;
          numberBlocks--;
     }
     iRow = iCol + k;
     counter++;
     if(counter > 100000)
          exit(77);
     return kk;
}
#endif
/* Factorize - filling in rowsDropped and returning number dropped */
void
ClpCholeskyDense::factorizePart2( int * rowsDropped)
{
     int iColumn;
     /*longDouble * xx = sparseFactor_;*/
     /*longDouble * yy = diagonal_;*/
     /*diagonal_ = sparseFactor_ + 40000;*/
     /*sparseFactor_=diagonal_ + numberRows_;*/
     /*memcpy(sparseFactor_,xx,sizeFactor_*sizeof(double));*/
     /*memcpy(sparseFactor_,xx,40000*sizeof(double));*/
     /*memcpy(diagonal_,yy,numberRows_*sizeof(double));*/
     int numberBlocks = (numberRows_ + BLOCK - 1) >> BLOCKSHIFT;
     /* later align on boundary*/
     longDouble * a = sparseFactor_ + BLOCKSQ * numberBlocks;
     int n = numberRows_;
     int nRound = numberRows_ & (~(BLOCK - 1));
     /* adjust if exact*/
     if (nRound == n)
          nRound -= BLOCK;
     int sizeLastBlock = n - nRound;
     int get = n * (n - 1) / 2; /* ? as no diagonal*/
     int block = numberBlocks * (numberBlocks + 1) / 2;
     int ifOdd;
     int rowLast;
     if (sizeLastBlock != BLOCK) {
          longDouble * aa = &a[(block-1)*BLOCKSQ];
          rowLast = nRound - 1;
          ifOdd = 1;
          int put = BLOCKSQ;
          /* do last separately*/
          put -= (BLOCK - sizeLastBlock) * (BLOCK + 1);
          for (iColumn = numberRows_ - 1; iColumn >= nRound; iColumn--) {
               int put2 = put;
               put -= BLOCK;
               for (int iRow = numberRows_ - 1; iRow > iColumn; iRow--) {
                    aa[--put2] = sparseFactor_[--get];
                    assert (aa + put2 >= sparseFactor_ + get);
               }
               /* save diagonal as well*/
               aa[--put2] = diagonal_[iColumn];
          }
          n = nRound;
          block--;
     } else {
          /* exact fit*/
          rowLast = numberRows_ - 1;
          ifOdd = 0;
     }
     /* Now main loop*/
     int nBlock = 0;
     for (; n > 0; n -= BLOCK) {
          longDouble * aa = &a[(block-1)*BLOCKSQ];
          longDouble * aaLast = NULL;
          int put = BLOCKSQ;
          int putLast = 0;
          /* see if we have small block*/
          if (ifOdd) {
               aaLast = &a[(block-1)*BLOCKSQ];
               aa = aaLast - BLOCKSQ;
               putLast = BLOCKSQ - BLOCK + sizeLastBlock;
          }
          for (iColumn = n - 1; iColumn >= n - BLOCK; iColumn--) {
               if (aaLast) {
                    /* last bit*/
                    for (int iRow = numberRows_ - 1; iRow > rowLast; iRow--) {
                         aaLast[--putLast] = sparseFactor_[--get];
                         assert (aaLast + putLast >= sparseFactor_ + get);
                    }
                    putLast -= BLOCK - sizeLastBlock;
               }
               longDouble * aPut = aa;
               int j = rowLast;
               for (int jBlock = 0; jBlock <= nBlock; jBlock++) {
                    int put2 = put;
                    int last = CoinMax(j - BLOCK, iColumn);
                    for (int iRow = j; iRow > last; iRow--) {
                         aPut[--put2] = sparseFactor_[--get];
                         assert (aPut + put2 >= sparseFactor_ + get);
                    }
                    if (j - BLOCK < iColumn) {
                         /* save diagonal as well*/
                         aPut[--put2] = diagonal_[iColumn];
                    }
                    j -= BLOCK;
                    aPut -= BLOCKSQ;
               }
               put -= BLOCK;
          }
          nBlock++;
          block -= nBlock + ifOdd;
     }
     ClpCholeskyDenseC  info;
     info.diagonal_ = diagonal_;
     info.doubleParameters_[0] = doubleParameters_[10];
     info.integerParameters_[0] = integerParameters_[34];
#ifndef CLP_CILK
     ClpCholeskyCfactor(&info, a, numberRows_, numberBlocks,
                        diagonal_, workDouble_, rowsDropped);
#else
     info.a = a;
     info.n = numberRows_;
     info.numberBlocks = numberBlocks;
     info.work = workDouble_;
     info.rowsDropped = rowsDropped;
     info.integerParameters_[1] = model_->numberThreads();
     ClpCholeskySpawn(&info);
#endif
     double largest = 0.0;
     double smallest = COIN_DBL_MAX;
     int numberDropped = 0;
     for (int i = 0; i < numberRows_; i++) {
          if (diagonal_[i]) {
               largest = CoinMax(largest, CoinAbs(diagonal_[i]));
               smallest = CoinMin(smallest, CoinAbs(diagonal_[i]));
          } else {
               numberDropped++;
          }
     }
     doubleParameters_[3] = CoinMax(doubleParameters_[3], 1.0 / smallest);
     doubleParameters_[4] = CoinMin(doubleParameters_[4], 1.0 / largest);
     integerParameters_[20] += numberDropped;
}
/* Non leaf recursive factor*/
void
ClpCholeskyCfactor(ClpCholeskyDenseC * thisStruct, longDouble * a, int n, int numberBlocks,
                   longDouble * diagonal, longDouble * work, int * rowsDropped)
{
     if (n <= BLOCK) {
          ClpCholeskyCfactorLeaf(thisStruct, a, n, diagonal, work, rowsDropped);
     } else {
          int nb = number_blocks((n + 1) >> 1);
          int nThis = number_rows(nb);
          longDouble * aother;
          int nLeft = n - nThis;
          int nintri = (nb * (nb + 1)) >> 1;
          int nbelow = (numberBlocks - nb) * nb;
          ClpCholeskyCfactor(thisStruct, a, nThis, numberBlocks, diagonal, work, rowsDropped);
          ClpCholeskyCtriRec(thisStruct, a, nThis, a + number_entries(nb), diagonal, work, nLeft, nb, 0, numberBlocks);
          aother = a + number_entries(nintri + nbelow);
          ClpCholeskyCrecTri(thisStruct, a + number_entries(nb), nLeft, nThis, nb, 0, aother, diagonal, work, numberBlocks);
          ClpCholeskyCfactor(thisStruct, aother, nLeft,
                             numberBlocks - nb, diagonal + nThis, work + nThis, rowsDropped);
     }
}
/* Non leaf recursive triangle rectangle update*/
void
ClpCholeskyCtriRec(ClpCholeskyDenseC * thisStruct, longDouble * aTri, int nThis, longDouble * aUnder,
                   longDouble * diagonal, longDouble * work,
                   int nLeft, int iBlock, int jBlock,
                   int numberBlocks)
{
     if (nThis <= BLOCK && nLeft <= BLOCK) {
          ClpCholeskyCtriRecLeaf(/*thisStruct,*/ aTri, aUnder, diagonal, work, nLeft);
     } else if (nThis < nLeft) {
          int nb = number_blocks((nLeft + 1) >> 1);
          int nLeft2 = number_rows(nb);
          cilk_spawn ClpCholeskyCtriRec(thisStruct, aTri, nThis, aUnder, diagonal, work, nLeft2, iBlock, jBlock, numberBlocks);
          ClpCholeskyCtriRec(thisStruct, aTri, nThis, aUnder + number_entries(nb), diagonal, work, nLeft - nLeft2,
                             iBlock + nb, jBlock, numberBlocks);
	  cilk_sync;
     } else {
          int nb = number_blocks((nThis + 1) >> 1);
          int nThis2 = number_rows(nb);
          longDouble * aother;
          int kBlock = jBlock + nb;
          int i;
          int nintri = (nb * (nb + 1)) >> 1;
          int nbelow = (numberBlocks - nb) * nb;
          ClpCholeskyCtriRec(thisStruct, aTri, nThis2, aUnder, diagonal, work, nLeft, iBlock, jBlock, numberBlocks);
          /* and rectangular update */
          i = ((numberBlocks - jBlock) * (numberBlocks - jBlock - 1) -
               (numberBlocks - jBlock - nb) * (numberBlocks - jBlock - nb - 1)) >> 1;
          aother = aUnder + number_entries(i);
          ClpCholeskyCrecRec(thisStruct, aTri + number_entries(nb), nThis - nThis2, nLeft, nThis2, aUnder, aother,
                             work, kBlock, jBlock, numberBlocks);
          ClpCholeskyCtriRec(thisStruct, aTri + number_entries(nintri + nbelow), nThis - nThis2, aother, diagonal + nThis2,
                             work + nThis2, nLeft,
                             iBlock - nb, kBlock - nb, numberBlocks - nb);
     }
}
/* Non leaf recursive rectangle triangle update*/
void
ClpCholeskyCrecTri(ClpCholeskyDenseC * thisStruct, longDouble * aUnder, int nTri, int nDo,
                   int iBlock, int jBlock, longDouble * aTri,
                   longDouble * diagonal, longDouble * work,
                   int numberBlocks)
{
     if (nTri <= BLOCK && nDo <= BLOCK) {
          ClpCholeskyCrecTriLeaf(/*thisStruct,*/ aUnder, aTri,/*diagonal,*/work, nTri);
     } else if (nTri < nDo) {
          int nb = number_blocks((nDo + 1) >> 1);
          int nDo2 = number_rows(nb);
          longDouble * aother;
          int i;
          ClpCholeskyCrecTri(thisStruct, aUnder, nTri, nDo2, iBlock, jBlock, aTri, diagonal, work, numberBlocks);
          i = ((numberBlocks - jBlock) * (numberBlocks - jBlock - 1) -
               (numberBlocks - jBlock - nb) * (numberBlocks - jBlock - nb - 1)) >> 1;
          aother = aUnder + number_entries(i);
          ClpCholeskyCrecTri(thisStruct, aother, nTri, nDo - nDo2, iBlock - nb, jBlock, aTri, diagonal + nDo2,
                             work + nDo2, numberBlocks - nb);
     } else {
          int nb = number_blocks((nTri + 1) >> 1);
          int nTri2 = number_rows(nb);
          longDouble * aother;
          int i;
          cilk_spawn ClpCholeskyCrecTri(thisStruct, aUnder, nTri2, nDo, iBlock, jBlock, aTri, diagonal, work, numberBlocks);
          /* and rectangular update */
          i = ((numberBlocks - iBlock) * (numberBlocks - iBlock + 1) -
               (numberBlocks - iBlock - nb) * (numberBlocks - iBlock - nb + 1)) >> 1;
          aother = aTri + number_entries(nb);
          ClpCholeskyCrecRec(thisStruct, aUnder, nTri2, nTri - nTri2, nDo, aUnder + number_entries(nb), aother,
                             work, iBlock, jBlock, numberBlocks);
          ClpCholeskyCrecTri(thisStruct, aUnder + number_entries(nb), nTri - nTri2, nDo, iBlock + nb, jBlock,
                             aTri + number_entries(i), diagonal, work, numberBlocks);
	  cilk_sync;
     }
}
/* Non leaf recursive rectangle rectangle update,
   nUnder is number of rows in iBlock,
   nUnderK is number of rows in kBlock
*/
void
ClpCholeskyCrecRec(ClpCholeskyDenseC * thisStruct, longDouble * above, int nUnder, int nUnderK,
                   int nDo, longDouble * aUnder, longDouble *aOther,
                   longDouble * work,
                   int iBlock, int jBlock,
                   int numberBlocks)
{
     if (nDo <= BLOCK && nUnder <= BLOCK && nUnderK <= BLOCK) {
          assert (nDo == BLOCK && nUnder == BLOCK);
          ClpCholeskyCrecRecLeaf(/*thisStruct,*/ above , aUnder ,  aOther, work, nUnderK);
     } else if (nDo <= nUnderK && nUnder <= nUnderK) {
          int nb = number_blocks((nUnderK + 1) >> 1);
          int nUnder2 = number_rows(nb);
          cilk_spawn ClpCholeskyCrecRec(thisStruct, above, nUnder, nUnder2, nDo, aUnder, aOther, work,
                             iBlock, jBlock, numberBlocks);
          ClpCholeskyCrecRec(thisStruct, above, nUnder, nUnderK - nUnder2, nDo, aUnder + number_entries(nb),
                             aOther + number_entries(nb), work, iBlock, jBlock, numberBlocks);
	  cilk_sync;
     } else if (nUnderK <= nDo && nUnder <= nDo) {
          int nb = number_blocks((nDo + 1) >> 1);
          int nDo2 = number_rows(nb);
          int i;
          ClpCholeskyCrecRec(thisStruct, above, nUnder, nUnderK, nDo2, aUnder, aOther, work,
                             iBlock, jBlock, numberBlocks);
          i = ((numberBlocks - jBlock) * (numberBlocks - jBlock - 1) -
               (numberBlocks - jBlock - nb) * (numberBlocks - jBlock - nb - 1)) >> 1;
          ClpCholeskyCrecRec(thisStruct, above + number_entries(i), nUnder, nUnderK, nDo - nDo2,
                             aUnder + number_entries(i),
                             aOther, work + nDo2,
                             iBlock - nb, jBlock, numberBlocks - nb);
     } else {
          int nb = number_blocks((nUnder + 1) >> 1);
          int nUnder2 = number_rows(nb);
          int i;
          cilk_spawn ClpCholeskyCrecRec(thisStruct, above, nUnder2, nUnderK, nDo, aUnder, aOther, work,
                             iBlock, jBlock, numberBlocks);
          i = ((numberBlocks - iBlock) * (numberBlocks - iBlock - 1) -
               (numberBlocks - iBlock - nb) * (numberBlocks - iBlock - nb - 1)) >> 1;
          ClpCholeskyCrecRec(thisStruct, above + number_entries(nb), nUnder - nUnder2, nUnderK, nDo, aUnder,
                             aOther + number_entries(i), work, iBlock + nb, jBlock, numberBlocks);
	  cilk_sync;
     }
}
/* Leaf recursive factor*/
void
ClpCholeskyCfactorLeaf(ClpCholeskyDenseC * thisStruct, longDouble * a, int n,
                       longDouble * diagonal, longDouble * work, int * rowsDropped)
{
     double dropValue = thisStruct->doubleParameters_[0];
     int firstPositive = thisStruct->integerParameters_[0];
     int rowOffset = static_cast<int>(diagonal - thisStruct->diagonal_);
     int i, j, k;
     CoinWorkDouble t00, temp1;
     longDouble * aa;
     aa = a - BLOCK;
     for (j = 0; j < n; j ++) {
          bool dropColumn;
          CoinWorkDouble useT00;
          aa += BLOCK;
          t00 = aa[j];
          for (k = 0; k < j; ++k) {
               CoinWorkDouble multiplier = work[k];
               t00 -= a[j + k * BLOCK] * a[j + k * BLOCK] * multiplier;
          }
          dropColumn = false;
          useT00 = t00;
          if (j + rowOffset < firstPositive) {
               /* must be negative*/
               if (t00 <= -dropValue) {
                    /*aa[j]=t00;*/
                    t00 = 1.0 / t00;
               } else {
                    dropColumn = true;
                    /*aa[j]=-1.0e100;*/
                    useT00 = -1.0e-100;
                    t00 = 0.0;
               }
          } else {
               /* must be positive*/
               if (t00 >= dropValue) {
                    /*aa[j]=t00;*/
                    t00 = 1.0 / t00;
               } else {
                    dropColumn = true;
                    /*aa[j]=1.0e100;*/
                    useT00 = 1.0e-100;
                    t00 = 0.0;
               }
          }
          if (!dropColumn) {
               diagonal[j] = t00;
               work[j] = useT00;
               temp1 = t00;
               for (i = j + 1; i < n; i++) {
                    t00 = aa[i];
                    for (k = 0; k < j; ++k) {
                         CoinWorkDouble multiplier = work[k];
                         t00 -= a[i + k * BLOCK] * a[j + k * BLOCK] * multiplier;
                    }
                    aa[i] = t00 * temp1;
               }
          } else {
               /* drop column*/
               rowsDropped[j+rowOffset] = 2;
               diagonal[j] = 0.0;
               /*aa[j]=1.0e100;*/
               work[j] = 1.0e100;
               for (i = j + 1; i < n; i++) {
                    aa[i] = 0.0;
               }
          }
     }
}
/* Leaf recursive triangle rectangle update*/
void
ClpCholeskyCtriRecLeaf(/*ClpCholeskyDenseC * thisStruct,*/ longDouble * aTri, longDouble * aUnder, longDouble * diagonal, longDouble * work,
          int nUnder)
{
#ifdef POS_DEBUG
     int iru, icu;
     int iu = bNumber(aUnder, iru, icu);
     int irt, ict;
     int it = bNumber(aTri, irt, ict);
     /*printf("%d %d\n",iu,it);*/
     printf("trirecleaf  under (%d,%d), tri (%d,%d)\n",
            iru, icu, irt, ict);
     assert (diagonal == thisStruct->diagonal_ + ict * BLOCK);
#endif
     int j;
     longDouble * aa;
#ifdef BLOCKUNROLL
     if (nUnder == BLOCK) {
          aa = aTri - 2 * BLOCK;
          for (j = 0; j < BLOCK; j += 2) {
               int i;
               CoinWorkDouble temp0 = diagonal[j];
               CoinWorkDouble temp1 = diagonal[j+1];
               aa += 2 * BLOCK;
               for ( i = 0; i < BLOCK; i += 2) {
                    CoinWorkDouble at1;
                    CoinWorkDouble t00 = aUnder[i+j*BLOCK];
                    CoinWorkDouble t10 = aUnder[i+ BLOCK + j*BLOCK];
                    CoinWorkDouble t01 = aUnder[i+1+j*BLOCK];
                    CoinWorkDouble t11 = aUnder[i+1+ BLOCK + j*BLOCK];
                    int k;
                    for (k = 0; k < j; ++k) {
                         CoinWorkDouble multiplier = work[k];
                         CoinWorkDouble au0 = aUnder[i + k * BLOCK] * multiplier;
                         CoinWorkDouble au1 = aUnder[i + 1 + k * BLOCK] * multiplier;
                         CoinWorkDouble at0 = aTri[j + k * BLOCK];
                         CoinWorkDouble at1 = aTri[j + 1 + k * BLOCK];
                         t00 -= au0 * at0;
                         t10 -= au0 * at1;
                         t01 -= au1 * at0;
                         t11 -= au1 * at1;
                    }
                    t00 *= temp0;
                    at1 = aTri[j + 1 + j*BLOCK] * work[j];
                    t10 -= t00 * at1;
                    t01 *= temp0;
                    t11 -= t01 * at1;
                    aUnder[i+j*BLOCK] = t00;
                    aUnder[i+1+j*BLOCK] = t01;
                    aUnder[i+ BLOCK + j*BLOCK] = t10 * temp1;
                    aUnder[i+1+ BLOCK + j*BLOCK] = t11 * temp1;
               }
          }
     } else {
#endif
          aa = aTri - BLOCK;
          for (j = 0; j < BLOCK; j ++) {
               int i;
               CoinWorkDouble temp1 = diagonal[j];
               aa += BLOCK;
               for (i = 0; i < nUnder; i++) {
                    int k;
                    CoinWorkDouble t00 = aUnder[i+j*BLOCK];
                    for ( k = 0; k < j; ++k) {
                         CoinWorkDouble multiplier = work[k];
                         t00 -= aUnder[i + k * BLOCK] * aTri[j + k * BLOCK] * multiplier;
                    }
                    aUnder[i+j*BLOCK] = t00 * temp1;
               }
          }
#ifdef BLOCKUNROLL
     }
#endif
}
/* Leaf recursive rectangle triangle update*/
void ClpCholeskyCrecTriLeaf(/*ClpCholeskyDenseC * thisStruct,*/ longDouble * aUnder, longDouble * aTri,
          /*longDouble * diagonal,*/ longDouble * work, int nUnder)
{
#ifdef POS_DEBUG
     int iru, icu;
     int iu = bNumber(aUnder, iru, icu);
     int irt, ict;
     int it = bNumber(aTri, irt, ict);
     /*printf("%d %d\n",iu,it);*/
     printf("rectrileaf  under (%d,%d), tri (%d,%d)\n",
            iru, icu, irt, ict);
     assert (diagonal == thisStruct->diagonal_ + icu * BLOCK);
#endif
     int i, j, k;
     CoinWorkDouble t00;
     longDouble * aa;
#ifdef BLOCKUNROLL
     if (nUnder == BLOCK) {
          longDouble * aUnder2 = aUnder - 2;
          aa = aTri - 2 * BLOCK;
          for (j = 0; j < BLOCK; j += 2) {
               CoinWorkDouble t00, t01, t10, t11;
               aa += 2 * BLOCK;
               aUnder2 += 2;
               t00 = aa[j];
               t01 = aa[j+1];
               t10 = aa[j+1+BLOCK];
               for (k = 0; k < BLOCK; ++k) {
                    CoinWorkDouble multiplier = work[k];
                    CoinWorkDouble a0 = aUnder2[k * BLOCK];
                    CoinWorkDouble a1 = aUnder2[1 + k * BLOCK];
                    CoinWorkDouble x0 = a0 * multiplier;
                    CoinWorkDouble x1 = a1 * multiplier;
                    t00 -= a0 * x0;
                    t01 -= a1 * x0;
                    t10 -= a1 * x1;
               }
               aa[j] = t00;
               aa[j+1] = t01;
               aa[j+1+BLOCK] = t10;
               for (i = j + 2; i < BLOCK; i += 2) {
                    t00 = aa[i];
                    t01 = aa[i+BLOCK];
                    t10 = aa[i+1];
                    t11 = aa[i+1+BLOCK];
                    for (k = 0; k < BLOCK; ++k) {
                         CoinWorkDouble multiplier = work[k];
                         CoinWorkDouble a0 = aUnder2[k * BLOCK] * multiplier;
                         CoinWorkDouble a1 = aUnder2[1 + k * BLOCK] * multiplier;
                         t00 -= aUnder[i + k * BLOCK] * a0;
                         t01 -= aUnder[i + k * BLOCK] * a1;
                         t10 -= aUnder[i + 1 + k * BLOCK] * a0;
                         t11 -= aUnder[i + 1 + k * BLOCK] * a1;
                    }
                    aa[i] = t00;
                    aa[i+BLOCK] = t01;
                    aa[i+1] = t10;
                    aa[i+1+BLOCK] = t11;
               }
          }
     } else {
#endif
          aa = aTri - BLOCK;
          for (j = 0; j < nUnder; j ++) {
               aa += BLOCK;
               for (i = j; i < nUnder; i++) {
                    t00 = aa[i];
                    for (k = 0; k < BLOCK; ++k) {
                         CoinWorkDouble multiplier = work[k];
                         t00 -= aUnder[i + k * BLOCK] * aUnder[j + k * BLOCK] * multiplier;
                    }
                    aa[i] = t00;
               }
          }
#ifdef BLOCKUNROLL
     }
#endif
}
/* Leaf recursive rectangle rectangle update,
   nUnder is number of rows in iBlock,
   nUnderK is number of rows in kBlock
*/
void
ClpCholeskyCrecRecLeaf(/*ClpCholeskyDenseC * thisStruct,*/
     const longDouble * COIN_RESTRICT above,
     const longDouble * COIN_RESTRICT aUnder,
     longDouble * COIN_RESTRICT aOther,
     const longDouble * COIN_RESTRICT work,
     int nUnder)
{
#ifdef POS_DEBUG
     int ira, ica;
     int ia = bNumber(above, ira, ica);
     int iru, icu;
     int iu = bNumber(aUnder, iru, icu);
     int iro, ico;
     int io = bNumber(aOther, iro, ico);
     /*printf("%d %d %d\n",ia,iu,io);*/
     printf("recrecleaf above (%d,%d), under (%d,%d), other (%d,%d)\n",
            ira, ica, iru, icu, iro, ico);
#endif
     int i, j, k;
     longDouble * aa;
#ifdef BLOCKUNROLL
     aa = aOther - 4 * BLOCK;
     if (nUnder == BLOCK) {
          /*#define INTEL*/
#ifdef INTEL
          aa += 2 * BLOCK;
          for (j = 0; j < BLOCK; j += 2) {
               aa += 2 * BLOCK;
               for (i = 0; i < BLOCK; i += 2) {
                    CoinWorkDouble t00 = aa[i+0*BLOCK];
                    CoinWorkDouble t10 = aa[i+1*BLOCK];
                    CoinWorkDouble t01 = aa[i+1+0*BLOCK];
                    CoinWorkDouble t11 = aa[i+1+1*BLOCK];
                    for (k = 0; k < BLOCK; k++) {
                         CoinWorkDouble multiplier = work[k];
                         CoinWorkDouble a00 = aUnder[i+k*BLOCK] * multiplier;
                         CoinWorkDouble a01 = aUnder[i+1+k*BLOCK] * multiplier;
                         t00 -= a00 * above[j + 0 + k * BLOCK];
                         t10 -= a00 * above[j + 1 + k * BLOCK];
                         t01 -= a01 * above[j + 0 + k * BLOCK];
                         t11 -= a01 * above[j + 1 + k * BLOCK];
                    }
                    aa[i+0*BLOCK] = t00;
                    aa[i+1*BLOCK] = t10;
                    aa[i+1+0*BLOCK] = t01;
                    aa[i+1+1*BLOCK] = t11;
               }
          }
#else
          for (j = 0; j < BLOCK; j += 4) {
               aa += 4 * BLOCK;
               for (i = 0; i < BLOCK; i += 4) {
                    CoinWorkDouble t00 = aa[i+0+0*BLOCK];
                    CoinWorkDouble t10 = aa[i+0+1*BLOCK];
                    CoinWorkDouble t20 = aa[i+0+2*BLOCK];
                    CoinWorkDouble t30 = aa[i+0+3*BLOCK];
                    CoinWorkDouble t01 = aa[i+1+0*BLOCK];
                    CoinWorkDouble t11 = aa[i+1+1*BLOCK];
                    CoinWorkDouble t21 = aa[i+1+2*BLOCK];
                    CoinWorkDouble t31 = aa[i+1+3*BLOCK];
                    CoinWorkDouble t02 = aa[i+2+0*BLOCK];
                    CoinWorkDouble t12 = aa[i+2+1*BLOCK];
                    CoinWorkDouble t22 = aa[i+2+2*BLOCK];
                    CoinWorkDouble t32 = aa[i+2+3*BLOCK];
                    CoinWorkDouble t03 = aa[i+3+0*BLOCK];
                    CoinWorkDouble t13 = aa[i+3+1*BLOCK];
                    CoinWorkDouble t23 = aa[i+3+2*BLOCK];
                    CoinWorkDouble t33 = aa[i+3+3*BLOCK];
                    const longDouble * COIN_RESTRICT aUnderNow = aUnder + i;
                    const longDouble * COIN_RESTRICT aboveNow = above + j;
                    for (k = 0; k < BLOCK; k++) {
                         CoinWorkDouble multiplier = work[k];
                         CoinWorkDouble a00 = aUnderNow[0] * multiplier;
                         CoinWorkDouble a01 = aUnderNow[1] * multiplier;
                         CoinWorkDouble a02 = aUnderNow[2] * multiplier;
                         CoinWorkDouble a03 = aUnderNow[3] * multiplier;
                         t00 -= a00 * aboveNow[0];
                         t10 -= a00 * aboveNow[1];
                         t20 -= a00 * aboveNow[2];
                         t30 -= a00 * aboveNow[3];
                         t01 -= a01 * aboveNow[0];
                         t11 -= a01 * aboveNow[1];
                         t21 -= a01 * aboveNow[2];
                         t31 -= a01 * aboveNow[3];
                         t02 -= a02 * aboveNow[0];
                         t12 -= a02 * aboveNow[1];
                         t22 -= a02 * aboveNow[2];
                         t32 -= a02 * aboveNow[3];
                         t03 -= a03 * aboveNow[0];
                         t13 -= a03 * aboveNow[1];
                         t23 -= a03 * aboveNow[2];
                         t33 -= a03 * aboveNow[3];
                         aUnderNow += BLOCK;
                         aboveNow += BLOCK;
                    }
                    aa[i+0+0*BLOCK] = t00;
                    aa[i+0+1*BLOCK] = t10;
                    aa[i+0+2*BLOCK] = t20;
                    aa[i+0+3*BLOCK] = t30;
                    aa[i+1+0*BLOCK] = t01;
                    aa[i+1+1*BLOCK] = t11;
                    aa[i+1+2*BLOCK] = t21;
                    aa[i+1+3*BLOCK] = t31;
                    aa[i+2+0*BLOCK] = t02;
                    aa[i+2+1*BLOCK] = t12;
                    aa[i+2+2*BLOCK] = t22;
                    aa[i+2+3*BLOCK] = t32;
                    aa[i+3+0*BLOCK] = t03;
                    aa[i+3+1*BLOCK] = t13;
                    aa[i+3+2*BLOCK] = t23;
                    aa[i+3+3*BLOCK] = t33;
               }
          }
#endif
     } else {
          int odd = nUnder & 1;
          int n = nUnder - odd;
          for (j = 0; j < BLOCK; j += 4) {
               aa += 4 * BLOCK;
               for (i = 0; i < n; i += 2) {
                    CoinWorkDouble t00 = aa[i+0*BLOCK];
                    CoinWorkDouble t10 = aa[i+1*BLOCK];
                    CoinWorkDouble t20 = aa[i+2*BLOCK];
                    CoinWorkDouble t30 = aa[i+3*BLOCK];
                    CoinWorkDouble t01 = aa[i+1+0*BLOCK];
                    CoinWorkDouble t11 = aa[i+1+1*BLOCK];
                    CoinWorkDouble t21 = aa[i+1+2*BLOCK];
                    CoinWorkDouble t31 = aa[i+1+3*BLOCK];
                    const longDouble * COIN_RESTRICT aUnderNow = aUnder + i;
                    const longDouble * COIN_RESTRICT aboveNow = above + j;
                    for (k = 0; k < BLOCK; k++) {
                         CoinWorkDouble multiplier = work[k];
                         CoinWorkDouble a00 = aUnderNow[0] * multiplier;
                         CoinWorkDouble a01 = aUnderNow[1] * multiplier;
                         t00 -= a00 * aboveNow[0];
                         t10 -= a00 * aboveNow[1];
                         t20 -= a00 * aboveNow[2];
                         t30 -= a00 * aboveNow[3];
                         t01 -= a01 * aboveNow[0];
                         t11 -= a01 * aboveNow[1];
                         t21 -= a01 * aboveNow[2];
                         t31 -= a01 * aboveNow[3];
                         aUnderNow += BLOCK;
                         aboveNow += BLOCK;
                    }
                    aa[i+0*BLOCK] = t00;
                    aa[i+1*BLOCK] = t10;
                    aa[i+2*BLOCK] = t20;
                    aa[i+3*BLOCK] = t30;
                    aa[i+1+0*BLOCK] = t01;
                    aa[i+1+1*BLOCK] = t11;
                    aa[i+1+2*BLOCK] = t21;
                    aa[i+1+3*BLOCK] = t31;
               }
               if (odd) {
                    CoinWorkDouble t0 = aa[n+0*BLOCK];
                    CoinWorkDouble t1 = aa[n+1*BLOCK];
                    CoinWorkDouble t2 = aa[n+2*BLOCK];
                    CoinWorkDouble t3 = aa[n+3*BLOCK];
                    CoinWorkDouble a0;
                    for (k = 0; k < BLOCK; k++) {
                         a0 = aUnder[n+k*BLOCK] * work[k];
                         t0 -= a0 * above[j + 0 + k * BLOCK];
                         t1 -= a0 * above[j + 1 + k * BLOCK];
                         t2 -= a0 * above[j + 2 + k * BLOCK];
                         t3 -= a0 * above[j + 3 + k * BLOCK];
                    }
                    aa[n+0*BLOCK] = t0;
                    aa[n+1*BLOCK] = t1;
                    aa[n+2*BLOCK] = t2;
                    aa[n+3*BLOCK] = t3;
               }
          }
     }
#else
     aa = aOther - BLOCK;
     for (j = 0; j < BLOCK; j ++) {
          aa += BLOCK;
          for (i = 0; i < nUnder; i++) {
               CoinWorkDouble t00 = aa[i+0*BLOCK];
               for (k = 0; k < BLOCK; k++) {
                    CoinWorkDouble a00 = aUnder[i+k*BLOCK] * work[k];
                    t00 -= a00 * above[j + k * BLOCK];
               }
               aa[i] = t00;
          }
     }
#endif
}
/* Uses factorization to solve. */
void
ClpCholeskyDense::solve (CoinWorkDouble * region)
{
#ifdef CHOL_COMPARE
     double * region2 = NULL;
     if (numberRows_ < 200) {
          longDouble * xx = sparseFactor_;
          longDouble * yy = diagonal_;
          diagonal_ = sparseFactor_ + 40000;
          sparseFactor_ = diagonal_ + numberRows_;
          region2 = ClpCopyOfArray(region, numberRows_);
          int iRow, iColumn;
          int addOffset = numberRows_ - 1;
          longDouble * work = sparseFactor_ - 1;
          for (iColumn = 0; iColumn < numberRows_; iColumn++) {
               double value = region2[iColumn];
               for (iRow = iColumn + 1; iRow < numberRows_; iRow++)
                    region2[iRow] -= value * work[iRow];
               addOffset--;
               work += addOffset;
          }
          for (iColumn = numberRows_ - 1; iColumn >= 0; iColumn--) {
               double value = region2[iColumn] * diagonal_[iColumn];
               work -= addOffset;
               addOffset++;
               for (iRow = iColumn + 1; iRow < numberRows_; iRow++)
                    value -= region2[iRow] * work[iRow];
               region2[iColumn] = value;
          }
          sparseFactor_ = xx;
          diagonal_ = yy;
     }
#endif
     /*longDouble * xx = sparseFactor_;*/
     /*longDouble * yy = diagonal_;*/
     /*diagonal_ = sparseFactor_ + 40000;*/
     /*sparseFactor_=diagonal_ + numberRows_;*/
     int numberBlocks = (numberRows_ + BLOCK - 1) >> BLOCKSHIFT;
     /* later align on boundary*/
     longDouble * a = sparseFactor_ + BLOCKSQ * numberBlocks;
     int iBlock;
     longDouble * aa = a;
     int iColumn;
     for (iBlock = 0; iBlock < numberBlocks; iBlock++) {
          int nChunk;
          int jBlock;
          int iDo = iBlock * BLOCK;
          int base = iDo;
          if (iDo + BLOCK > numberRows_) {
               nChunk = numberRows_ - iDo;
          } else {
               nChunk = BLOCK;
          }
          solveF1(aa, nChunk, region + iDo);
          for (jBlock = iBlock + 1; jBlock < numberBlocks; jBlock++) {
               base += BLOCK;
               aa += BLOCKSQ;
               if (base + BLOCK > numberRows_) {
                    nChunk = numberRows_ - base;
               } else {
                    nChunk = BLOCK;
               }
               solveF2(aa, nChunk, region + iDo, region + base);
          }
          aa += BLOCKSQ;
     }
     /* do diagonal outside*/
     for (iColumn = 0; iColumn < numberRows_; iColumn++)
          region[iColumn] *= diagonal_[iColumn];
     int offset = ((numberBlocks * (numberBlocks + 1)) >> 1);
     aa = a + number_entries(offset - 1);
     int lBase = (numberBlocks - 1) * BLOCK;
     for (iBlock = numberBlocks - 1; iBlock >= 0; iBlock--) {
          int nChunk;
          int jBlock;
          int triBase = iBlock * BLOCK;
          int iBase = lBase;
          for (jBlock = iBlock + 1; jBlock < numberBlocks; jBlock++) {
               if (iBase + BLOCK > numberRows_) {
                    nChunk = numberRows_ - iBase;
               } else {
                    nChunk = BLOCK;
               }
               solveB2(aa, nChunk, region + triBase, region + iBase);
               iBase -= BLOCK;
               aa -= BLOCKSQ;
          }
          if (triBase + BLOCK > numberRows_) {
               nChunk = numberRows_ - triBase;
          } else {
               nChunk = BLOCK;
          }
          solveB1(aa, nChunk, region + triBase);
          aa -= BLOCKSQ;
     }
#ifdef CHOL_COMPARE
     if (numberRows_ < 200) {
          for (int i = 0; i < numberRows_; i++) {
               assert(CoinAbs(region[i] - region2[i]) < 1.0e-3);
          }
          delete [] region2;
     }
#endif
}
/* Forward part of solve 1*/
void
ClpCholeskyDense::solveF1(longDouble * a, int n, CoinWorkDouble * region)
{
     int j, k;
     CoinWorkDouble t00;
     for (j = 0; j < n; j ++) {
          t00 = region[j];
          for (k = 0; k < j; ++k) {
               t00 -= region[k] * a[j + k * BLOCK];
          }
          /*t00*=a[j + j * BLOCK];*/
          region[j] = t00;
     }
}
/* Forward part of solve 2*/
void
ClpCholeskyDense::solveF2(longDouble * a, int n, CoinWorkDouble * region, CoinWorkDouble * region2)
{
     int j, k;
#ifdef BLOCKUNROLL
     if (n == BLOCK) {
          for (k = 0; k < BLOCK; k += 4) {
               CoinWorkDouble t0 = region2[0];
               CoinWorkDouble t1 = region2[1];
               CoinWorkDouble t2 = region2[2];
               CoinWorkDouble t3 = region2[3];
               t0 -= region[0] * a[0 + 0 * BLOCK];
               t1 -= region[0] * a[1 + 0 * BLOCK];
               t2 -= region[0] * a[2 + 0 * BLOCK];
               t3 -= region[0] * a[3 + 0 * BLOCK];

               t0 -= region[1] * a[0 + 1 * BLOCK];
               t1 -= region[1] * a[1 + 1 * BLOCK];
               t2 -= region[1] * a[2 + 1 * BLOCK];
               t3 -= region[1] * a[3 + 1 * BLOCK];

               t0 -= region[2] * a[0 + 2 * BLOCK];
               t1 -= region[2] * a[1 + 2 * BLOCK];
               t2 -= region[2] * a[2 + 2 * BLOCK];
               t3 -= region[2] * a[3 + 2 * BLOCK];

               t0 -= region[3] * a[0 + 3 * BLOCK];
               t1 -= region[3] * a[1 + 3 * BLOCK];
               t2 -= region[3] * a[2 + 3 * BLOCK];
               t3 -= region[3] * a[3 + 3 * BLOCK];

               t0 -= region[4] * a[0 + 4 * BLOCK];
               t1 -= region[4] * a[1 + 4 * BLOCK];
               t2 -= region[4] * a[2 + 4 * BLOCK];
               t3 -= region[4] * a[3 + 4 * BLOCK];

               t0 -= region[5] * a[0 + 5 * BLOCK];
               t1 -= region[5] * a[1 + 5 * BLOCK];
               t2 -= region[5] * a[2 + 5 * BLOCK];
               t3 -= region[5] * a[3 + 5 * BLOCK];

               t0 -= region[6] * a[0 + 6 * BLOCK];
               t1 -= region[6] * a[1 + 6 * BLOCK];
               t2 -= region[6] * a[2 + 6 * BLOCK];
               t3 -= region[6] * a[3 + 6 * BLOCK];

               t0 -= region[7] * a[0 + 7 * BLOCK];
               t1 -= region[7] * a[1 + 7 * BLOCK];
               t2 -= region[7] * a[2 + 7 * BLOCK];
               t3 -= region[7] * a[3 + 7 * BLOCK];
#if BLOCK>8
               t0 -= region[8] * a[0 + 8 * BLOCK];
               t1 -= region[8] * a[1 + 8 * BLOCK];
               t2 -= region[8] * a[2 + 8 * BLOCK];
               t3 -= region[8] * a[3 + 8 * BLOCK];

               t0 -= region[9] * a[0 + 9 * BLOCK];
               t1 -= region[9] * a[1 + 9 * BLOCK];
               t2 -= region[9] * a[2 + 9 * BLOCK];
               t3 -= region[9] * a[3 + 9 * BLOCK];

               t0 -= region[10] * a[0 + 10 * BLOCK];
               t1 -= region[10] * a[1 + 10 * BLOCK];
               t2 -= region[10] * a[2 + 10 * BLOCK];
               t3 -= region[10] * a[3 + 10 * BLOCK];

               t0 -= region[11] * a[0 + 11 * BLOCK];
               t1 -= region[11] * a[1 + 11 * BLOCK];
               t2 -= region[11] * a[2 + 11 * BLOCK];
               t3 -= region[11] * a[3 + 11 * BLOCK];

               t0 -= region[12] * a[0 + 12 * BLOCK];
               t1 -= region[12] * a[1 + 12 * BLOCK];
               t2 -= region[12] * a[2 + 12 * BLOCK];
               t3 -= region[12] * a[3 + 12 * BLOCK];

               t0 -= region[13] * a[0 + 13 * BLOCK];
               t1 -= region[13] * a[1 + 13 * BLOCK];
               t2 -= region[13] * a[2 + 13 * BLOCK];
               t3 -= region[13] * a[3 + 13 * BLOCK];

               t0 -= region[14] * a[0 + 14 * BLOCK];
               t1 -= region[14] * a[1 + 14 * BLOCK];
               t2 -= region[14] * a[2 + 14 * BLOCK];
               t3 -= region[14] * a[3 + 14 * BLOCK];

               t0 -= region[15] * a[0 + 15 * BLOCK];
               t1 -= region[15] * a[1 + 15 * BLOCK];
               t2 -= region[15] * a[2 + 15 * BLOCK];
               t3 -= region[15] * a[3 + 15 * BLOCK];
#endif
               region2[0] = t0;
               region2[1] = t1;
               region2[2] = t2;
               region2[3] = t3;
               region2 += 4;
               a += 4;
          }
     } else {
#endif
          for (k = 0; k < n; ++k) {
               CoinWorkDouble t00 = region2[k];
               for (j = 0; j < BLOCK; j ++) {
                    t00 -= region[j] * a[k + j * BLOCK];
               }
               region2[k] = t00;
          }
#ifdef BLOCKUNROLL
     }
#endif
}
/* Backward part of solve 1*/
void
ClpCholeskyDense::solveB1(longDouble * a, int n, CoinWorkDouble * region)
{
     int j, k;
     CoinWorkDouble t00;
     for (j = n - 1; j >= 0; j --) {
          t00 = region[j];
          for (k = j + 1; k < n; ++k) {
               t00 -= region[k] * a[k + j * BLOCK];
          }
          /*t00*=a[j + j * BLOCK];*/
          region[j] = t00;
     }
}
/* Backward part of solve 2*/
void
ClpCholeskyDense::solveB2(longDouble * a, int n, CoinWorkDouble * region, CoinWorkDouble * region2)
{
     int j, k;
#ifdef BLOCKUNROLL
     if (n == BLOCK) {
          for (j = 0; j < BLOCK; j += 4) {
               CoinWorkDouble t0 = region[0];
               CoinWorkDouble t1 = region[1];
               CoinWorkDouble t2 = region[2];
               CoinWorkDouble t3 = region[3];
               t0 -= region2[0] * a[0 + 0*BLOCK];
               t1 -= region2[0] * a[0 + 1*BLOCK];
               t2 -= region2[0] * a[0 + 2*BLOCK];
               t3 -= region2[0] * a[0 + 3*BLOCK];

               t0 -= region2[1] * a[1 + 0*BLOCK];
               t1 -= region2[1] * a[1 + 1*BLOCK];
               t2 -= region2[1] * a[1 + 2*BLOCK];
               t3 -= region2[1] * a[1 + 3*BLOCK];

               t0 -= region2[2] * a[2 + 0*BLOCK];
               t1 -= region2[2] * a[2 + 1*BLOCK];
               t2 -= region2[2] * a[2 + 2*BLOCK];
               t3 -= region2[2] * a[2 + 3*BLOCK];

               t0 -= region2[3] * a[3 + 0*BLOCK];
               t1 -= region2[3] * a[3 + 1*BLOCK];
               t2 -= region2[3] * a[3 + 2*BLOCK];
               t3 -= region2[3] * a[3 + 3*BLOCK];

               t0 -= region2[4] * a[4 + 0*BLOCK];
               t1 -= region2[4] * a[4 + 1*BLOCK];
               t2 -= region2[4] * a[4 + 2*BLOCK];
               t3 -= region2[4] * a[4 + 3*BLOCK];

               t0 -= region2[5] * a[5 + 0*BLOCK];
               t1 -= region2[5] * a[5 + 1*BLOCK];
               t2 -= region2[5] * a[5 + 2*BLOCK];
               t3 -= region2[5] * a[5 + 3*BLOCK];

               t0 -= region2[6] * a[6 + 0*BLOCK];
               t1 -= region2[6] * a[6 + 1*BLOCK];
               t2 -= region2[6] * a[6 + 2*BLOCK];
               t3 -= region2[6] * a[6 + 3*BLOCK];

               t0 -= region2[7] * a[7 + 0*BLOCK];
               t1 -= region2[7] * a[7 + 1*BLOCK];
               t2 -= region2[7] * a[7 + 2*BLOCK];
               t3 -= region2[7] * a[7 + 3*BLOCK];
#if BLOCK>8

               t0 -= region2[8] * a[8 + 0*BLOCK];
               t1 -= region2[8] * a[8 + 1*BLOCK];
               t2 -= region2[8] * a[8 + 2*BLOCK];
               t3 -= region2[8] * a[8 + 3*BLOCK];

               t0 -= region2[9] * a[9 + 0*BLOCK];
               t1 -= region2[9] * a[9 + 1*BLOCK];
               t2 -= region2[9] * a[9 + 2*BLOCK];
               t3 -= region2[9] * a[9 + 3*BLOCK];

               t0 -= region2[10] * a[10 + 0*BLOCK];
               t1 -= region2[10] * a[10 + 1*BLOCK];
               t2 -= region2[10] * a[10 + 2*BLOCK];
               t3 -= region2[10] * a[10 + 3*BLOCK];

               t0 -= region2[11] * a[11 + 0*BLOCK];
               t1 -= region2[11] * a[11 + 1*BLOCK];
               t2 -= region2[11] * a[11 + 2*BLOCK];
               t3 -= region2[11] * a[11 + 3*BLOCK];

               t0 -= region2[12] * a[12 + 0*BLOCK];
               t1 -= region2[12] * a[12 + 1*BLOCK];
               t2 -= region2[12] * a[12 + 2*BLOCK];
               t3 -= region2[12] * a[12 + 3*BLOCK];

               t0 -= region2[13] * a[13 + 0*BLOCK];
               t1 -= region2[13] * a[13 + 1*BLOCK];
               t2 -= region2[13] * a[13 + 2*BLOCK];
               t3 -= region2[13] * a[13 + 3*BLOCK];

               t0 -= region2[14] * a[14 + 0*BLOCK];
               t1 -= region2[14] * a[14 + 1*BLOCK];
               t2 -= region2[14] * a[14 + 2*BLOCK];
               t3 -= region2[14] * a[14 + 3*BLOCK];

               t0 -= region2[15] * a[15 + 0*BLOCK];
               t1 -= region2[15] * a[15 + 1*BLOCK];
               t2 -= region2[15] * a[15 + 2*BLOCK];
               t3 -= region2[15] * a[15 + 3*BLOCK];
#endif
               region[0] = t0;
               region[1] = t1;
               region[2] = t2;
               region[3] = t3;
               a += 4 * BLOCK;
               region += 4;
          }
     } else {
#endif
          for (j = 0; j < BLOCK; j ++) {
               CoinWorkDouble t00 = region[j];
               for (k = 0; k < n; ++k) {
                    t00 -= region2[k] * a[k + j * BLOCK];
               }
               region[j] = t00;
          }
#ifdef BLOCKUNROLL
     }
#endif
}
