Actual source code: sbaijfact2.c

  1: #define PETSCMAT_DLL

  3: /*
  4:     Factorization code for SBAIJ format. 
  5: */

 7:  #include src/mat/impls/sbaij/seq/sbaij.h
 8:  #include src/mat/impls/baij/seq/baij.h
 9:  #include src/inline/ilu.h
 10:  #include src/inline/dot.h

 14: PetscErrorCode MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
 15: {
 16:   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
 17:   IS              isrow=a->row;
 18:   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j;
 19:   PetscErrorCode  ierr;
 20:   PetscInt        nz,*vj,k,*r,idx,k1;
 21:   PetscInt        bs=A->rmap.bs,bs2 = a->bs2;
 22:   MatScalar       *aa=a->a,*v,*diag;
 23:   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;

 26:   VecGetArray(bb,&b);
 27:   VecGetArray(xx,&x);
 28:   t  = a->solve_work;
 29:   ISGetIndices(isrow,&r);
 30:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

 32:   /* solve U^T * D * y = b by forward substitution */
 33:   xk = t;
 34:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 35:     idx   = bs*r[k];
 36:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 37:   }
 38:   for (k=0; k<mbs; k++){
 39:     v  = aa + bs2*ai[k];
 40:     xk = t + k*bs;      /* Dk*xk = k-th block of x */
 41:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
 42:     nz = ai[k+1] - ai[k];
 43:     vj = aj + ai[k];
 44:     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
 45:     while (nz--) {
 46:       /* x(:) += U(k,:)^T*(Dk*xk) */
 47:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 48:       vj++; xj = t + (*vj)*bs;
 49:       v += bs2;
 50:     }
 51:     /* xk = inv(Dk)*(Dk*xk) */
 52:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 53:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 54:   }

 56:   /* solve U*x = y by back substitution */
 57:   for (k=mbs-1; k>=0; k--){
 58:     v  = aa + bs2*ai[k];
 59:     xk = t + k*bs;        /* xk */
 60:     nz = ai[k+1] - ai[k];
 61:     vj = aj + ai[k];
 62:     xj = t + (*vj)*bs;
 63:     while (nz--) {
 64:       /* xk += U(k,:)*x(:) */
 65:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 66:       vj++;
 67:       v += bs2; xj = t + (*vj)*bs;
 68:     }
 69:     idx   = bs*r[k];
 70:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 71:   }

 73:   PetscFree(xk_tmp);
 74:   ISRestoreIndices(isrow,&r);
 75:   VecRestoreArray(bb,&b);
 76:   VecRestoreArray(xx,&x);
 77:   PetscLogFlops(bs2*(2*a->nz + mbs));
 78:   return(0);
 79: }

 83: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
 84: {
 85:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
 87:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
 88:   PetscInt       nz,*vj,k;
 89:   PetscInt       bs=A->rmap.bs,bs2 = a->bs2;
 90:   MatScalar      *aa=a->a,*v,*diag;
 91:   PetscScalar    *x,*xk,*xj,*b,*xk_tmp;

 94: 
 95:   VecGetArray(bb,&b);
 96:   VecGetArray(xx,&x);

 98:   PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);

100:   /* solve U^T * D * y = b by forward substitution */
101:   PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
102:   for (k=0; k<mbs; k++){
103:     v  = aa + bs2*ai[k];
104:     xk = x + k*bs;      /* Dk*xk = k-th block of x */
105:     PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
106:     nz = ai[k+1] - ai[k];
107:     vj = aj + ai[k];
108:     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
109:     while (nz--) {
110:       /* x(:) += U(k,:)^T*(Dk*xk) */
111:       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
112:       vj++; xj = x + (*vj)*bs;
113:       v += bs2;
114:     }
115:     /* xk = inv(Dk)*(Dk*xk) */
116:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
117:     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
118:   }

120:   /* solve U*x = y by back substitution */
121:   for (k=mbs-1; k>=0; k--){
122:     v  = aa + bs2*ai[k];
123:     xk = x + k*bs;        /* xk */
124:     nz = ai[k+1] - ai[k];
125:     vj = aj + ai[k];
126:     xj = x + (*vj)*bs;
127:     while (nz--) {
128:       /* xk += U(k,:)*x(:) */
129:       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
130:       vj++;
131:       v += bs2; xj = x + (*vj)*bs;
132:     }
133:   }

135:   PetscFree(xk_tmp);
136:   VecRestoreArray(bb,&b);
137:   VecRestoreArray(xx,&x);
138:   PetscLogFlops(bs2*(2*a->nz + mbs));
139:   return(0);
140: }

144: PetscErrorCode MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
145: {
146:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
147:   IS             isrow=a->row;
148:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
150:   PetscInt       nz,*vj,k,*r,idx;
151:   MatScalar      *aa=a->a,*v,*d;
152:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;

155:   VecGetArray(bb,&b);
156:   VecGetArray(xx,&x);
157:   t  = a->solve_work;
158:   ISGetIndices(isrow,&r);

160:   /* solve U^T * D * y = b by forward substitution */
161:   tp = t;
162:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
163:     idx   = 7*r[k];
164:     tp[0] = b[idx];
165:     tp[1] = b[idx+1];
166:     tp[2] = b[idx+2];
167:     tp[3] = b[idx+3];
168:     tp[4] = b[idx+4];
169:     tp[5] = b[idx+5];
170:     tp[6] = b[idx+6];
171:     tp += 7;
172:   }
173: 
174:   for (k=0; k<mbs; k++){
175:     v  = aa + 49*ai[k];
176:     vj = aj + ai[k];
177:     tp = t + k*7;
178:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
179:     nz = ai[k+1] - ai[k];
180:     tp = t + (*vj)*7;
181:     while (nz--) {
182:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
183:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
184:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
185:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
186:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
187:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
188:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
189:       vj++; tp = t + (*vj)*7;
190:       v += 49;
191:     }

193:     /* xk = inv(Dk)*(Dk*xk) */
194:     d  = aa+k*49;          /* ptr to inv(Dk) */
195:     tp    = t + k*7;
196:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
197:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
198:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
199:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
200:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
201:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
202:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
203:   }

205:   /* solve U*x = y by back substitution */
206:   for (k=mbs-1; k>=0; k--){
207:     v  = aa + 49*ai[k];
208:     vj = aj + ai[k];
209:     tp    = t + k*7;
210:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
211:     nz = ai[k+1] - ai[k];
212: 
213:     tp = t + (*vj)*7;
214:     while (nz--) {
215:       /* xk += U(k,:)*x(:) */
216:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
217:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
218:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
219:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
220:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
221:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
222:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
223:       vj++; tp = t + (*vj)*7;
224:       v += 49;
225:     }
226:     tp    = t + k*7;
227:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
228:     idx   = 7*r[k];
229:     x[idx]     = x0;
230:     x[idx+1]   = x1;
231:     x[idx+2]   = x2;
232:     x[idx+3]   = x3;
233:     x[idx+4]   = x4;
234:     x[idx+5]   = x5;
235:     x[idx+6]   = x6;
236:   }

238:   ISRestoreIndices(isrow,&r);
239:   VecRestoreArray(bb,&b);
240:   VecRestoreArray(xx,&x);
241:   PetscLogFlops(49*(2*a->nz + mbs));
242:   return(0);
243: }

247: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
248: {
249:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
251:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
252:   MatScalar      *aa=a->a,*v,*d;
253:   PetscScalar    *x,*xp,*b,x0,x1,x2,x3,x4,x5,x6;
254:   PetscInt       nz,*vj,k;

257:   VecGetArray(bb,&b);
258:   VecGetArray(xx,&x);
259: 
260:   /* solve U^T * D * y = b by forward substitution */
261:   PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
262:   for (k=0; k<mbs; k++){
263:     v  = aa + 49*ai[k];
264:     xp = x + k*7;
265:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
266:     nz = ai[k+1] - ai[k];
267:     vj = aj + ai[k];
268:     xp = x + (*vj)*7;
269:     while (nz--) {
270:       /* x(:) += U(k,:)^T*(Dk*xk) */
271:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
272:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
273:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
274:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
275:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
276:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
277:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
278:       vj++; xp = x + (*vj)*7;
279:       v += 49;
280:     }
281:     /* xk = inv(Dk)*(Dk*xk) */
282:     d  = aa+k*49;          /* ptr to inv(Dk) */
283:     xp = x + k*7;
284:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
285:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
286:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
287:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
288:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
289:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
290:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
291:   }

293:   /* solve U*x = y by back substitution */
294:   for (k=mbs-1; k>=0; k--){
295:     v  = aa + 49*ai[k];
296:     xp = x + k*7;
297:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
298:     nz = ai[k+1] - ai[k];
299:     vj = aj + ai[k];
300:     xp = x + (*vj)*7;
301:     while (nz--) {
302:       /* xk += U(k,:)*x(:) */
303:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
304:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
305:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
306:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
307:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
308:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
309:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
310:       vj++;
311:       v += 49; xp = x + (*vj)*7;
312:     }
313:     xp = x + k*7;
314:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
315:   }

317:   VecRestoreArray(bb,&b);
318:   VecRestoreArray(xx,&x);
319:   PetscLogFlops(49*(2*a->nz + mbs));
320:   return(0);
321: }

325: PetscErrorCode MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
326: {
327:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
328:   IS             isrow=a->row;
329:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
331:   PetscInt       nz,*vj,k,*r,idx;
332:   MatScalar      *aa=a->a,*v,*d;
333:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;

336:   VecGetArray(bb,&b);
337:   VecGetArray(xx,&x);
338:   t  = a->solve_work;
339:   ISGetIndices(isrow,&r);

341:   /* solve U^T * D * y = b by forward substitution */
342:   tp = t;
343:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
344:     idx   = 6*r[k];
345:     tp[0] = b[idx];
346:     tp[1] = b[idx+1];
347:     tp[2] = b[idx+2];
348:     tp[3] = b[idx+3];
349:     tp[4] = b[idx+4];
350:     tp[5] = b[idx+5];
351:     tp += 6;
352:   }
353: 
354:   for (k=0; k<mbs; k++){
355:     v  = aa + 36*ai[k];
356:     vj = aj + ai[k];
357:     tp = t + k*6;
358:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
359:     nz = ai[k+1] - ai[k];
360:     tp = t + (*vj)*6;
361:     while (nz--) {
362:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
363:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
364:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
365:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
366:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
367:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
368:       vj++; tp = t + (*vj)*6;
369:       v += 36;
370:     }

372:     /* xk = inv(Dk)*(Dk*xk) */
373:     d  = aa+k*36;          /* ptr to inv(Dk) */
374:     tp    = t + k*6;
375:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
376:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
377:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
378:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
379:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
380:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
381:   }

383:   /* solve U*x = y by back substitution */
384:   for (k=mbs-1; k>=0; k--){
385:     v  = aa + 36*ai[k];
386:     vj = aj + ai[k];
387:     tp    = t + k*6;
388:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
389:     nz = ai[k+1] - ai[k];
390: 
391:     tp = t + (*vj)*6;
392:     while (nz--) {
393:       /* xk += U(k,:)*x(:) */
394:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
395:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
396:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
397:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
398:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
399:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
400:       vj++; tp = t + (*vj)*6;
401:       v += 36;
402:     }
403:     tp    = t + k*6;
404:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
405:     idx   = 6*r[k];
406:     x[idx]     = x0;
407:     x[idx+1]   = x1;
408:     x[idx+2]   = x2;
409:     x[idx+3]   = x3;
410:     x[idx+4]   = x4;
411:     x[idx+5]   = x5;
412:   }

414:   ISRestoreIndices(isrow,&r);
415:   VecRestoreArray(bb,&b);
416:   VecRestoreArray(xx,&x);
417:   PetscLogFlops(36*(2*a->nz + mbs));
418:   return(0);
419: }

423: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
424: {
425:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
426:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
427:   MatScalar      *aa=a->a,*v,*d;
428:   PetscScalar    *x,*xp,*b,x0,x1,x2,x3,x4,x5;
430:   PetscInt       nz,*vj,k;

433: 
434:   VecGetArray(bb,&b);
435:   VecGetArray(xx,&x);
436: 
437:   /* solve U^T * D * y = b by forward substitution */
438:   PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
439:   for (k=0; k<mbs; k++){
440:     v  = aa + 36*ai[k];
441:     xp = x + k*6;
442:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
443:     nz = ai[k+1] - ai[k];
444:     vj = aj + ai[k];
445:     xp = x + (*vj)*6;
446:     while (nz--) {
447:       /* x(:) += U(k,:)^T*(Dk*xk) */
448:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
449:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
450:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
451:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
452:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
453:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
454:       vj++; xp = x + (*vj)*6;
455:       v += 36;
456:     }
457:     /* xk = inv(Dk)*(Dk*xk) */
458:     d  = aa+k*36;          /* ptr to inv(Dk) */
459:     xp = x + k*6;
460:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
461:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
462:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
463:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
464:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
465:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
466:   }

468:   /* solve U*x = y by back substitution */
469:   for (k=mbs-1; k>=0; k--){
470:     v  = aa + 36*ai[k];
471:     xp = x + k*6;
472:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
473:     nz = ai[k+1] - ai[k];
474:     vj = aj + ai[k];
475:     xp = x + (*vj)*6;
476:     while (nz--) {
477:       /* xk += U(k,:)*x(:) */
478:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
479:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
480:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
481:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
482:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
483:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
484:       vj++;
485:       v += 36; xp = x + (*vj)*6;
486:     }
487:     xp = x + k*6;
488:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
489:   }

491:   VecRestoreArray(bb,&b);
492:   VecRestoreArray(xx,&x);
493:   PetscLogFlops(36*(2*a->nz + mbs));
494:   return(0);
495: }

499: PetscErrorCode MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
500: {
501:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
502:   IS             isrow=a->row;
503:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
505:   PetscInt       nz,*vj,k,*r,idx;
506:   MatScalar      *aa=a->a,*v,*diag;
507:   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;

510:   VecGetArray(bb,&b);
511:   VecGetArray(xx,&x);
512:   t  = a->solve_work;
513:   ISGetIndices(isrow,&r);

515:   /* solve U^T * D * y = b by forward substitution */
516:   tp = t;
517:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
518:     idx   = 5*r[k];
519:     tp[0] = b[idx];
520:     tp[1] = b[idx+1];
521:     tp[2] = b[idx+2];
522:     tp[3] = b[idx+3];
523:     tp[4] = b[idx+4];
524:     tp += 5;
525:   }
526: 
527:   for (k=0; k<mbs; k++){
528:     v  = aa + 25*ai[k];
529:     vj = aj + ai[k];
530:     tp = t + k*5;
531:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
532:     nz = ai[k+1] - ai[k];

534:     tp = t + (*vj)*5;
535:     while (nz--) {
536:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
537:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
538:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
539:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
540:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
541:       vj++; tp = t + (*vj)*5;
542:       v += 25;
543:     }

545:     /* xk = inv(Dk)*(Dk*xk) */
546:     diag  = aa+k*25;          /* ptr to inv(Dk) */
547:     tp    = t + k*5;
548:       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
549:       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
550:       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
551:       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
552:       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
553:   }

555:   /* solve U*x = y by back substitution */
556:   for (k=mbs-1; k>=0; k--){
557:     v  = aa + 25*ai[k];
558:     vj = aj + ai[k];
559:     tp    = t + k*5;
560:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
561:     nz = ai[k+1] - ai[k];
562: 
563:     tp = t + (*vj)*5;
564:     while (nz--) {
565:       /* xk += U(k,:)*x(:) */
566:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
567:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
568:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
569:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
570:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
571:       vj++; tp = t + (*vj)*5;
572:       v += 25;
573:     }
574:     tp    = t + k*5;
575:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
576:     idx   = 5*r[k];
577:     x[idx]     = x0;
578:     x[idx+1]   = x1;
579:     x[idx+2]   = x2;
580:     x[idx+3]   = x3;
581:     x[idx+4]   = x4;
582:   }

584:   ISRestoreIndices(isrow,&r);
585:   VecRestoreArray(bb,&b);
586:   VecRestoreArray(xx,&x);
587:   PetscLogFlops(25*(2*a->nz + mbs));
588:   return(0);
589: }

593: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
594: {
595:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
596:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
597:   MatScalar      *aa=a->a,*v,*diag;
598:   PetscScalar    *x,*xp,*b,x0,x1,x2,x3,x4;
600:   PetscInt       nz,*vj,k;

603: 
604:   VecGetArray(bb,&b);
605:   VecGetArray(xx,&x);

607:   /* solve U^T * D * y = b by forward substitution */
608:   PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
609:   for (k=0; k<mbs; k++){
610:     v  = aa + 25*ai[k];
611:     xp = x + k*5;
612:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
613:     nz = ai[k+1] - ai[k];
614:     vj = aj + ai[k];
615:     xp = x + (*vj)*5;
616:     while (nz--) {
617:       /* x(:) += U(k,:)^T*(Dk*xk) */
618:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
619:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
620:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
621:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
622:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
623:       vj++; xp = x + (*vj)*5;
624:       v += 25;
625:     }
626:     /* xk = inv(Dk)*(Dk*xk) */
627:     diag = aa+k*25;          /* ptr to inv(Dk) */
628:     xp   = x + k*5;
629:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
630:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
631:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
632:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
633:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
634:   }

636:   /* solve U*x = y by back substitution */
637:   for (k=mbs-1; k>=0; k--){
638:     v  = aa + 25*ai[k];
639:     xp = x + k*5;
640:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
641:     nz = ai[k+1] - ai[k];
642:     vj = aj + ai[k];
643:     xp = x + (*vj)*5;
644:     while (nz--) {
645:       /* xk += U(k,:)*x(:) */
646:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
647:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
648:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
649:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
650:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
651:       vj++;
652:       v += 25; xp = x + (*vj)*5;
653:     }
654:     xp = x + k*5;
655:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
656:   }

658:   VecRestoreArray(bb,&b);
659:   VecRestoreArray(xx,&x);
660:   PetscLogFlops(25*(2*a->nz + mbs));
661:   return(0);
662: }

666: PetscErrorCode MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
667: {
668:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
669:   IS             isrow=a->row;
670:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
672:   PetscInt       nz,*vj,k,*r,idx;
673:   MatScalar      *aa=a->a,*v,*diag;
674:   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;

677:   VecGetArray(bb,&b);
678:   VecGetArray(xx,&x);
679:   t  = a->solve_work;
680:   ISGetIndices(isrow,&r);

682:   /* solve U^T * D * y = b by forward substitution */
683:   tp = t;
684:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
685:     idx   = 4*r[k];
686:     tp[0] = b[idx];
687:     tp[1] = b[idx+1];
688:     tp[2] = b[idx+2];
689:     tp[3] = b[idx+3];
690:     tp += 4;
691:   }
692: 
693:   for (k=0; k<mbs; k++){
694:     v  = aa + 16*ai[k];
695:     vj = aj + ai[k];
696:     tp = t + k*4;
697:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
698:     nz = ai[k+1] - ai[k];

700:     tp = t + (*vj)*4;
701:     while (nz--) {
702:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
703:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
704:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
705:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
706:       vj++; tp = t + (*vj)*4;
707:       v += 16;
708:     }

710:     /* xk = inv(Dk)*(Dk*xk) */
711:     diag  = aa+k*16;          /* ptr to inv(Dk) */
712:     tp    = t + k*4;
713:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
714:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
715:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
716:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
717:   }

719:   /* solve U*x = y by back substitution */
720:   for (k=mbs-1; k>=0; k--){
721:     v  = aa + 16*ai[k];
722:     vj = aj + ai[k];
723:     tp    = t + k*4;
724:     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
725:     nz = ai[k+1] - ai[k];
726: 
727:     tp = t + (*vj)*4;
728:     while (nz--) {
729:       /* xk += U(k,:)*x(:) */
730:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
731:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
732:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
733:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
734:       vj++; tp = t + (*vj)*4;
735:       v += 16;
736:     }
737:     tp    = t + k*4;
738:     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
739:     idx        = 4*r[k];
740:     x[idx]     = x0;
741:     x[idx+1]   = x1;
742:     x[idx+2]   = x2;
743:     x[idx+3]   = x3;
744:   }

746:   ISRestoreIndices(isrow,&r);
747:   VecRestoreArray(bb,&b);
748:   VecRestoreArray(xx,&x);
749:   PetscLogFlops(16*(2*a->nz + mbs));
750:   return(0);
751: }

753: /*
754:    Special case where the matrix was factored in the natural ordering. 
755:    This eliminates the need for the column and row permutation.
756: */
759: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
760: {
761:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
762:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
763:   MatScalar      *aa=a->a,*v,*diag;
764:   PetscScalar    *x,*xp,*b,x0,x1,x2,x3;
766:   PetscInt       nz,*vj,k;

769: 
770:   VecGetArray(bb,&b);
771:   VecGetArray(xx,&x);

773:   /* solve U^T * D * y = b by forward substitution */
774:   PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
775:   for (k=0; k<mbs; k++){
776:     v  = aa + 16*ai[k];
777:     xp = x + k*4;
778:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
779:     nz = ai[k+1] - ai[k];
780:     vj = aj + ai[k];
781:     xp = x + (*vj)*4;
782:     while (nz--) {
783:       /* x(:) += U(k,:)^T*(Dk*xk) */
784:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
785:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
786:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
787:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
788:       vj++; xp = x + (*vj)*4;
789:       v += 16;
790:     }
791:     /* xk = inv(Dk)*(Dk*xk) */
792:     diag = aa+k*16;          /* ptr to inv(Dk) */
793:     xp   = x + k*4;
794:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
795:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
796:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
797:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
798:   }

800:   /* solve U*x = y by back substitution */
801:   for (k=mbs-1; k>=0; k--){
802:     v  = aa + 16*ai[k];
803:     xp = x + k*4;
804:     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
805:     nz = ai[k+1] - ai[k];
806:     vj = aj + ai[k];
807:     xp = x + (*vj)*4;
808:     while (nz--) {
809:       /* xk += U(k,:)*x(:) */
810:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
811:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
812:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
813:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
814:       vj++;
815:       v += 16; xp = x + (*vj)*4;
816:     }
817:     xp = x + k*4;
818:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
819:   }

821:   VecRestoreArray(bb,&b);
822:   VecRestoreArray(xx,&x);
823:   PetscLogFlops(16*(2*a->nz + mbs));
824:   return(0);
825: }

829: PetscErrorCode MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
830: {
831:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
832:   IS             isrow=a->row;
833:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
835:   PetscInt       nz,*vj,k,*r,idx;
836:   MatScalar      *aa=a->a,*v,*diag;
837:   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;

840:   VecGetArray(bb,&b);
841:   VecGetArray(xx,&x);
842:   t  = a->solve_work;
843:   ISGetIndices(isrow,&r);

845:   /* solve U^T * D * y = b by forward substitution */
846:   tp = t;
847:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
848:     idx   = 3*r[k];
849:     tp[0] = b[idx];
850:     tp[1] = b[idx+1];
851:     tp[2] = b[idx+2];
852:     tp += 3;
853:   }
854: 
855:   for (k=0; k<mbs; k++){
856:     v  = aa + 9*ai[k];
857:     vj = aj + ai[k];
858:     tp = t + k*3;
859:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
860:     nz = ai[k+1] - ai[k];

862:     tp = t + (*vj)*3;
863:     while (nz--) {
864:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
865:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
866:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
867:       vj++; tp = t + (*vj)*3;
868:       v += 9;
869:     }

871:     /* xk = inv(Dk)*(Dk*xk) */
872:     diag  = aa+k*9;          /* ptr to inv(Dk) */
873:     tp    = t + k*3;
874:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
875:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
876:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
877:   }

879:   /* solve U*x = y by back substitution */
880:   for (k=mbs-1; k>=0; k--){
881:     v  = aa + 9*ai[k];
882:     vj = aj + ai[k];
883:     tp    = t + k*3;
884:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
885:     nz = ai[k+1] - ai[k];
886: 
887:     tp = t + (*vj)*3;
888:     while (nz--) {
889:       /* xk += U(k,:)*x(:) */
890:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
891:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
892:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
893:       vj++; tp = t + (*vj)*3;
894:       v += 9;
895:     }
896:     tp    = t + k*3;
897:     tp[0] = x0; tp[1] = x1; tp[2] = x2;
898:     idx      = 3*r[k];
899:     x[idx]   = x0;
900:     x[idx+1] = x1;
901:     x[idx+2] = x2;
902:   }

904:   ISRestoreIndices(isrow,&r);
905:   VecRestoreArray(bb,&b);
906:   VecRestoreArray(xx,&x);
907:   PetscLogFlops(9*(2*a->nz + mbs));
908:   return(0);
909: }

911: /*
912:    Special case where the matrix was factored in the natural ordering. 
913:    This eliminates the need for the column and row permutation.
914: */
917: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
918: {
919:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
920:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
921:   MatScalar      *aa=a->a,*v,*diag;
922:   PetscScalar    *x,*xp,*b,x0,x1,x2;
924:   PetscInt       nz,*vj,k;

927: 
928:   VecGetArray(bb,&b);
929:   VecGetArray(xx,&x);

931:   /* solve U^T * D * y = b by forward substitution */
932:   PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
933:   for (k=0; k<mbs; k++){
934:     v  = aa + 9*ai[k];
935:     xp = x + k*3;
936:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
937:     nz = ai[k+1] - ai[k];
938:     vj = aj + ai[k];
939:     xp = x + (*vj)*3;
940:     while (nz--) {
941:       /* x(:) += U(k,:)^T*(Dk*xk) */
942:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
943:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
944:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
945:       vj++; xp = x + (*vj)*3;
946:       v += 9;
947:     }
948:     /* xk = inv(Dk)*(Dk*xk) */
949:     diag = aa+k*9;          /* ptr to inv(Dk) */
950:     xp   = x + k*3;
951:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
952:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
953:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
954:   }

956:   /* solve U*x = y by back substitution */
957:   for (k=mbs-1; k>=0; k--){
958:     v  = aa + 9*ai[k];
959:     xp = x + k*3;
960:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
961:     nz = ai[k+1] - ai[k];
962:     vj = aj + ai[k];
963:     xp = x + (*vj)*3;
964:     while (nz--) {
965:       /* xk += U(k,:)*x(:) */
966:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
967:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
968:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
969:       vj++;
970:       v += 9; xp = x + (*vj)*3;
971:     }
972:     xp = x + k*3;
973:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
974:   }

976:   VecRestoreArray(bb,&b);
977:   VecRestoreArray(xx,&x);
978:   PetscLogFlops(9*(2*a->nz + mbs));
979:   return(0);
980: }

984: PetscErrorCode MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
985: {
986:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ *)A->data;
987:   IS             isrow=a->row;
988:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
990:   PetscInt       nz,*vj,k,k2,*r,idx;
991:   MatScalar      *aa=a->a,*v,*diag;
992:   PetscScalar    *x,*b,x0,x1,*t;

995:   VecGetArray(bb,&b);
996:   VecGetArray(xx,&x);
997:   t  = a->solve_work;
998:   /* printf("called MatSolve_SeqSBAIJ_2\n"); */
999:   ISGetIndices(isrow,&r);

1001:   /* solve U^T * D * y = perm(b) by forward substitution */
1002:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1003:     idx = 2*r[k];
1004:     t[k*2]   = b[idx];
1005:     t[k*2+1] = b[idx+1];
1006:   }
1007:   for (k=0; k<mbs; k++){
1008:     v  = aa + 4*ai[k];
1009:     vj = aj + ai[k];
1010:     k2 = k*2;
1011:     x0 = t[k2]; x1 = t[k2+1];
1012:     nz = ai[k+1] - ai[k];
1013:     while (nz--) {
1014:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1015:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1016:       vj++; v += 4;
1017:     }
1018:     diag = aa+k*4;  /* ptr to inv(Dk) */
1019:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1020:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1021:   }

1023:   /* solve U*x = y by back substitution */
1024:   for (k=mbs-1; k>=0; k--){
1025:     v  = aa + 4*ai[k];
1026:     vj = aj + ai[k];
1027:     k2 = k*2;
1028:     x0 = t[k2]; x1 = t[k2+1];
1029:     nz = ai[k+1] - ai[k];
1030:     while (nz--) {
1031:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1032:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1033:       vj++; v += 4;
1034:     }
1035:     t[k2]    = x0;
1036:     t[k2+1]  = x1;
1037:     idx      = 2*r[k];
1038:     x[idx]   = x0;
1039:     x[idx+1] = x1;
1040:   }

1042:   ISRestoreIndices(isrow,&r);
1043:   VecRestoreArray(bb,&b);
1044:   VecRestoreArray(xx,&x);
1045:   PetscLogFlops(4*(2*a->nz + mbs));
1046:   return(0);
1047: }

1049: /*
1050:    Special case where the matrix was factored in the natural ordering. 
1051:    This eliminates the need for the column and row permutation.
1052: */
1055: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1056: {
1057:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1058:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1059:   MatScalar      *aa=a->a,*v,*diag;
1060:   PetscScalar    *x,*b,x0,x1;
1062:   PetscInt       nz,*vj,k,k2;

1065: 
1066:   VecGetArray(bb,&b);
1067:   VecGetArray(xx,&x);

1069:   /* solve U^T * D * y = b by forward substitution */
1070:   PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1071:   for (k=0; k<mbs; k++){
1072:     v  = aa + 4*ai[k];
1073:     vj = aj + ai[k];
1074:     k2 = k*2;
1075:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1076:     nz = ai[k+1] - ai[k];
1077: 
1078:     while (nz--) {
1079:       /* x(:) += U(k,:)^T*(Dk*xk) */
1080:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1081:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1082:       vj++; v += 4;
1083:     }
1084:     /* xk = inv(Dk)*(Dk*xk) */
1085:     diag = aa+k*4;          /* ptr to inv(Dk) */
1086:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1087:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1088:   }

1090:   /* solve U*x = y by back substitution */
1091:   for (k=mbs-1; k>=0; k--){
1092:     v  = aa + 4*ai[k];
1093:     vj = aj + ai[k];
1094:     k2 = k*2;
1095:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1096:     nz = ai[k+1] - ai[k];
1097:     while (nz--) {
1098:       /* xk += U(k,:)*x(:) */
1099:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1100:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1101:       vj++; v += 4;
1102:     }
1103:     x[k2]     = x0;
1104:     x[k2+1]   = x1;
1105:   }

1107:   VecRestoreArray(bb,&b);
1108:   VecRestoreArray(xx,&x);
1109:   PetscLogFlops(4*(2*a->nz + mbs)); /* bs2*(2*a->nz + mbs) */
1110:   return(0);
1111: }

1115: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1116: {
1117:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1118:   IS             isrow=a->row;
1120:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,*rip;
1121:   MatScalar      *aa=a->a,*v;
1122:   PetscScalar    *x,*b,xk,*t;
1123:   PetscInt       nz,*vj,k;

1126:   if (!mbs) return(0);
1127:   /* printf(" MatSolve_SeqSBAIJ_1 is called\n"); */

1129:   VecGetArray(bb,&b);
1130:   VecGetArray(xx,&x);
1131:   t    = a->solve_work;

1133:   ISGetIndices(isrow,&rip);
1134: 
1135:   /* solve U^T*D*y = perm(b) by forward substitution */
1136:   for (k=0; k<mbs; k++) t[k] = b[rip[k]];
1137:   for (k=0; k<mbs; k++){
1138:     v  = aa + ai[k] + 1;
1139:     vj = aj + ai[k] + 1;
1140:     xk = t[k];
1141:     nz = ai[k+1] - ai[k] - 1;
1142:     while (nz--) t[*vj++] += (*v++) * xk;
1143:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1144:   }

1146:   /* solve U*x = y by back substitution */
1147:   for (k=mbs-1; k>=0; k--){
1148:     v  = aa + ai[k] + 1;
1149:     vj = aj + ai[k] + 1;
1150:     xk = t[k];
1151:     nz = ai[k+1] - ai[k] - 1;
1152:     while (nz--) xk += (*v++) * t[*vj++];
1153:     t[k]      = xk;
1154:     x[rip[k]] = xk;
1155:   }

1157:   ISRestoreIndices(isrow,&rip);
1158:   VecRestoreArray(bb,&b);
1159:   VecRestoreArray(xx,&x);
1160:   PetscLogFlops(4*a->nz + A->rmap.N);
1161:   return(0);
1162: }

1166: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1167: {
1168:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;

1172:   if (A->rmap.bs == 1) {
1173:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1174:   } else {
1175:     IS              isrow=a->row;
1176:     PetscInt             mbs=a->mbs,*ai=a->i,*aj=a->j,*rip,i;
1177:     MatScalar       *aa=a->a,*v;
1178:     PetscScalar     *x,*b,*t;
1179:     PetscInt             nz,*vj,k,n;
1180:     if (bb->n > a->solves_work_n) {
1181:       PetscFree(a->solves_work);
1182:       PetscMalloc(bb->n*A->rmap.N*sizeof(PetscScalar),&a->solves_work);
1183:       a->solves_work_n = bb->n;
1184:     }
1185:     n    = bb->n;
1186:     VecGetArray(bb->v,&b);
1187:     VecGetArray(xx->v,&x);
1188:     t    = a->solves_work;

1190:     ISGetIndices(isrow,&rip);
1191: 
1192:     /* solve U^T*D*y = perm(b) by forward substitution */
1193:     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rip[k]+i*mbs];} /* values are stored interlaced in t */
1194:     for (k=0; k<mbs; k++){
1195:       v  = aa + ai[k];
1196:       vj = aj + ai[k];
1197:       nz = ai[k+1] - ai[k];
1198:       while (nz--) {
1199:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1200:         v++;vj++;
1201:       }
1202:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1203:     }
1204: 
1205:     /* solve U*x = y by back substitution */
1206:     for (k=mbs-1; k>=0; k--){
1207:       v  = aa + ai[k];
1208:       vj = aj + ai[k];
1209:       nz = ai[k+1] - ai[k];
1210:       while (nz--) {
1211:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1212:         v++;vj++;
1213:       }
1214:       for (i=0; i<n; i++) x[rip[k]+i*mbs] = t[n*k+i];
1215:     }

1217:     ISRestoreIndices(isrow,&rip);
1218:     VecRestoreArray(bb->v,&b);
1219:     VecRestoreArray(xx->v,&x);
1220:     PetscLogFlops(bb->n*(4*a->nz + A->rmap.N));
1221:   }
1222:   return(0);
1223: }

1225: /*
1226:       Special case where the matrix was ILU(0) factored in the natural
1227:    ordering. This eliminates the need for the column and row permutation.
1228: */
1231: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1232: {
1233:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1235:   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1236:   MatScalar      *aa=a->a,*v;
1237:   PetscScalar    *x,*b,xk;
1238:   PetscInt       nz,*vj,k;

1241:   VecGetArray(bb,&b);
1242:   VecGetArray(xx,&x);
1243: 
1244:   /* solve U^T*D*y = b by forward substitution */
1245:   PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1246:   for (k=0; k<mbs; k++){
1247:     v  = aa + ai[k] + 1;
1248:     vj = aj + ai[k] + 1;
1249:     xk = x[k];
1250:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
1251:     while (nz--) x[*vj++] += (*v++) * xk;
1252:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
1253:   }

1255:   /* solve U*x = y by back substitution */
1256:   for (k=mbs-2; k>=0; k--){
1257:     v  = aa + ai[k] + 1;
1258:     vj = aj + ai[k] + 1;
1259:     xk = x[k];
1260:     nz = ai[k+1] - ai[k] - 1;
1261:     while (nz--) xk += (*v++) * x[*vj++];
1262:     x[k] = xk;
1263:   }

1265:   VecRestoreArray(bb,&b);
1266:   VecRestoreArray(xx,&x);
1267:   PetscLogFlops(4*a->nz + A->rmap.N);
1268:   return(0);
1269: }

1271: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
1274: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
1275: {
1276:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
1278:   PetscInt       *rip,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
1279:   PetscInt       *jutmp,bs = A->rmap.bs,bs2=a->bs2;
1280:   PetscInt       m,reallocs = 0,*levtmp;
1281:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
1282:   PetscInt       incrlev,*lev,shift,prow,nz;
1283:   PetscReal      f = info->fill,levels = info->levels;
1284:   PetscTruth     perm_identity;

1287:   /* check whether perm is the identity mapping */
1288:   ISIdentity(perm,&perm_identity);

1290:   if (perm_identity){
1291:     a->permute = PETSC_FALSE;
1292:     ai = a->i; aj = a->j;
1293:   } else { /*  non-trivial permutation */
1294:     a->permute = PETSC_TRUE;
1295:     MatReorderingSeqSBAIJ(A, perm);
1296:     ai = a->inew; aj = a->jnew;
1297:   }
1298: 
1299:   /* initialization */
1300:   ISGetIndices(perm,&rip);
1301:   umax  = (PetscInt)(f*ai[mbs] + 1);
1302:   PetscMalloc(umax*sizeof(PetscInt),&lev);
1303:   umax += mbs + 1;
1304:   shift = mbs + 1;
1305:   PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
1306:   PetscMalloc(umax*sizeof(PetscInt),&ju);
1307:   iu[0] = mbs + 1;
1308:   juidx = mbs + 1;
1309:   /* prowl: linked list for pivot row */
1310:   PetscMalloc((3*mbs+1)*sizeof(PetscInt),&prowl);
1311:   /* q: linked list for col index */
1312:   q       = prowl + mbs;
1313:   levtmp  = q     + mbs;
1314: 
1315:   for (i=0; i<mbs; i++){
1316:     prowl[i] = mbs;
1317:     q[i] = 0;
1318:   }

1320:   /* for each row k */
1321:   for (k=0; k<mbs; k++){
1322:     nzk  = 0;
1323:     q[k] = mbs;
1324:     /* copy current row into linked list */
1325:     nz = ai[rip[k]+1] - ai[rip[k]];
1326:     j = ai[rip[k]];
1327:     while (nz--){
1328:       vj = rip[aj[j++]];
1329:       if (vj > k){
1330:         qm = k;
1331:         do {
1332:           m  = qm; qm = q[m];
1333:         } while(qm < vj);
1334:         if (qm == vj) {
1335:           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
1336:         }
1337:         nzk++;
1338:         q[m]       = vj;
1339:         q[vj]      = qm;
1340:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
1341:       }
1342:     }

1344:     /* modify nonzero structure of k-th row by computing fill-in
1345:        for each row prow to be merged in */
1346:     prow = k;
1347:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
1348: 
1349:     while (prow < k){
1350:       /* merge row prow into k-th row */
1351:       jmin = iu[prow] + 1;
1352:       jmax = iu[prow+1];
1353:       qm = k;
1354:       for (j=jmin; j<jmax; j++){
1355:         incrlev = lev[j-shift] + 1;
1356:         if (incrlev > levels) continue;
1357:         vj      = ju[j];
1358:         do {
1359:           m = qm; qm = q[m];
1360:         } while (qm < vj);
1361:         if (qm != vj){      /* a fill */
1362:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1363:           levtmp[vj] = incrlev;
1364:         } else {
1365:           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1366:         }
1367:       }
1368:       prow = prowl[prow]; /* next pivot row */
1369:     }
1370: 
1371:     /* add k to row list for first nonzero element in k-th row */
1372:     if (nzk > 1){
1373:       i = q[k]; /* col value of first nonzero element in k_th row of U */
1374:       prowl[k] = prowl[i]; prowl[i] = k;
1375:     }
1376:     iu[k+1] = iu[k] + nzk;

1378:     /* allocate more space to ju and lev if needed */
1379:     if (iu[k+1] > umax) {
1380:       /* estimate how much additional space we will need */
1381:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1382:       /* just double the memory each time */
1383:       maxadd = umax;
1384:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1385:       umax += maxadd;

1387:       /* allocate a longer ju */
1388:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1389:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
1390:       PetscFree(ju);
1391:       ju       = jutmp;

1393:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1394:       PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
1395:       PetscFree(lev);
1396:       lev      = jutmp;
1397:       reallocs += 2; /* count how many times we realloc */
1398:     }

1400:     /* save nonzero structure of k-th row in ju */
1401:     i=k;
1402:     while (nzk --) {
1403:       i                = q[i];
1404:       ju[juidx]        = i;
1405:       lev[juidx-shift] = levtmp[i];
1406:       juidx++;
1407:     }
1408:   }
1409: 
1410: #if defined(PETSC_USE_INFO)
1411:   if (ai[mbs] != 0) {
1412:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1413:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
1414:     PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
1415:     PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
1416:     PetscInfo(A,"for best performance.\n");
1417:   } else {
1418:     PetscInfo(A,"Empty matrix.\n");
1419:   }
1420: #endif

1422:   ISRestoreIndices(perm,&rip);
1423:   PetscFree(prowl);
1424:   PetscFree(lev);

1426:   /* put together the new matrix */
1427:   MatCreate(A->comm,B);
1428:   MatSetSizes(*B,bs*mbs,bs*mbs,bs*mbs,bs*mbs);
1429:   MatSetType(*B,A->type_name);
1430:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(*B,bs,0,PETSC_NULL);

1432:   /* PetscLogObjectParent(*B,iperm); */
1433:   b    = (Mat_SeqSBAIJ*)(*B)->data;
1434:   PetscFree2(b->imax,b->ilen);
1435:   b->singlemalloc = PETSC_FALSE;
1436:   b->free_a       = PETSC_TRUE;
1437:   b->free_ij      = PETSC_TRUE;
1438:   /* the next line frees the default space generated by the Create() */
1439:   PetscFree3(b->a,b->j,b->i);
1440:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1441:   b->j    = ju;
1442:   b->i    = iu;
1443:   b->diag = 0;
1444:   b->ilen = 0;
1445:   b->imax = 0;
1446: 
1447:   if (b->row) {
1448:     ISDestroy(b->row);
1449:   }
1450:   if (b->icol) {
1451:     ISDestroy(b->icol);
1452:   }
1453:   b->row  = perm;
1454:   b->icol = perm;
1455:   PetscObjectReference((PetscObject)perm);
1456:   PetscObjectReference((PetscObject)perm);
1457:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1458:   /* In b structure:  Free imax, ilen, old a, old j.  
1459:      Allocate idnew, solve_work, new a, new j */
1460:   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
1461:   b->maxnz = b->nz = iu[mbs];
1462: 
1463:   (*B)->factor                 = FACTOR_CHOLESKY;
1464:   (*B)->info.factor_mallocs    = reallocs;
1465:   (*B)->info.fill_ratio_given  = f;
1466:   if (ai[mbs] != 0) {
1467:     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1468:   } else {
1469:     (*B)->info.fill_ratio_needed = 0.0;
1470:   }

1472:   if (perm_identity){
1473:     switch (bs) {
1474:       case 1:
1475:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1476:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1477:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1478:         (*B)->ops->solves                = MatSolves_SeqSBAIJ_1;
1479:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=1\n");
1480:         break;
1481:       case 2:
1482:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1483:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1484:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1485:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=2\n");
1486:         break;
1487:       case 3:
1488:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1489:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1490:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1491:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=3\n");
1492:         break;
1493:       case 4:
1494:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1495:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1496:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1497:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=4\n");
1498:         break;
1499:       case 5:
1500:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1501:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1502:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1503:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=5\n");
1504:         break;
1505:       case 6:
1506:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1507:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1508:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1509:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=6\n");
1510:         break;
1511:       case 7:
1512:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1513:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1514:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1515:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=7\n");
1516:       break;
1517:       default:
1518:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1519:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1520:         (*B)->ops->solvetranspose        = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1521:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS>7\n");
1522:       break;
1523:     }
1524:   }

1526:   return(0);
1527: }

1529:  #include petscbt.h
1530:  #include src/mat/utils/freespace.h
1533: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *fact)
1534: {
1535:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
1536:   Mat_SeqSBAIJ       *b;
1537:   Mat                B;
1538:   PetscErrorCode     ierr;
1539:   PetscTruth         perm_identity,free_ij = PETSC_TRUE;
1540:   PetscInt           bs=A->rmap.bs,am=a->mbs;
1541:   PetscInt           reallocs=0,*rip,i,*ai,*aj,*ui;
1542:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
1543:   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
1544:   PetscReal          fill=info->fill,levels=info->levels,ratio_needed;
1545:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
1546:   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
1547:   PetscBT            lnkbt;
1548:   PetscScalar        *ua;

1551:   /*  
1552:    This code originally uses Modified Sparse Row (MSR) storage
1553:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
1554:    Then it is rewritten so the factor B takes seqsbaij format. However the associated 
1555:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 
1556:    thus the original code in MSR format is still used for these cases. 
1557:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 
1558:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
1559:   */
1560:   if (bs > 1){
1561:     MatICCFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,fact);
1562:     return(0);
1563:   }

1565:   /* check whether perm is the identity mapping */
1566:   ISIdentity(perm,&perm_identity);
1567: 
1568:   /* special case that simply copies fill pattern */
1569:   if (!levels && perm_identity) {
1570:     a->permute = PETSC_FALSE;
1571:     /* reuse the column pointers and row offsets for memory savings */
1572:     ui           = a->i;
1573:     uj           = a->j;
1574:     free_ij      = PETSC_FALSE;
1575:     ratio_needed = 1.0;
1576:   } else { /* case: levels>0 || (levels=0 && !perm_identity) */
1577:     if (perm_identity){
1578:       a->permute = PETSC_FALSE;
1579:       ai = a->i; aj = a->j;
1580:     } else { /*  non-trivial permutation */
1581:       a->permute = PETSC_TRUE;
1582:       MatReorderingSeqSBAIJ(A, perm);
1583:       ai   = a->inew; aj = a->jnew;
1584:     }
1585:     ISGetIndices(perm,&rip);

1587:     /* initialization */
1588:     PetscMalloc((am+1)*sizeof(PetscInt),&ui);
1589:     ui[0] = 0;

1591:     /* jl: linked list for storing indices of the pivot rows 
1592:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1593:     PetscMalloc((2*am+1)*sizeof(PetscInt)+2*am*sizeof(PetscInt*),&jl);
1594:     il         = jl + am;
1595:     uj_ptr     = (PetscInt**)(il + am);
1596:     uj_lvl_ptr = (PetscInt**)(uj_ptr + am);
1597:     for (i=0; i<am; i++){
1598:       jl[i] = am; il[i] = 0;
1599:     }

1601:     /* create and initialize a linked list for storing column indices of the active row k */
1602:     nlnk = am + 1;
1603:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

1605:     /* initial FreeSpace size is fill*(ai[am]+1) */
1606:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
1607:     current_space = free_space;
1608:     PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
1609:     current_space_lvl = free_space_lvl;

1611:     for (k=0; k<am; k++){  /* for each active row k */
1612:       /* initialize lnk by the column indices of row rip[k] */
1613:       nzk   = 0;
1614:       ncols = ai[rip[k]+1] - ai[rip[k]];
1615:       cols  = aj+ai[rip[k]];
1616:       PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
1617:       nzk += nlnk;

1619:       /* update lnk by computing fill-in for each pivot row to be merged in */
1620:       prow = jl[k]; /* 1st pivot row */
1621: 
1622:       while (prow < k){
1623:         nextprow = jl[prow];
1624: 
1625:         /* merge prow into k-th row */
1626:         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
1627:         jmax = ui[prow+1];
1628:         ncols = jmax-jmin;
1629:         i     = jmin - ui[prow];
1630:         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1631:         j     = *(uj_lvl_ptr[prow] + i - 1);
1632:         cols_lvl = uj_lvl_ptr[prow]+i;
1633:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
1634:         nzk += nlnk;

1636:         /* update il and jl for prow */
1637:         if (jmin < jmax){
1638:           il[prow] = jmin;
1639:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
1640:         }
1641:         prow = nextprow;
1642:       }

1644:       /* if free space is not available, make more free space */
1645:       if (current_space->local_remaining<nzk) {
1646:         i = am - k + 1; /* num of unfactored rows */
1647:         i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1648:         PetscFreeSpaceGet(i,&current_space);
1649:         PetscFreeSpaceGet(i,&current_space_lvl);
1650:         reallocs++;
1651:       }

1653:       /* copy data into free_space and free_space_lvl, then initialize lnk */
1654:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

1656:       /* add the k-th row into il and jl */
1657:       if (nzk-1 > 0){
1658:         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1659:         jl[k] = jl[i]; jl[i] = k;
1660:         il[k] = ui[k] + 1;
1661:       }
1662:       uj_ptr[k]     = current_space->array;
1663:       uj_lvl_ptr[k] = current_space_lvl->array;

1665:       current_space->array           += nzk;
1666:       current_space->local_used      += nzk;
1667:       current_space->local_remaining -= nzk;
1668:       current_space_lvl->array           += nzk;
1669:       current_space_lvl->local_used      += nzk;
1670:       current_space_lvl->local_remaining -= nzk;

1672:       ui[k+1] = ui[k] + nzk;
1673:     }

1675: #if defined(PETSC_USE_INFO)
1676:     if (ai[am] != 0) {
1677:       PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
1678:       PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
1679:       PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
1680:       PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
1681:     } else {
1682:       PetscInfo(A,"Empty matrix.\n");
1683:     }
1684: #endif

1686:     ISRestoreIndices(perm,&rip);
1687:     PetscFree(jl);

1689:     /* destroy list of free space and other temporary array(s) */
1690:     PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
1691:     PetscFreeSpaceContiguous(&free_space,uj);
1692:     PetscIncompleteLLDestroy(lnk,lnkbt);
1693:     PetscFreeSpaceDestroy(free_space_lvl);
1694:     if (ai[am] != 0) {
1695:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
1696:     } else {
1697:       ratio_needed = 0.0;
1698:     }
1699:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

1701:   /* put together the new matrix in MATSEQSBAIJ format */
1702:   PetscMalloc((ui[am]+1)*sizeof(MatScalar),&ua);
1703:   MatCreateSeqSBAIJWithArrays(PETSC_COMM_SELF,1,am,am,ui,uj,ua,fact);
1704:   B = *fact;
1705:   b = (Mat_SeqSBAIJ*)B->data;
1706:   PetscFree2(b->imax,b->ilen);
1707:   b->singlemalloc = PETSC_FALSE;
1708:   b->free_a  = PETSC_TRUE;
1709:   b->free_ij = free_ij;
1710:   b->diag    = 0;
1711:   b->ilen    = 0;
1712:   b->imax    = 0;
1713:   b->row     = perm;
1714:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1715:   PetscObjectReference((PetscObject)perm);
1716:   b->icol = perm;
1717:   PetscObjectReference((PetscObject)perm);
1718:   PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
1719:   b->maxnz = b->nz = ui[am];
1720: 
1721:   B->factor                 = FACTOR_CHOLESKY;
1722:   B->info.factor_mallocs    = reallocs;
1723:   B->info.fill_ratio_given  = fill;
1724:   B->info.fill_ratio_needed = ratio_needed;

1726:   if (perm_identity){
1727:     switch (bs) {
1728:       case 1:
1729:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1730:         B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1731:         B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1732:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=1\n");
1733:         break;
1734:       case 2:
1735:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1736:         B->ops->solve                 = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1737:         B->ops->solvetranspose        = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1738:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=2\n");
1739:         break;
1740:       case 3:
1741:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1742:         B->ops->solve                 = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1743:         B->ops->solvetranspose        = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1744:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=3\n");
1745:         break;
1746:       case 4:
1747:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1748:         B->ops->solve                 = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1749:         B->ops->solvetranspose        = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1750:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=4\n");
1751:         break;
1752:       case 5:
1753:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1754:         B->ops->solve                 = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1755:         B->ops->solvetranspose        = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1756:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=5\n");
1757:         break;
1758:       case 6:
1759:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1760:         B->ops->solve                 = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1761:         B->ops->solvetranspose        = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1762:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=6\n");
1763:         break;
1764:       case 7:
1765:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1766:         B->ops->solve                 = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1767:         B->ops->solvetranspose        = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1768:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=7\n");
1769:       break;
1770:       default:
1771:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1772:         B->ops->solve                 = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1773:         B->ops->solvetranspose        = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1774:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS>7\n");
1775:       break;
1776:     }
1777:   }
1778:   return(0);
1779: }