Actual source code: sbaijfact2.c
2: /*
3: Factorization code for SBAIJ format.
4: */
6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
7: #include <../src/mat/impls/baij/seq/baij.h>
8: #include <petsc/private/kernels/blockinvert.h>
10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
11: {
12: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
13: IS isrow=a->row;
14: PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
15: PetscErrorCode ierr;
16: const PetscInt *r;
17: PetscInt nz,*vj,k,idx,k1;
18: PetscInt bs =A->rmap->bs,bs2 = a->bs2;
19: const MatScalar *aa=a->a,*v,*diag;
20: PetscScalar *x,*xk,*xj,*xk_tmp,*t;
21: const PetscScalar *b;
24: VecGetArrayRead(bb,&b);
25: VecGetArray(xx,&x);
26: t = a->solve_work;
27: ISGetIndices(isrow,&r);
28: PetscMalloc1(bs,&xk_tmp);
30: /* solve U^T * D * y = b by forward substitution */
31: xk = t;
32: for (k=0; k<mbs; k++) { /* t <- perm(b) */
33: idx = bs*r[k];
34: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
35: }
36: for (k=0; k<mbs; k++) {
37: v = aa + bs2*ai[k];
38: xk = t + k*bs; /* Dk*xk = k-th block of x */
39: PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
40: nz = ai[k+1] - ai[k];
41: vj = aj + ai[k];
42: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
43: while (nz--) {
44: /* x(:) += U(k,:)^T*(Dk*xk) */
45: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
46: vj++; xj = t + (*vj)*bs;
47: v += bs2;
48: }
49: /* xk = inv(Dk)*(Dk*xk) */
50: diag = aa+k*bs2; /* ptr to inv(Dk) */
51: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
52: }
54: /* solve U*x = y by back substitution */
55: for (k=mbs-1; k>=0; k--) {
56: v = aa + bs2*ai[k];
57: xk = t + k*bs; /* xk */
58: nz = ai[k+1] - ai[k];
59: vj = aj + ai[k];
60: xj = t + (*vj)*bs;
61: while (nz--) {
62: /* xk += U(k,:)*x(:) */
63: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
64: vj++;
65: v += bs2; xj = t + (*vj)*bs;
66: }
67: idx = bs*r[k];
68: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
69: }
71: PetscFree(xk_tmp);
72: ISRestoreIndices(isrow,&r);
73: VecRestoreArrayRead(bb,&b);
74: VecRestoreArray(xx,&x);
75: PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
76: return(0);
77: }
79: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
80: {
82: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
83: }
85: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
86: {
88: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
89: }
91: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
92: {
93: PetscErrorCode ierr;
94: PetscInt nz,k;
95: const PetscInt *vj,bs2 = bs*bs;
96: const MatScalar *v,*diag;
97: PetscScalar *xk,*xj,*xk_tmp;
100: PetscMalloc1(bs,&xk_tmp);
101: for (k=0; k<mbs; k++) {
102: v = aa + bs2*ai[k];
103: xk = x + k*bs; /* Dk*xk = k-th block of x */
104: PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
105: nz = ai[k+1] - ai[k];
106: vj = aj + ai[k];
107: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
108: while (nz--) {
109: /* x(:) += U(k,:)^T*(Dk*xk) */
110: PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
111: vj++; xj = x + (*vj)*bs;
112: v += bs2;
113: }
114: /* xk = inv(Dk)*(Dk*xk) */
115: diag = aa+k*bs2; /* ptr to inv(Dk) */
116: PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
117: }
118: PetscFree(xk_tmp);
119: return(0);
120: }
122: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
123: {
124: PetscInt nz,k;
125: const PetscInt *vj,bs2 = bs*bs;
126: const MatScalar *v;
127: PetscScalar *xk,*xj;
130: for (k=mbs-1; k>=0; k--) {
131: v = aa + bs2*ai[k];
132: xk = x + k*bs; /* xk */
133: nz = ai[k+1] - ai[k];
134: vj = aj + ai[k];
135: xj = x + (*vj)*bs;
136: while (nz--) {
137: /* xk += U(k,:)*x(:) */
138: PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
139: vj++;
140: v += bs2; xj = x + (*vj)*bs;
141: }
142: }
143: return(0);
144: }
146: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
147: {
148: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
149: PetscErrorCode ierr;
150: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
151: PetscInt bs =A->rmap->bs;
152: const MatScalar *aa=a->a;
153: PetscScalar *x;
154: const PetscScalar *b;
157: VecGetArrayRead(bb,&b);
158: VecGetArray(xx,&x);
160: /* solve U^T * D * y = b by forward substitution */
161: PetscArraycpy(x,b,bs*mbs); /* x <- b */
162: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
164: /* solve U*x = y by back substitution */
165: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
167: VecRestoreArrayRead(bb,&b);
168: VecRestoreArray(xx,&x);
169: PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
170: return(0);
171: }
173: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
174: {
175: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
176: PetscErrorCode ierr;
177: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
178: PetscInt bs =A->rmap->bs;
179: const MatScalar *aa=a->a;
180: const PetscScalar *b;
181: PetscScalar *x;
184: VecGetArrayRead(bb,&b);
185: VecGetArray(xx,&x);
186: PetscArraycpy(x,b,bs*mbs); /* x <- b */
187: MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
188: VecRestoreArrayRead(bb,&b);
189: VecRestoreArray(xx,&x);
190: PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
191: return(0);
192: }
194: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
195: {
196: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
197: PetscErrorCode ierr;
198: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
199: PetscInt bs =A->rmap->bs;
200: const MatScalar *aa=a->a;
201: const PetscScalar *b;
202: PetscScalar *x;
205: VecGetArrayRead(bb,&b);
206: VecGetArray(xx,&x);
207: PetscArraycpy(x,b,bs*mbs);
208: MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
209: VecRestoreArrayRead(bb,&b);
210: VecRestoreArray(xx,&x);
211: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
212: return(0);
213: }
215: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
216: {
217: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
218: IS isrow=a->row;
219: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
220: PetscErrorCode ierr;
221: PetscInt nz,k,idx;
222: const MatScalar *aa=a->a,*v,*d;
223: PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
224: const PetscScalar *b;
227: VecGetArrayRead(bb,&b);
228: VecGetArray(xx,&x);
229: t = a->solve_work;
230: ISGetIndices(isrow,&r);
232: /* solve U^T * D * y = b by forward substitution */
233: tp = t;
234: for (k=0; k<mbs; k++) { /* t <- perm(b) */
235: idx = 7*r[k];
236: tp[0] = b[idx];
237: tp[1] = b[idx+1];
238: tp[2] = b[idx+2];
239: tp[3] = b[idx+3];
240: tp[4] = b[idx+4];
241: tp[5] = b[idx+5];
242: tp[6] = b[idx+6];
243: tp += 7;
244: }
246: for (k=0; k<mbs; k++) {
247: v = aa + 49*ai[k];
248: vj = aj + ai[k];
249: tp = t + k*7;
250: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
251: nz = ai[k+1] - ai[k];
252: tp = t + (*vj)*7;
253: while (nz--) {
254: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
255: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
256: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
257: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
258: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
259: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
260: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
261: vj++;
262: tp = t + (*vj)*7;
263: v += 49;
264: }
266: /* xk = inv(Dk)*(Dk*xk) */
267: d = aa+k*49; /* ptr to inv(Dk) */
268: tp = t + k*7;
269: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
270: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
271: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
272: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
273: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
274: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
275: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
276: }
278: /* solve U*x = y by back substitution */
279: for (k=mbs-1; k>=0; k--) {
280: v = aa + 49*ai[k];
281: vj = aj + ai[k];
282: tp = t + k*7;
283: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
284: nz = ai[k+1] - ai[k];
286: tp = t + (*vj)*7;
287: while (nz--) {
288: /* xk += U(k,:)*x(:) */
289: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
290: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
291: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
292: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
293: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
294: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
295: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
296: vj++;
297: tp = t + (*vj)*7;
298: v += 49;
299: }
300: tp = t + k*7;
301: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
302: idx = 7*r[k];
303: x[idx] = x0;
304: x[idx+1] = x1;
305: x[idx+2] = x2;
306: x[idx+3] = x3;
307: x[idx+4] = x4;
308: x[idx+5] = x5;
309: x[idx+6] = x6;
310: }
312: ISRestoreIndices(isrow,&r);
313: VecRestoreArrayRead(bb,&b);
314: VecRestoreArray(xx,&x);
315: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
316: return(0);
317: }
319: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
320: {
321: const MatScalar *v,*d;
322: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
323: PetscInt nz,k;
324: const PetscInt *vj;
327: for (k=0; k<mbs; k++) {
328: v = aa + 49*ai[k];
329: xp = x + k*7;
330: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
331: nz = ai[k+1] - ai[k];
332: vj = aj + ai[k];
333: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
334: PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
335: while (nz--) {
336: xp = x + (*vj)*7;
337: /* x(:) += U(k,:)^T*(Dk*xk) */
338: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
339: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
340: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
341: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
342: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
343: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
344: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
345: vj++;
346: v += 49;
347: }
348: /* xk = inv(Dk)*(Dk*xk) */
349: d = aa+k*49; /* ptr to inv(Dk) */
350: xp = x + k*7;
351: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
352: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
353: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
354: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
355: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
356: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
357: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
358: }
359: return(0);
360: }
362: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
363: {
364: const MatScalar *v;
365: PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6;
366: PetscInt nz,k;
367: const PetscInt *vj;
370: for (k=mbs-1; k>=0; k--) {
371: v = aa + 49*ai[k];
372: xp = x + k*7;
373: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
374: nz = ai[k+1] - ai[k];
375: vj = aj + ai[k];
376: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
377: PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
378: while (nz--) {
379: xp = x + (*vj)*7;
380: /* xk += U(k,:)*x(:) */
381: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
382: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
383: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
384: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
385: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
386: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
387: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
388: vj++;
389: v += 49;
390: }
391: xp = x + k*7;
392: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
393: }
394: return(0);
395: }
397: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
398: {
399: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
400: PetscErrorCode ierr;
401: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
402: const MatScalar *aa=a->a;
403: PetscScalar *x;
404: const PetscScalar *b;
407: VecGetArrayRead(bb,&b);
408: VecGetArray(xx,&x);
410: /* solve U^T * D * y = b by forward substitution */
411: PetscArraycpy(x,b,7*mbs); /* x <- b */
412: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
414: /* solve U*x = y by back substitution */
415: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
417: VecRestoreArrayRead(bb,&b);
418: VecRestoreArray(xx,&x);
419: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
420: return(0);
421: }
423: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
424: {
425: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
426: PetscErrorCode ierr;
427: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
428: const MatScalar *aa=a->a;
429: PetscScalar *x;
430: const PetscScalar *b;
433: VecGetArrayRead(bb,&b);
434: VecGetArray(xx,&x);
435: PetscArraycpy(x,b,7*mbs);
436: MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
437: VecRestoreArrayRead(bb,&b);
438: VecRestoreArray(xx,&x);
439: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
440: return(0);
441: }
443: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
444: {
445: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
446: PetscErrorCode ierr;
447: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
448: const MatScalar *aa=a->a;
449: PetscScalar *x;
450: const PetscScalar *b;
453: VecGetArrayRead(bb,&b);
454: VecGetArray(xx,&x);
455: PetscArraycpy(x,b,7*mbs);
456: MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
457: VecRestoreArrayRead(bb,&b);
458: VecRestoreArray(xx,&x);
459: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
460: return(0);
461: }
463: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
464: {
465: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
466: IS isrow=a->row;
467: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
468: PetscErrorCode ierr;
469: PetscInt nz,k,idx;
470: const MatScalar *aa=a->a,*v,*d;
471: PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp;
472: const PetscScalar *b;
475: VecGetArrayRead(bb,&b);
476: VecGetArray(xx,&x);
477: t = a->solve_work;
478: ISGetIndices(isrow,&r);
480: /* solve U^T * D * y = b by forward substitution */
481: tp = t;
482: for (k=0; k<mbs; k++) { /* t <- perm(b) */
483: idx = 6*r[k];
484: tp[0] = b[idx];
485: tp[1] = b[idx+1];
486: tp[2] = b[idx+2];
487: tp[3] = b[idx+3];
488: tp[4] = b[idx+4];
489: tp[5] = b[idx+5];
490: tp += 6;
491: }
493: for (k=0; k<mbs; k++) {
494: v = aa + 36*ai[k];
495: vj = aj + ai[k];
496: tp = t + k*6;
497: x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
498: nz = ai[k+1] - ai[k];
499: tp = t + (*vj)*6;
500: while (nz--) {
501: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
502: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
503: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
504: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
505: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
506: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
507: vj++;
508: tp = t + (*vj)*6;
509: v += 36;
510: }
512: /* xk = inv(Dk)*(Dk*xk) */
513: d = aa+k*36; /* ptr to inv(Dk) */
514: tp = t + k*6;
515: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
516: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
517: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
518: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
519: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
520: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
521: }
523: /* solve U*x = y by back substitution */
524: for (k=mbs-1; k>=0; k--) {
525: v = aa + 36*ai[k];
526: vj = aj + ai[k];
527: tp = t + k*6;
528: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
529: nz = ai[k+1] - ai[k];
531: tp = t + (*vj)*6;
532: while (nz--) {
533: /* xk += U(k,:)*x(:) */
534: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
535: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
536: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
537: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
538: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
539: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
540: vj++;
541: tp = t + (*vj)*6;
542: v += 36;
543: }
544: tp = t + k*6;
545: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
546: idx = 6*r[k];
547: x[idx] = x0;
548: x[idx+1] = x1;
549: x[idx+2] = x2;
550: x[idx+3] = x3;
551: x[idx+4] = x4;
552: x[idx+5] = x5;
553: }
555: ISRestoreIndices(isrow,&r);
556: VecRestoreArrayRead(bb,&b);
557: VecRestoreArray(xx,&x);
558: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
559: return(0);
560: }
562: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
563: {
564: const MatScalar *v,*d;
565: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
566: PetscInt nz,k;
567: const PetscInt *vj;
570: for (k=0; k<mbs; k++) {
571: v = aa + 36*ai[k];
572: xp = x + k*6;
573: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
574: nz = ai[k+1] - ai[k];
575: vj = aj + ai[k];
576: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
577: PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
578: while (nz--) {
579: xp = x + (*vj)*6;
580: /* x(:) += U(k,:)^T*(Dk*xk) */
581: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
582: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
583: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
584: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
585: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
586: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
587: vj++;
588: v += 36;
589: }
590: /* xk = inv(Dk)*(Dk*xk) */
591: d = aa+k*36; /* ptr to inv(Dk) */
592: xp = x + k*6;
593: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
594: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
595: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
596: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
597: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
598: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
599: }
600: return(0);
601: }
602: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
603: {
604: const MatScalar *v;
605: PetscScalar *xp,x0,x1,x2,x3,x4,x5;
606: PetscInt nz,k;
607: const PetscInt *vj;
610: for (k=mbs-1; k>=0; k--) {
611: v = aa + 36*ai[k];
612: xp = x + k*6;
613: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
614: nz = ai[k+1] - ai[k];
615: vj = aj + ai[k];
616: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
617: PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
618: while (nz--) {
619: xp = x + (*vj)*6;
620: /* xk += U(k,:)*x(:) */
621: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
622: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
623: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
624: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
625: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
626: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
627: vj++;
628: v += 36;
629: }
630: xp = x + k*6;
631: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
632: }
633: return(0);
634: }
636: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
637: {
638: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
639: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
640: const MatScalar *aa=a->a;
641: PetscScalar *x;
642: const PetscScalar *b;
643: PetscErrorCode ierr;
646: VecGetArrayRead(bb,&b);
647: VecGetArray(xx,&x);
649: /* solve U^T * D * y = b by forward substitution */
650: PetscArraycpy(x,b,6*mbs); /* x <- b */
651: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
653: /* solve U*x = y by back substitution */
654: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
656: VecRestoreArrayRead(bb,&b);
657: VecRestoreArray(xx,&x);
658: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
659: return(0);
660: }
662: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
663: {
664: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
665: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
666: const MatScalar *aa=a->a;
667: PetscScalar *x;
668: const PetscScalar *b;
669: PetscErrorCode ierr;
672: VecGetArrayRead(bb,&b);
673: VecGetArray(xx,&x);
674: PetscArraycpy(x,b,6*mbs); /* x <- b */
675: MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
676: VecRestoreArrayRead(bb,&b);
677: VecRestoreArray(xx,&x);
678: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
679: return(0);
680: }
682: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
683: {
684: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
685: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
686: const MatScalar *aa=a->a;
687: PetscScalar *x;
688: const PetscScalar *b;
689: PetscErrorCode ierr;
692: VecGetArrayRead(bb,&b);
693: VecGetArray(xx,&x);
694: PetscArraycpy(x,b,6*mbs); /* x <- b */
695: MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
696: VecRestoreArrayRead(bb,&b);
697: VecRestoreArray(xx,&x);
698: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
699: return(0);
700: }
702: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
703: {
704: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
705: IS isrow=a->row;
706: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
707: PetscErrorCode ierr;
708: const PetscInt *r,*vj;
709: PetscInt nz,k,idx;
710: const MatScalar *aa=a->a,*v,*diag;
711: PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp;
712: const PetscScalar *b;
715: VecGetArrayRead(bb,&b);
716: VecGetArray(xx,&x);
717: t = a->solve_work;
718: ISGetIndices(isrow,&r);
720: /* solve U^T * D * y = b by forward substitution */
721: tp = t;
722: for (k=0; k<mbs; k++) { /* t <- perm(b) */
723: idx = 5*r[k];
724: tp[0] = b[idx];
725: tp[1] = b[idx+1];
726: tp[2] = b[idx+2];
727: tp[3] = b[idx+3];
728: tp[4] = b[idx+4];
729: tp += 5;
730: }
732: for (k=0; k<mbs; k++) {
733: v = aa + 25*ai[k];
734: vj = aj + ai[k];
735: tp = t + k*5;
736: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
737: nz = ai[k+1] - ai[k];
739: tp = t + (*vj)*5;
740: while (nz--) {
741: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
742: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
743: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
744: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
745: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
746: vj++;
747: tp = t + (*vj)*5;
748: v += 25;
749: }
751: /* xk = inv(Dk)*(Dk*xk) */
752: diag = aa+k*25; /* ptr to inv(Dk) */
753: tp = t + k*5;
754: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
755: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
756: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
757: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
758: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
759: }
761: /* solve U*x = y by back substitution */
762: for (k=mbs-1; k>=0; k--) {
763: v = aa + 25*ai[k];
764: vj = aj + ai[k];
765: tp = t + k*5;
766: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
767: nz = ai[k+1] - ai[k];
769: tp = t + (*vj)*5;
770: while (nz--) {
771: /* xk += U(k,:)*x(:) */
772: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
773: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
774: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
775: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
776: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
777: vj++;
778: tp = t + (*vj)*5;
779: v += 25;
780: }
781: tp = t + k*5;
782: tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
783: idx = 5*r[k];
784: x[idx] = x0;
785: x[idx+1] = x1;
786: x[idx+2] = x2;
787: x[idx+3] = x3;
788: x[idx+4] = x4;
789: }
791: ISRestoreIndices(isrow,&r);
792: VecRestoreArrayRead(bb,&b);
793: VecRestoreArray(xx,&x);
794: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
795: return(0);
796: }
798: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
799: {
800: const MatScalar *v,*diag;
801: PetscScalar *xp,x0,x1,x2,x3,x4;
802: PetscInt nz,k;
803: const PetscInt *vj;
806: for (k=0; k<mbs; k++) {
807: v = aa + 25*ai[k];
808: xp = x + k*5;
809: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
810: nz = ai[k+1] - ai[k];
811: vj = aj + ai[k];
812: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
813: PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
814: while (nz--) {
815: xp = x + (*vj)*5;
816: /* x(:) += U(k,:)^T*(Dk*xk) */
817: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
818: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
819: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
820: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
821: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
822: vj++;
823: v += 25;
824: }
825: /* xk = inv(Dk)*(Dk*xk) */
826: diag = aa+k*25; /* ptr to inv(Dk) */
827: xp = x + k*5;
828: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
829: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
830: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
831: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
832: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
833: }
834: return(0);
835: }
837: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
838: {
839: const MatScalar *v;
840: PetscScalar *xp,x0,x1,x2,x3,x4;
841: PetscInt nz,k;
842: const PetscInt *vj;
845: for (k=mbs-1; k>=0; k--) {
846: v = aa + 25*ai[k];
847: xp = x + k*5;
848: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
849: nz = ai[k+1] - ai[k];
850: vj = aj + ai[k];
851: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
852: PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
853: while (nz--) {
854: xp = x + (*vj)*5;
855: /* xk += U(k,:)*x(:) */
856: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
857: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
858: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
859: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
860: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
861: vj++;
862: v += 25;
863: }
864: xp = x + k*5;
865: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
866: }
867: return(0);
868: }
870: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
871: {
872: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
873: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
874: const MatScalar *aa=a->a;
875: PetscScalar *x;
876: const PetscScalar *b;
877: PetscErrorCode ierr;
880: VecGetArrayRead(bb,&b);
881: VecGetArray(xx,&x);
883: /* solve U^T * D * y = b by forward substitution */
884: PetscArraycpy(x,b,5*mbs); /* x <- b */
885: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
887: /* solve U*x = y by back substitution */
888: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
890: VecRestoreArrayRead(bb,&b);
891: VecRestoreArray(xx,&x);
892: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
893: return(0);
894: }
896: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
897: {
898: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
899: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
900: const MatScalar *aa=a->a;
901: PetscScalar *x;
902: const PetscScalar *b;
903: PetscErrorCode ierr;
906: VecGetArrayRead(bb,&b);
907: VecGetArray(xx,&x);
908: PetscArraycpy(x,b,5*mbs); /* x <- b */
909: MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
910: VecRestoreArrayRead(bb,&b);
911: VecRestoreArray(xx,&x);
912: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
913: return(0);
914: }
916: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
917: {
918: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
919: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
920: const MatScalar *aa=a->a;
921: PetscScalar *x;
922: const PetscScalar *b;
923: PetscErrorCode ierr;
926: VecGetArrayRead(bb,&b);
927: VecGetArray(xx,&x);
928: PetscArraycpy(x,b,5*mbs);
929: MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
930: VecRestoreArrayRead(bb,&b);
931: VecRestoreArray(xx,&x);
932: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
933: return(0);
934: }
936: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
937: {
938: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
939: IS isrow=a->row;
940: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
941: PetscErrorCode ierr;
942: const PetscInt *r,*vj;
943: PetscInt nz,k,idx;
944: const MatScalar *aa=a->a,*v,*diag;
945: PetscScalar *x,x0,x1,x2,x3,*t,*tp;
946: const PetscScalar *b;
949: VecGetArrayRead(bb,&b);
950: VecGetArray(xx,&x);
951: t = a->solve_work;
952: ISGetIndices(isrow,&r);
954: /* solve U^T * D * y = b by forward substitution */
955: tp = t;
956: for (k=0; k<mbs; k++) { /* t <- perm(b) */
957: idx = 4*r[k];
958: tp[0] = b[idx];
959: tp[1] = b[idx+1];
960: tp[2] = b[idx+2];
961: tp[3] = b[idx+3];
962: tp += 4;
963: }
965: for (k=0; k<mbs; k++) {
966: v = aa + 16*ai[k];
967: vj = aj + ai[k];
968: tp = t + k*4;
969: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
970: nz = ai[k+1] - ai[k];
972: tp = t + (*vj)*4;
973: while (nz--) {
974: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
975: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
976: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
977: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
978: vj++;
979: tp = t + (*vj)*4;
980: v += 16;
981: }
983: /* xk = inv(Dk)*(Dk*xk) */
984: diag = aa+k*16; /* ptr to inv(Dk) */
985: tp = t + k*4;
986: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
987: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
988: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
989: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
990: }
992: /* solve U*x = y by back substitution */
993: for (k=mbs-1; k>=0; k--) {
994: v = aa + 16*ai[k];
995: vj = aj + ai[k];
996: tp = t + k*4;
997: x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
998: nz = ai[k+1] - ai[k];
1000: tp = t + (*vj)*4;
1001: while (nz--) {
1002: /* xk += U(k,:)*x(:) */
1003: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1004: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1005: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1006: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1007: vj++;
1008: tp = t + (*vj)*4;
1009: v += 16;
1010: }
1011: tp = t + k*4;
1012: tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1013: idx = 4*r[k];
1014: x[idx] = x0;
1015: x[idx+1] = x1;
1016: x[idx+2] = x2;
1017: x[idx+3] = x3;
1018: }
1020: ISRestoreIndices(isrow,&r);
1021: VecRestoreArrayRead(bb,&b);
1022: VecRestoreArray(xx,&x);
1023: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1024: return(0);
1025: }
1027: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1028: {
1029: const MatScalar *v,*diag;
1030: PetscScalar *xp,x0,x1,x2,x3;
1031: PetscInt nz,k;
1032: const PetscInt *vj;
1035: for (k=0; k<mbs; k++) {
1036: v = aa + 16*ai[k];
1037: xp = x + k*4;
1038: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1039: nz = ai[k+1] - ai[k];
1040: vj = aj + ai[k];
1041: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1042: PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1043: while (nz--) {
1044: xp = x + (*vj)*4;
1045: /* x(:) += U(k,:)^T*(Dk*xk) */
1046: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1047: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1048: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1049: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1050: vj++;
1051: v += 16;
1052: }
1053: /* xk = inv(Dk)*(Dk*xk) */
1054: diag = aa+k*16; /* ptr to inv(Dk) */
1055: xp = x + k*4;
1056: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1057: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1058: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1059: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1060: }
1061: return(0);
1062: }
1064: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1065: {
1066: const MatScalar *v;
1067: PetscScalar *xp,x0,x1,x2,x3;
1068: PetscInt nz,k;
1069: const PetscInt *vj;
1072: for (k=mbs-1; k>=0; k--) {
1073: v = aa + 16*ai[k];
1074: xp = x + k*4;
1075: x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1076: nz = ai[k+1] - ai[k];
1077: vj = aj + ai[k];
1078: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1079: PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1080: while (nz--) {
1081: xp = x + (*vj)*4;
1082: /* xk += U(k,:)*x(:) */
1083: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1084: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1085: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1086: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1087: vj++;
1088: v += 16;
1089: }
1090: xp = x + k*4;
1091: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1092: }
1093: return(0);
1094: }
1096: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1097: {
1098: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1099: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1100: const MatScalar *aa=a->a;
1101: PetscScalar *x;
1102: const PetscScalar *b;
1103: PetscErrorCode ierr;
1106: VecGetArrayRead(bb,&b);
1107: VecGetArray(xx,&x);
1109: /* solve U^T * D * y = b by forward substitution */
1110: PetscArraycpy(x,b,4*mbs); /* x <- b */
1111: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1113: /* solve U*x = y by back substitution */
1114: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1115: VecRestoreArrayRead(bb,&b);
1116: VecRestoreArray(xx,&x);
1117: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1118: return(0);
1119: }
1121: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1122: {
1123: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1124: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1125: const MatScalar *aa=a->a;
1126: PetscScalar *x;
1127: const PetscScalar *b;
1128: PetscErrorCode ierr;
1131: VecGetArrayRead(bb,&b);
1132: VecGetArray(xx,&x);
1133: PetscArraycpy(x,b,4*mbs); /* x <- b */
1134: MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1135: VecRestoreArrayRead(bb,&b);
1136: VecRestoreArray(xx,&x);
1137: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1138: return(0);
1139: }
1141: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1142: {
1143: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1144: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1145: const MatScalar *aa=a->a;
1146: PetscScalar *x;
1147: const PetscScalar *b;
1148: PetscErrorCode ierr;
1151: VecGetArrayRead(bb,&b);
1152: VecGetArray(xx,&x);
1153: PetscArraycpy(x,b,4*mbs);
1154: MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1155: VecRestoreArrayRead(bb,&b);
1156: VecRestoreArray(xx,&x);
1157: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1158: return(0);
1159: }
1161: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1162: {
1163: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1164: IS isrow=a->row;
1165: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1166: PetscErrorCode ierr;
1167: const PetscInt *r;
1168: PetscInt nz,k,idx;
1169: const PetscInt *vj;
1170: const MatScalar *aa=a->a,*v,*diag;
1171: PetscScalar *x,x0,x1,x2,*t,*tp;
1172: const PetscScalar *b;
1175: VecGetArrayRead(bb,&b);
1176: VecGetArray(xx,&x);
1177: t = a->solve_work;
1178: ISGetIndices(isrow,&r);
1180: /* solve U^T * D * y = b by forward substitution */
1181: tp = t;
1182: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1183: idx = 3*r[k];
1184: tp[0] = b[idx];
1185: tp[1] = b[idx+1];
1186: tp[2] = b[idx+2];
1187: tp += 3;
1188: }
1190: for (k=0; k<mbs; k++) {
1191: v = aa + 9*ai[k];
1192: vj = aj + ai[k];
1193: tp = t + k*3;
1194: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1195: nz = ai[k+1] - ai[k];
1197: tp = t + (*vj)*3;
1198: while (nz--) {
1199: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1200: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1201: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1202: vj++;
1203: tp = t + (*vj)*3;
1204: v += 9;
1205: }
1207: /* xk = inv(Dk)*(Dk*xk) */
1208: diag = aa+k*9; /* ptr to inv(Dk) */
1209: tp = t + k*3;
1210: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1211: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1212: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1213: }
1215: /* solve U*x = y by back substitution */
1216: for (k=mbs-1; k>=0; k--) {
1217: v = aa + 9*ai[k];
1218: vj = aj + ai[k];
1219: tp = t + k*3;
1220: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
1221: nz = ai[k+1] - ai[k];
1223: tp = t + (*vj)*3;
1224: while (nz--) {
1225: /* xk += U(k,:)*x(:) */
1226: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1227: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1228: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1229: vj++;
1230: tp = t + (*vj)*3;
1231: v += 9;
1232: }
1233: tp = t + k*3;
1234: tp[0] = x0; tp[1] = x1; tp[2] = x2;
1235: idx = 3*r[k];
1236: x[idx] = x0;
1237: x[idx+1] = x1;
1238: x[idx+2] = x2;
1239: }
1241: ISRestoreIndices(isrow,&r);
1242: VecRestoreArrayRead(bb,&b);
1243: VecRestoreArray(xx,&x);
1244: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1245: return(0);
1246: }
1248: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1249: {
1250: const MatScalar *v,*diag;
1251: PetscScalar *xp,x0,x1,x2;
1252: PetscInt nz,k;
1253: const PetscInt *vj;
1256: for (k=0; k<mbs; k++) {
1257: v = aa + 9*ai[k];
1258: xp = x + k*3;
1259: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1260: nz = ai[k+1] - ai[k];
1261: vj = aj + ai[k];
1262: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1263: PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1264: while (nz--) {
1265: xp = x + (*vj)*3;
1266: /* x(:) += U(k,:)^T*(Dk*xk) */
1267: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1268: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1269: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1270: vj++;
1271: v += 9;
1272: }
1273: /* xk = inv(Dk)*(Dk*xk) */
1274: diag = aa+k*9; /* ptr to inv(Dk) */
1275: xp = x + k*3;
1276: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1277: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1278: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1279: }
1280: return(0);
1281: }
1283: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1284: {
1285: const MatScalar *v;
1286: PetscScalar *xp,x0,x1,x2;
1287: PetscInt nz,k;
1288: const PetscInt *vj;
1291: for (k=mbs-1; k>=0; k--) {
1292: v = aa + 9*ai[k];
1293: xp = x + k*3;
1294: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
1295: nz = ai[k+1] - ai[k];
1296: vj = aj + ai[k];
1297: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1298: PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1299: while (nz--) {
1300: xp = x + (*vj)*3;
1301: /* xk += U(k,:)*x(:) */
1302: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1303: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1304: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1305: vj++;
1306: v += 9;
1307: }
1308: xp = x + k*3;
1309: xp[0] = x0; xp[1] = x1; xp[2] = x2;
1310: }
1311: return(0);
1312: }
1314: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1315: {
1316: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1317: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1318: const MatScalar *aa=a->a;
1319: PetscScalar *x;
1320: const PetscScalar *b;
1321: PetscErrorCode ierr;
1324: VecGetArrayRead(bb,&b);
1325: VecGetArray(xx,&x);
1327: /* solve U^T * D * y = b by forward substitution */
1328: PetscArraycpy(x,b,3*mbs);
1329: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1331: /* solve U*x = y by back substitution */
1332: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1334: VecRestoreArrayRead(bb,&b);
1335: VecRestoreArray(xx,&x);
1336: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1337: return(0);
1338: }
1340: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1341: {
1342: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1343: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1344: const MatScalar *aa=a->a;
1345: PetscScalar *x;
1346: const PetscScalar *b;
1347: PetscErrorCode ierr;
1350: VecGetArrayRead(bb,&b);
1351: VecGetArray(xx,&x);
1352: PetscArraycpy(x,b,3*mbs);
1353: MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1354: VecRestoreArrayRead(bb,&b);
1355: VecRestoreArray(xx,&x);
1356: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1357: return(0);
1358: }
1360: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1361: {
1362: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1363: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1364: const MatScalar *aa=a->a;
1365: PetscScalar *x;
1366: const PetscScalar *b;
1367: PetscErrorCode ierr;
1370: VecGetArrayRead(bb,&b);
1371: VecGetArray(xx,&x);
1372: PetscArraycpy(x,b,3*mbs);
1373: MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1374: VecRestoreArrayRead(bb,&b);
1375: VecRestoreArray(xx,&x);
1376: PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1377: return(0);
1378: }
1380: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1381: {
1382: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1383: IS isrow=a->row;
1384: const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j;
1385: PetscErrorCode ierr;
1386: const PetscInt *r,*vj;
1387: PetscInt nz,k,k2,idx;
1388: const MatScalar *aa=a->a,*v,*diag;
1389: PetscScalar *x,x0,x1,*t;
1390: const PetscScalar *b;
1393: VecGetArrayRead(bb,&b);
1394: VecGetArray(xx,&x);
1395: t = a->solve_work;
1396: ISGetIndices(isrow,&r);
1398: /* solve U^T * D * y = perm(b) by forward substitution */
1399: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1400: idx = 2*r[k];
1401: t[k*2] = b[idx];
1402: t[k*2+1] = b[idx+1];
1403: }
1404: for (k=0; k<mbs; k++) {
1405: v = aa + 4*ai[k];
1406: vj = aj + ai[k];
1407: k2 = k*2;
1408: x0 = t[k2]; x1 = t[k2+1];
1409: nz = ai[k+1] - ai[k];
1410: while (nz--) {
1411: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1412: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1413: vj++; v += 4;
1414: }
1415: diag = aa+k*4; /* ptr to inv(Dk) */
1416: t[k2] = diag[0]*x0 + diag[2]*x1;
1417: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1418: }
1420: /* solve U*x = y by back substitution */
1421: for (k=mbs-1; k>=0; k--) {
1422: v = aa + 4*ai[k];
1423: vj = aj + ai[k];
1424: k2 = k*2;
1425: x0 = t[k2]; x1 = t[k2+1];
1426: nz = ai[k+1] - ai[k];
1427: while (nz--) {
1428: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1429: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1430: vj++;
1431: v += 4;
1432: }
1433: t[k2] = x0;
1434: t[k2+1] = x1;
1435: idx = 2*r[k];
1436: x[idx] = x0;
1437: x[idx+1] = x1;
1438: }
1440: ISRestoreIndices(isrow,&r);
1441: VecRestoreArrayRead(bb,&b);
1442: VecRestoreArray(xx,&x);
1443: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1444: return(0);
1445: }
1447: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1448: {
1449: const MatScalar *v,*diag;
1450: PetscScalar x0,x1;
1451: PetscInt nz,k,k2;
1452: const PetscInt *vj;
1455: for (k=0; k<mbs; k++) {
1456: v = aa + 4*ai[k];
1457: vj = aj + ai[k];
1458: k2 = k*2;
1459: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1460: nz = ai[k+1] - ai[k];
1461: PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1462: PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1463: while (nz--) {
1464: /* x(:) += U(k,:)^T*(Dk*xk) */
1465: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1466: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1467: vj++; v += 4;
1468: }
1469: /* xk = inv(Dk)*(Dk*xk) */
1470: diag = aa+k*4; /* ptr to inv(Dk) */
1471: x[k2] = diag[0]*x0 + diag[2]*x1;
1472: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1473: }
1474: return(0);
1475: }
1477: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1478: {
1479: const MatScalar *v;
1480: PetscScalar x0,x1;
1481: PetscInt nz,k,k2;
1482: const PetscInt *vj;
1485: for (k=mbs-1; k>=0; k--) {
1486: v = aa + 4*ai[k];
1487: vj = aj + ai[k];
1488: k2 = k*2;
1489: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1490: nz = ai[k+1] - ai[k];
1491: PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1492: PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1493: while (nz--) {
1494: /* xk += U(k,:)*x(:) */
1495: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1496: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1497: vj++;
1498: v += 4;
1499: }
1500: x[k2] = x0;
1501: x[k2+1] = x1;
1502: }
1503: return(0);
1504: }
1506: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1507: {
1508: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1509: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1510: const MatScalar *aa=a->a;
1511: PetscScalar *x;
1512: const PetscScalar *b;
1513: PetscErrorCode ierr;
1516: VecGetArrayRead(bb,&b);
1517: VecGetArray(xx,&x);
1519: /* solve U^T * D * y = b by forward substitution */
1520: PetscArraycpy(x,b,2*mbs);
1521: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1523: /* solve U*x = y by back substitution */
1524: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1526: VecRestoreArrayRead(bb,&b);
1527: VecRestoreArray(xx,&x);
1528: PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1529: return(0);
1530: }
1532: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1533: {
1534: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1535: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1536: const MatScalar *aa=a->a;
1537: PetscScalar *x;
1538: const PetscScalar *b;
1539: PetscErrorCode ierr;
1542: VecGetArrayRead(bb,&b);
1543: VecGetArray(xx,&x);
1544: PetscArraycpy(x,b,2*mbs);
1545: MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1546: VecRestoreArrayRead(bb,&b);
1547: VecRestoreArray(xx,&x);
1548: PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1549: return(0);
1550: }
1552: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1553: {
1554: Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data;
1555: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1556: const MatScalar *aa=a->a;
1557: PetscScalar *x;
1558: const PetscScalar *b;
1559: PetscErrorCode ierr;
1562: VecGetArrayRead(bb,&b);
1563: VecGetArray(xx,&x);
1564: PetscArraycpy(x,b,2*mbs);
1565: MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1566: VecRestoreArrayRead(bb,&b);
1567: VecRestoreArray(xx,&x);
1568: PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1569: return(0);
1570: }
1572: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1573: {
1574: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1575: IS isrow=a->row;
1576: PetscErrorCode ierr;
1577: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1578: const MatScalar *aa=a->a,*v;
1579: const PetscScalar *b;
1580: PetscScalar *x,xk,*t;
1581: PetscInt nz,k,j;
1584: VecGetArrayRead(bb,&b);
1585: VecGetArray(xx,&x);
1586: t = a->solve_work;
1587: ISGetIndices(isrow,&rp);
1589: /* solve U^T*D*y = perm(b) by forward substitution */
1590: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1591: for (k=0; k<mbs; k++) {
1592: v = aa + ai[k];
1593: vj = aj + ai[k];
1594: xk = t[k];
1595: nz = ai[k+1] - ai[k] - 1;
1596: for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1597: t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */
1598: }
1600: /* solve U*perm(x) = y by back substitution */
1601: for (k=mbs-1; k>=0; k--) {
1602: v = aa + adiag[k] - 1;
1603: vj = aj + adiag[k] - 1;
1604: nz = ai[k+1] - ai[k] - 1;
1605: for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1606: x[rp[k]] = t[k];
1607: }
1609: ISRestoreIndices(isrow,&rp);
1610: VecRestoreArrayRead(bb,&b);
1611: VecRestoreArray(xx,&x);
1612: PetscLogFlops(4.0*a->nz - 3.0*mbs);
1613: return(0);
1614: }
1616: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1617: {
1618: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1619: IS isrow=a->row;
1620: PetscErrorCode ierr;
1621: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1622: const MatScalar *aa=a->a,*v;
1623: PetscScalar *x,xk,*t;
1624: const PetscScalar *b;
1625: PetscInt nz,k;
1628: VecGetArrayRead(bb,&b);
1629: VecGetArray(xx,&x);
1630: t = a->solve_work;
1631: ISGetIndices(isrow,&rp);
1633: /* solve U^T*D*y = perm(b) by forward substitution */
1634: for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1635: for (k=0; k<mbs; k++) {
1636: v = aa + ai[k] + 1;
1637: vj = aj + ai[k] + 1;
1638: xk = t[k];
1639: nz = ai[k+1] - ai[k] - 1;
1640: while (nz--) t[*vj++] += (*v++) * xk;
1641: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1642: }
1644: /* solve U*perm(x) = y by back substitution */
1645: for (k=mbs-1; k>=0; k--) {
1646: v = aa + ai[k] + 1;
1647: vj = aj + ai[k] + 1;
1648: nz = ai[k+1] - ai[k] - 1;
1649: while (nz--) t[k] += (*v++) * t[*vj++];
1650: x[rp[k]] = t[k];
1651: }
1653: ISRestoreIndices(isrow,&rp);
1654: VecRestoreArrayRead(bb,&b);
1655: VecRestoreArray(xx,&x);
1656: PetscLogFlops(4.0*a->nz - 3*mbs);
1657: return(0);
1658: }
1660: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1661: {
1662: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1663: IS isrow=a->row;
1664: PetscErrorCode ierr;
1665: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1666: const MatScalar *aa=a->a,*v;
1667: PetscReal diagk;
1668: PetscScalar *x,xk;
1669: const PetscScalar *b;
1670: PetscInt nz,k;
1673: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1674: VecGetArrayRead(bb,&b);
1675: VecGetArray(xx,&x);
1676: ISGetIndices(isrow,&rp);
1678: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1679: for (k=0; k<mbs; k++) {
1680: v = aa + ai[k];
1681: vj = aj + ai[k];
1682: xk = x[k];
1683: nz = ai[k+1] - ai[k] - 1;
1684: while (nz--) x[*vj++] += (*v++) * xk;
1686: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1687: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1688: x[k] = xk*PetscSqrtReal(diagk);
1689: }
1690: ISRestoreIndices(isrow,&rp);
1691: VecRestoreArrayRead(bb,&b);
1692: VecRestoreArray(xx,&x);
1693: PetscLogFlops(2.0*a->nz - mbs);
1694: return(0);
1695: }
1697: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1698: {
1699: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1700: IS isrow=a->row;
1701: PetscErrorCode ierr;
1702: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1703: const MatScalar *aa=a->a,*v;
1704: PetscReal diagk;
1705: PetscScalar *x,xk;
1706: const PetscScalar *b;
1707: PetscInt nz,k;
1710: /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1711: VecGetArrayRead(bb,&b);
1712: VecGetArray(xx,&x);
1713: ISGetIndices(isrow,&rp);
1715: for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1716: for (k=0; k<mbs; k++) {
1717: v = aa + ai[k] + 1;
1718: vj = aj + ai[k] + 1;
1719: xk = x[k];
1720: nz = ai[k+1] - ai[k] - 1;
1721: while (nz--) x[*vj++] += (*v++) * xk;
1723: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1724: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1725: x[k] = xk*PetscSqrtReal(diagk);
1726: }
1727: ISRestoreIndices(isrow,&rp);
1728: VecRestoreArrayRead(bb,&b);
1729: VecRestoreArray(xx,&x);
1730: PetscLogFlops(2.0*a->nz - mbs);
1731: return(0);
1732: }
1734: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1735: {
1736: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1737: IS isrow=a->row;
1738: PetscErrorCode ierr;
1739: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1740: const MatScalar *aa=a->a,*v;
1741: PetscReal diagk;
1742: PetscScalar *x,*t;
1743: const PetscScalar *b;
1744: PetscInt nz,k;
1747: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1748: VecGetArrayRead(bb,&b);
1749: VecGetArray(xx,&x);
1750: t = a->solve_work;
1751: ISGetIndices(isrow,&rp);
1753: for (k=mbs-1; k>=0; k--) {
1754: v = aa + ai[k];
1755: vj = aj + ai[k];
1756: diagk = PetscRealPart(aa[adiag[k]]);
1757: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1758: t[k] = b[k] * PetscSqrtReal(diagk);
1759: nz = ai[k+1] - ai[k] - 1;
1760: while (nz--) t[k] += (*v++) * t[*vj++];
1761: x[rp[k]] = t[k];
1762: }
1763: ISRestoreIndices(isrow,&rp);
1764: VecRestoreArrayRead(bb,&b);
1765: VecRestoreArray(xx,&x);
1766: PetscLogFlops(2.0*a->nz - mbs);
1767: return(0);
1768: }
1770: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1771: {
1772: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1773: IS isrow=a->row;
1774: PetscErrorCode ierr;
1775: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1776: const MatScalar *aa=a->a,*v;
1777: PetscReal diagk;
1778: PetscScalar *x,*t;
1779: const PetscScalar *b;
1780: PetscInt nz,k;
1783: /* solve D^(1/2)*U*perm(x) = b by back substitution */
1784: VecGetArrayRead(bb,&b);
1785: VecGetArray(xx,&x);
1786: t = a->solve_work;
1787: ISGetIndices(isrow,&rp);
1789: for (k=mbs-1; k>=0; k--) {
1790: v = aa + ai[k] + 1;
1791: vj = aj + ai[k] + 1;
1792: diagk = PetscRealPart(aa[ai[k]]);
1793: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1794: t[k] = b[k] * PetscSqrtReal(diagk);
1795: nz = ai[k+1] - ai[k] - 1;
1796: while (nz--) t[k] += (*v++) * t[*vj++];
1797: x[rp[k]] = t[k];
1798: }
1799: ISRestoreIndices(isrow,&rp);
1800: VecRestoreArrayRead(bb,&b);
1801: VecRestoreArray(xx,&x);
1802: PetscLogFlops(2.0*a->nz - mbs);
1803: return(0);
1804: }
1806: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1807: {
1808: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1812: if (A->rmap->bs == 1) {
1813: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1814: } else {
1815: IS isrow=a->row;
1816: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1817: const MatScalar *aa=a->a,*v;
1818: PetscScalar *x,*t;
1819: const PetscScalar *b;
1820: PetscInt nz,k,n,i,j;
1822: if (bb->n > a->solves_work_n) {
1823: PetscFree(a->solves_work);
1824: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1825: a->solves_work_n = bb->n;
1826: }
1827: n = bb->n;
1828: VecGetArrayRead(bb->v,&b);
1829: VecGetArray(xx->v,&x);
1830: t = a->solves_work;
1832: ISGetIndices(isrow,&rp);
1834: /* solve U^T*D*y = perm(b) by forward substitution */
1835: for (k=0; k<mbs; k++) {
1836: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1837: }
1838: for (k=0; k<mbs; k++) {
1839: v = aa + ai[k];
1840: vj = aj + ai[k];
1841: nz = ai[k+1] - ai[k] - 1;
1842: for (j=0; j<nz; j++) {
1843: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1844: v++;vj++;
1845: }
1846: for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1847: }
1849: /* solve U*perm(x) = y by back substitution */
1850: for (k=mbs-1; k>=0; k--) {
1851: v = aa + ai[k] - 1;
1852: vj = aj + ai[k] - 1;
1853: nz = ai[k+1] - ai[k] - 1;
1854: for (j=0; j<nz; j++) {
1855: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1856: v++;vj++;
1857: }
1858: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1859: }
1861: ISRestoreIndices(isrow,&rp);
1862: VecRestoreArrayRead(bb->v,&b);
1863: VecRestoreArray(xx->v,&x);
1864: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1865: }
1866: return(0);
1867: }
1869: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1870: {
1871: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1875: if (A->rmap->bs == 1) {
1876: MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1877: } else {
1878: IS isrow=a->row;
1879: const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1880: const MatScalar *aa=a->a,*v;
1881: PetscScalar *x,*t;
1882: const PetscScalar *b;
1883: PetscInt nz,k,n,i;
1885: if (bb->n > a->solves_work_n) {
1886: PetscFree(a->solves_work);
1887: PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1888: a->solves_work_n = bb->n;
1889: }
1890: n = bb->n;
1891: VecGetArrayRead(bb->v,&b);
1892: VecGetArray(xx->v,&x);
1893: t = a->solves_work;
1895: ISGetIndices(isrow,&rp);
1897: /* solve U^T*D*y = perm(b) by forward substitution */
1898: for (k=0; k<mbs; k++) {
1899: for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1900: }
1901: for (k=0; k<mbs; k++) {
1902: v = aa + ai[k];
1903: vj = aj + ai[k];
1904: nz = ai[k+1] - ai[k];
1905: while (nz--) {
1906: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1907: v++;vj++;
1908: }
1909: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1910: }
1912: /* solve U*perm(x) = y by back substitution */
1913: for (k=mbs-1; k>=0; k--) {
1914: v = aa + ai[k];
1915: vj = aj + ai[k];
1916: nz = ai[k+1] - ai[k];
1917: while (nz--) {
1918: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1919: v++;vj++;
1920: }
1921: for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1922: }
1924: ISRestoreIndices(isrow,&rp);
1925: VecRestoreArrayRead(bb->v,&b);
1926: VecRestoreArray(xx->v,&x);
1927: PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1928: }
1929: return(0);
1930: }
1932: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1933: {
1934: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1935: PetscErrorCode ierr;
1936: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1937: const MatScalar *aa=a->a,*v;
1938: const PetscScalar *b;
1939: PetscScalar *x,xi;
1940: PetscInt nz,i,j;
1943: VecGetArrayRead(bb,&b);
1944: VecGetArray(xx,&x);
1945: /* solve U^T*D*y = b by forward substitution */
1946: PetscArraycpy(x,b,mbs);
1947: for (i=0; i<mbs; i++) {
1948: v = aa + ai[i];
1949: vj = aj + ai[i];
1950: xi = x[i];
1951: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1952: for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1953: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
1954: }
1955: /* solve U*x = y by backward substitution */
1956: for (i=mbs-2; i>=0; i--) {
1957: xi = x[i];
1958: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
1959: vj = aj + adiag[i] - 1;
1960: nz = ai[i+1] - ai[i] - 1;
1961: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1962: x[i] = xi;
1963: }
1964: VecRestoreArrayRead(bb,&b);
1965: VecRestoreArray(xx,&x);
1966: PetscLogFlops(4.0*a->nz - 3*mbs);
1967: return(0);
1968: }
1970: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1971: {
1972: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1973: PetscErrorCode ierr;
1974: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1975: const MatScalar *aa=a->a,*v;
1976: const PetscScalar *b;
1977: PetscScalar *x,xi;
1978: PetscInt nz,i,j,neq,ldb,ldx;
1979: PetscBool isdense;
1982: if (!mbs) return(0);
1983: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);
1984: if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix");
1985: if (X != B) {
1986: PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);
1987: if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix");
1988: }
1989: MatDenseGetArrayRead(B,&b);
1990: MatDenseGetLDA(B,&ldb);
1991: MatDenseGetArray(X,&x);
1992: MatDenseGetLDA(X,&ldx);
1993: for (neq=0; neq<B->cmap->n; neq++) {
1994: /* solve U^T*D*y = b by forward substitution */
1995: PetscArraycpy(x,b,mbs);
1996: for (i=0; i<mbs; i++) {
1997: v = aa + ai[i];
1998: vj = aj + ai[i];
1999: xi = x[i];
2000: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2001: for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
2002: x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2003: }
2004: /* solve U*x = y by backward substitution */
2005: for (i=mbs-2; i>=0; i--) {
2006: xi = x[i];
2007: v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2008: vj = aj + adiag[i] - 1;
2009: nz = ai[i+1] - ai[i] - 1;
2010: for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2011: x[i] = xi;
2012: }
2013: b += ldb;
2014: x += ldx;
2015: }
2016: MatDenseRestoreArrayRead(B,&b);
2017: MatDenseRestoreArray(X,&x);
2018: PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));
2019: return(0);
2020: }
2022: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2023: {
2024: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2025: PetscErrorCode ierr;
2026: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2027: const MatScalar *aa=a->a,*v;
2028: PetscScalar *x,xk;
2029: const PetscScalar *b;
2030: PetscInt nz,k;
2033: VecGetArrayRead(bb,&b);
2034: VecGetArray(xx,&x);
2036: /* solve U^T*D*y = b by forward substitution */
2037: PetscArraycpy(x,b,mbs);
2038: for (k=0; k<mbs; k++) {
2039: v = aa + ai[k] + 1;
2040: vj = aj + ai[k] + 1;
2041: xk = x[k];
2042: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2043: while (nz--) x[*vj++] += (*v++) * xk;
2044: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2045: }
2047: /* solve U*x = y by back substitution */
2048: for (k=mbs-2; k>=0; k--) {
2049: v = aa + ai[k] + 1;
2050: vj = aj + ai[k] + 1;
2051: xk = x[k];
2052: nz = ai[k+1] - ai[k] - 1;
2053: while (nz--) {
2054: xk += (*v++) * x[*vj++];
2055: }
2056: x[k] = xk;
2057: }
2059: VecRestoreArrayRead(bb,&b);
2060: VecRestoreArray(xx,&x);
2061: PetscLogFlops(4.0*a->nz - 3*mbs);
2062: return(0);
2063: }
2065: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2066: {
2067: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2068: PetscErrorCode ierr;
2069: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2070: const MatScalar *aa=a->a,*v;
2071: PetscReal diagk;
2072: PetscScalar *x;
2073: const PetscScalar *b;
2074: PetscInt nz,k;
2077: /* solve U^T*D^(1/2)*x = b by forward substitution */
2078: VecGetArrayRead(bb,&b);
2079: VecGetArray(xx,&x);
2080: PetscArraycpy(x,b,mbs);
2081: for (k=0; k<mbs; k++) {
2082: v = aa + ai[k];
2083: vj = aj + ai[k];
2084: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2085: while (nz--) x[*vj++] += (*v++) * x[k];
2086: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2087: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2088: x[k] *= PetscSqrtReal(diagk);
2089: }
2090: VecRestoreArrayRead(bb,&b);
2091: VecRestoreArray(xx,&x);
2092: PetscLogFlops(2.0*a->nz - mbs);
2093: return(0);
2094: }
2096: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2097: {
2098: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2099: PetscErrorCode ierr;
2100: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2101: const MatScalar *aa=a->a,*v;
2102: PetscReal diagk;
2103: PetscScalar *x;
2104: const PetscScalar *b;
2105: PetscInt nz,k;
2108: /* solve U^T*D^(1/2)*x = b by forward substitution */
2109: VecGetArrayRead(bb,&b);
2110: VecGetArray(xx,&x);
2111: PetscArraycpy(x,b,mbs);
2112: for (k=0; k<mbs; k++) {
2113: v = aa + ai[k] + 1;
2114: vj = aj + ai[k] + 1;
2115: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
2116: while (nz--) x[*vj++] += (*v++) * x[k];
2117: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2118: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2119: x[k] *= PetscSqrtReal(diagk);
2120: }
2121: VecRestoreArrayRead(bb,&b);
2122: VecRestoreArray(xx,&x);
2123: PetscLogFlops(2.0*a->nz - mbs);
2124: return(0);
2125: }
2127: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2128: {
2129: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2130: PetscErrorCode ierr;
2131: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2132: const MatScalar *aa=a->a,*v;
2133: PetscReal diagk;
2134: PetscScalar *x;
2135: const PetscScalar *b;
2136: PetscInt nz,k;
2139: /* solve D^(1/2)*U*x = b by back substitution */
2140: VecGetArrayRead(bb,&b);
2141: VecGetArray(xx,&x);
2143: for (k=mbs-1; k>=0; k--) {
2144: v = aa + ai[k];
2145: vj = aj + ai[k];
2146: diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2147: if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2148: x[k] = PetscSqrtReal(diagk)*b[k];
2149: nz = ai[k+1] - ai[k] - 1;
2150: while (nz--) x[k] += (*v++) * x[*vj++];
2151: }
2152: VecRestoreArrayRead(bb,&b);
2153: VecRestoreArray(xx,&x);
2154: PetscLogFlops(2.0*a->nz - mbs);
2155: return(0);
2156: }
2158: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2159: {
2160: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2161: PetscErrorCode ierr;
2162: const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2163: const MatScalar *aa=a->a,*v;
2164: PetscReal diagk;
2165: PetscScalar *x;
2166: const PetscScalar *b;
2167: PetscInt nz,k;
2170: /* solve D^(1/2)*U*x = b by back substitution */
2171: VecGetArrayRead(bb,&b);
2172: VecGetArray(xx,&x);
2174: for (k=mbs-1; k>=0; k--) {
2175: v = aa + ai[k] + 1;
2176: vj = aj + ai[k] + 1;
2177: diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2178: if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2179: x[k] = PetscSqrtReal(diagk)*b[k];
2180: nz = ai[k+1] - ai[k] - 1;
2181: while (nz--) x[k] += (*v++) * x[*vj++];
2182: }
2183: VecRestoreArrayRead(bb,&b);
2184: VecRestoreArray(xx,&x);
2185: PetscLogFlops(2.0*a->nz - mbs);
2186: return(0);
2187: }
2189: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2190: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2191: {
2192: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2194: const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2195: PetscInt *jutmp,bs = A->rmap->bs,i;
2196: PetscInt m,reallocs = 0,*levtmp;
2197: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2198: PetscInt incrlev,*lev,shift,prow,nz;
2199: PetscReal f = info->fill,levels = info->levels;
2200: PetscBool perm_identity;
2203: /* check whether perm is the identity mapping */
2204: ISIdentity(perm,&perm_identity);
2206: if (perm_identity) {
2207: a->permute = PETSC_FALSE;
2208: ai = a->i; aj = a->j;
2209: } else { /* non-trivial permutation */
2210: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2211: }
2213: /* initialization */
2214: ISGetIndices(perm,&rip);
2215: umax = (PetscInt)(f*ai[mbs] + 1);
2216: PetscMalloc1(umax,&lev);
2217: umax += mbs + 1;
2218: shift = mbs + 1;
2219: PetscMalloc1(mbs+1,&iu);
2220: PetscMalloc1(umax,&ju);
2221: iu[0] = mbs + 1;
2222: juidx = mbs + 1;
2223: /* prowl: linked list for pivot row */
2224: PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2225: /* q: linked list for col index */
2227: for (i=0; i<mbs; i++) {
2228: prowl[i] = mbs;
2229: q[i] = 0;
2230: }
2232: /* for each row k */
2233: for (k=0; k<mbs; k++) {
2234: nzk = 0;
2235: q[k] = mbs;
2236: /* copy current row into linked list */
2237: nz = ai[rip[k]+1] - ai[rip[k]];
2238: j = ai[rip[k]];
2239: while (nz--) {
2240: vj = rip[aj[j++]];
2241: if (vj > k) {
2242: qm = k;
2243: do {
2244: m = qm; qm = q[m];
2245: } while (qm < vj);
2246: if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2247: nzk++;
2248: q[m] = vj;
2249: q[vj] = qm;
2250: levtmp[vj] = 0; /* initialize lev for nonzero element */
2251: }
2252: }
2254: /* modify nonzero structure of k-th row by computing fill-in
2255: for each row prow to be merged in */
2256: prow = k;
2257: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2259: while (prow < k) {
2260: /* merge row prow into k-th row */
2261: jmin = iu[prow] + 1;
2262: jmax = iu[prow+1];
2263: qm = k;
2264: for (j=jmin; j<jmax; j++) {
2265: incrlev = lev[j-shift] + 1;
2266: if (incrlev > levels) continue;
2267: vj = ju[j];
2268: do {
2269: m = qm; qm = q[m];
2270: } while (qm < vj);
2271: if (qm != vj) { /* a fill */
2272: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2273: levtmp[vj] = incrlev;
2274: } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2275: }
2276: prow = prowl[prow]; /* next pivot row */
2277: }
2279: /* add k to row list for first nonzero element in k-th row */
2280: if (nzk > 1) {
2281: i = q[k]; /* col value of first nonzero element in k_th row of U */
2282: prowl[k] = prowl[i]; prowl[i] = k;
2283: }
2284: iu[k+1] = iu[k] + nzk;
2286: /* allocate more space to ju and lev if needed */
2287: if (iu[k+1] > umax) {
2288: /* estimate how much additional space we will need */
2289: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2290: /* just double the memory each time */
2291: maxadd = umax;
2292: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2293: umax += maxadd;
2295: /* allocate a longer ju */
2296: PetscMalloc1(umax,&jutmp);
2297: PetscArraycpy(jutmp,ju,iu[k]);
2298: PetscFree(ju);
2299: ju = jutmp;
2301: PetscMalloc1(umax,&jutmp);
2302: PetscArraycpy(jutmp,lev,iu[k]-shift);
2303: PetscFree(lev);
2304: lev = jutmp;
2305: reallocs += 2; /* count how many times we realloc */
2306: }
2308: /* save nonzero structure of k-th row in ju */
2309: i=k;
2310: while (nzk--) {
2311: i = q[i];
2312: ju[juidx] = i;
2313: lev[juidx-shift] = levtmp[i];
2314: juidx++;
2315: }
2316: }
2318: #if defined(PETSC_USE_INFO)
2319: if (ai[mbs] != 0) {
2320: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2321: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2322: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2323: PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2324: PetscInfo(A,"for best performance.\n");
2325: } else {
2326: PetscInfo(A,"Empty matrix\n");
2327: }
2328: #endif
2330: ISRestoreIndices(perm,&rip);
2331: PetscFree3(prowl,q,levtmp);
2332: PetscFree(lev);
2334: /* put together the new matrix */
2335: MatSeqSBAIJSetPreallocation(B,bs,0,NULL);
2337: /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2338: b = (Mat_SeqSBAIJ*)(B)->data;
2339: PetscFree2(b->imax,b->ilen);
2341: b->singlemalloc = PETSC_FALSE;
2342: b->free_a = PETSC_TRUE;
2343: b->free_ij = PETSC_TRUE;
2344: /* the next line frees the default space generated by the Create() */
2345: PetscFree3(b->a,b->j,b->i);
2346: PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2347: b->j = ju;
2348: b->i = iu;
2349: b->diag = NULL;
2350: b->ilen = NULL;
2351: b->imax = NULL;
2353: ISDestroy(&b->row);
2354: ISDestroy(&b->icol);
2355: b->row = perm;
2356: b->icol = perm;
2357: PetscObjectReference((PetscObject)perm);
2358: PetscObjectReference((PetscObject)perm);
2359: PetscMalloc1(bs*mbs+bs,&b->solve_work);
2360: /* In b structure: Free imax, ilen, old a, old j.
2361: Allocate idnew, solve_work, new a, new j */
2362: PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2363: b->maxnz = b->nz = iu[mbs];
2365: (B)->info.factor_mallocs = reallocs;
2366: (B)->info.fill_ratio_given = f;
2367: if (ai[mbs] != 0) {
2368: (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2369: } else {
2370: (B)->info.fill_ratio_needed = 0.0;
2371: }
2372: MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2373: return(0);
2374: }
2376: /*
2377: See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2378: */
2379: #include <petscbt.h>
2380: #include <../src/mat/utils/freespace.h>
2381: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2382: {
2383: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
2384: PetscErrorCode ierr;
2385: PetscBool perm_identity,free_ij = PETSC_TRUE,missing;
2386: PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2387: const PetscInt *rip;
2388: PetscInt reallocs=0,i,*ui,*udiag,*cols;
2389: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2390: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2391: PetscReal fill =info->fill,levels=info->levels;
2392: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2393: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2394: PetscBT lnkbt;
2397: if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2398: MatMissingDiagonal(A,&missing,&d);
2399: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2400: if (bs > 1) {
2401: MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2402: return(0);
2403: }
2405: /* check whether perm is the identity mapping */
2406: ISIdentity(perm,&perm_identity);
2407: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2408: a->permute = PETSC_FALSE;
2410: PetscMalloc1(am+1,&ui);
2411: PetscMalloc1(am+1,&udiag);
2412: ui[0] = 0;
2414: /* ICC(0) without matrix ordering: simply rearrange column indices */
2415: if (!levels) {
2416: /* reuse the column pointers and row offsets for memory savings */
2417: for (i=0; i<am; i++) {
2418: ncols = ai[i+1] - ai[i];
2419: ui[i+1] = ui[i] + ncols;
2420: udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2421: }
2422: PetscMalloc1(ui[am]+1,&uj);
2423: cols = uj;
2424: for (i=0; i<am; i++) {
2425: aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2426: ncols = ai[i+1] - ai[i] -1;
2427: for (j=0; j<ncols; j++) *cols++ = aj[j];
2428: *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2429: }
2430: } else { /* case: levels>0 */
2431: ISGetIndices(perm,&rip);
2433: /* initialization */
2434: /* jl: linked list for storing indices of the pivot rows
2435: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2436: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2437: for (i=0; i<am; i++) {
2438: jl[i] = am; il[i] = 0;
2439: }
2441: /* create and initialize a linked list for storing column indices of the active row k */
2442: nlnk = am + 1;
2443: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2445: /* initial FreeSpace size is fill*(ai[am]+1) */
2446: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2448: current_space = free_space;
2450: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2452: current_space_lvl = free_space_lvl;
2454: for (k=0; k<am; k++) { /* for each active row k */
2455: /* initialize lnk by the column indices of row k */
2456: nzk = 0;
2457: ncols = ai[k+1] - ai[k];
2458: if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2459: cols = aj+ai[k];
2460: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2461: nzk += nlnk;
2463: /* update lnk by computing fill-in for each pivot row to be merged in */
2464: prow = jl[k]; /* 1st pivot row */
2466: while (prow < k) {
2467: nextprow = jl[prow];
2469: /* merge prow into k-th row */
2470: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2471: jmax = ui[prow+1];
2472: ncols = jmax-jmin;
2473: i = jmin - ui[prow];
2475: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2476: uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2477: j = *(uj - 1);
2478: PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2479: nzk += nlnk;
2481: /* update il and jl for prow */
2482: if (jmin < jmax) {
2483: il[prow] = jmin;
2485: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2486: }
2487: prow = nextprow;
2488: }
2490: /* if free space is not available, make more free space */
2491: if (current_space->local_remaining<nzk) {
2492: i = am - k + 1; /* num of unfactored rows */
2493: i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2494: PetscFreeSpaceGet(i,¤t_space);
2495: PetscFreeSpaceGet(i,¤t_space_lvl);
2496: reallocs++;
2497: }
2499: /* copy data into free_space and free_space_lvl, then initialize lnk */
2500: if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2501: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2503: /* add the k-th row into il and jl */
2504: if (nzk > 1) {
2505: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2506: jl[k] = jl[i]; jl[i] = k;
2507: il[k] = ui[k] + 1;
2508: }
2509: uj_ptr[k] = current_space->array;
2510: uj_lvl_ptr[k] = current_space_lvl->array;
2512: current_space->array += nzk;
2513: current_space->local_used += nzk;
2514: current_space->local_remaining -= nzk;
2515: current_space_lvl->array += nzk;
2516: current_space_lvl->local_used += nzk;
2517: current_space_lvl->local_remaining -= nzk;
2519: ui[k+1] = ui[k] + nzk;
2520: }
2522: ISRestoreIndices(perm,&rip);
2523: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2525: /* destroy list of free space and other temporary array(s) */
2526: PetscMalloc1(ui[am]+1,&uj);
2527: PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor */
2528: PetscIncompleteLLDestroy(lnk,lnkbt);
2529: PetscFreeSpaceDestroy(free_space_lvl);
2531: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2533: /* put together the new matrix in MATSEQSBAIJ format */
2534: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2536: b = (Mat_SeqSBAIJ*)(fact)->data;
2537: PetscFree2(b->imax,b->ilen);
2539: b->singlemalloc = PETSC_FALSE;
2540: b->free_a = PETSC_TRUE;
2541: b->free_ij = free_ij;
2543: PetscMalloc1(ui[am]+1,&b->a);
2545: b->j = uj;
2546: b->i = ui;
2547: b->diag = udiag;
2548: b->free_diag = PETSC_TRUE;
2549: b->ilen = NULL;
2550: b->imax = NULL;
2551: b->row = perm;
2552: b->col = perm;
2554: PetscObjectReference((PetscObject)perm);
2555: PetscObjectReference((PetscObject)perm);
2557: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2559: PetscMalloc1(am+1,&b->solve_work);
2560: PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));
2562: b->maxnz = b->nz = ui[am];
2564: fact->info.factor_mallocs = reallocs;
2565: fact->info.fill_ratio_given = fill;
2566: if (ai[am] != 0) {
2567: fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2568: } else {
2569: fact->info.fill_ratio_needed = 0.0;
2570: }
2571: #if defined(PETSC_USE_INFO)
2572: if (ai[am] != 0) {
2573: PetscReal af = fact->info.fill_ratio_needed;
2574: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2575: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2576: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2577: } else {
2578: PetscInfo(A,"Empty matrix\n");
2579: }
2580: #endif
2581: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2582: return(0);
2583: }
2585: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2586: {
2587: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
2588: Mat_SeqSBAIJ *b;
2589: PetscErrorCode ierr;
2590: PetscBool perm_identity,free_ij = PETSC_TRUE;
2591: PetscInt bs=A->rmap->bs,am=a->mbs;
2592: const PetscInt *cols,*rip,*ai=a->i,*aj=a->j;
2593: PetscInt reallocs=0,i,*ui;
2594: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2595: PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2596: PetscReal fill =info->fill,levels=info->levels,ratio_needed;
2597: PetscFreeSpaceList free_space =NULL,current_space=NULL;
2598: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2599: PetscBT lnkbt;
2602: /*
2603: This code originally uses Modified Sparse Row (MSR) storage
2604: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2605: Then it is rewritten so the factor B takes seqsbaij format. However the associated
2606: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2607: thus the original code in MSR format is still used for these cases.
2608: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2609: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2610: */
2611: if (bs > 1) {
2612: MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2613: return(0);
2614: }
2616: /* check whether perm is the identity mapping */
2617: ISIdentity(perm,&perm_identity);
2618: if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2619: a->permute = PETSC_FALSE;
2621: /* special case that simply copies fill pattern */
2622: if (!levels) {
2623: /* reuse the column pointers and row offsets for memory savings */
2624: ui = a->i;
2625: uj = a->j;
2626: free_ij = PETSC_FALSE;
2627: ratio_needed = 1.0;
2628: } else { /* case: levels>0 */
2629: ISGetIndices(perm,&rip);
2631: /* initialization */
2632: PetscMalloc1(am+1,&ui);
2633: ui[0] = 0;
2635: /* jl: linked list for storing indices of the pivot rows
2636: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2637: PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2638: for (i=0; i<am; i++) {
2639: jl[i] = am; il[i] = 0;
2640: }
2642: /* create and initialize a linked list for storing column indices of the active row k */
2643: nlnk = am + 1;
2644: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
2646: /* initial FreeSpace size is fill*(ai[am]+1) */
2647: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);
2649: current_space = free_space;
2651: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);
2653: current_space_lvl = free_space_lvl;
2655: for (k=0; k<am; k++) { /* for each active row k */
2656: /* initialize lnk by the column indices of row rip[k] */
2657: nzk = 0;
2658: ncols = ai[rip[k]+1] - ai[rip[k]];
2659: cols = aj+ai[rip[k]];
2660: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
2661: nzk += nlnk;
2663: /* update lnk by computing fill-in for each pivot row to be merged in */
2664: prow = jl[k]; /* 1st pivot row */
2666: while (prow < k) {
2667: nextprow = jl[prow];
2669: /* merge prow into k-th row */
2670: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2671: jmax = ui[prow+1];
2672: ncols = jmax-jmin;
2673: i = jmin - ui[prow];
2674: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2675: j = *(uj_lvl_ptr[prow] + i - 1);
2676: cols_lvl = uj_lvl_ptr[prow]+i;
2677: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
2678: nzk += nlnk;
2680: /* update il and jl for prow */
2681: if (jmin < jmax) {
2682: il[prow] = jmin;
2683: j = *cols;
2684: jl[prow] = jl[j];
2685: jl[j] = prow;
2686: }
2687: prow = nextprow;
2688: }
2690: /* if free space is not available, make more free space */
2691: if (current_space->local_remaining<nzk) {
2692: i = am - k + 1; /* num of unfactored rows */
2693: i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2694: PetscFreeSpaceGet(i,¤t_space);
2695: PetscFreeSpaceGet(i,¤t_space_lvl);
2696: reallocs++;
2697: }
2699: /* copy data into free_space and free_space_lvl, then initialize lnk */
2700: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
2702: /* add the k-th row into il and jl */
2703: if (nzk-1 > 0) {
2704: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2705: jl[k] = jl[i]; jl[i] = k;
2706: il[k] = ui[k] + 1;
2707: }
2708: uj_ptr[k] = current_space->array;
2709: uj_lvl_ptr[k] = current_space_lvl->array;
2711: current_space->array += nzk;
2712: current_space->local_used += nzk;
2713: current_space->local_remaining -= nzk;
2714: current_space_lvl->array += nzk;
2715: current_space_lvl->local_used += nzk;
2716: current_space_lvl->local_remaining -= nzk;
2718: ui[k+1] = ui[k] + nzk;
2719: }
2721: ISRestoreIndices(perm,&rip);
2722: PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);
2724: /* destroy list of free space and other temporary array(s) */
2725: PetscMalloc1(ui[am]+1,&uj);
2726: PetscFreeSpaceContiguous(&free_space,uj);
2727: PetscIncompleteLLDestroy(lnk,lnkbt);
2728: PetscFreeSpaceDestroy(free_space_lvl);
2729: if (ai[am] != 0) {
2730: ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2731: } else {
2732: ratio_needed = 0.0;
2733: }
2734: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2736: /* put together the new matrix in MATSEQSBAIJ format */
2737: MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
2739: b = (Mat_SeqSBAIJ*)(fact)->data;
2741: PetscFree2(b->imax,b->ilen);
2743: b->singlemalloc = PETSC_FALSE;
2744: b->free_a = PETSC_TRUE;
2745: b->free_ij = free_ij;
2747: PetscMalloc1(ui[am]+1,&b->a);
2749: b->j = uj;
2750: b->i = ui;
2751: b->diag = NULL;
2752: b->ilen = NULL;
2753: b->imax = NULL;
2754: b->row = perm;
2755: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2757: PetscObjectReference((PetscObject)perm);
2758: b->icol = perm;
2759: PetscObjectReference((PetscObject)perm);
2760: PetscMalloc1(am+1,&b->solve_work);
2762: b->maxnz = b->nz = ui[am];
2764: fact->info.factor_mallocs = reallocs;
2765: fact->info.fill_ratio_given = fill;
2766: fact->info.fill_ratio_needed = ratio_needed;
2767: #if defined(PETSC_USE_INFO)
2768: if (ai[am] != 0) {
2769: PetscReal af = fact->info.fill_ratio_needed;
2770: PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2771: PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2772: PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2773: } else {
2774: PetscInfo(A,"Empty matrix\n");
2775: }
2776: #endif
2777: if (perm_identity) {
2778: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2779: } else {
2780: fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2781: }
2782: return(0);
2783: }