Actual source code: baijfact11.c
1: /*
2: Factorization code for BAIJ format.
3: */
4: #include <../src/mat/impls/baij/seq/baij.h>
5: #include <petsc/private/kernels/blockinvert.h>
7: /*
8: Version for when blocks are 4 by 4
9: */
10: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
11: {
12: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13: IS isrow = b->row, isicol = b->icol;
14: const PetscInt *r, *ic;
15: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16: PetscInt *ajtmpold, *ajtmp, nz, row;
17: const PetscInt *diag_offset;
18: PetscInt idx, *ai = a->i, *aj = a->j, *pj;
19: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
20: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
21: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
22: MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
23: MatScalar m13, m14, m15, m16;
24: MatScalar *ba = b->a, *aa = a->a;
25: PetscBool pivotinblocks = b->pivotinblocks;
26: PetscReal shift = info->shiftamount;
27: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
29: PetscFunctionBegin;
30: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
31: A->factortype = MAT_FACTOR_NONE;
32: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
33: A->factortype = MAT_FACTOR_ILU;
34: PetscCall(ISGetIndices(isrow, &r));
35: PetscCall(ISGetIndices(isicol, &ic));
36: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
37: allowzeropivot = PetscNot(A->erroriffailure);
39: for (i = 0; i < n; i++) {
40: nz = bi[i + 1] - bi[i];
41: ajtmp = bj + bi[i];
42: for (j = 0; j < nz; j++) {
43: x = rtmp + 16 * ajtmp[j];
44: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
45: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
46: }
47: /* load in initial (unfactored row) */
48: idx = r[i];
49: nz = ai[idx + 1] - ai[idx];
50: ajtmpold = aj + ai[idx];
51: v = aa + 16 * ai[idx];
52: for (j = 0; j < nz; j++) {
53: x = rtmp + 16 * ic[ajtmpold[j]];
54: x[0] = v[0];
55: x[1] = v[1];
56: x[2] = v[2];
57: x[3] = v[3];
58: x[4] = v[4];
59: x[5] = v[5];
60: x[6] = v[6];
61: x[7] = v[7];
62: x[8] = v[8];
63: x[9] = v[9];
64: x[10] = v[10];
65: x[11] = v[11];
66: x[12] = v[12];
67: x[13] = v[13];
68: x[14] = v[14];
69: x[15] = v[15];
70: v += 16;
71: }
72: row = *ajtmp++;
73: while (row < i) {
74: pc = rtmp + 16 * row;
75: p1 = pc[0];
76: p2 = pc[1];
77: p3 = pc[2];
78: p4 = pc[3];
79: p5 = pc[4];
80: p6 = pc[5];
81: p7 = pc[6];
82: p8 = pc[7];
83: p9 = pc[8];
84: p10 = pc[9];
85: p11 = pc[10];
86: p12 = pc[11];
87: p13 = pc[12];
88: p14 = pc[13];
89: p15 = pc[14];
90: p16 = pc[15];
91: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
92: pv = ba + 16 * diag_offset[row];
93: pj = bj + diag_offset[row] + 1;
94: x1 = pv[0];
95: x2 = pv[1];
96: x3 = pv[2];
97: x4 = pv[3];
98: x5 = pv[4];
99: x6 = pv[5];
100: x7 = pv[6];
101: x8 = pv[7];
102: x9 = pv[8];
103: x10 = pv[9];
104: x11 = pv[10];
105: x12 = pv[11];
106: x13 = pv[12];
107: x14 = pv[13];
108: x15 = pv[14];
109: x16 = pv[15];
110: pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
111: pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
112: pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
113: pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
115: pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
116: pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
117: pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
118: pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
120: pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
121: pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
122: pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
123: pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
125: pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
126: pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
127: pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
128: pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
130: nz = bi[row + 1] - diag_offset[row] - 1;
131: pv += 16;
132: for (j = 0; j < nz; j++) {
133: x1 = pv[0];
134: x2 = pv[1];
135: x3 = pv[2];
136: x4 = pv[3];
137: x5 = pv[4];
138: x6 = pv[5];
139: x7 = pv[6];
140: x8 = pv[7];
141: x9 = pv[8];
142: x10 = pv[9];
143: x11 = pv[10];
144: x12 = pv[11];
145: x13 = pv[12];
146: x14 = pv[13];
147: x15 = pv[14];
148: x16 = pv[15];
149: x = rtmp + 16 * pj[j];
150: x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
151: x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
152: x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
153: x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
155: x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
156: x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
157: x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
158: x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
160: x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
161: x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
162: x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
163: x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
165: x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
166: x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
167: x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
168: x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
170: pv += 16;
171: }
172: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
173: }
174: row = *ajtmp++;
175: }
176: /* finished row so stick it into b->a */
177: pv = ba + 16 * bi[i];
178: pj = bj + bi[i];
179: nz = bi[i + 1] - bi[i];
180: for (j = 0; j < nz; j++) {
181: x = rtmp + 16 * pj[j];
182: pv[0] = x[0];
183: pv[1] = x[1];
184: pv[2] = x[2];
185: pv[3] = x[3];
186: pv[4] = x[4];
187: pv[5] = x[5];
188: pv[6] = x[6];
189: pv[7] = x[7];
190: pv[8] = x[8];
191: pv[9] = x[9];
192: pv[10] = x[10];
193: pv[11] = x[11];
194: pv[12] = x[12];
195: pv[13] = x[13];
196: pv[14] = x[14];
197: pv[15] = x[15];
198: pv += 16;
199: }
200: /* invert diagonal block */
201: w = ba + 16 * diag_offset[i];
202: if (pivotinblocks) {
203: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
204: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
205: } else {
206: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
207: }
208: }
210: PetscCall(PetscFree(rtmp));
211: PetscCall(ISRestoreIndices(isicol, &ic));
212: PetscCall(ISRestoreIndices(isrow, &r));
214: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
215: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
216: C->assembled = PETSC_TRUE;
218: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
219: PetscFunctionReturn(PETSC_SUCCESS);
220: }
222: /* MatLUFactorNumeric_SeqBAIJ_4 -
223: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
224: PetscKernel_A_gets_A_times_B()
225: PetscKernel_A_gets_A_minus_B_times_C()
226: PetscKernel_A_gets_inverse_A()
227: */
229: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
230: {
231: Mat C = B;
232: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
233: IS isrow = b->row, isicol = b->icol;
234: const PetscInt *r, *ic;
235: PetscInt i, j, k, nz, nzL, row;
236: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
237: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
238: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
239: PetscInt flg;
240: PetscReal shift;
241: PetscBool allowzeropivot, zeropivotdetected;
243: PetscFunctionBegin;
244: allowzeropivot = PetscNot(A->erroriffailure);
245: PetscCall(ISGetIndices(isrow, &r));
246: PetscCall(ISGetIndices(isicol, &ic));
248: if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
249: shift = 0;
250: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
251: shift = info->shiftamount;
252: }
254: /* generate work space needed by the factorization */
255: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
256: PetscCall(PetscArrayzero(rtmp, bs2 * n));
258: for (i = 0; i < n; i++) {
259: /* zero rtmp */
260: /* L part */
261: nz = bi[i + 1] - bi[i];
262: bjtmp = bj + bi[i];
263: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
265: /* U part */
266: nz = bdiag[i] - bdiag[i + 1];
267: bjtmp = bj + bdiag[i + 1] + 1;
268: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
270: /* load in initial (unfactored row) */
271: nz = ai[r[i] + 1] - ai[r[i]];
272: ajtmp = aj + ai[r[i]];
273: v = aa + bs2 * ai[r[i]];
274: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
276: /* elimination */
277: bjtmp = bj + bi[i];
278: nzL = bi[i + 1] - bi[i];
279: for (k = 0; k < nzL; k++) {
280: row = bjtmp[k];
281: pc = rtmp + bs2 * row;
282: for (flg = 0, j = 0; j < bs2; j++) {
283: if (pc[j] != 0.0) {
284: flg = 1;
285: break;
286: }
287: }
288: if (flg) {
289: pv = b->a + bs2 * bdiag[row];
290: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
291: PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
293: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
294: pv = b->a + bs2 * (bdiag[row + 1] + 1);
295: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
296: for (j = 0; j < nz; j++) {
297: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
298: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
299: v = rtmp + bs2 * pj[j];
300: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
301: pv += bs2;
302: }
303: PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
304: }
305: }
307: /* finished row so stick it into b->a */
308: /* L part */
309: pv = b->a + bs2 * bi[i];
310: pj = b->j + bi[i];
311: nz = bi[i + 1] - bi[i];
312: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
314: /* Mark diagonal and invert diagonal for simpler triangular solves */
315: pv = b->a + bs2 * bdiag[i];
316: pj = b->j + bdiag[i];
317: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
318: PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
319: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
321: /* U part */
322: pv = b->a + bs2 * (bdiag[i + 1] + 1);
323: pj = b->j + bdiag[i + 1] + 1;
324: nz = bdiag[i] - bdiag[i + 1] - 1;
325: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
326: }
328: PetscCall(PetscFree2(rtmp, mwork));
329: PetscCall(ISRestoreIndices(isicol, &ic));
330: PetscCall(ISRestoreIndices(isrow, &r));
332: C->ops->solve = MatSolve_SeqBAIJ_4;
333: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
334: C->assembled = PETSC_TRUE;
336: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
337: PetscFunctionReturn(PETSC_SUCCESS);
338: }
340: PetscErrorCode MatILUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
341: {
342: /*
343: Default Version for when blocks are 4 by 4 Using natural ordering
344: */
345: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
346: PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
347: PetscInt *ajtmpold, *ajtmp, nz, row;
348: const PetscInt *diag_offset;
349: PetscInt *ai = a->i, *aj = a->j, *pj;
350: MatScalar *pv, *v, *rtmp, *pc, *w, *x;
351: MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
352: MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
353: MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
354: MatScalar m13, m14, m15, m16;
355: MatScalar *ba = b->a, *aa = a->a;
356: PetscBool pivotinblocks = b->pivotinblocks;
357: PetscReal shift = info->shiftamount;
358: PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
360: PetscFunctionBegin;
361: /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
362: A->factortype = MAT_FACTOR_NONE;
363: PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
364: A->factortype = MAT_FACTOR_ILU;
365: allowzeropivot = PetscNot(A->erroriffailure);
366: PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
368: for (i = 0; i < n; i++) {
369: nz = bi[i + 1] - bi[i];
370: ajtmp = bj + bi[i];
371: for (j = 0; j < nz; j++) {
372: x = rtmp + 16 * ajtmp[j];
373: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
374: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
375: }
376: /* load in initial (unfactored row) */
377: nz = ai[i + 1] - ai[i];
378: ajtmpold = aj + ai[i];
379: v = aa + 16 * ai[i];
380: for (j = 0; j < nz; j++) {
381: x = rtmp + 16 * ajtmpold[j];
382: x[0] = v[0];
383: x[1] = v[1];
384: x[2] = v[2];
385: x[3] = v[3];
386: x[4] = v[4];
387: x[5] = v[5];
388: x[6] = v[6];
389: x[7] = v[7];
390: x[8] = v[8];
391: x[9] = v[9];
392: x[10] = v[10];
393: x[11] = v[11];
394: x[12] = v[12];
395: x[13] = v[13];
396: x[14] = v[14];
397: x[15] = v[15];
398: v += 16;
399: }
400: row = *ajtmp++;
401: while (row < i) {
402: pc = rtmp + 16 * row;
403: p1 = pc[0];
404: p2 = pc[1];
405: p3 = pc[2];
406: p4 = pc[3];
407: p5 = pc[4];
408: p6 = pc[5];
409: p7 = pc[6];
410: p8 = pc[7];
411: p9 = pc[8];
412: p10 = pc[9];
413: p11 = pc[10];
414: p12 = pc[11];
415: p13 = pc[12];
416: p14 = pc[13];
417: p15 = pc[14];
418: p16 = pc[15];
419: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
420: pv = ba + 16 * diag_offset[row];
421: pj = bj + diag_offset[row] + 1;
422: x1 = pv[0];
423: x2 = pv[1];
424: x3 = pv[2];
425: x4 = pv[3];
426: x5 = pv[4];
427: x6 = pv[5];
428: x7 = pv[6];
429: x8 = pv[7];
430: x9 = pv[8];
431: x10 = pv[9];
432: x11 = pv[10];
433: x12 = pv[11];
434: x13 = pv[12];
435: x14 = pv[13];
436: x15 = pv[14];
437: x16 = pv[15];
438: pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
439: pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
440: pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
441: pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
443: pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
444: pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
445: pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
446: pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
448: pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
449: pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
450: pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
451: pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
453: pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
454: pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
455: pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
456: pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
457: nz = bi[row + 1] - diag_offset[row] - 1;
458: pv += 16;
459: for (j = 0; j < nz; j++) {
460: x1 = pv[0];
461: x2 = pv[1];
462: x3 = pv[2];
463: x4 = pv[3];
464: x5 = pv[4];
465: x6 = pv[5];
466: x7 = pv[6];
467: x8 = pv[7];
468: x9 = pv[8];
469: x10 = pv[9];
470: x11 = pv[10];
471: x12 = pv[11];
472: x13 = pv[12];
473: x14 = pv[13];
474: x15 = pv[14];
475: x16 = pv[15];
476: x = rtmp + 16 * pj[j];
477: x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
478: x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
479: x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
480: x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
482: x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
483: x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
484: x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
485: x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
487: x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
488: x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
489: x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
490: x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
492: x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
493: x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
494: x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
495: x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
497: pv += 16;
498: }
499: PetscCall(PetscLogFlops(128.0 * nz + 112.0));
500: }
501: row = *ajtmp++;
502: }
503: /* finished row so stick it into b->a */
504: pv = ba + 16 * bi[i];
505: pj = bj + bi[i];
506: nz = bi[i + 1] - bi[i];
507: for (j = 0; j < nz; j++) {
508: x = rtmp + 16 * pj[j];
509: pv[0] = x[0];
510: pv[1] = x[1];
511: pv[2] = x[2];
512: pv[3] = x[3];
513: pv[4] = x[4];
514: pv[5] = x[5];
515: pv[6] = x[6];
516: pv[7] = x[7];
517: pv[8] = x[8];
518: pv[9] = x[9];
519: pv[10] = x[10];
520: pv[11] = x[11];
521: pv[12] = x[12];
522: pv[13] = x[13];
523: pv[14] = x[14];
524: pv[15] = x[15];
525: pv += 16;
526: }
527: /* invert diagonal block */
528: w = ba + 16 * diag_offset[i];
529: if (pivotinblocks) {
530: PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
531: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
532: } else {
533: PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
534: }
535: }
537: PetscCall(PetscFree(rtmp));
539: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
540: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
541: C->assembled = PETSC_TRUE;
543: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
544: PetscFunctionReturn(PETSC_SUCCESS);
545: }
547: /*
548: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
549: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
550: */
551: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
552: {
553: Mat C = B;
554: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
555: PetscInt i, j, k, nz, nzL, row;
556: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
557: const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
558: MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
559: PetscInt flg;
560: PetscReal shift;
561: PetscBool allowzeropivot, zeropivotdetected;
563: PetscFunctionBegin;
564: allowzeropivot = PetscNot(A->erroriffailure);
566: /* generate work space needed by the factorization */
567: PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
568: PetscCall(PetscArrayzero(rtmp, bs2 * n));
570: if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
571: shift = 0;
572: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
573: shift = info->shiftamount;
574: }
576: for (i = 0; i < n; i++) {
577: /* zero rtmp */
578: /* L part */
579: nz = bi[i + 1] - bi[i];
580: bjtmp = bj + bi[i];
581: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
583: /* U part */
584: nz = bdiag[i] - bdiag[i + 1];
585: bjtmp = bj + bdiag[i + 1] + 1;
586: for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
588: /* load in initial (unfactored row) */
589: nz = ai[i + 1] - ai[i];
590: ajtmp = aj + ai[i];
591: v = aa + bs2 * ai[i];
592: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
594: /* elimination */
595: bjtmp = bj + bi[i];
596: nzL = bi[i + 1] - bi[i];
597: for (k = 0; k < nzL; k++) {
598: row = bjtmp[k];
599: pc = rtmp + bs2 * row;
600: for (flg = 0, j = 0; j < bs2; j++) {
601: if (pc[j] != 0.0) {
602: flg = 1;
603: break;
604: }
605: }
606: if (flg) {
607: pv = b->a + bs2 * bdiag[row];
608: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
609: PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
611: pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
612: pv = b->a + bs2 * (bdiag[row + 1] + 1);
613: nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
614: for (j = 0; j < nz; j++) {
615: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
616: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
617: v = rtmp + bs2 * pj[j];
618: PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
619: pv += bs2;
620: }
621: PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
622: }
623: }
625: /* finished row so stick it into b->a */
626: /* L part */
627: pv = b->a + bs2 * bi[i];
628: pj = b->j + bi[i];
629: nz = bi[i + 1] - bi[i];
630: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
632: /* Mark diagonal and invert diagonal for simpler triangular solves */
633: pv = b->a + bs2 * bdiag[i];
634: pj = b->j + bdiag[i];
635: PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
636: PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
637: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
639: /* U part */
640: pv = b->a + bs2 * (bdiag[i + 1] + 1);
641: pj = b->j + bdiag[i + 1] + 1;
642: nz = bdiag[i] - bdiag[i + 1] - 1;
643: for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
644: }
645: PetscCall(PetscFree2(rtmp, mwork));
647: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
648: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
649: C->assembled = PETSC_TRUE;
651: PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
652: PetscFunctionReturn(PETSC_SUCCESS);
653: }