Actual source code: blockinvert.h
1: /*
2: Kernels used in sparse ILU (and LU) and in the resulting triangular
3: solves. These are for block algorithms where the block sizes are on
4: the order of 2-6+.
6: There are TWO versions of the macros below.
7: 1) standard for MatScalar == PetscScalar use the standard BLAS for
8: block size larger than 7 and
9: 2) handcoded Fortran single precision for the matrices, since BLAS
10: does not have some arguments in single and some in double.
12: */
15: #include <petscblaslapack.h>
17: /*
18: These are C kernels,they are contained in
19: src/mat/impls/baij/seq
20: */
22: PETSC_INTERN PetscErrorCode PetscLINPACKgefa(MatScalar*,PetscInt,PetscInt*,PetscBool,PetscBool*);
23: PETSC_INTERN PetscErrorCode PetscLINPACKgedi(MatScalar*,PetscInt,PetscInt*,MatScalar*);
24: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_2(MatScalar*,PetscReal,PetscBool,PetscBool*);
25: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_3(MatScalar*,PetscReal,PetscBool,PetscBool*);
27: #define PetscKernel_A_gets_inverse_A_4_nopivot(mat) 0; do { \
28: MatScalar d, di = mat[0]; \
29: mat[0] = d = 1.0 / di; \
30: mat[4] *= -d; \
31: mat[8] *= -d; \
32: mat[12] *= -d; \
33: mat[1] *= d; \
34: mat[2] *= d; \
35: mat[3] *= d; \
36: mat[5] += mat[4] * mat[1] * di; \
37: mat[6] += mat[4] * mat[2] * di; \
38: mat[7] += mat[4] * mat[3] * di; \
39: mat[9] += mat[8] * mat[1] * di; \
40: mat[10] += mat[8] * mat[2] * di; \
41: mat[11] += mat[8] * mat[3] * di; \
42: mat[13] += mat[12] * mat[1] * di; \
43: mat[14] += mat[12] * mat[2] * di; \
44: mat[15] += mat[12] * mat[3] * di; \
45: di = mat[5]; \
46: mat[5] = d = 1.0 / di; \
47: mat[1] *= -d; \
48: mat[9] *= -d; \
49: mat[13] *= -d; \
50: mat[4] *= d; \
51: mat[6] *= d; \
52: mat[7] *= d; \
53: mat[0] += mat[1] * mat[4] * di; \
54: mat[2] += mat[1] * mat[6] * di; \
55: mat[3] += mat[1] * mat[7] * di; \
56: mat[8] += mat[9] * mat[4] * di; \
57: mat[10] += mat[9] * mat[6] * di; \
58: mat[11] += mat[9] * mat[7] * di; \
59: mat[12] += mat[13] * mat[4] * di; \
60: mat[14] += mat[13] * mat[6] * di; \
61: mat[15] += mat[13] * mat[7] * di; \
62: di = mat[10]; \
63: mat[10] = d = 1.0 / di; \
64: mat[2] *= -d; \
65: mat[6] *= -d; \
66: mat[14] *= -d; \
67: mat[8] *= d; \
68: mat[9] *= d; \
69: mat[11] *= d; \
70: mat[0] += mat[2] * mat[8] * di; \
71: mat[1] += mat[2] * mat[9] * di; \
72: mat[3] += mat[2] * mat[11] * di; \
73: mat[4] += mat[6] * mat[8] * di; \
74: mat[5] += mat[6] * mat[9] * di; \
75: mat[7] += mat[6] * mat[11] * di; \
76: mat[12] += mat[14] * mat[8] * di; \
77: mat[13] += mat[14] * mat[9] * di; \
78: mat[15] += mat[14] * mat[11] * di; \
79: di = mat[15]; \
80: mat[15] = d = 1.0 / di; \
81: mat[3] *= -d; \
82: mat[7] *= -d; \
83: mat[11] *= -d; \
84: mat[12] *= d; \
85: mat[13] *= d; \
86: mat[14] *= d; \
87: mat[0] += mat[3] * mat[12] * di; \
88: mat[1] += mat[3] * mat[13] * di; \
89: mat[2] += mat[3] * mat[14] * di; \
90: mat[4] += mat[7] * mat[12] * di; \
91: mat[5] += mat[7] * mat[13] * di; \
92: mat[6] += mat[7] * mat[14] * di; \
93: mat[8] += mat[11] * mat[12] * di; \
94: mat[9] += mat[11] * mat[13] * di; \
95: mat[10] += mat[11] * mat[14] * di; \
96: } while (0)
98: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar*,PetscReal,PetscBool,PetscBool*);
99: # if defined(PETSC_HAVE_SSE)
100: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(MatScalar*);
101: # endif
102: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_5(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);
103: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_6(MatScalar*,PetscReal,PetscBool,PetscBool*);
104: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_7(MatScalar*,PetscReal,PetscBool,PetscBool*);
105: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_9(MatScalar*,PetscReal,PetscBool,PetscBool*);
106: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_15(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);
108: /*
109: A = inv(A) A_gets_inverse_A
111: A - square bs by bs array stored in column major order
112: pivots - integer work array of length bs
113: W - bs by 1 work array
114: */
115: #define PetscKernel_A_gets_inverse_A(bs,A,pivots,W,allowzeropivot,zeropivotdetected) (PetscLINPACKgefa((A),(bs),(pivots),(allowzeropivot),(zeropivotdetected)) || PetscLINPACKgedi((A),(bs),(pivots),(W)))
117: /* -----------------------------------------------------------------------*/
119: #if !defined(PETSC_USE_REAL_MAT_SINGLE)
120: /*
121: Version that calls the BLAS directly
122: */
123: /*
124: A = A * B A_gets_A_times_B
126: A, B - square bs by bs arrays stored in column major order
127: W - square bs by bs work array
129: */
130: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) do { \
131: PetscBLASInt _bbs; \
132: PetscScalar _one = 1.0,_zero = 0.0; \
133: PetscErrorCode _ierr; \
134: _PetscBLASIntCast(bs,&_bbs); \
135: _PetscArraycpy((W),(A),(bs)*(bs));CHKERRQ(_ierr); \
139: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs))); \
140: } while (0)
142: /*
144: A = A - B * C A_gets_A_minus_B_times_C
146: A, B, C - square bs by bs arrays stored in column major order
147: */
148: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) do { \
149: PetscScalar _mone = -1.0,_one = 1.0; \
150: PetscBLASInt _bbs; \
151: PetscErrorCode _ierr; \
152: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
156: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
157: } while (0)
159: /*
161: A = A + B^T * C A_gets_A_plus_Btranspose_times_C
163: A, B, C - square bs by bs arrays stored in column major order
164: */
165: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) do { \
166: PetscScalar _one = 1.0; \
167: PetscBLASInt _bbs; \
168: PetscErrorCode _ierr; \
169: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
173: PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
174: } while (0)
176: /*
177: v = v + A^T w v_gets_v_plus_Atranspose_times_w
179: v - array of length bs
180: A - square bs by bs array
181: w - array of length bs
182: */
183: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) do { \
184: PetscScalar _one = 1.0; \
185: PetscBLASInt _ione = 1, _bbs; \
186: PetscErrorCode _ierr; \
187: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
191: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
192: } while (0)
194: /*
195: v = v - A w v_gets_v_minus_A_times_w
197: v - array of length bs
198: A - square bs by bs array
199: w - array of length bs
200: */
201: #define PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) do { \
202: PetscScalar _mone = -1.0,_one = 1.0; \
203: PetscBLASInt _ione = 1,_bbs; \
204: PetscErrorCode _ierr; \
205: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
209: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
210: } while (0)
212: /*
213: v = v - A w v_gets_v_minus_transA_times_w
215: v - array of length bs
216: A - square bs by bs array
217: w - array of length bs
218: */
219: #define PetscKernel_v_gets_v_minus_transA_times_w(bs,v,A,w) do { \
220: PetscScalar _mone = -1.0,_one = 1.0; \
221: PetscBLASInt _ione = 1,_bbs; \
222: PetscErrorCode _ierr; \
223: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
227: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
228: } while (0)
230: /*
231: v = v + A w v_gets_v_plus_A_times_w
233: v - array of length bs
234: A - square bs by bs array
235: w - array of length bs
236: */
237: #define PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) do { \
238: PetscScalar _one = 1.0; \
239: PetscBLASInt _ione = 1,_bbs; \
240: PetscErrorCode _ierr; \
241: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
245: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
246: } while (0)
248: /*
249: v = v + A w v_gets_v_plus_Ar_times_w
251: v - array of length bs
252: A - square bs by bs array
253: w - array of length bs
254: */
255: #define PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) do { \
256: PetscScalar _one = 1.0; \
257: PetscBLASInt _ione = 1,_bbs,_bncols; \
258: PetscErrorCode _ierr; \
259: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
260: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
264: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
265: } while (0)
267: /*
268: w <- w - A v w_gets_w_minus_Ar_times_v
270: v - array of length ncol
271: A(bs,ncols)
272: w - array of length bs
273: */
274: #define PetscKernel_w_gets_w_minus_Ar_times_v(bs,ncols,w,A,v) do { \
275: PetscScalar _one = 1.0,_mone = -1.0; \
276: PetscBLASInt _ione = 1,_bbs,_bncols; \
277: PetscErrorCode _ierr; \
278: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
279: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
283: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_mone,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
284: } while (0)
286: /*
287: w = A v w_gets_A_times_v
289: v - array of length bs
290: A - square bs by bs array
291: w - array of length bs
292: */
293: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) do { \
294: PetscScalar _zero = 0.0,_one = 1.0; \
295: PetscBLASInt _ione = 1,_bbs; \
296: PetscErrorCode _ierr; \
297: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
301: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
302: } while (0)
304: /*
305: w = A' v w_gets_transA_times_v
307: v - array of length bs
308: A - square bs by bs array
309: w - array of length bs
310: */
311: #define PetscKernel_w_gets_transA_times_v(bs,v,A,w) do { \
312: PetscScalar _zero = 0.0,_one = 1.0; \
313: PetscBLASInt _ione = 1,_bbs; \
314: PetscErrorCode _ierr; \
315: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
319: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
320: } while (0)
322: /*
323: z = A*x
324: */
325: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) do { \
326: PetscScalar _one = 1.0,_zero = 0.0; \
327: PetscBLASInt _ione = 1,_bbs,_bncols; \
328: PetscErrorCode _ierr; \
329: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
330: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
334: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione)); \
335: } while (0)
337: /*
338: z = trans(A)*x
339: */
340: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) do { \
341: PetscScalar _one = 1.0; \
342: PetscBLASInt _ione = 1,_bbs,_bncols; \
343: PetscErrorCode _ierr; \
344: _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr); \
345: _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr); \
349: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione)); \
350: } while (0)
352: #else /* !defined(PETSC_USE_REAL_MAT_SINGLE) */
353: /*
354: Version that calls Fortran routines; can handle different precision
355: of matrix (array) and vectors
356: */
357: /*
358: These are Fortran kernels: They replace certain BLAS routines but
359: have some arguments that may be single precision,rather than double
360: These routines are provided in src/fortran/kernels/sgemv.F
361: They are pretty pitiful but get the job done. The intention is
362: that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
363: code is used.
364: */
366: #if defined(PETSC_HAVE_FORTRAN_CAPS)
367: #define msgemv_ MSGEMV
368: #define msgemvp_ MSGEMVP
369: #define msgemvm_ MSGEMVM
370: #define msgemvt_ MSGEMVT
371: #define msgemmi_ MSGEMMI
372: #define msgemm_ MSGEMM
373: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
374: #define msgemv_ msgemv
375: #define msgemvp_ msgemvp
376: #define msgemvm_ msgemvm
377: #define msgemvt_ msgemvt
378: #define msgemmi_ msgemmi
379: #define msgemm_ msgemm
380: #endif
382: PETSC_EXTERN void msgemv_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
383: PETSC_EXTERN void msgemvp_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
384: PETSC_EXTERN void msgemvm_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
385: PETSC_EXTERN void msgemvt_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
386: PETSC_EXTERN void msgemmi_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
387: PETSC_EXTERN void msgemm_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
389: /*
390: A = A * B A_gets_A_times_B
392: A, B - square bs by bs arrays stored in column major order
393: W - square bs by bs work array
395: */
396: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) do { \
397: PetscErrorCode _ierr; \
398: _PetscArraycpy((W),(A),(bs)*(bs));CHKERRQ(_ierr); \
399: msgemmi_(&bs,A,B,W); \
400: } while (0)
402: /*
404: A = A - B * C A_gets_A_minus_B_times_C
406: A, B, C - square bs by bs arrays stored in column major order
407: */
408: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) msgemm_(&(bs),(A),(B),(C))
410: /*
411: v = v - A w v_gets_v_minus_A_times_w
413: v - array of length bs
414: A - square bs by bs array
415: w - array of length bs
416: */
417: #define PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) msgemvm_(&(bs),&(bs),(A),(w),(v))
419: /*
420: v = v + A w v_gets_v_plus_A_times_w
422: v - array of length bs
423: A - square bs by bs array
424: w - array of length bs
425: */
426: #define PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) msgemvp_(&(bs),&(bs),(A),(w),(v))
428: /*
429: v = v + A w v_gets_v_plus_Ar_times_w
431: v - array of length bs
432: A - square bs by bs array
433: w - array of length bs
434: */
435: #define PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) msgemvp_(&(bs),&(ncol),(A),(v),(w))
437: /*
438: w = A v w_gets_A_times_v
440: v - array of length bs
441: A - square bs by bs array
442: w - array of length bs
443: */
444: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) msgemv_(&(bs),&(bs),(A),(v),(w))
446: /*
447: z = A*x
448: */
449: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) msgemv_(&(bs),&(ncols),(A),(x),(z))
451: /*
452: z = trans(A)*x
453: */
454: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) msgemvt_(&(bs),&(ncols),(A),(x),(z))
456: /* These do not work yet */
457: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
458: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
460: #endif /* !defined(PETSC_USE_REAL_MAT_SINGLE) */
462: #endif /* __ILU_H */