Actual source code: baijfact12.c
1: #define PETSCMAT_DLL
3: /*
4: Factorization code for BAIJ format.
5: */
6: #include src/mat/impls/baij/seq/baij.h
7: #include src/inline/ilu.h
11: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
12: {
13: /*
14: Default Version for when blocks are 4 by 4 Using natural ordering
15: */
16: Mat C = *B;
17: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
19: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
20: PetscInt *ajtmpold,*ajtmp,nz,row;
21: PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
22: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
23: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
24: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
25: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
26: MatScalar m13,m14,m15,m16;
27: MatScalar *ba = b->a,*aa = a->a;
28: PetscTruth pivotinblocks = b->pivotinblocks;
31: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
33: for (i=0; i<n; i++) {
34: nz = bi[i+1] - bi[i];
35: ajtmp = bj + bi[i];
36: for (j=0; j<nz; j++) {
37: x = rtmp+16*ajtmp[j];
38: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
39: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
40: }
41: /* load in initial (unfactored row) */
42: nz = ai[i+1] - ai[i];
43: ajtmpold = aj + ai[i];
44: v = aa + 16*ai[i];
45: for (j=0; j<nz; j++) {
46: x = rtmp+16*ajtmpold[j];
47: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
48: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
49: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
50: x[14] = v[14]; x[15] = v[15];
51: v += 16;
52: }
53: row = *ajtmp++;
54: while (row < i) {
55: pc = rtmp + 16*row;
56: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
57: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
58: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
59: p15 = pc[14]; p16 = pc[15];
60: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
61: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
62: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
63: || p16 != 0.0) {
64: pv = ba + 16*diag_offset[row];
65: pj = bj + diag_offset[row] + 1;
66: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
67: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
68: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
69: x15 = pv[14]; x16 = pv[15];
70: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
71: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
72: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
73: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
75: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
76: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
77: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
78: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
80: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
81: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
82: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
83: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
85: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
86: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
87: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
88: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
89: nz = bi[row+1] - diag_offset[row] - 1;
90: pv += 16;
91: for (j=0; j<nz; j++) {
92: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
93: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
94: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
95: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
96: x = rtmp + 16*pj[j];
97: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
98: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
99: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
100: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
102: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
103: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
104: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
105: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
107: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
108: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
109: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
110: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
112: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
113: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
114: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
115: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
117: pv += 16;
118: }
119: PetscLogFlops(128*nz+112);
120: }
121: row = *ajtmp++;
122: }
123: /* finished row so stick it into b->a */
124: pv = ba + 16*bi[i];
125: pj = bj + bi[i];
126: nz = bi[i+1] - bi[i];
127: for (j=0; j<nz; j++) {
128: x = rtmp+16*pj[j];
129: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
130: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
131: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
132: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
133: pv += 16;
134: }
135: /* invert diagonal block */
136: w = ba + 16*diag_offset[i];
137: if (pivotinblocks) {
138: Kernel_A_gets_inverse_A_4(w);
139: } else {
140: Kernel_A_gets_inverse_A_4_nopivot(w);
141: }
142: }
144: PetscFree(rtmp);
145: C->factor = FACTOR_LU;
146: C->assembled = PETSC_TRUE;
147: PetscLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
148: return(0);
149: }
152: #if defined(PETSC_HAVE_SSE)
154: #include PETSC_HAVE_SSE
156: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
159: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat A,MatFactorInfo *info,Mat *B)
160: {
161: Mat C = *B;
162: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
164: int i,j,n = a->mbs;
165: int *bj = b->j,*bjtmp,*pj;
166: int row;
167: int *ajtmpold,nz,*bi=b->i;
168: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
169: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
170: MatScalar *ba = b->a,*aa = a->a;
171: int nonzero=0;
172: /* int nonzero=0,colscale = 16; */
173: PetscTruth pivotinblocks = b->pivotinblocks;
176: SSE_SCOPE_BEGIN;
178: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
179: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
180: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
181: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
182: /* if ((unsigned long)bj==(unsigned long)aj) { */
183: /* colscale = 4; */
184: /* } */
185: for (i=0; i<n; i++) {
186: nz = bi[i+1] - bi[i];
187: bjtmp = bj + bi[i];
188: /* zero out the 4x4 block accumulators */
189: /* zero out one register */
190: XOR_PS(XMM7,XMM7);
191: for (j=0; j<nz; j++) {
192: x = rtmp+16*bjtmp[j];
193: /* x = rtmp+4*bjtmp[j]; */
194: SSE_INLINE_BEGIN_1(x)
195: /* Copy zero register to memory locations */
196: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
197: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
198: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
199: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
200: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
201: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
202: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
203: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
204: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
205: SSE_INLINE_END_1;
206: }
207: /* load in initial (unfactored row) */
208: nz = ai[i+1] - ai[i];
209: ajtmpold = aj + ai[i];
210: v = aa + 16*ai[i];
211: for (j=0; j<nz; j++) {
212: x = rtmp+16*ajtmpold[j];
213: /* x = rtmp+colscale*ajtmpold[j]; */
214: /* Copy v block into x block */
215: SSE_INLINE_BEGIN_2(v,x)
216: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
217: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
218: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
220: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
221: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
223: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
224: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
226: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
227: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
229: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
230: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
232: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
233: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
235: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
236: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
238: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
239: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
240: SSE_INLINE_END_2;
242: v += 16;
243: }
244: /* row = (*bjtmp++)/4; */
245: row = *bjtmp++;
246: while (row < i) {
247: pc = rtmp + 16*row;
248: SSE_INLINE_BEGIN_1(pc)
249: /* Load block from lower triangle */
250: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
251: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
252: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
254: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
255: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
257: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
258: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
260: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
261: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
263: /* Compare block to zero block */
265: SSE_COPY_PS(XMM4,XMM7)
266: SSE_CMPNEQ_PS(XMM4,XMM0)
268: SSE_COPY_PS(XMM5,XMM7)
269: SSE_CMPNEQ_PS(XMM5,XMM1)
271: SSE_COPY_PS(XMM6,XMM7)
272: SSE_CMPNEQ_PS(XMM6,XMM2)
274: SSE_CMPNEQ_PS(XMM7,XMM3)
276: /* Reduce the comparisons to one SSE register */
277: SSE_OR_PS(XMM6,XMM7)
278: SSE_OR_PS(XMM5,XMM4)
279: SSE_OR_PS(XMM5,XMM6)
280: SSE_INLINE_END_1;
282: /* Reduce the one SSE register to an integer register for branching */
283: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
284: MOVEMASK(nonzero,XMM5);
286: /* If block is nonzero ... */
287: if (nonzero) {
288: pv = ba + 16*diag_offset[row];
289: PREFETCH_L1(&pv[16]);
290: pj = bj + diag_offset[row] + 1;
292: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
293: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
294: /* but the diagonal was inverted already */
295: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
297: SSE_INLINE_BEGIN_2(pv,pc)
298: /* Column 0, product is accumulated in XMM4 */
299: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
300: SSE_SHUFFLE(XMM4,XMM4,0x00)
301: SSE_MULT_PS(XMM4,XMM0)
303: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
304: SSE_SHUFFLE(XMM5,XMM5,0x00)
305: SSE_MULT_PS(XMM5,XMM1)
306: SSE_ADD_PS(XMM4,XMM5)
308: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
309: SSE_SHUFFLE(XMM6,XMM6,0x00)
310: SSE_MULT_PS(XMM6,XMM2)
311: SSE_ADD_PS(XMM4,XMM6)
313: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
314: SSE_SHUFFLE(XMM7,XMM7,0x00)
315: SSE_MULT_PS(XMM7,XMM3)
316: SSE_ADD_PS(XMM4,XMM7)
318: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
319: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
321: /* Column 1, product is accumulated in XMM5 */
322: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
323: SSE_SHUFFLE(XMM5,XMM5,0x00)
324: SSE_MULT_PS(XMM5,XMM0)
326: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
327: SSE_SHUFFLE(XMM6,XMM6,0x00)
328: SSE_MULT_PS(XMM6,XMM1)
329: SSE_ADD_PS(XMM5,XMM6)
331: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
332: SSE_SHUFFLE(XMM7,XMM7,0x00)
333: SSE_MULT_PS(XMM7,XMM2)
334: SSE_ADD_PS(XMM5,XMM7)
336: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
337: SSE_SHUFFLE(XMM6,XMM6,0x00)
338: SSE_MULT_PS(XMM6,XMM3)
339: SSE_ADD_PS(XMM5,XMM6)
341: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
342: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
344: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
346: /* Column 2, product is accumulated in XMM6 */
347: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
348: SSE_SHUFFLE(XMM6,XMM6,0x00)
349: SSE_MULT_PS(XMM6,XMM0)
351: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
352: SSE_SHUFFLE(XMM7,XMM7,0x00)
353: SSE_MULT_PS(XMM7,XMM1)
354: SSE_ADD_PS(XMM6,XMM7)
356: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
357: SSE_SHUFFLE(XMM7,XMM7,0x00)
358: SSE_MULT_PS(XMM7,XMM2)
359: SSE_ADD_PS(XMM6,XMM7)
361: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
362: SSE_SHUFFLE(XMM7,XMM7,0x00)
363: SSE_MULT_PS(XMM7,XMM3)
364: SSE_ADD_PS(XMM6,XMM7)
365:
366: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
367: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
369: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
370: /* Column 3, product is accumulated in XMM0 */
371: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
372: SSE_SHUFFLE(XMM7,XMM7,0x00)
373: SSE_MULT_PS(XMM0,XMM7)
375: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
376: SSE_SHUFFLE(XMM7,XMM7,0x00)
377: SSE_MULT_PS(XMM1,XMM7)
378: SSE_ADD_PS(XMM0,XMM1)
380: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
381: SSE_SHUFFLE(XMM1,XMM1,0x00)
382: SSE_MULT_PS(XMM1,XMM2)
383: SSE_ADD_PS(XMM0,XMM1)
385: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
386: SSE_SHUFFLE(XMM7,XMM7,0x00)
387: SSE_MULT_PS(XMM3,XMM7)
388: SSE_ADD_PS(XMM0,XMM3)
390: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
391: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
393: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
394: /* This is code to be maintained and read by humans afterall. */
395: /* Copy Multiplier Col 3 into XMM3 */
396: SSE_COPY_PS(XMM3,XMM0)
397: /* Copy Multiplier Col 2 into XMM2 */
398: SSE_COPY_PS(XMM2,XMM6)
399: /* Copy Multiplier Col 1 into XMM1 */
400: SSE_COPY_PS(XMM1,XMM5)
401: /* Copy Multiplier Col 0 into XMM0 */
402: SSE_COPY_PS(XMM0,XMM4)
403: SSE_INLINE_END_2;
405: /* Update the row: */
406: nz = bi[row+1] - diag_offset[row] - 1;
407: pv += 16;
408: for (j=0; j<nz; j++) {
409: PREFETCH_L1(&pv[16]);
410: x = rtmp + 16*pj[j];
411: /* x = rtmp + 4*pj[j]; */
413: /* X:=X-M*PV, One column at a time */
414: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
415: SSE_INLINE_BEGIN_2(x,pv)
416: /* Load First Column of X*/
417: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
418: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
420: /* Matrix-Vector Product: */
421: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
422: SSE_SHUFFLE(XMM5,XMM5,0x00)
423: SSE_MULT_PS(XMM5,XMM0)
424: SSE_SUB_PS(XMM4,XMM5)
426: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
427: SSE_SHUFFLE(XMM6,XMM6,0x00)
428: SSE_MULT_PS(XMM6,XMM1)
429: SSE_SUB_PS(XMM4,XMM6)
431: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
432: SSE_SHUFFLE(XMM7,XMM7,0x00)
433: SSE_MULT_PS(XMM7,XMM2)
434: SSE_SUB_PS(XMM4,XMM7)
436: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
437: SSE_SHUFFLE(XMM5,XMM5,0x00)
438: SSE_MULT_PS(XMM5,XMM3)
439: SSE_SUB_PS(XMM4,XMM5)
441: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
442: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
444: /* Second Column */
445: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
446: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
448: /* Matrix-Vector Product: */
449: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
450: SSE_SHUFFLE(XMM6,XMM6,0x00)
451: SSE_MULT_PS(XMM6,XMM0)
452: SSE_SUB_PS(XMM5,XMM6)
454: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
455: SSE_SHUFFLE(XMM7,XMM7,0x00)
456: SSE_MULT_PS(XMM7,XMM1)
457: SSE_SUB_PS(XMM5,XMM7)
459: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
460: SSE_SHUFFLE(XMM4,XMM4,0x00)
461: SSE_MULT_PS(XMM4,XMM2)
462: SSE_SUB_PS(XMM5,XMM4)
464: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
465: SSE_SHUFFLE(XMM6,XMM6,0x00)
466: SSE_MULT_PS(XMM6,XMM3)
467: SSE_SUB_PS(XMM5,XMM6)
468:
469: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
470: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
472: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
474: /* Third Column */
475: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
476: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
478: /* Matrix-Vector Product: */
479: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
480: SSE_SHUFFLE(XMM7,XMM7,0x00)
481: SSE_MULT_PS(XMM7,XMM0)
482: SSE_SUB_PS(XMM6,XMM7)
484: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
485: SSE_SHUFFLE(XMM4,XMM4,0x00)
486: SSE_MULT_PS(XMM4,XMM1)
487: SSE_SUB_PS(XMM6,XMM4)
489: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
490: SSE_SHUFFLE(XMM5,XMM5,0x00)
491: SSE_MULT_PS(XMM5,XMM2)
492: SSE_SUB_PS(XMM6,XMM5)
494: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
495: SSE_SHUFFLE(XMM7,XMM7,0x00)
496: SSE_MULT_PS(XMM7,XMM3)
497: SSE_SUB_PS(XMM6,XMM7)
498:
499: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
500: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
501:
502: /* Fourth Column */
503: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
504: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
506: /* Matrix-Vector Product: */
507: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
508: SSE_SHUFFLE(XMM5,XMM5,0x00)
509: SSE_MULT_PS(XMM5,XMM0)
510: SSE_SUB_PS(XMM4,XMM5)
512: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
513: SSE_SHUFFLE(XMM6,XMM6,0x00)
514: SSE_MULT_PS(XMM6,XMM1)
515: SSE_SUB_PS(XMM4,XMM6)
517: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
518: SSE_SHUFFLE(XMM7,XMM7,0x00)
519: SSE_MULT_PS(XMM7,XMM2)
520: SSE_SUB_PS(XMM4,XMM7)
522: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
523: SSE_SHUFFLE(XMM5,XMM5,0x00)
524: SSE_MULT_PS(XMM5,XMM3)
525: SSE_SUB_PS(XMM4,XMM5)
526:
527: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
528: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
529: SSE_INLINE_END_2;
530: pv += 16;
531: }
532: PetscLogFlops(128*nz+112);
533: }
534: row = *bjtmp++;
535: /* row = (*bjtmp++)/4; */
536: }
537: /* finished row so stick it into b->a */
538: pv = ba + 16*bi[i];
539: pj = bj + bi[i];
540: nz = bi[i+1] - bi[i];
542: /* Copy x block back into pv block */
543: for (j=0; j<nz; j++) {
544: x = rtmp+16*pj[j];
545: /* x = rtmp+4*pj[j]; */
547: SSE_INLINE_BEGIN_2(x,pv)
548: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
549: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
550: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
552: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
553: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
555: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
556: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
558: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
559: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
561: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
562: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
564: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
565: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
567: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
568: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
570: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
571: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
572: SSE_INLINE_END_2;
573: pv += 16;
574: }
575: /* invert diagonal block */
576: w = ba + 16*diag_offset[i];
577: if (pivotinblocks) {
578: Kernel_A_gets_inverse_A_4(w);
579: } else {
580: Kernel_A_gets_inverse_A_4_nopivot(w);
581: }
582: /* Kernel_A_gets_inverse_A_4_SSE(w); */
583: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
584: }
586: PetscFree(rtmp);
587: C->factor = FACTOR_LU;
588: C->assembled = PETSC_TRUE;
589: PetscLogFlops(1.3333*64*b->mbs);
590: /* Flop Count from inverting diagonal blocks */
591: SSE_SCOPE_END;
592: return(0);
593: }
597: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat *B)
598: {
599: Mat A=*B,C=*B;
600: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
602: int i,j,n = a->mbs;
603: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
604: unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
605: unsigned int row;
606: int nz,*bi=b->i;
607: int *diag_offset = b->diag,*ai=a->i;
608: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
609: MatScalar *ba = b->a,*aa = a->a;
610: int nonzero=0;
611: /* int nonzero=0,colscale = 16; */
612: PetscTruth pivotinblocks = b->pivotinblocks;
615: SSE_SCOPE_BEGIN;
617: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
618: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
619: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
620: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
621: /* if ((unsigned long)bj==(unsigned long)aj) { */
622: /* colscale = 4; */
623: /* } */
624:
625: for (i=0; i<n; i++) {
626: nz = bi[i+1] - bi[i];
627: bjtmp = bj + bi[i];
628: /* zero out the 4x4 block accumulators */
629: /* zero out one register */
630: XOR_PS(XMM7,XMM7);
631: for (j=0; j<nz; j++) {
632: x = rtmp+16*((unsigned int)bjtmp[j]);
633: /* x = rtmp+4*bjtmp[j]; */
634: SSE_INLINE_BEGIN_1(x)
635: /* Copy zero register to memory locations */
636: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
637: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
638: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
639: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
640: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
641: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
642: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
643: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
644: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
645: SSE_INLINE_END_1;
646: }
647: /* load in initial (unfactored row) */
648: nz = ai[i+1] - ai[i];
649: ajtmp = aj + ai[i];
650: v = aa + 16*ai[i];
651: for (j=0; j<nz; j++) {
652: x = rtmp+16*((unsigned int)ajtmp[j]);
653: /* x = rtmp+colscale*ajtmp[j]; */
654: /* Copy v block into x block */
655: SSE_INLINE_BEGIN_2(v,x)
656: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
657: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
658: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
660: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
661: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
663: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
664: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
666: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
667: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
669: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
670: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
672: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
673: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
675: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
676: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
678: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
679: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
680: SSE_INLINE_END_2;
682: v += 16;
683: }
684: /* row = (*bjtmp++)/4; */
685: row = (unsigned int)(*bjtmp++);
686: while (row < i) {
687: pc = rtmp + 16*row;
688: SSE_INLINE_BEGIN_1(pc)
689: /* Load block from lower triangle */
690: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
691: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
692: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
694: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
695: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
697: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
698: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
700: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
701: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
703: /* Compare block to zero block */
705: SSE_COPY_PS(XMM4,XMM7)
706: SSE_CMPNEQ_PS(XMM4,XMM0)
708: SSE_COPY_PS(XMM5,XMM7)
709: SSE_CMPNEQ_PS(XMM5,XMM1)
711: SSE_COPY_PS(XMM6,XMM7)
712: SSE_CMPNEQ_PS(XMM6,XMM2)
714: SSE_CMPNEQ_PS(XMM7,XMM3)
716: /* Reduce the comparisons to one SSE register */
717: SSE_OR_PS(XMM6,XMM7)
718: SSE_OR_PS(XMM5,XMM4)
719: SSE_OR_PS(XMM5,XMM6)
720: SSE_INLINE_END_1;
722: /* Reduce the one SSE register to an integer register for branching */
723: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
724: MOVEMASK(nonzero,XMM5);
726: /* If block is nonzero ... */
727: if (nonzero) {
728: pv = ba + 16*diag_offset[row];
729: PREFETCH_L1(&pv[16]);
730: pj = bj + diag_offset[row] + 1;
732: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
733: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
734: /* but the diagonal was inverted already */
735: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
737: SSE_INLINE_BEGIN_2(pv,pc)
738: /* Column 0, product is accumulated in XMM4 */
739: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
740: SSE_SHUFFLE(XMM4,XMM4,0x00)
741: SSE_MULT_PS(XMM4,XMM0)
743: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
744: SSE_SHUFFLE(XMM5,XMM5,0x00)
745: SSE_MULT_PS(XMM5,XMM1)
746: SSE_ADD_PS(XMM4,XMM5)
748: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
749: SSE_SHUFFLE(XMM6,XMM6,0x00)
750: SSE_MULT_PS(XMM6,XMM2)
751: SSE_ADD_PS(XMM4,XMM6)
753: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
754: SSE_SHUFFLE(XMM7,XMM7,0x00)
755: SSE_MULT_PS(XMM7,XMM3)
756: SSE_ADD_PS(XMM4,XMM7)
758: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
759: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
761: /* Column 1, product is accumulated in XMM5 */
762: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
763: SSE_SHUFFLE(XMM5,XMM5,0x00)
764: SSE_MULT_PS(XMM5,XMM0)
766: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
767: SSE_SHUFFLE(XMM6,XMM6,0x00)
768: SSE_MULT_PS(XMM6,XMM1)
769: SSE_ADD_PS(XMM5,XMM6)
771: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
772: SSE_SHUFFLE(XMM7,XMM7,0x00)
773: SSE_MULT_PS(XMM7,XMM2)
774: SSE_ADD_PS(XMM5,XMM7)
776: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
777: SSE_SHUFFLE(XMM6,XMM6,0x00)
778: SSE_MULT_PS(XMM6,XMM3)
779: SSE_ADD_PS(XMM5,XMM6)
781: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
782: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
784: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
786: /* Column 2, product is accumulated in XMM6 */
787: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
788: SSE_SHUFFLE(XMM6,XMM6,0x00)
789: SSE_MULT_PS(XMM6,XMM0)
791: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
792: SSE_SHUFFLE(XMM7,XMM7,0x00)
793: SSE_MULT_PS(XMM7,XMM1)
794: SSE_ADD_PS(XMM6,XMM7)
796: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
797: SSE_SHUFFLE(XMM7,XMM7,0x00)
798: SSE_MULT_PS(XMM7,XMM2)
799: SSE_ADD_PS(XMM6,XMM7)
801: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
802: SSE_SHUFFLE(XMM7,XMM7,0x00)
803: SSE_MULT_PS(XMM7,XMM3)
804: SSE_ADD_PS(XMM6,XMM7)
805:
806: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
807: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
809: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
810: /* Column 3, product is accumulated in XMM0 */
811: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
812: SSE_SHUFFLE(XMM7,XMM7,0x00)
813: SSE_MULT_PS(XMM0,XMM7)
815: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
816: SSE_SHUFFLE(XMM7,XMM7,0x00)
817: SSE_MULT_PS(XMM1,XMM7)
818: SSE_ADD_PS(XMM0,XMM1)
820: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
821: SSE_SHUFFLE(XMM1,XMM1,0x00)
822: SSE_MULT_PS(XMM1,XMM2)
823: SSE_ADD_PS(XMM0,XMM1)
825: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
826: SSE_SHUFFLE(XMM7,XMM7,0x00)
827: SSE_MULT_PS(XMM3,XMM7)
828: SSE_ADD_PS(XMM0,XMM3)
830: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
831: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
833: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
834: /* This is code to be maintained and read by humans afterall. */
835: /* Copy Multiplier Col 3 into XMM3 */
836: SSE_COPY_PS(XMM3,XMM0)
837: /* Copy Multiplier Col 2 into XMM2 */
838: SSE_COPY_PS(XMM2,XMM6)
839: /* Copy Multiplier Col 1 into XMM1 */
840: SSE_COPY_PS(XMM1,XMM5)
841: /* Copy Multiplier Col 0 into XMM0 */
842: SSE_COPY_PS(XMM0,XMM4)
843: SSE_INLINE_END_2;
845: /* Update the row: */
846: nz = bi[row+1] - diag_offset[row] - 1;
847: pv += 16;
848: for (j=0; j<nz; j++) {
849: PREFETCH_L1(&pv[16]);
850: x = rtmp + 16*((unsigned int)pj[j]);
851: /* x = rtmp + 4*pj[j]; */
853: /* X:=X-M*PV, One column at a time */
854: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
855: SSE_INLINE_BEGIN_2(x,pv)
856: /* Load First Column of X*/
857: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
858: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
860: /* Matrix-Vector Product: */
861: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
862: SSE_SHUFFLE(XMM5,XMM5,0x00)
863: SSE_MULT_PS(XMM5,XMM0)
864: SSE_SUB_PS(XMM4,XMM5)
866: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
867: SSE_SHUFFLE(XMM6,XMM6,0x00)
868: SSE_MULT_PS(XMM6,XMM1)
869: SSE_SUB_PS(XMM4,XMM6)
871: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
872: SSE_SHUFFLE(XMM7,XMM7,0x00)
873: SSE_MULT_PS(XMM7,XMM2)
874: SSE_SUB_PS(XMM4,XMM7)
876: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
877: SSE_SHUFFLE(XMM5,XMM5,0x00)
878: SSE_MULT_PS(XMM5,XMM3)
879: SSE_SUB_PS(XMM4,XMM5)
881: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
882: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
884: /* Second Column */
885: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
886: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
888: /* Matrix-Vector Product: */
889: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
890: SSE_SHUFFLE(XMM6,XMM6,0x00)
891: SSE_MULT_PS(XMM6,XMM0)
892: SSE_SUB_PS(XMM5,XMM6)
894: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
895: SSE_SHUFFLE(XMM7,XMM7,0x00)
896: SSE_MULT_PS(XMM7,XMM1)
897: SSE_SUB_PS(XMM5,XMM7)
899: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
900: SSE_SHUFFLE(XMM4,XMM4,0x00)
901: SSE_MULT_PS(XMM4,XMM2)
902: SSE_SUB_PS(XMM5,XMM4)
904: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
905: SSE_SHUFFLE(XMM6,XMM6,0x00)
906: SSE_MULT_PS(XMM6,XMM3)
907: SSE_SUB_PS(XMM5,XMM6)
908:
909: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
910: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
912: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
914: /* Third Column */
915: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
916: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
918: /* Matrix-Vector Product: */
919: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
920: SSE_SHUFFLE(XMM7,XMM7,0x00)
921: SSE_MULT_PS(XMM7,XMM0)
922: SSE_SUB_PS(XMM6,XMM7)
924: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
925: SSE_SHUFFLE(XMM4,XMM4,0x00)
926: SSE_MULT_PS(XMM4,XMM1)
927: SSE_SUB_PS(XMM6,XMM4)
929: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
930: SSE_SHUFFLE(XMM5,XMM5,0x00)
931: SSE_MULT_PS(XMM5,XMM2)
932: SSE_SUB_PS(XMM6,XMM5)
934: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
935: SSE_SHUFFLE(XMM7,XMM7,0x00)
936: SSE_MULT_PS(XMM7,XMM3)
937: SSE_SUB_PS(XMM6,XMM7)
938:
939: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
940: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
941:
942: /* Fourth Column */
943: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
944: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
946: /* Matrix-Vector Product: */
947: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
948: SSE_SHUFFLE(XMM5,XMM5,0x00)
949: SSE_MULT_PS(XMM5,XMM0)
950: SSE_SUB_PS(XMM4,XMM5)
952: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
953: SSE_SHUFFLE(XMM6,XMM6,0x00)
954: SSE_MULT_PS(XMM6,XMM1)
955: SSE_SUB_PS(XMM4,XMM6)
957: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
958: SSE_SHUFFLE(XMM7,XMM7,0x00)
959: SSE_MULT_PS(XMM7,XMM2)
960: SSE_SUB_PS(XMM4,XMM7)
962: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
963: SSE_SHUFFLE(XMM5,XMM5,0x00)
964: SSE_MULT_PS(XMM5,XMM3)
965: SSE_SUB_PS(XMM4,XMM5)
966:
967: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
968: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
969: SSE_INLINE_END_2;
970: pv += 16;
971: }
972: PetscLogFlops(128*nz+112);
973: }
974: row = (unsigned int)(*bjtmp++);
975: /* row = (*bjtmp++)/4; */
976: /* bjtmp++; */
977: }
978: /* finished row so stick it into b->a */
979: pv = ba + 16*bi[i];
980: pj = bj + bi[i];
981: nz = bi[i+1] - bi[i];
983: /* Copy x block back into pv block */
984: for (j=0; j<nz; j++) {
985: x = rtmp+16*((unsigned int)pj[j]);
986: /* x = rtmp+4*pj[j]; */
988: SSE_INLINE_BEGIN_2(x,pv)
989: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
990: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
991: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
993: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
994: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
996: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
997: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
999: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1000: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1002: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1003: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1005: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1006: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1008: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1009: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1011: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1012: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1013: SSE_INLINE_END_2;
1014: pv += 16;
1015: }
1016: /* invert diagonal block */
1017: w = ba + 16*diag_offset[i];
1018: if (pivotinblocks) {
1019: Kernel_A_gets_inverse_A_4(w);
1020: } else {
1021: Kernel_A_gets_inverse_A_4_nopivot(w);
1022: }
1023: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1024: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1025: }
1027: PetscFree(rtmp);
1028: C->factor = FACTOR_LU;
1029: C->assembled = PETSC_TRUE;
1030: PetscLogFlops(1.3333*64*b->mbs);
1031: /* Flop Count from inverting diagonal blocks */
1032: SSE_SCOPE_END;
1033: return(0);
1034: }
1038: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A,MatFactorInfo *info,Mat *B)
1039: {
1040: Mat C = *B;
1041: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1043: int i,j,n = a->mbs;
1044: unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1045: unsigned int row;
1046: int *ajtmpold,nz,*bi=b->i;
1047: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1048: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1049: MatScalar *ba = b->a,*aa = a->a;
1050: int nonzero=0;
1051: /* int nonzero=0,colscale = 16; */
1052: PetscTruth pivotinblocks = b->pivotinblocks;
1055: SSE_SCOPE_BEGIN;
1057: if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work.");
1058: if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work.");
1059: PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1060: if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work.");
1061: /* if ((unsigned long)bj==(unsigned long)aj) { */
1062: /* colscale = 4; */
1063: /* } */
1064: if ((unsigned long)bj==(unsigned long)aj) {
1065: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(B));
1066: }
1067:
1068: for (i=0; i<n; i++) {
1069: nz = bi[i+1] - bi[i];
1070: bjtmp = bj + bi[i];
1071: /* zero out the 4x4 block accumulators */
1072: /* zero out one register */
1073: XOR_PS(XMM7,XMM7);
1074: for (j=0; j<nz; j++) {
1075: x = rtmp+16*((unsigned int)bjtmp[j]);
1076: /* x = rtmp+4*bjtmp[j]; */
1077: SSE_INLINE_BEGIN_1(x)
1078: /* Copy zero register to memory locations */
1079: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1080: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1081: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1082: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1083: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1084: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1085: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1086: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1087: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1088: SSE_INLINE_END_1;
1089: }
1090: /* load in initial (unfactored row) */
1091: nz = ai[i+1] - ai[i];
1092: ajtmpold = aj + ai[i];
1093: v = aa + 16*ai[i];
1094: for (j=0; j<nz; j++) {
1095: x = rtmp+16*ajtmpold[j];
1096: /* x = rtmp+colscale*ajtmpold[j]; */
1097: /* Copy v block into x block */
1098: SSE_INLINE_BEGIN_2(v,x)
1099: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1100: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1101: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1103: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1104: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1106: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1107: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1109: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1110: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1112: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1113: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1115: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1116: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1118: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1119: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1121: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1122: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1123: SSE_INLINE_END_2;
1125: v += 16;
1126: }
1127: /* row = (*bjtmp++)/4; */
1128: row = (unsigned int)(*bjtmp++);
1129: while (row < i) {
1130: pc = rtmp + 16*row;
1131: SSE_INLINE_BEGIN_1(pc)
1132: /* Load block from lower triangle */
1133: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1134: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1135: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1137: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1138: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1140: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1141: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1143: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1144: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1146: /* Compare block to zero block */
1148: SSE_COPY_PS(XMM4,XMM7)
1149: SSE_CMPNEQ_PS(XMM4,XMM0)
1151: SSE_COPY_PS(XMM5,XMM7)
1152: SSE_CMPNEQ_PS(XMM5,XMM1)
1154: SSE_COPY_PS(XMM6,XMM7)
1155: SSE_CMPNEQ_PS(XMM6,XMM2)
1157: SSE_CMPNEQ_PS(XMM7,XMM3)
1159: /* Reduce the comparisons to one SSE register */
1160: SSE_OR_PS(XMM6,XMM7)
1161: SSE_OR_PS(XMM5,XMM4)
1162: SSE_OR_PS(XMM5,XMM6)
1163: SSE_INLINE_END_1;
1165: /* Reduce the one SSE register to an integer register for branching */
1166: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1167: MOVEMASK(nonzero,XMM5);
1169: /* If block is nonzero ... */
1170: if (nonzero) {
1171: pv = ba + 16*diag_offset[row];
1172: PREFETCH_L1(&pv[16]);
1173: pj = bj + diag_offset[row] + 1;
1175: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1176: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1177: /* but the diagonal was inverted already */
1178: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1180: SSE_INLINE_BEGIN_2(pv,pc)
1181: /* Column 0, product is accumulated in XMM4 */
1182: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1183: SSE_SHUFFLE(XMM4,XMM4,0x00)
1184: SSE_MULT_PS(XMM4,XMM0)
1186: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1187: SSE_SHUFFLE(XMM5,XMM5,0x00)
1188: SSE_MULT_PS(XMM5,XMM1)
1189: SSE_ADD_PS(XMM4,XMM5)
1191: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1192: SSE_SHUFFLE(XMM6,XMM6,0x00)
1193: SSE_MULT_PS(XMM6,XMM2)
1194: SSE_ADD_PS(XMM4,XMM6)
1196: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1197: SSE_SHUFFLE(XMM7,XMM7,0x00)
1198: SSE_MULT_PS(XMM7,XMM3)
1199: SSE_ADD_PS(XMM4,XMM7)
1201: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1202: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1204: /* Column 1, product is accumulated in XMM5 */
1205: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1206: SSE_SHUFFLE(XMM5,XMM5,0x00)
1207: SSE_MULT_PS(XMM5,XMM0)
1209: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1210: SSE_SHUFFLE(XMM6,XMM6,0x00)
1211: SSE_MULT_PS(XMM6,XMM1)
1212: SSE_ADD_PS(XMM5,XMM6)
1214: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1215: SSE_SHUFFLE(XMM7,XMM7,0x00)
1216: SSE_MULT_PS(XMM7,XMM2)
1217: SSE_ADD_PS(XMM5,XMM7)
1219: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1220: SSE_SHUFFLE(XMM6,XMM6,0x00)
1221: SSE_MULT_PS(XMM6,XMM3)
1222: SSE_ADD_PS(XMM5,XMM6)
1224: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1225: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1227: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1229: /* Column 2, product is accumulated in XMM6 */
1230: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1231: SSE_SHUFFLE(XMM6,XMM6,0x00)
1232: SSE_MULT_PS(XMM6,XMM0)
1234: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1235: SSE_SHUFFLE(XMM7,XMM7,0x00)
1236: SSE_MULT_PS(XMM7,XMM1)
1237: SSE_ADD_PS(XMM6,XMM7)
1239: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1240: SSE_SHUFFLE(XMM7,XMM7,0x00)
1241: SSE_MULT_PS(XMM7,XMM2)
1242: SSE_ADD_PS(XMM6,XMM7)
1244: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1245: SSE_SHUFFLE(XMM7,XMM7,0x00)
1246: SSE_MULT_PS(XMM7,XMM3)
1247: SSE_ADD_PS(XMM6,XMM7)
1248:
1249: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1250: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1252: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1253: /* Column 3, product is accumulated in XMM0 */
1254: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1255: SSE_SHUFFLE(XMM7,XMM7,0x00)
1256: SSE_MULT_PS(XMM0,XMM7)
1258: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1259: SSE_SHUFFLE(XMM7,XMM7,0x00)
1260: SSE_MULT_PS(XMM1,XMM7)
1261: SSE_ADD_PS(XMM0,XMM1)
1263: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1264: SSE_SHUFFLE(XMM1,XMM1,0x00)
1265: SSE_MULT_PS(XMM1,XMM2)
1266: SSE_ADD_PS(XMM0,XMM1)
1268: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1269: SSE_SHUFFLE(XMM7,XMM7,0x00)
1270: SSE_MULT_PS(XMM3,XMM7)
1271: SSE_ADD_PS(XMM0,XMM3)
1273: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1274: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1276: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1277: /* This is code to be maintained and read by humans afterall. */
1278: /* Copy Multiplier Col 3 into XMM3 */
1279: SSE_COPY_PS(XMM3,XMM0)
1280: /* Copy Multiplier Col 2 into XMM2 */
1281: SSE_COPY_PS(XMM2,XMM6)
1282: /* Copy Multiplier Col 1 into XMM1 */
1283: SSE_COPY_PS(XMM1,XMM5)
1284: /* Copy Multiplier Col 0 into XMM0 */
1285: SSE_COPY_PS(XMM0,XMM4)
1286: SSE_INLINE_END_2;
1288: /* Update the row: */
1289: nz = bi[row+1] - diag_offset[row] - 1;
1290: pv += 16;
1291: for (j=0; j<nz; j++) {
1292: PREFETCH_L1(&pv[16]);
1293: x = rtmp + 16*((unsigned int)pj[j]);
1294: /* x = rtmp + 4*pj[j]; */
1296: /* X:=X-M*PV, One column at a time */
1297: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1298: SSE_INLINE_BEGIN_2(x,pv)
1299: /* Load First Column of X*/
1300: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1301: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1303: /* Matrix-Vector Product: */
1304: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1305: SSE_SHUFFLE(XMM5,XMM5,0x00)
1306: SSE_MULT_PS(XMM5,XMM0)
1307: SSE_SUB_PS(XMM4,XMM5)
1309: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1310: SSE_SHUFFLE(XMM6,XMM6,0x00)
1311: SSE_MULT_PS(XMM6,XMM1)
1312: SSE_SUB_PS(XMM4,XMM6)
1314: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1315: SSE_SHUFFLE(XMM7,XMM7,0x00)
1316: SSE_MULT_PS(XMM7,XMM2)
1317: SSE_SUB_PS(XMM4,XMM7)
1319: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1320: SSE_SHUFFLE(XMM5,XMM5,0x00)
1321: SSE_MULT_PS(XMM5,XMM3)
1322: SSE_SUB_PS(XMM4,XMM5)
1324: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1325: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1327: /* Second Column */
1328: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1329: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1331: /* Matrix-Vector Product: */
1332: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1333: SSE_SHUFFLE(XMM6,XMM6,0x00)
1334: SSE_MULT_PS(XMM6,XMM0)
1335: SSE_SUB_PS(XMM5,XMM6)
1337: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1338: SSE_SHUFFLE(XMM7,XMM7,0x00)
1339: SSE_MULT_PS(XMM7,XMM1)
1340: SSE_SUB_PS(XMM5,XMM7)
1342: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1343: SSE_SHUFFLE(XMM4,XMM4,0x00)
1344: SSE_MULT_PS(XMM4,XMM2)
1345: SSE_SUB_PS(XMM5,XMM4)
1347: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1348: SSE_SHUFFLE(XMM6,XMM6,0x00)
1349: SSE_MULT_PS(XMM6,XMM3)
1350: SSE_SUB_PS(XMM5,XMM6)
1351:
1352: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1353: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1355: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1357: /* Third Column */
1358: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1359: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1361: /* Matrix-Vector Product: */
1362: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1363: SSE_SHUFFLE(XMM7,XMM7,0x00)
1364: SSE_MULT_PS(XMM7,XMM0)
1365: SSE_SUB_PS(XMM6,XMM7)
1367: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1368: SSE_SHUFFLE(XMM4,XMM4,0x00)
1369: SSE_MULT_PS(XMM4,XMM1)
1370: SSE_SUB_PS(XMM6,XMM4)
1372: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1373: SSE_SHUFFLE(XMM5,XMM5,0x00)
1374: SSE_MULT_PS(XMM5,XMM2)
1375: SSE_SUB_PS(XMM6,XMM5)
1377: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1378: SSE_SHUFFLE(XMM7,XMM7,0x00)
1379: SSE_MULT_PS(XMM7,XMM3)
1380: SSE_SUB_PS(XMM6,XMM7)
1381:
1382: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1383: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1384:
1385: /* Fourth Column */
1386: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1387: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1389: /* Matrix-Vector Product: */
1390: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1391: SSE_SHUFFLE(XMM5,XMM5,0x00)
1392: SSE_MULT_PS(XMM5,XMM0)
1393: SSE_SUB_PS(XMM4,XMM5)
1395: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1396: SSE_SHUFFLE(XMM6,XMM6,0x00)
1397: SSE_MULT_PS(XMM6,XMM1)
1398: SSE_SUB_PS(XMM4,XMM6)
1400: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1401: SSE_SHUFFLE(XMM7,XMM7,0x00)
1402: SSE_MULT_PS(XMM7,XMM2)
1403: SSE_SUB_PS(XMM4,XMM7)
1405: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1406: SSE_SHUFFLE(XMM5,XMM5,0x00)
1407: SSE_MULT_PS(XMM5,XMM3)
1408: SSE_SUB_PS(XMM4,XMM5)
1409:
1410: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1411: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1412: SSE_INLINE_END_2;
1413: pv += 16;
1414: }
1415: PetscLogFlops(128*nz+112);
1416: }
1417: row = (unsigned int)(*bjtmp++);
1418: /* row = (*bjtmp++)/4; */
1419: /* bjtmp++; */
1420: }
1421: /* finished row so stick it into b->a */
1422: pv = ba + 16*bi[i];
1423: pj = bj + bi[i];
1424: nz = bi[i+1] - bi[i];
1426: /* Copy x block back into pv block */
1427: for (j=0; j<nz; j++) {
1428: x = rtmp+16*((unsigned int)pj[j]);
1429: /* x = rtmp+4*pj[j]; */
1431: SSE_INLINE_BEGIN_2(x,pv)
1432: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1433: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1434: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1436: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1437: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1439: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1440: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1442: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1443: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1445: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1446: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1448: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1449: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1451: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1452: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1454: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1455: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1456: SSE_INLINE_END_2;
1457: pv += 16;
1458: }
1459: /* invert diagonal block */
1460: w = ba + 16*diag_offset[i];
1461: if (pivotinblocks) {
1462: Kernel_A_gets_inverse_A_4(w);
1463: } else {
1464: Kernel_A_gets_inverse_A_4_nopivot(w);
1465: }
1466: /* Kernel_A_gets_inverse_A_4_SSE(w); */
1467: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1468: }
1470: PetscFree(rtmp);
1471: C->factor = FACTOR_LU;
1472: C->assembled = PETSC_TRUE;
1473: PetscLogFlops(1.3333*64*b->mbs);
1474: /* Flop Count from inverting diagonal blocks */
1475: SSE_SCOPE_END;
1476: return(0);
1477: }
1479: #endif