Actual source code: dalocalizationletkf.kokkos.cxx

  1: #include <petsc.h>
  2: #include <petscmat.h>
  3: #include <petsc_kokkos.hpp>

  5: typedef struct {
  6:   PetscReal distance;
  7:   PetscInt  obs_index;
  8: } DistObsPair;

 10: KOKKOS_INLINE_FUNCTION
 11: static PetscReal GaspariCohn(PetscReal distance, PetscReal radius)
 12: {
 13:   if (radius <= 0.0) return 0.0;
 14:   const PetscReal r = distance / radius;

 16:   if (r >= 2.0) return 0.0;

 18:   const PetscReal r2 = r * r;
 19:   const PetscReal r3 = r2 * r;
 20:   const PetscReal r4 = r3 * r;
 21:   const PetscReal r5 = r4 * r;

 23:   if (r <= 1.0) {
 24:     // Region [0, 1]
 25:     return -0.25 * r5 + 0.5 * r4 + 0.625 * r3 - (5.0 / 3.0) * r2 + 1.0;
 26:   } else {
 27:     // Region [1, 2]
 28:     return (1.0 / 12.0) * r5 - 0.5 * r4 + 0.625 * r3 + (5.0 / 3.0) * r2 - 5.0 * r + 4.0 - (2.0 / 3.0) / r;
 29:   }
 30: }

 32: #define RADIUS_FACTOR 1.1

 34: template <class ViewType>
 35: struct RadiusStatsFunctor {
 36:   ViewType best_dists;
 37:   PetscInt n_obs_vertex;

 39:   struct value_type {
 40:     double sum, sq_sum;
 41:   };

 43:   KOKKOS_INLINE_FUNCTION void operator()(const PetscInt i, value_type &update) const
 44:   {
 45:     PetscReal r2 = best_dists(i, n_obs_vertex - 1);
 46:     PetscReal r  = std::sqrt(r2);
 47:     r *= RADIUS_FACTOR;
 48:     if (r == 0.0) r = 1.0;
 49:     update.sum += r;
 50:     update.sq_sum += r * r;
 51:   }

 53:   KOKKOS_INLINE_FUNCTION void init(value_type &update) const
 54:   {
 55:     update.sum    = 0.0;
 56:     update.sq_sum = 0.0;
 57:   }

 59:   KOKKOS_INLINE_FUNCTION void join(value_type &dest, const value_type &src) const
 60:   {
 61:     dest.sum += src.sum;
 62:     dest.sq_sum += src.sq_sum;
 63:   }
 64: };

 66: /*@
 67:   PetscDALETKFGetLocalizationMatrix - Compute localization weight matrix for LETKF [move to ml/da/interface]

 69:   Collective

 71:   Input Parameters:
 72: + n_obs_vertex - Number of observations to localize to per vertex
 73: . n_dof        - Number of degrees of freedom
 74: . Vecxyz       - Array of vectors containing the vertex coordinates
 75: . bd           - Array of boundary extents per dimension (used for periodicity)
 76: - H            - Observation operator matrix

 78:   Output Parameter:
 79: . Q - Localization weight matrix (sparse, AIJ format)

 81:   Level: intermediate

 83:   Notes:
 84:   The output matrix Q has dimensions (n_vert_global x n_obs_global) where
 85:   n_vert_global is the number of vertices in the DMPlex. Each row contains
 86:   exactly n_obs_vertex non-zero entries corresponding to the nearest
 87:   observations, weighted by the Gaspari-Cohn fifth-order piecewise
 88:   rational function.

 90:   The observation locations are computed as H * V where V is the vector
 91:   of vertex coordinates. The localization weights ensure smooth tapering
 92:   of observation influence with distance.

 94:   Kokkos is required for this routine.

 96: .seealso: [](ch_da), `PetscDALETKFSetLocalization()`
 97: @*/
 98: PetscErrorCode PetscDALETKFGetLocalizationMatrix(const PetscInt n_obs_vertex, const PetscInt n_dof, Vec Vecxyz[3], PetscReal bd[3], Mat H, Mat *Q)
 99: {
100:   PetscInt dim = 0, n_vert_local, d, n_obs_global, n_obs_local;
101:   Vec     *obs_vecs;
102:   MPI_Comm comm;

104:   PetscFunctionBegin;
106:   PetscAssertPointer(Q, 6);

108:   PetscCall(PetscKokkosInitializeCheck());
109:   PetscCall(PetscObjectGetComm((PetscObject)H, &comm));
110:   PetscCall(MatGetLocalSize(H, &n_obs_local, NULL));
111:   PetscCall(MatGetSize(H, &n_obs_global, NULL));
112:   /* Infer dim from the number of vectors in Vecxyz */
113:   for (d = 0; d < 3; ++d) {
114:     if (Vecxyz[d]) dim++;
115:     else break;
116:   }
117:   PetscCall(VecGetLocalSize(Vecxyz[0], &n_vert_local));

119:   /* Check H dimensions */
120:   // If n_obs_global < n_obs_vertex, we will pad with -1 indices and 0.0 weights. ???
121:   // This is not an error condition, but rather a case where we have fewer observations than requested neighbors.
122:   PetscCheck(dim > 0, comm, PETSC_ERR_ARG_WRONG, "Dim must be > 0");
123:   PetscCheck(n_obs_vertex > 0 && n_obs_vertex <= n_obs_global, comm, PETSC_ERR_ARG_WRONG, "n_obs_vertex must be > 0 and <= n_obs_global");

125:   /* Allocate storage for observation locations */
126:   PetscCall(PetscMalloc1(dim, &obs_vecs));

128:   /* Compute observation locations per dimension */
129:   for (d = 0; d < dim; ++d) {
130:     PetscCall(MatCreateVecs(H, NULL, &obs_vecs[d]));
131:     PetscCall(MatMult(H, Vecxyz[d], obs_vecs[d]));
132:   }

134:   /* Create output matrix Q in N/n_dof x P */
135:   PetscCall(MatCreate(comm, Q));
136:   PetscCall(MatSetSizes(*Q, n_vert_local, n_obs_local, PETSC_DETERMINE, n_obs_global));
137:   PetscCall(MatSetType(*Q, MATAIJ));
138:   PetscCall(MatSeqAIJSetPreallocation(*Q, n_obs_vertex, NULL));
139:   PetscCall(MatMPIAIJSetPreallocation(*Q, n_obs_vertex, NULL, n_obs_vertex, NULL));
140:   PetscCall(MatSetFromOptions(*Q));
141:   PetscCall(MatSetUp(*Q));

143:   PetscCall(PetscInfo((PetscObject)*Q, "Computing LETKF localization matrix: %" PetscInt_FMT " vertices, %" PetscInt_FMT " observations, %" PetscInt_FMT " observations per vertex\n", n_vert_local, n_obs_global, n_obs_vertex));

145:   /* Prepare Kokkos Views */
146:   using ExecSpace = Kokkos::DefaultExecutionSpace;
147:   using MemSpace  = ExecSpace::memory_space;

149:   /* Vertex Coordinates */
150:   // Use LayoutLeft for coalesced access on GPU (i is contiguous)
151:   Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, MemSpace> vertex_coords_dev("vertex_coords", n_vert_local, dim);
152:   {
153:     // Host view must match the data layout from VecGetArray (d-major, i-minor implies LayoutLeft for (i,d) view)
154:     Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, Kokkos::HostSpace> vertex_coords_host("vertex_coords_host", n_vert_local, dim);
155:     for (d = 0; d < dim; ++d) {
156:       const PetscScalar *local_coords_array;
157:       PetscCall(VecGetArrayRead(Vecxyz[d], &local_coords_array));
158:       // Copy data. Since vertex_coords_host is LayoutLeft, &vertex_coords_host(0, d) is the start of column d.
159:       for (PetscInt i = 0; i < n_vert_local; ++i) vertex_coords_host(i, d) = local_coords_array[i];
160:       PetscCall(VecRestoreArrayRead(Vecxyz[d], &local_coords_array));
161:     }
162:     Kokkos::deep_copy(vertex_coords_dev, vertex_coords_host);
163:   }

165:   /* Observation Coordinates */
166:   Kokkos::View<PetscReal **, Kokkos::LayoutRight, MemSpace> obs_coords_dev("obs_coords", n_obs_global, dim);
167:   {
168:     Kokkos::View<PetscReal **, Kokkos::LayoutRight, Kokkos::HostSpace> obs_coords_host("obs_coords_host", n_obs_global, dim);
169:     for (d = 0; d < dim; ++d) {
170:       VecScatter         ctx;
171:       Vec                seq_vec;
172:       const PetscScalar *array;

174:       PetscCall(VecScatterCreateToAll(obs_vecs[d], &ctx, &seq_vec));
175:       PetscCall(VecScatterBegin(ctx, obs_vecs[d], seq_vec, INSERT_VALUES, SCATTER_FORWARD));
176:       PetscCall(VecScatterEnd(ctx, obs_vecs[d], seq_vec, INSERT_VALUES, SCATTER_FORWARD));

178:       PetscCall(VecGetArrayRead(seq_vec, &array));
179:       for (PetscInt j = 0; j < n_obs_global; ++j) obs_coords_host(j, d) = PetscRealPart(array[j]);
180:       PetscCall(VecRestoreArrayRead(seq_vec, &array));
181:       PetscCall(VecScatterDestroy(&ctx));
182:       PetscCall(VecDestroy(&seq_vec));
183:     }
184:     Kokkos::deep_copy(obs_coords_dev, obs_coords_host);
185:   }

187:   PetscInt rstart;
188:   PetscCall(VecGetOwnershipRange(Vecxyz[0], &rstart, NULL));

190:   /* Output Views */
191:   // LayoutLeft for coalesced access on GPU
192:   Kokkos::View<PetscInt **, Kokkos::LayoutLeft, MemSpace>    indices_dev("indices", n_vert_local, n_obs_vertex);
193:   Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, MemSpace> values_dev("values", n_vert_local, n_obs_vertex);

195:   /* Temporary storage for top-k per vertex */
196:   // LayoutLeft for coalesced access on GPU.
197:   // Note: For the insertion sort within a thread, LayoutRight would offer better cache locality for the thread's private list.
198:   // However, LayoutLeft is preferred for coalesced access across threads during the final weight computation and initialization.
199:   // Given the random access nature of the sort (divergence), we stick to the default GPU layout (Left).
200:   Kokkos::View<PetscReal **, Kokkos::LayoutLeft, MemSpace> best_dists_dev("best_dists", n_vert_local, n_obs_vertex);
201:   Kokkos::View<PetscInt **, Kokkos::LayoutLeft, MemSpace>  best_idxs_dev("best_idxs", n_vert_local, n_obs_vertex);

203:   /* Copy boundary data to device */
204:   Kokkos::View<PetscReal *, MemSpace> bd_dev("bd_dev", dim);
205:   {
206:     Kokkos::View<PetscReal *, Kokkos::HostSpace> bd_host("bd_host", dim);
207:     for (PetscInt d = 0; d < dim; ++d) bd_host(d) = bd[d];
208:     Kokkos::deep_copy(bd_dev, bd_host);
209:   }

211:   /* Main Kernel */
212:   Kokkos::parallel_for(
213:     "ComputeLocalization", Kokkos::RangePolicy<ExecSpace>(0, n_vert_local), KOKKOS_LAMBDA(const PetscInt i) {
214:       PetscReal current_max_dist = PETSC_MAX_REAL;

216:       // Cache vertex coordinates in registers to avoid repeated global memory access
217:       // dim is small (<= 3), so this fits easily in registers
218:       PetscReal v_coords[3] = {0.0, 0.0, 0.0};
219:       for (PetscInt d = 0; d < dim; ++d) v_coords[d] = PetscRealPart(vertex_coords_dev(i, d));

221:       // Initialize with infinity
222:       for (PetscInt k = 0; k < n_obs_vertex; ++k) {
223:         best_dists_dev(i, k) = PETSC_MAX_REAL;
224:         best_idxs_dev(i, k)  = -1;
225:       }

227:       // Iterate over all observations
228:       for (PetscInt j = 0; j < n_obs_global; ++j) {
229:         PetscReal dist2 = 0.0;
230:         for (PetscInt d = 0; d < dim; ++d) {
231:           PetscReal diff = v_coords[d] - obs_coords_dev(j, d);
232:           if (bd_dev(d) != 0) { // Periodic boundary
233:             PetscReal domain_size = bd_dev(d);
234:             if (diff > 0.5 * domain_size) diff -= domain_size;
235:             else if (diff < -0.5 * domain_size) diff += domain_size;
236:           }
237:           dist2 += diff * diff;
238:         }

240:         // Check if this observation is closer than the furthest stored observation
241:         if (dist2 < current_max_dist) {
242:           // Insert sorted
243:           PetscInt pos = n_obs_vertex - 1;
244:           while (pos > 0 && best_dists_dev(i, pos - 1) > dist2) {
245:             best_dists_dev(i, pos) = best_dists_dev(i, pos - 1);
246:             best_idxs_dev(i, pos)  = best_idxs_dev(i, pos - 1);
247:             pos--;
248:           }
249:           best_dists_dev(i, pos) = dist2;
250:           best_idxs_dev(i, pos)  = j;

252:           // Update current max distance
253:           current_max_dist = best_dists_dev(i, n_obs_vertex - 1);
254:         }
255:       }
256:       // Compute weights
257:       PetscReal radius2 = best_dists_dev(i, n_obs_vertex - 1);
258:       PetscReal radius  = std::sqrt(radius2);
259:       radius *= RADIUS_FACTOR;
260:       if (radius == 0.0) radius = 1.0;

262:       for (PetscInt k = 0; k < n_obs_vertex; ++k) {
263:         if (best_idxs_dev(i, k) != -1) {
264:           PetscReal dist    = std::sqrt(best_dists_dev(i, k));
265:           indices_dev(i, k) = best_idxs_dev(i, k);
266:           values_dev(i, k)  = GaspariCohn(dist, radius);
267:         } else {
268:           indices_dev(i, k) = -1; // Ignore this entry
269:           values_dev(i, k)  = 0.0;
270:         }
271:       }
272:     });

274:   /* Copy back to host and fill matrix */
275:   // Host views must be LayoutRight for MatSetValues (row-major)
276:   Kokkos::View<PetscInt **, Kokkos::LayoutRight, Kokkos::HostSpace>    indices_host("indices_host", n_vert_local, n_obs_vertex);
277:   Kokkos::View<PetscScalar **, Kokkos::LayoutRight, Kokkos::HostSpace> values_host("values_host", n_vert_local, n_obs_vertex);

279:   // Deep copy will handle layout conversion (transpose) if device views are LayoutLeft
280:   // Note: Kokkos::deep_copy cannot copy between different layouts if the memory spaces are different (e.g. GPU to Host).
281:   // We need an intermediate mirror view on the host with the same layout as the device view.
282:   Kokkos::View<PetscInt **, Kokkos::LayoutLeft, Kokkos::HostSpace>    indices_host_left = Kokkos::create_mirror_view(indices_dev);
283:   Kokkos::View<PetscScalar **, Kokkos::LayoutLeft, Kokkos::HostSpace> values_host_left  = Kokkos::create_mirror_view(values_dev);

285:   Kokkos::deep_copy(indices_host_left, indices_dev);
286:   Kokkos::deep_copy(values_host_left, values_dev);

288:   // Now copy from LayoutLeft host view to LayoutRight host view
289:   Kokkos::deep_copy(indices_host, indices_host_left);
290:   Kokkos::deep_copy(values_host, values_host_left);

292:   for (PetscInt i = 0; i < n_vert_local; ++i) {
293:     PetscInt globalRow = rstart + i;
294:     PetscCall(MatSetValues(*Q, 1, &globalRow, n_obs_vertex, &indices_host(i, 0), &values_host(i, 0), INSERT_VALUES));
295:   }

297:   /* Compute mean and std dev of localization radius */
298:   {
299:     using FunctorType = RadiusStatsFunctor<decltype(best_dists_dev)>;
300:     typename FunctorType::value_type result;
301:     Kokkos::parallel_reduce("ComputeRadiusStats", Kokkos::RangePolicy<ExecSpace>(0, n_vert_local), FunctorType{best_dists_dev, n_obs_vertex}, result);

303:     if (n_vert_local > 0) {
304:       double mean   = result.sum / n_vert_local;
305:       double var    = (result.sq_sum / n_vert_local) - (mean * mean);
306:       double stddev = (var > 1e-1 * PETSC_SQRT_MACHINE_EPSILON) ? std::sqrt(var) : 0.0;
307:       PetscCall(PetscInfo((PetscObject)obs_vecs[0], "LETKF localization radius: mean %g, std dev %g\n", mean, stddev));
308:     }
309:   }

311:   /* Cleanup storage */
312:   for (d = 0; d < dim; ++d) PetscCall(VecDestroy(&obs_vecs[d]));
313:   PetscCall(PetscFree(obs_vecs));

315:   /* Assemble matrix */
316:   PetscCall(MatAssemblyBegin(*Q, MAT_FINAL_ASSEMBLY));
317:   PetscCall(MatAssemblyEnd(*Q, MAT_FINAL_ASSEMBLY));
318:   PetscFunctionReturn(PETSC_SUCCESS);
319: }