tdevice.cu - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
HTML git clone git://src.adamsgaard.dk/sphere
DIR Log
DIR Files
DIR Refs
DIR LICENSE
---
tdevice.cu (119471B)
---
1 // device.cu -- GPU specific operations utilizing the CUDA API.
2 #include <iostream>
3 #include <fstream>
4 #include <string>
5 #include <cstdio>
6 #include <cuda.h>
7 #include <helper_math.h>
8 #include <iomanip>
9 #include <time.h>
10
11 #include "vector_arithmetic.h" // for arbitrary prec. vectors
12 //#include <vector_functions.h> // for single prec. vectors
13 #include "thrust/device_ptr.h"
14 #include "thrust/sort.h"
15
16 #include "sphere.h"
17 #include "datatypes.h"
18 #include "utility.h"
19 #include "constants.cuh"
20 #include "debug.h"
21 #include "version.h"
22
23 #include "sorting.cuh"
24 #include "contactmodels.cuh"
25 #include "cohesion.cuh"
26 #include "contactsearch.cuh"
27 #include "integration.cuh"
28 #include "raytracer.cuh"
29 #include "navierstokes.cuh"
30 #include "darcy.cuh"
31
32 // Returns the number of cores per streaming multiprocessor, which is
33 // a function of the device compute capability
34 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
35 int cudaCoresPerSM(int major, int minor)
36 {
37 if (major == 1)
38 return 8;
39 else if (major == 2 && minor == 0)
40 return 32;
41 else if (major == 2 && minor == 1)
42 return 48;
43 else if (major == 3)
44 return 192;
45 else if (major == 4)
46 return 128;
47 else if (major == 5)
48 return 128;
49 else if (major == 6 && minor == 0)
50 return 64;
51 else if (major == 6 && minor == 1)
52 return 128;
53 else if (major == 6 && minor == 2)
54 return 128;
55 else if (major == 7)
56 return 32;
57 else if (major == 8)
58 return 64;
59 else
60 printf("Error in cudaCoresPerSM Device compute capability value "
61 "(%d.%d) not recognized.", major, minor);
62 return -1;
63 }
64
65 // Wrapper function for initializing the CUDA components.
66 // Called from main.cpp
67 void DEM::initializeGPU(void)
68 {
69 using std::cout; // stdout
70
71 // Specify target device
72 int cudadevice = 0;
73
74 // Variables containing device properties
75 cudaDeviceProp prop;
76 int deviceCount;
77 int cudaDriverVersion;
78 int cudaRuntimeVersion;
79
80 checkForCudaErrors("Before initializing CUDA device");
81
82 // Register number of devices
83 cudaGetDeviceCount(&deviceCount);
84 ndevices = deviceCount; // store in DEM class
85
86 if (deviceCount == 0) {
87 std::cerr << "\nERROR: No CUDA-enabled devices availible. Bye."
88 << std::endl;
89 exit(EXIT_FAILURE);
90 } else if (deviceCount == 1) {
91 if (verbose == 1)
92 cout << " System contains 1 CUDA compatible device.\n";
93 } else {
94 if (verbose == 1)
95 cout << " System contains " << deviceCount
96 << " CUDA compatible devices.\n";
97 }
98
99 // Loop through GPU's and choose the one with the most CUDA cores
100 if (device == -1) {
101 int ncudacores;
102 int max_ncudacores = 0;
103 for (int d=0; d<ndevices; d++) {
104 cudaGetDeviceProperties(&prop, d);
105 cudaDriverGetVersion(&cudaDriverVersion);
106 cudaRuntimeGetVersion(&cudaRuntimeVersion);
107
108 ncudacores = prop.multiProcessorCount
109 *cudaCoresPerSM(prop.major, prop.minor);
110 if (ncudacores > max_ncudacores) {
111 max_ncudacores = ncudacores;
112 cudadevice = d;
113 }
114
115 if (verbose == 1) {
116 cout << " CUDA device ID: " << d << "\n";
117 cout << " - Name: " << prop.name << ", compute capability: "
118 << prop.major << "." << prop.minor << ".\n";
119 cout << " - CUDA Driver version: " << cudaDriverVersion/1000
120 << "." << cudaDriverVersion%100
121 << ", runtime version " << cudaRuntimeVersion/1000 << "."
122 << cudaRuntimeVersion%100 << std::endl;
123 }
124 }
125
126 device = cudadevice; // store in DEM class
127 if (verbose == 1) {
128 cout << " Using CUDA device ID " << device << " with "
129 << max_ncudacores << " cores." << std::endl;
130 }
131
132 } else {
133
134 cudaGetDeviceProperties(&prop, device);
135 cudaDriverGetVersion(&cudaDriverVersion);
136 cudaRuntimeGetVersion(&cudaRuntimeVersion);
137
138 int ncudacores = prop.multiProcessorCount
139 *cudaCoresPerSM(prop.major, prop.minor);
140
141 if (verbose == 1) {
142 cout << " CUDA device ID: " << device << "\n";
143 cout << " - Name: " << prop.name << ", compute capability: "
144 << prop.major << "." << prop.minor << ".\n";
145 cout << " - CUDA Driver version: " << cudaDriverVersion/1000
146 << "." << cudaDriverVersion%100
147 << ", runtime version " << cudaRuntimeVersion/1000 << "."
148 << cudaRuntimeVersion%100
149 << "\n - " << ncudacores << " CUDA cores" << std::endl;
150 }
151 }
152
153 // The value of device is now 0 or larger
154 cudaSetDevice(device);
155
156 checkForCudaErrors("While initializing CUDA device");
157 }
158
159 // Start timer for kernel profiling
160 void startTimer(cudaEvent_t* kernel_tic)
161 {
162 cudaEventRecord(*kernel_tic);
163 }
164
165 // Stop timer for kernel profiling and time to function sum
166 void stopTimer(cudaEvent_t *kernel_tic,
167 cudaEvent_t *kernel_toc,
168 float *kernel_elapsed,
169 double* sum)
170 {
171 cudaEventRecord(*kernel_toc, 0);
172 cudaEventSynchronize(*kernel_toc);
173 cudaEventElapsedTime(kernel_elapsed, *kernel_tic, *kernel_toc);
174 *sum += *kernel_elapsed;
175 }
176
177 // Check values of parameters in constant memory
178 __global__ void checkConstantValues(int* dev_equal,
179 Grid* dev_grid,
180 Params* dev_params)
181 {
182 // Values ok (0)
183 *dev_equal = 0;
184
185 // Compare values between global- and constant
186 // memory structures
187 if (dev_grid->origo[0] != devC_grid.origo[0])
188 *dev_equal = 1;
189 if (dev_grid->origo[1] != devC_grid.origo[1])
190 *dev_equal = 2; // Not ok
191 if (dev_grid->origo[2] != devC_grid.origo[2])
192 *dev_equal = 3; // Not ok
193 if (dev_grid->L[0] != devC_grid.L[0])
194 *dev_equal = 4; // Not ok
195 if (dev_grid->L[1] != devC_grid.L[1])
196 *dev_equal = 5; // Not ok
197 if (dev_grid->L[2] != devC_grid.L[2])
198 *dev_equal = 6; // Not ok
199 if (dev_grid->num[0] != devC_grid.num[0])
200 *dev_equal = 7; // Not ok
201 if (dev_grid->num[1] != devC_grid.num[1])
202 *dev_equal = 8; // Not ok
203 if (dev_grid->num[2] != devC_grid.num[2])
204 *dev_equal = 9; // Not ok
205 if (dev_grid->periodic != devC_grid.periodic)
206 *dev_equal = 10; // Not ok
207
208 if (dev_params->g[0] != devC_params.g[0])
209 *dev_equal = 11; // Not ok
210 if (dev_params->g[1] != devC_params.g[1])
211 *dev_equal = 12; // Not ok
212 if (dev_params->g[2] != devC_params.g[2])
213 *dev_equal = 13; // Not ok
214 if (dev_params->k_n != devC_params.k_n)
215 *dev_equal = 14; // Not ok
216 if (dev_params->k_t != devC_params.k_t)
217 *dev_equal = 15; // Not ok
218 if (dev_params->k_r != devC_params.k_r)
219 *dev_equal = 16; // Not ok
220 if (dev_params->gamma_n != devC_params.gamma_n)
221 *dev_equal = 17; // Not ok
222 if (dev_params->gamma_t != devC_params.gamma_t)
223 *dev_equal = 18; // Not ok
224 if (dev_params->gamma_r != devC_params.gamma_r)
225 *dev_equal = 19; // Not ok
226 if (dev_params->mu_s != devC_params.mu_s)
227 *dev_equal = 20; // Not ok
228 if (dev_params->mu_d != devC_params.mu_d)
229 *dev_equal = 21; // Not ok
230 if (dev_params->mu_r != devC_params.mu_r)
231 *dev_equal = 22; // Not ok
232 if (dev_params->rho != devC_params.rho)
233 *dev_equal = 23; // Not ok
234 if (dev_params->contactmodel != devC_params.contactmodel)
235 *dev_equal = 24; // Not ok
236 if (dev_params->kappa != devC_params.kappa)
237 *dev_equal = 25; // Not ok
238 if (dev_params->db != devC_params.db)
239 *dev_equal = 26; // Not ok
240 if (dev_params->V_b != devC_params.V_b)
241 *dev_equal = 27; // Not ok
242 if (dev_params->lambda_bar != devC_params.lambda_bar)
243 *dev_equal = 28; // Not ok
244 if (dev_params->nb0 != devC_params.nb0)
245 *dev_equal = 29; // Not ok
246 if (dev_params->E != devC_params.E)
247 *dev_equal = 30; // Not ok
248 }
249
250 __global__ void checkParticlePositions(
251 const Float4* __restrict__ dev_x)
252 {
253 unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
254
255 if (idx < devC_np) { // Condition prevents block size error
256 Float4 x = dev_x[idx];
257
258 // make sure grain doesn't have NaN or Inf position
259 if (!isfinite(x.x) || !isfinite(x.y) || !isfinite(x.z)) {
260 __syncthreads();
261 printf("\nParticle %d has non-finite position: x = %f %f %f",
262 idx, x.x, x.y, x.z);
263 }
264
265 /*__syncthreads();
266 printf("\nParticle %d: x = %f %f %f",
267 idx, x.x, x.y, x.z);*/
268
269 // check that the particle is inside of the simulation domain
270 if (x.x < devC_grid.origo[0] ||
271 x.y < devC_grid.origo[1] ||
272 x.z < devC_grid.origo[2] ||
273 x.x > devC_grid.L[0] ||
274 x.y > devC_grid.L[1] ||
275 x.z > devC_grid.L[2]) {
276 __syncthreads();
277 printf("\nParticle %d is outside the computational domain "
278 "(%f %f %f to %f %f %f): x = %f %f %f",
279 idx,
280 devC_grid.origo[0], devC_grid.origo[1], devC_grid.origo[2],
281 devC_grid.L[0], devC_grid.L[1], devC_grid.L[2],
282 x.x, x.y, x.z);
283 }
284 }
285 }
286
287
288 // Copy the constant data components to device memory,
289 // and check whether the values correspond to the
290 // values in constant memory.
291 void DEM::checkConstantMemory()
292 {
293 // Allocate space in global device memory
294 Grid* dev_grid;
295 Params* dev_params;
296 cudaMalloc((void**)&dev_grid, sizeof(Grid));
297 cudaMalloc((void**)&dev_params, sizeof(Params));
298
299 // Copy structure data from host to global device memory
300 cudaMemcpy(dev_grid, &grid, sizeof(Grid), cudaMemcpyHostToDevice);
301 cudaMemcpy(dev_params, ¶ms, sizeof(Params), cudaMemcpyHostToDevice);
302
303 // Compare values between global and constant memory
304 // structures on the device.
305 int* equal = new int; // The values are equal = 0, if not = 1
306 *equal = 0;
307 int* dev_equal;
308 cudaMalloc((void**)&dev_equal, sizeof(int));
309 checkConstantValues<<<1,1>>>(dev_equal, dev_grid, dev_params);
310 checkForCudaErrors("After constant memory check");
311
312 // Copy result to host
313 cudaMemcpy(equal, dev_equal, sizeof(int), cudaMemcpyDeviceToHost);
314
315 // Free global device memory
316 cudaFree(dev_grid);
317 cudaFree(dev_params);
318 cudaFree(dev_equal);
319
320 // Are the values equal?
321 if (*equal != 0) {
322 std::cerr << "Error! The values in constant memory do not "
323 << "seem to be correct (" << *equal << ")." << std::endl;
324 exit(1);
325 } else {
326 if (verbose == 1)
327 std::cout << " Constant values ok (" << *equal << ")."
328 << std::endl;
329 }
330 }
331
332 // Copy selected constant components to constant device memory.
333 void DEM::transferToConstantDeviceMemory()
334 {
335 using std::cout;
336
337 if (verbose == 1)
338 cout << " Transfering data to constant device memory: ";
339
340 /*for (int d=0; d<ndevices; d++) {
341 cudaSetDevice(d);*/
342 cudaMemcpyToSymbol(devC_nd, &nd, sizeof(nd));
343 cudaMemcpyToSymbol(devC_np, &np, sizeof(np));
344 cudaMemcpyToSymbol(devC_nw, &walls.nw, sizeof(unsigned int));
345 cudaMemcpyToSymbol(devC_nc, &NC, sizeof(int));
346 cudaMemcpyToSymbol(devC_dt, &time.dt, sizeof(Float));
347 cudaMemcpyToSymbol(devC_grid, &grid, sizeof(Grid));
348 cudaMemcpyToSymbol(devC_params, ¶ms, sizeof(Params));
349 /*}
350 cudaSetDevice(device);*/
351
352 checkForCudaErrors("After transferring to device constant memory");
353
354 if (verbose == 1)
355 cout << "Done\n";
356
357 // only for device with most CUDA cores
358 checkConstantMemory();
359 }
360
361 __global__ void printWorldSize(Float4* dev_walls_nx)
362 {
363 printf("\nL = %f, %f, %f\n",
364 devC_grid.L[0], devC_grid.L[1], devC_grid.L[2]);
365 printf("\ndev_walls_nx[0] = %f, %f, %f, %f\n",
366 dev_walls_nx[0].x,
367 dev_walls_nx[0].y,
368 dev_walls_nx[0].z,
369 dev_walls_nx[0].w);
370 }
371
372 void DEM::updateGridSize()
373 {
374 //printf("\nDEM::updateGridSize() start\n");
375 Float* Lz = new Float;
376
377 // Get top wall position from dev_walls_nx[0].z
378 cudaMemcpy(Lz, &dev_walls_nx[0].w, sizeof(Float), cudaMemcpyDeviceToHost);
379 checkForCudaErrors("DEM::updateGridSize(): copying wall position");
380
381 //printWorldSize<<<1,1>>>(dev_walls_nx);
382 //cudaDeviceSynchronize();
383 //checkForCudaErrors("DEM::updateGridSize(): first printWorldSize");
384
385 //printf("\nLz = %f\n", *Lz);
386
387 // Write value to grid.L[2]
388 grid.L[2] = *Lz;
389
390 // Write value to devC_grid.L[2]
391 //cudaMemcpyToSymbol(devC_grid.L[2], &Lz, sizeof(Float));
392 cudaMemcpyToSymbol(devC_grid, &grid, sizeof(Grid));
393
394 checkForCudaErrors("DEM::updateGridSize(): write to devC_grid.L[2]");
395
396 //printWorldSize<<<1,1>>>(dev_walls_nx);
397 //cudaDeviceSynchronize();
398 //checkForCudaErrors("DEM::updateGridSize(): second printWorldSize");
399
400 // check value only during debugging
401 //checkConstantMemory();
402 }
403
404
405 // Allocate device memory for particle variables,
406 // tied to previously declared pointers in structures
407 void DEM::allocateGlobalDeviceMemory(void)
408 {
409 // Particle memory size
410 unsigned int memSizeF = sizeof(Float) * np;
411 unsigned int memSizeF4 = sizeof(Float4) * np;
412
413 if (verbose == 1)
414 std::cout << " Allocating global device memory: ";
415
416 k.acc = new Float4[np];
417 k.angacc = new Float4[np];
418 #pragma omp parallel for if(np>100)
419 for (unsigned int i = 0; i<np; ++i) {
420 k.acc[i] = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
421 k.angacc[i] = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
422 }
423
424 // Kinematics arrays
425 cudaMalloc((void**)&dev_x, memSizeF4);
426 cudaMalloc((void**)&dev_xyzsum, memSizeF4);
427 cudaMalloc((void**)&dev_vel, memSizeF4);
428 cudaMalloc((void**)&dev_vel0, memSizeF4);
429 cudaMalloc((void**)&dev_acc, memSizeF4);
430 cudaMalloc((void**)&dev_force, memSizeF4);
431 cudaMalloc((void**)&dev_angpos, memSizeF4);
432 cudaMalloc((void**)&dev_angvel, memSizeF4);
433 cudaMalloc((void**)&dev_angvel0, memSizeF4);
434 cudaMalloc((void**)&dev_angacc, memSizeF4);
435 cudaMalloc((void**)&dev_torque, memSizeF4);
436
437 // Particle contact bookkeeping arrays
438 cudaMalloc((void**)&dev_contacts,
439 sizeof(unsigned int)*np*NC);
440 cudaMalloc((void**)&dev_distmod, memSizeF4*NC);
441 cudaMalloc((void**)&dev_delta_t, memSizeF4*NC);
442 cudaMalloc((void**)&dev_bonds, sizeof(uint2)*params.nb0);
443 cudaMalloc((void**)&dev_bonds_delta, sizeof(Float4)*params.nb0);
444 cudaMalloc((void**)&dev_bonds_omega, sizeof(Float4)*params.nb0);
445
446 // Sorted arrays
447 cudaMalloc((void**)&dev_x_sorted, memSizeF4);
448 cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
449 cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
450
451 // Energy arrays
452 cudaMalloc((void**)&dev_es_dot, memSizeF);
453 cudaMalloc((void**)&dev_ev_dot, memSizeF);
454 cudaMalloc((void**)&dev_es, memSizeF);
455 cudaMalloc((void**)&dev_ev, memSizeF);
456 cudaMalloc((void**)&dev_p, memSizeF);
457
458 // Cell-related arrays
459 cudaMalloc((void**)&dev_gridParticleCellID, sizeof(unsigned int)*np);
460 cudaMalloc((void**)&dev_gridParticleIndex, sizeof(unsigned int)*np);
461 cudaMalloc((void**)&dev_cellStart, sizeof(unsigned int)
462 *grid.num[0]*grid.num[1]*grid.num[2]);
463 cudaMalloc((void**)&dev_cellEnd, sizeof(unsigned int)
464 *grid.num[0]*grid.num[1]*grid.num[2]);
465
466 // Host contact bookkeeping arrays
467 k.contacts = new unsigned int[np*NC];
468 // Initialize contacts lists to np
469 #pragma omp parallel for if(np>100)
470 for (unsigned int i=0; i<(np*NC); ++i)
471 k.contacts[i] = np;
472 k.distmod = new Float4[np*NC];
473 k.delta_t = new Float4[np*NC];
474
475 // Wall arrays
476 cudaMalloc((void**)&dev_walls_wmode, sizeof(int)*walls.nw);
477 cudaMalloc((void**)&dev_walls_nx, sizeof(Float4)*walls.nw);
478 cudaMalloc((void**)&dev_walls_mvfd, sizeof(Float4)*walls.nw);
479 cudaMalloc((void**)&dev_walls_tau_x, sizeof(Float)*walls.nw);
480 cudaMalloc((void**)&dev_walls_tau_eff_x_pp, sizeof(Float)*walls.nw*np);
481 cudaMalloc((void**)&dev_walls_force_pp, sizeof(Float)*walls.nw*np);
482 cudaMalloc((void**)&dev_walls_acc, sizeof(Float)*walls.nw);
483 // dev_walls_force_partial allocated later
484 // dev_walls_tau_eff_x_partial allocated later
485
486 checkForCudaErrors("End of allocateGlobalDeviceMemory");
487 if (verbose == 1)
488 std::cout << "Done" << std::endl;
489 }
490
491 // Allocate global memory on other devices required for "interact" function.
492 // The values of domain_size[ndevices] must be set beforehand.
493 void DEM::allocateHelperDeviceMemory(void)
494 {
495 // Particle memory size
496 unsigned int memSizeF4 = sizeof(Float4) * np;
497
498 // Initialize pointers to per-GPU arrays
499 hdev_gridParticleIndex = (unsigned**)malloc(ndevices*sizeof(unsigned*));
500 hdev_gridCellStart = (unsigned**)malloc(ndevices*sizeof(unsigned*));
501 hdev_gridCellEnd = (unsigned**)malloc(ndevices*sizeof(unsigned*));
502 hdev_x = (Float4**)malloc(ndevices*sizeof(Float4*));
503 hdev_x_sorted = (Float4**)malloc(ndevices*sizeof(Float4*));
504 hdev_vel = (Float4**)malloc(ndevices*sizeof(Float4*));
505 hdev_vel_sorted = (Float4**)malloc(ndevices*sizeof(Float4*));
506 hdev_angvel = (Float4**)malloc(ndevices*sizeof(Float4*));
507 hdev_angvel_sorted = (Float4**)malloc(ndevices*sizeof(Float4*));
508 hdev_walls_nx = (Float4**)malloc(ndevices*sizeof(Float4*));
509 hdev_walls_mvfd = (Float4**)malloc(ndevices*sizeof(Float4*));
510 hdev_distmod = (Float4**)malloc(ndevices*sizeof(Float4*));
511
512 hdev_force = (Float4**)malloc(ndevices*sizeof(Float4*));
513 hdev_torque = (Float4**)malloc(ndevices*sizeof(Float4*));
514 hdev_delta_t = (Float4**)malloc(ndevices*sizeof(Float4*));
515 hdev_es_dot = (Float**)malloc(ndevices*sizeof(Float*));
516 hdev_es = (Float**)malloc(ndevices*sizeof(Float*));
517 hdev_ev_dot = (Float**)malloc(ndevices*sizeof(Float*));
518 hdev_ev = (Float**)malloc(ndevices*sizeof(Float*));
519 hdev_p = (Float**)malloc(ndevices*sizeof(Float*));
520 hdev_walls_force_pp = (Float**)malloc(ndevices*sizeof(Float*));
521 hdev_contacts = (unsigned**)malloc(ndevices*sizeof(unsigned*));
522
523 for (int d=0; d<ndevices; d++) {
524
525 // do not allocate memory on primary GPU
526 if (d == device)
527 continue;
528
529 cudaSetDevice(d);
530
531 // allocate space for full input arrays for interact()
532 cudaMalloc((void**)&hdev_gridParticleIndex[d], sizeof(unsigned int)*np);
533 cudaMalloc((void**)&hdev_gridCellStart[d], sizeof(unsigned int)
534 *grid.num[0]*grid.num[1]*grid.num[2]);
535 cudaMalloc((void**)&hdev_gridCellEnd[d], sizeof(unsigned int)
536 *grid.num[0]*grid.num[1]*grid.num[2]);
537 cudaMalloc((void**)&hdev_x[d], memSizeF4);
538 cudaMalloc((void**)&hdev_x_sorted[d], memSizeF4);
539 cudaMalloc((void**)&hdev_vel[d], memSizeF4);
540 cudaMalloc((void**)&hdev_vel_sorted[d], memSizeF4);
541 cudaMalloc((void**)&hdev_angvel[d], memSizeF4);
542 cudaMalloc((void**)&hdev_angvel_sorted[d], memSizeF4);
543 cudaMalloc((void**)&hdev_walls_nx[d], sizeof(Float4)*walls.nw);
544 cudaMalloc((void**)&hdev_walls_mvfd[d], sizeof(Float4)*walls.nw);
545 cudaMalloc((void**)&hdev_distmod[d], memSizeF4*NC);
546
547 // allocate space for partial output arrays for interact()
548 cudaMalloc((void**)&hdev_force[d], sizeof(Float4)*domain_size[d]);
549 cudaMalloc((void**)&hdev_torque[d], sizeof(Float4)*domain_size[d]);
550 cudaMalloc((void**)&hdev_es_dot[d], sizeof(Float)*domain_size[d]);
551 cudaMalloc((void**)&hdev_ev_dot[d], sizeof(Float)*domain_size[d]);
552 cudaMalloc((void**)&hdev_es[d], sizeof(Float)*domain_size[d]);
553 cudaMalloc((void**)&hdev_ev[d], sizeof(Float)*domain_size[d]);
554 cudaMalloc((void**)&hdev_p[d], sizeof(Float)*domain_size[d]);
555 cudaMalloc((void**)&hdev_walls_force_pp[d],
556 sizeof(Float)*domain_size[d]*walls.nw);
557 cudaMalloc((void**)&hdev_contacts[d],
558 sizeof(unsigned)*domain_size[d]*NC);
559 cudaMalloc((void**)&hdev_delta_t[d], sizeof(Float4)*domain_size[d]*NC);
560
561 checkForCudaErrors("During allocateGlobalDeviceMemoryOtherDevices");
562 }
563 cudaSetDevice(device); // select main GPU
564 }
565
566 void DEM::freeHelperDeviceMemory()
567 {
568 for (int d=0; d<ndevices; d++) {
569
570 // do not allocate memory on primary GPU
571 if (d == device)
572 continue;
573
574 cudaSetDevice(d);
575
576 cudaFree(hdev_gridParticleIndex[d]);
577 cudaFree(hdev_gridCellStart[d]);
578 cudaFree(hdev_gridCellEnd[d]);
579 cudaFree(hdev_x[d]);
580 cudaFree(hdev_vel[d]);
581 cudaFree(hdev_vel_sorted[d]);
582 cudaFree(hdev_angvel[d]);
583 cudaFree(hdev_angvel_sorted[d]);
584 cudaFree(hdev_walls_nx[d]);
585 cudaFree(hdev_walls_mvfd[d]);
586 cudaFree(hdev_distmod[d]);
587
588 cudaFree(hdev_force[d]);
589 cudaFree(hdev_torque[d]);
590 cudaFree(hdev_es_dot[d]);
591 cudaFree(hdev_ev_dot[d]);
592 cudaFree(hdev_es[d]);
593 cudaFree(hdev_ev[d]);
594 cudaFree(hdev_p[d]);
595 cudaFree(hdev_walls_force_pp[d]);
596 cudaFree(hdev_contacts[d]);
597 cudaFree(hdev_delta_t[d]);
598
599 checkForCudaErrors("During helper device cudaFree calls");
600 }
601 cudaSetDevice(device); // select primary GPU
602 }
603
604 void DEM::freeGlobalDeviceMemory()
605 {
606 if (verbose == 1)
607 printf("\nFreeing device memory: ");
608
609 // Particle arrays
610 cudaFree(dev_x);
611 cudaFree(dev_xyzsum);
612 cudaFree(dev_vel);
613 cudaFree(dev_vel0);
614 cudaFree(dev_acc);
615 cudaFree(dev_force);
616 cudaFree(dev_angpos);
617 cudaFree(dev_angvel);
618 cudaFree(dev_angvel0);
619 cudaFree(dev_angacc);
620 cudaFree(dev_torque);
621
622 cudaFree(dev_contacts);
623 cudaFree(dev_distmod);
624 cudaFree(dev_delta_t);
625 cudaFree(dev_bonds);
626 cudaFree(dev_bonds_delta);
627 cudaFree(dev_bonds_omega);
628
629 cudaFree(dev_es_dot);
630 cudaFree(dev_es);
631 cudaFree(dev_ev_dot);
632 cudaFree(dev_ev);
633 cudaFree(dev_p);
634
635 cudaFree(dev_x_sorted);
636 cudaFree(dev_vel_sorted);
637 cudaFree(dev_angvel_sorted);
638
639 // Cell-related arrays
640 cudaFree(dev_gridParticleIndex);
641 cudaFree(dev_cellStart);
642 cudaFree(dev_cellEnd);
643
644 // Wall arrays
645 cudaFree(dev_walls_nx);
646 cudaFree(dev_walls_mvfd);
647 cudaFree(dev_walls_tau_x);
648 cudaFree(dev_walls_force_partial);
649 cudaFree(dev_walls_force_pp);
650 cudaFree(dev_walls_acc);
651 cudaFree(dev_walls_tau_eff_x_pp);
652 cudaFree(dev_walls_tau_eff_x_partial);
653
654 // Fluid arrays
655 if (fluid == 1 && cfd_solver == 0) {
656 freeNSmemDev();
657 }
658 if (fluid == 1 && cfd_solver == 1) {
659 freeDarcyMemDev();
660 }
661
662 //checkForCudaErrors("During cudaFree calls");
663
664 if (verbose == 1)
665 std::cout << "Done" << std::endl;
666 }
667
668
669 void DEM::transferToGlobalDeviceMemory(int statusmsg)
670 {
671 if (verbose == 1 && statusmsg == 1)
672 std::cout << " Transfering data to the device: ";
673
674 // Commonly-used memory sizes
675 unsigned int memSizeF = sizeof(Float) * np;
676 unsigned int memSizeF4 = sizeof(Float4) * np;
677
678 // Copy static-size structure data from host to global device memory
679 //cudaMemcpy(dev_time, &time, sizeof(Time), cudaMemcpyHostToDevice);
680
681 // Kinematic particle values
682 cudaMemcpy( dev_x, k.x,
683 memSizeF4, cudaMemcpyHostToDevice);
684 cudaMemcpy( dev_xyzsum, k.xyzsum,
685 memSizeF4, cudaMemcpyHostToDevice);
686 cudaMemcpy( dev_vel, k.vel,
687 memSizeF4, cudaMemcpyHostToDevice);
688 cudaMemcpy( dev_vel0, k.vel,
689 memSizeF4, cudaMemcpyHostToDevice);
690 cudaMemcpy( dev_acc, k.acc,
691 memSizeF4, cudaMemcpyHostToDevice);
692 cudaMemcpy( dev_force, k.force,
693 memSizeF4, cudaMemcpyHostToDevice);
694 cudaMemcpy( dev_angpos, k.angpos,
695 memSizeF4, cudaMemcpyHostToDevice);
696 cudaMemcpy( dev_angvel, k.angvel,
697 memSizeF4, cudaMemcpyHostToDevice);
698 cudaMemcpy( dev_angvel0, k.angvel,
699 memSizeF4, cudaMemcpyHostToDevice);
700 cudaMemcpy( dev_angacc, k.angacc,
701 memSizeF4, cudaMemcpyHostToDevice);
702 cudaMemcpy( dev_torque, k.torque,
703 memSizeF4, cudaMemcpyHostToDevice);
704 cudaMemcpy( dev_contacts, k.contacts,
705 sizeof(unsigned int)*np*NC, cudaMemcpyHostToDevice);
706 cudaMemcpy( dev_distmod, k.distmod,
707 memSizeF4*NC, cudaMemcpyHostToDevice);
708 cudaMemcpy( dev_delta_t, k.delta_t,
709 memSizeF4*NC, cudaMemcpyHostToDevice);
710 cudaMemcpy( dev_bonds, k.bonds,
711 sizeof(uint2)*params.nb0, cudaMemcpyHostToDevice);
712 cudaMemcpy( dev_bonds_delta, k.bonds_delta,
713 sizeof(Float4)*params.nb0, cudaMemcpyHostToDevice);
714 cudaMemcpy( dev_bonds_omega, k.bonds_omega,
715 sizeof(Float4)*params.nb0, cudaMemcpyHostToDevice);
716
717 // Individual particle energy values
718 cudaMemcpy( dev_es_dot, e.es_dot,
719 memSizeF, cudaMemcpyHostToDevice);
720 cudaMemcpy( dev_es, e.es,
721 memSizeF, cudaMemcpyHostToDevice);
722 cudaMemcpy( dev_ev_dot, e.ev_dot,
723 memSizeF, cudaMemcpyHostToDevice);
724 cudaMemcpy( dev_ev, e.ev,
725 memSizeF, cudaMemcpyHostToDevice);
726 cudaMemcpy( dev_p, e.p,
727 memSizeF, cudaMemcpyHostToDevice);
728
729 // Wall parameters
730 cudaMemcpy( dev_walls_wmode, walls.wmode,
731 sizeof(int)*walls.nw, cudaMemcpyHostToDevice);
732 cudaMemcpy( dev_walls_nx, walls.nx,
733 sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
734 cudaMemcpy( dev_walls_mvfd, walls.mvfd,
735 sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
736 cudaMemcpy( dev_walls_tau_x, walls.tau_x,
737 sizeof(Float)*walls.nw, cudaMemcpyHostToDevice);
738
739 // Fluid arrays
740 if (fluid == 1) {
741 if (cfd_solver == 0) {
742 transferNStoGlobalDeviceMemory(1);
743 } else if (cfd_solver == 1) {
744 transferDarcyToGlobalDeviceMemory(1);
745 } else {
746 std::cerr << "Error: cfd_solver value not understood ("
747 << cfd_solver << ")" << std::endl;
748 }
749 }
750
751 checkForCudaErrors("End of transferToGlobalDeviceMemory");
752 if (verbose == 1 && statusmsg == 1)
753 std::cout << "Done" << std::endl;
754 }
755
756 void DEM::transferFromGlobalDeviceMemory()
757 {
758 //std::cout << " Transfering data from the device: ";
759
760 // Commonly-used memory sizes
761 unsigned int memSizeF = sizeof(Float) * np;
762 unsigned int memSizeF4 = sizeof(Float4) * np;
763
764 // Copy static-size structure data from host to global device memory
765 //cudaMemcpy(&time, dev_time, sizeof(Time), cudaMemcpyDeviceToHost);
766
767 // Kinematic particle values
768 cudaMemcpy( k.x, dev_x,
769 memSizeF4, cudaMemcpyDeviceToHost);
770 cudaMemcpy( k.xyzsum, dev_xyzsum,
771 memSizeF4, cudaMemcpyDeviceToHost);
772 cudaMemcpy( k.vel, dev_vel,
773 memSizeF4, cudaMemcpyDeviceToHost);
774 cudaMemcpy( k.acc, dev_acc,
775 memSizeF4, cudaMemcpyDeviceToHost);
776 cudaMemcpy( k.force, dev_force,
777 memSizeF4, cudaMemcpyDeviceToHost);
778 cudaMemcpy( k.angpos, dev_angpos,
779 memSizeF4, cudaMemcpyDeviceToHost);
780 cudaMemcpy( k.angvel, dev_angvel,
781 memSizeF4, cudaMemcpyDeviceToHost);
782 cudaMemcpy( k.angacc, dev_angacc,
783 memSizeF4, cudaMemcpyDeviceToHost);
784 cudaMemcpy( k.torque, dev_torque,
785 memSizeF4, cudaMemcpyDeviceToHost);
786 cudaMemcpy( k.contacts, dev_contacts,
787 sizeof(unsigned int)*np*NC, cudaMemcpyDeviceToHost);
788 cudaMemcpy( k.distmod, dev_distmod,
789 memSizeF4*NC, cudaMemcpyDeviceToHost);
790 cudaMemcpy( k.delta_t, dev_delta_t,
791 memSizeF4*NC, cudaMemcpyDeviceToHost);
792 cudaMemcpy( k.bonds, dev_bonds,
793 sizeof(uint2)*params.nb0, cudaMemcpyDeviceToHost);
794 cudaMemcpy( k.bonds_delta, dev_bonds_delta,
795 sizeof(Float4)*params.nb0, cudaMemcpyDeviceToHost);
796 cudaMemcpy( k.bonds_omega, dev_bonds_omega,
797 sizeof(Float4)*params.nb0, cudaMemcpyDeviceToHost);
798
799 // Individual particle energy values
800 cudaMemcpy( e.es_dot, dev_es_dot,
801 memSizeF, cudaMemcpyDeviceToHost);
802 cudaMemcpy( e.es, dev_es,
803 memSizeF, cudaMemcpyDeviceToHost);
804 cudaMemcpy( e.ev_dot, dev_ev_dot,
805 memSizeF, cudaMemcpyDeviceToHost);
806 cudaMemcpy( e.ev, dev_ev,
807 memSizeF, cudaMemcpyDeviceToHost);
808 cudaMemcpy( e.p, dev_p,
809 memSizeF, cudaMemcpyDeviceToHost);
810
811 // Wall parameters
812 cudaMemcpy( walls.wmode, dev_walls_wmode,
813 sizeof(int)*walls.nw, cudaMemcpyDeviceToHost);
814 cudaMemcpy( walls.nx, dev_walls_nx,
815 sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
816 cudaMemcpy( walls.mvfd, dev_walls_mvfd,
817 sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
818 cudaMemcpy( walls.tau_x, dev_walls_tau_x,
819 sizeof(Float)*walls.nw, cudaMemcpyDeviceToHost);
820
821 // Fluid arrays
822 if (fluid == 1 && cfd_solver == 0) {
823 transferNSfromGlobalDeviceMemory(0);
824 }
825 else if (fluid == 1 && cfd_solver == 1) {
826 transferDarcyFromGlobalDeviceMemory(0);
827 checkDarcyStability();
828 }
829
830 //checkForCudaErrors("End of transferFromGlobalDeviceMemory");
831 }
832
833
834 // Iterate through time by explicit time integration
835 void DEM::startTime()
836 {
837 using std::cout;
838 using std::cerr;
839 using std::endl;
840
841 std::string outfile;
842 char file[200];
843 FILE *fp;
844
845 // Synchronization point
846 cudaDeviceSynchronize();
847 checkForCudaErrors("Start of startTime()");
848
849 // Write initial data to output/<sid>.output00000.bin
850 if (time.step_count == 0)
851 writebin(("output/" + sid + ".output00000.bin").c_str());
852
853 // Time variables
854 clock_t tic, toc;
855 double filetimeclock, time_spent;
856 float dev_time_spent;
857
858 // Start CPU clock
859 tic = clock();
860
861 //// GPU workload configuration
862 unsigned int threadsPerBlock = 256;
863 //unsigned int threadsPerBlock = 512;
864
865 // Create enough blocks to accomodate the particles
866 unsigned int blocksPerGrid = iDivUp(np, threadsPerBlock);
867 dim3 dimGrid(blocksPerGrid, 1, 1); // Blocks arranged in 1D grid
868 dim3 dimBlock(threadsPerBlock, 1, 1); // Threads arranged in 1D block
869
870 unsigned int blocksPerGridBonds = iDivUp(params.nb0, threadsPerBlock);
871 dim3 dimGridBonds(blocksPerGridBonds, 1, 1); // Blocks arranged in 1D grid
872
873 // Use 3D block and grid layout for cell-centered fluid calculations
874 dim3 dimBlockFluid(8, 8, 8); // 512 threads per block
875 dim3 dimGridFluid(
876 iDivUp(grid.num[0], dimBlockFluid.x),
877 iDivUp(grid.num[1], dimBlockFluid.y),
878 iDivUp(grid.num[2], dimBlockFluid.z));
879 if (dimGridFluid.z > 64 && fluid == 1) {
880 cerr << "Error: dimGridFluid.z > 64" << endl;
881 exit(1);
882 }
883
884 // Use 3D block and grid layout for cell-face fluid calculations
885 dim3 dimBlockFluidFace(8, 8, 8); // 512 threads per block
886 dim3 dimGridFluidFace(
887 iDivUp(grid.num[0]+1, dimBlockFluidFace.x),
888 iDivUp(grid.num[1]+1, dimBlockFluidFace.y),
889 iDivUp(grid.num[2]+1, dimBlockFluidFace.z));
890 if (dimGridFluidFace.z > 64 && fluid == 1) {
891 cerr << "Error: dimGridFluidFace.z > 64" << endl;
892 exit(1);
893 }
894
895
896 // Shared memory per block
897 unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);
898
899 // Pre-sum of force per wall
900 cudaMalloc((void**)&dev_walls_force_partial,
901 sizeof(Float)*dimGrid.x*walls.nw);
902
903 // Pre-sum of shear stress per wall
904 cudaMalloc((void**)&dev_walls_tau_eff_x_partial,
905 sizeof(Float)*dimGrid.x*walls.nw);
906
907 // Report to stdout
908 if (verbose == 1) {
909 cout << "\n Device memory allocation and transfer complete.\n"
910 << " - Blocks per grid: "
911 << dimGrid.x << "*" << dimGrid.y << "*" << dimGrid.z << "\n"
912 << " - Threads per block: "
913 << dimBlock.x << "*" << dimBlock.y << "*" << dimBlock.z << "\n"
914 << " - Shared memory required per block: " << smemSize << " bytes"
915 << endl;
916 if (fluid == 1) {
917 cout << " - Blocks per fluid grid: "
918 << dimGridFluid.x << "*" << dimGridFluid.y << "*" <<
919 dimGridFluid.z << "\n"
920 << " - Threads per fluid block: "
921 << dimBlockFluid.x << "*" << dimBlockFluid.y << "*" <<
922 dimBlockFluid.z << endl;
923 }
924 }
925
926 // Initialize counter variable values
927 filetimeclock = 0.0;
928 long iter = 0;
929 const int stdout_report = 10; // no of steps between reporting to stdout
930
931 // Create first status.dat
932 //sprintf(file,"output/%s.status.dat", sid);
933 outfile = "output/" + sid + ".status.dat";
934 fp = fopen(outfile.c_str(), "w");
935 fprintf(fp,"%2.4e %2.4e %d\n",
936 time.current,
937 100.0*time.current/time.total,
938 time.step_count);
939 fclose(fp);
940
941 if (verbose == 1) {
942 cout << "\n Entering the main calculation time loop...\n\n"
943 << " IMPORTANT: Do not close this terminal, doing so will \n"
944 << " terminate this SPHERE process. Follow the \n"
945 << " progress by executing:\n"
946 << " $ ./sphere_status " << sid << endl << endl;
947 }
948
949
950 // Start GPU clock
951 cudaEvent_t dev_tic, dev_toc;
952 cudaEventCreate(&dev_tic);
953 cudaEventCreate(&dev_toc);
954 cudaEventRecord(dev_tic, 0);
955
956 // If profiling is enabled, initialize timers for each kernel
957 cudaEvent_t kernel_tic, kernel_toc;
958 float kernel_elapsed;
959 double t_calcParticleCellID = 0.0;
960 double t_thrustsort = 0.0;
961 double t_reorderArrays = 0.0;
962 double t_topology = 0.0;
963 double t_interact = 0.0;
964 double t_bondsLinear = 0.0;
965 double t_latticeBoltzmannD3Q19 = 0.0;
966 double t_integrate = 0.0;
967 double t_summation = 0.0;
968 double t_integrateWalls = 0.0;
969
970 double t_findPorositiesDev = 0.0;
971 double t_findNSstressTensor = 0.0;
972 double t_findNSdivphiviv = 0.0;
973 double t_findNSdivtau = 0.0;
974 double t_findPredNSvelocities = 0.0;
975 double t_setNSepsilon = 0.0;
976 double t_setNSdirichlet = 0.0;
977 double t_setNSghostNodesDev = 0.0;
978 double t_findNSforcing = 0.0;
979 double t_jacobiIterationNS = 0.0;
980 double t_updateNSvelocityPressure = 0.0;
981
982 double t_findDarcyPorosities = 0.0;
983 double t_setDarcyGhostNodes = 0.0;
984 double t_findDarcyPressureForce = 0.0;
985 double t_setDarcyTopPressure = 0.0;
986 double t_findDarcyPermeabilities = 0.0;
987 double t_findDarcyPermeabilityGradients = 0.0;
988 //double t_findDarcyPressureChange = 0.0;
989 double t_updateDarcySolution = 0.0;
990 double t_copyValues = 0.0;
991 double t_findDarcyVelocities = 0.0;
992
993 if (PROFILING == 1) {
994 cudaEventCreate(&kernel_tic);
995 cudaEventCreate(&kernel_toc);
996 }
997
998 // The model start time is saved for profiling performance
999 double t_start = time.current;
1000 double t_ratio; // ration between time flow in model vs. reality
1001
1002 // Hard-coded parameters for stepwise velocity change (rate-state exp)
1003 int velocity_state = 1; // 1: v1, 2: v2
1004 int change_velocity_state = 0; // 1: increase velocity, 2: decrease vel.
1005 const Float velocity_factor = 10.0; // v2 = v1*velocity_factor
1006 const Float v2_start = 10.0; // seconds
1007 const Float v2_end = 15.0; // seconds
1008
1009 // Index of dynamic top wall (if it exists)
1010 unsigned int wall0_iz = 10000000;
1011 // weight of fluid between two cells in z direction
1012 Float dp_dz;
1013 if (fluid == 1) {
1014 if (cfd_solver == 0)
1015 dp_dz = fabs(ns.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
1016 else if (cfd_solver == 1) {
1017 dp_dz = fabs(darcy.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
1018
1019 // determine pressure at top wall at t=0
1020 darcy.p_top_orig = darcy.p[d_idx(0,0,darcy.nz-1)]
1021 - darcy.p_mod_A
1022 *sin(2.0*M_PI*darcy.p_mod_f*time.current
1023 + darcy.p_mod_phi);
1024 }
1025 }
1026 //std::cout << "dp_dz = " << dp_dz << std::endl;
1027
1028 // Write a log file of the number of iterations it took before
1029 // convergence in the fluid solver
1030 std::ofstream convlog;
1031 if (write_conv_log == 1) {
1032 std::string f = "output/" + sid + "-conv.log";
1033 convlog.open(f.c_str());
1034 }
1035
1036 if (verbose == 1)
1037 cout << " Current simulation time: " << time.current << " s.";
1038
1039 // MAIN CALCULATION TIME LOOP
1040 while (time.current <= time.total) {
1041
1042 // Print current step number to terminal
1043 //printf("\n\n@@@ DEM time step: %ld\n", iter);
1044
1045 // Routine check for errors
1046 checkForCudaErrors("Start of main while loop");
1047
1048 if (np > 0) {
1049
1050 // check if particle positions have finite values
1051 #ifdef CHECK_PARTICLES_FINITE
1052 checkParticlePositions<<<dimGrid, dimBlock>>>(dev_x);
1053 cudaDeviceSynchronize();
1054 checkForCudaErrorsIter("Post checkParticlePositions", iter);
1055 #endif
1056
1057 // If the grid is adaptive, readjust the grid height to equal the
1058 // positions of the dynamic walls
1059 if (grid.adaptive == 1 && walls.nw > 0) {
1060 updateGridSize();
1061 }
1062
1063 // For each particle:
1064 // Compute hash key (cell index) from position
1065 // in the fine, uniform and homogenous grid.
1066 if (PROFILING == 1)
1067 startTimer(&kernel_tic);
1068 calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID,
1069 dev_gridParticleIndex,
1070 dev_x);
1071
1072 // Synchronization point
1073 cudaDeviceSynchronize();
1074 if (PROFILING == 1)
1075 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1076 &t_calcParticleCellID);
1077 checkForCudaErrorsIter("Post calcParticleCellID", iter);
1078
1079
1080 // Sort particle (key, particle ID) pairs by hash key with Thrust
1081 // radix sort
1082 if (PROFILING == 1)
1083 startTimer(&kernel_tic);
1084 thrust::sort_by_key(
1085 thrust::device_ptr<uint>(dev_gridParticleCellID),
1086 thrust::device_ptr<uint>(dev_gridParticleCellID + np),
1087 thrust::device_ptr<uint>(dev_gridParticleIndex));
1088 cudaDeviceSynchronize(); // Maybe Thrust synchronizes implicitly?
1089 if (PROFILING == 1)
1090 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1091 &t_thrustsort);
1092 checkForCudaErrorsIter("Post thrust::sort_by_key", iter);
1093
1094
1095 // Zero cell array values by setting cellStart to its highest
1096 // possible value, specified with pointer value 0xffffffff, which
1097 // for a 32 bit unsigned int is 4294967295.
1098 cudaMemset(dev_cellStart, 0xffffffff,
1099 grid.num[0]*grid.num[1]*grid.num[2]*sizeof(unsigned int));
1100 cudaDeviceSynchronize();
1101 checkForCudaErrorsIter("Post cudaMemset", iter);
1102
1103 // Use sorted order to reorder particle arrays (position,
1104 // velocities, radii) to ensure coherent memory access. Save ordered
1105 // configurations in new arrays (*_sorted).
1106 if (PROFILING == 1)
1107 startTimer(&kernel_tic);
1108 reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
1109 dev_cellEnd,
1110 dev_gridParticleCellID,
1111 dev_gridParticleIndex,
1112 dev_x, dev_vel,
1113 dev_angvel,
1114 dev_x_sorted,
1115 dev_vel_sorted,
1116 dev_angvel_sorted);
1117
1118 // Synchronization point
1119 cudaDeviceSynchronize();
1120 if (PROFILING == 1)
1121 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1122 &t_reorderArrays);
1123 checkForCudaErrorsIter("Post reorderArrays", iter);
1124
1125 // The contact search in topology() is only necessary for
1126 // determining the accumulated shear distance needed in the linear
1127 // elastic and nonlinear contact force model
1128 if (params.contactmodel == 2 || params.contactmodel == 3) {
1129 // For each particle: Search contacts in neighbor cells
1130 if (PROFILING == 1)
1131 startTimer(&kernel_tic);
1132 topology<<<dimGrid, dimBlock>>>(dev_cellStart,
1133 dev_cellEnd,
1134 dev_gridParticleIndex,
1135 dev_x_sorted,
1136 dev_contacts,
1137 dev_distmod);
1138
1139 // Synchronization point
1140 cudaDeviceSynchronize();
1141 if (PROFILING == 1)
1142 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1143 &t_topology);
1144 checkForCudaErrorsIter(
1145 "Post topology: One or more particles moved "
1146 "outside the grid.\nThis could possibly be caused by a "
1147 "numerical instability.\nIs the computational time step"
1148 " too large?", iter);
1149 }
1150
1151 // For each particle process collisions and compute resulting forces
1152 //cudaPrintfInit();
1153 if (PROFILING == 1)
1154 startTimer(&kernel_tic);
1155 interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
1156 dev_cellStart,
1157 dev_cellEnd,
1158 dev_x,
1159 dev_x_sorted,
1160 dev_vel_sorted,
1161 dev_angvel_sorted,
1162 dev_vel,
1163 dev_angvel,
1164 dev_force,
1165 dev_torque,
1166 dev_es_dot,
1167 dev_ev_dot,
1168 dev_es,
1169 dev_ev,
1170 dev_p,
1171 dev_walls_nx,
1172 dev_walls_mvfd,
1173 dev_walls_force_pp,
1174 dev_contacts,
1175 dev_distmod,
1176 dev_delta_t);
1177
1178 // Synchronization point
1179 cudaDeviceSynchronize();
1180 //cudaPrintfDisplay(stdout, true);
1181 if (PROFILING == 1)
1182 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1183 &t_interact);
1184 checkForCudaErrorsIter(
1185 "Post interact - often caused if particles move "
1186 "outside the grid", iter);
1187
1188 // Process particle pairs
1189 if (params.nb0 > 0) {
1190 if (PROFILING == 1)
1191 startTimer(&kernel_tic);
1192 bondsLinear<<<dimGridBonds, dimBlock>>>(
1193 dev_bonds,
1194 dev_bonds_delta,
1195 dev_bonds_omega,
1196 dev_x,
1197 dev_vel,
1198 dev_angvel,
1199 dev_force,
1200 dev_torque);
1201 // Synchronization point
1202 cudaDeviceSynchronize();
1203 //cudaPrintfDisplay(stdout, true);
1204 if (PROFILING == 1)
1205 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1206 &t_bondsLinear);
1207 checkForCudaErrorsIter("Post bondsLinear", iter);
1208 }
1209 }
1210
1211 // Solve fluid flow through the grid
1212 if (fluid == 1) {
1213
1214 // Navier-Stokes solution
1215 if (cfd_solver == 0) {
1216
1217 checkForCudaErrorsIter("Before findPorositiesDev", iter);
1218 // Find cell porosities, average particle velocities, and
1219 // average particle diameters. These are needed for predicting
1220 // the fluid velocities
1221 if (PROFILING == 1)
1222 startTimer(&kernel_tic);
1223 findPorositiesVelocitiesDiametersSpherical
1224 //findPorositiesVelocitiesDiametersSphericalGradient
1225 <<<dimGridFluid, dimBlockFluid>>>(
1226 dev_cellStart,
1227 dev_cellEnd,
1228 dev_x_sorted,
1229 dev_vel_sorted,
1230 dev_ns_phi,
1231 dev_ns_dphi,
1232 dev_ns_vp_avg,
1233 dev_ns_d_avg,
1234 iter,
1235 np,
1236 ns.c_phi);
1237 cudaDeviceSynchronize();
1238 if (PROFILING == 1)
1239 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1240 &t_findPorositiesDev);
1241 checkForCudaErrorsIter("Post findPorositiesDev", iter);
1242
1243 #ifdef CFD_DEM_COUPLING
1244 /*if (params.nu <= 0.0) {
1245 std::cerr << "Error! The fluid needs a positive viscosity "
1246 "value in order to simulate particle-fluid interaction."
1247 << std::endl;
1248 exit(1);
1249 }*/
1250 if (iter == 0) {
1251 // set cell center ghost nodes
1252 setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
1253 dev_ns_v, ns.bc_bot, ns.bc_top);
1254
1255 // find cell face velocities
1256 interpolateCenterToFace
1257 <<<dimGridFluidFace, dimBlockFluidFace>>>(
1258 dev_ns_v,
1259 dev_ns_v_x,
1260 dev_ns_v_y,
1261 dev_ns_v_z);
1262 cudaDeviceSynchronize();
1263 checkForCudaErrors("Post interpolateCenterToFace");
1264 }
1265
1266 setNSghostNodesFace<Float>
1267 <<<dimGridFluidFace, dimBlockFluidFace>>>(
1268 dev_ns_v_x,
1269 dev_ns_v_y,
1270 dev_ns_v_z,
1271 ns.bc_bot,
1272 ns.bc_top);
1273 cudaDeviceSynchronize();
1274 checkForCudaErrorsIter("Post setNSghostNodesFace", iter);
1275
1276 findFaceDivTau<<<dimGridFluidFace, dimBlockFluidFace>>>(
1277 dev_ns_v_x,
1278 dev_ns_v_y,
1279 dev_ns_v_z,
1280 ns.mu,
1281 dev_ns_div_tau_x,
1282 dev_ns_div_tau_y,
1283 dev_ns_div_tau_z);
1284 cudaDeviceSynchronize();
1285 checkForCudaErrorsIter("Post findFaceDivTau", iter);
1286
1287 setNSghostNodesFace<Float>
1288 <<<dimGridFluidFace, dimBlockFluid>>>(
1289 dev_ns_div_tau_x,
1290 dev_ns_div_tau_y,
1291 dev_ns_div_tau_z,
1292 ns.bc_bot,
1293 ns.bc_top);
1294 cudaDeviceSynchronize();
1295 checkForCudaErrorsIter("Post setNSghostNodes(dev_ns_div_tau)",
1296 iter);
1297
1298 setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
1299 dev_ns_p, ns.bc_bot, ns.bc_top);
1300 cudaDeviceSynchronize();
1301 checkForCudaErrorsIter("Post setNSghostNodes(dev_ns_p)", iter);
1302
1303 setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
1304 dev_ns_phi, ns.bc_bot, ns.bc_top);
1305 cudaDeviceSynchronize();
1306 checkForCudaErrorsIter("Post setNSghostNodes(dev_ns_p)", iter);
1307
1308
1309 if (np > 0) {
1310
1311 // Per particle, find the fluid-particle interaction force
1312 // f_pf and apply it to the particle
1313 findInteractionForce<<<dimGrid, dimBlock>>>(
1314 dev_x,
1315 dev_vel,
1316 dev_ns_phi,
1317 dev_ns_p,
1318 dev_ns_v_x,
1319 dev_ns_v_y,
1320 dev_ns_v_z,
1321 dev_ns_div_tau_x,
1322 dev_ns_div_tau_y,
1323 dev_ns_div_tau_z,
1324 //ns.c_v,
1325 ns.mu,
1326 ns.rho_f,
1327 dev_ns_f_pf,
1328 dev_force,
1329 dev_ns_f_d,
1330 dev_ns_f_p,
1331 dev_ns_f_v,
1332 dev_ns_f_sum);
1333 cudaDeviceSynchronize();
1334 checkForCudaErrorsIter("Post findInteractionForce", iter);
1335
1336 setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
1337 dev_ns_p, ns.bc_bot, ns.bc_top);
1338 cudaDeviceSynchronize();
1339 checkForCudaErrorsIter("Post setNSghostNodes(dev_ns_p)",
1340 iter);
1341
1342 // Apply fluid-particle interaction force to the fluid
1343 applyInteractionForceToFluid
1344 <<<dimGridFluid, dimBlockFluid>>>(
1345 dev_gridParticleIndex,
1346 dev_cellStart,
1347 dev_cellEnd,
1348 dev_ns_f_pf,
1349 dev_ns_F_pf);
1350 //dev_ns_F_pf_x,
1351 //dev_ns_F_pf_y,
1352 //dev_ns_F_pf_z);
1353 cudaDeviceSynchronize();
1354 checkForCudaErrorsIter("Post applyInteractionForceToFluid",
1355 iter);
1356
1357 setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
1358 dev_ns_F_pf, ns.bc_bot, ns.bc_top);
1359 cudaDeviceSynchronize();
1360 checkForCudaErrorsIter("Post setNSghostNodes(F_pf)", iter);
1361 }
1362 #endif
1363
1364 if ((iter % ns.ndem) == 0) {
1365 // Initial guess for the top epsilon values. These may be
1366 // changed in setUpperPressureNS
1367 // TODO: Check if this should only be set when top bc=Dirichlet
1368 Float pressure = ns.p[idx(0,0,ns.nz-1)];
1369 Float pressure_new = pressure; // Dirichlet
1370 Float epsilon_value = pressure_new - ns.beta*pressure;
1371 setNSepsilonTop<<<dimGridFluid, dimBlockFluid>>>(
1372 dev_ns_epsilon,
1373 dev_ns_epsilon_new,
1374 epsilon_value);
1375 cudaDeviceSynchronize();
1376 checkForCudaErrorsIter("Post setNSepsilonTop", iter);
1377
1378 #if defined(REPORT_EPSILON) || defined(REPORT_V_P_COMPONENTS) || defined(REPORT_V_C_COMPONENTS)
1379 std::cout
1380 << "\n\n@@@@@@ TIME STEP " << iter << " @@@"
1381 << std::endl;
1382 #endif
1383
1384 // find cell containing top wall
1385 if (walls.nw > 0 &&
1386 (walls.wmode[0] == 1 || walls.wmode[0] == 3)) {
1387 wall0_iz = walls.nx->w/(grid.L[2]/grid.num[2]);
1388 setNSepsilonAtTopWall<<<dimGridFluid, dimBlockFluid>>>(
1389 dev_ns_epsilon,
1390 dev_ns_epsilon_new,
1391 wall0_iz,
1392 epsilon_value,
1393 dp_dz);
1394 cudaDeviceSynchronize();
1395 checkForCudaErrorsIter("Post setNSepsilonAtTopWall",
1396 iter);
1397
1398 #ifdef REPORT_EPSILON
1399 std::cout
1400 << "\n###### EPSILON setNSepsilonAtTopWall "
1401 << "######" << std::endl;
1402 transferNSepsilonFromGlobalDeviceMemory();
1403 printNSarray(stdout, ns.epsilon, "epsilon");
1404 #endif
1405 }
1406
1407 // Modulate the pressures at the upper boundary cells
1408 if ((ns.p_mod_A > 1.0e-5 || ns.p_mod_A < -1.0e-5) &&
1409 ns.p_mod_f > 1.0e-7) {
1410 // original pressure
1411 Float new_pressure = ns.p[idx(0,0,ns.nz-1)]
1412 + ns.p_mod_A*sin(2.0*M_PI*ns.p_mod_f*time.current
1413 + ns.p_mod_phi);
1414 setUpperPressureNS<<<dimGridFluid, dimBlockFluid>>>(
1415 dev_ns_p,
1416 dev_ns_epsilon,
1417 dev_ns_epsilon_new,
1418 ns.beta,
1419 new_pressure);
1420 cudaDeviceSynchronize();
1421 checkForCudaErrorsIter("Post setUpperPressureNS", iter);
1422
1423 #ifdef REPORT_MORE_EPSILON
1424 std::cout
1425 << "\n@@@@@@ TIME STEP " << iter << " @@@@@@"
1426 << "\n###### EPSILON AFTER setUpperPressureNS "
1427 << "######" << std::endl;
1428 transferNSepsilonFromGlobalDeviceMemory();
1429 printNSarray(stdout, ns.epsilon, "epsilon");
1430 #endif
1431 }
1432
1433 // Set the values of the ghost nodes in the grid
1434 if (PROFILING == 1)
1435 startTimer(&kernel_tic);
1436
1437 setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
1438 dev_ns_p, ns.bc_bot, ns.bc_top);
1439
1440 //setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
1441 //dev_ns_v, ns.bc_bot, ns.bc_top);
1442
1443 setNSghostNodesFace<Float>
1444 <<<dimGridFluidFace, dimBlockFluidFace>>>(
1445 dev_ns_v_p_x,
1446 dev_ns_v_p_y,
1447 dev_ns_v_p_z,
1448 ns.bc_bot, ns.bc_top);
1449
1450 setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
1451 dev_ns_phi, ns.bc_bot, ns.bc_top);
1452
1453 setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
1454 dev_ns_dphi, ns.bc_bot, ns.bc_top);
1455
1456 cudaDeviceSynchronize();
1457 if (PROFILING == 1)
1458 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1459 &t_setNSghostNodesDev);
1460 checkForCudaErrorsIter("Post setNSghostNodesDev", iter);
1461 /*std::cout
1462 << "\n###### EPSILON AFTER setNSghostNodesDev #####"
1463 << std::endl;
1464 transferNSepsilonFromGlobalDeviceMemory();
1465 printNSarray(stdout, ns.epsilon, "epsilon");*/
1466
1467 // interpolate velocities to cell centers which makes
1468 // velocity prediction easier
1469 interpolateFaceToCenter<<<dimGridFluid, dimBlockFluid>>>(
1470 dev_ns_v_x,
1471 dev_ns_v_y,
1472 dev_ns_v_z,
1473 dev_ns_v);
1474 cudaDeviceSynchronize();
1475 checkForCudaErrorsIter(
1476 "Post interpolateFaceToCenter", iter);
1477
1478 // Set cell-center velocity ghost nodes
1479 setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
1480 dev_ns_v, ns.bc_bot, ns.bc_top);
1481 cudaDeviceSynchronize();
1482 checkForCudaErrorsIter("Post setNSghostNodes(v)", iter);
1483
1484 // Find the divergence of phi*vi*v, needed for predicting
1485 // the fluid velocities
1486 if (PROFILING == 1)
1487 startTimer(&kernel_tic);
1488 findNSdivphiviv<<<dimGridFluid, dimBlockFluid>>>(
1489 dev_ns_phi,
1490 dev_ns_v,
1491 dev_ns_div_phi_vi_v);
1492 cudaDeviceSynchronize();
1493 if (PROFILING == 1)
1494 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1495 &t_findNSdivphiviv);
1496 checkForCudaErrorsIter("Post findNSdivphiviv", iter);
1497
1498 // Set cell-center ghost nodes
1499 setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
1500 dev_ns_div_phi_vi_v, ns.bc_bot, ns.bc_top);
1501 cudaDeviceSynchronize();
1502 checkForCudaErrorsIter("Post setNSghostNodes(div_phi_vi_v)",
1503 iter);
1504
1505 // Predict the fluid velocities on the base of the old
1506 // pressure field and ignoring the incompressibility
1507 // constraint
1508 if (PROFILING == 1)
1509 startTimer(&kernel_tic);
1510 findPredNSvelocities<<<dimGridFluidFace, dimBlockFluidFace>>>(
1511 dev_ns_p,
1512 dev_ns_v_x,
1513 dev_ns_v_y,
1514 dev_ns_v_z,
1515 dev_ns_phi,
1516 dev_ns_dphi,
1517 dev_ns_div_tau_x,
1518 dev_ns_div_tau_y,
1519 dev_ns_div_tau_z,
1520 dev_ns_div_phi_vi_v,
1521 ns.bc_bot,
1522 ns.bc_top,
1523 ns.beta,
1524 dev_ns_F_pf,
1525 ns.ndem,
1526 wall0_iz,
1527 ns.c_v,
1528 ns.mu,
1529 ns.rho_f,
1530 dev_ns_v_p_x,
1531 dev_ns_v_p_y,
1532 dev_ns_v_p_z);
1533 cudaDeviceSynchronize();
1534 if (PROFILING == 1)
1535 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1536 &t_findPredNSvelocities);
1537 checkForCudaErrorsIter("Post findPredNSvelocities", iter);
1538
1539 setNSghostNodesFace<Float>
1540 <<<dimGridFluidFace, dimBlockFluidFace>>>(
1541 dev_ns_v_p_x,
1542 dev_ns_v_p_y,
1543 dev_ns_v_p_z,
1544 ns.bc_bot, ns.bc_top);
1545 cudaDeviceSynchronize();
1546 checkForCudaErrorsIter(
1547 "Post setNSghostNodesFace(dev_ns_v_p)", iter);
1548
1549 interpolateFaceToCenter<<<dimGridFluid, dimBlockFluid>>>(
1550 dev_ns_v_p_x,
1551 dev_ns_v_p_y,
1552 dev_ns_v_p_z,
1553 dev_ns_v_p);
1554 cudaDeviceSynchronize();
1555 checkForCudaErrorsIter(
1556 "Post interpolateFaceToCenter", iter);
1557
1558
1559 // In the first iteration of the sphere program, we'll need
1560 // to manually estimate the values of epsilon. In the
1561 // subsequent iterations, the previous values are used.
1562 if (iter == 0) {
1563
1564 // Define the first estimate of the values of epsilon.
1565 // The initial guess depends on the value of ns.beta.
1566 Float pressure = ns.p[idx(2,2,2)];
1567 Float pressure_new = pressure; // Guess p_curr = p_new
1568 Float epsilon_value = pressure_new - ns.beta*pressure;
1569 if (PROFILING == 1)
1570 startTimer(&kernel_tic);
1571 setNSepsilonInterior<<<dimGridFluid, dimBlockFluid>>>(
1572 dev_ns_epsilon, epsilon_value);
1573 cudaDeviceSynchronize();
1574
1575 setNSnormZero<<<dimGridFluid, dimBlockFluid>>>
1576 (dev_ns_norm);
1577 cudaDeviceSynchronize();
1578
1579 if (PROFILING == 1)
1580 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1581 &t_setNSepsilon);
1582 checkForCudaErrorsIter("Post setNSepsilonInterior",
1583 iter);
1584
1585 #ifdef REPORT_MORE_EPSILON
1586 std::cout
1587 << "\n###### EPSILON AFTER setNSepsilonInterior "
1588 << "######" << std::endl;
1589 transferNSepsilonFromGlobalDeviceMemory();
1590 printNSarray(stdout, ns.epsilon, "epsilon");
1591 #endif
1592
1593 // Set the epsilon values at the lower boundary
1594 pressure = ns.p[idx(0,0,0)];
1595 pressure_new = pressure; // Guess p_current = p_new
1596 epsilon_value = pressure_new - ns.beta*pressure;
1597 if (PROFILING == 1)
1598 startTimer(&kernel_tic);
1599 setNSepsilonBottom<<<dimGridFluid, dimBlockFluid>>>(
1600 dev_ns_epsilon,
1601 dev_ns_epsilon_new,
1602 epsilon_value);
1603 cudaDeviceSynchronize();
1604 if (PROFILING == 1)
1605 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1606 &t_setNSdirichlet);
1607 checkForCudaErrorsIter("Post setNSepsilonBottom", iter);
1608
1609 #ifdef REPORT_MORE_EPSILON
1610 std::cout
1611 << "\n###### EPSILON AFTER setNSepsilonBottom "
1612 << "######" << std::endl;
1613 transferNSepsilonFromGlobalDeviceMemory();
1614 printNSarray(stdout, ns.epsilon, "epsilon");
1615 #endif
1616
1617 /*setNSghostNodes<Float>
1618 <<<dimGridFluid, dimBlockFluid>>>(
1619 dev_ns_epsilon);
1620 cudaDeviceSynchronize();
1621 checkForCudaErrors(
1622 "Post setNSghostNodesFloat(dev_ns_epsilon)",
1623 iter);*/
1624 setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
1625 dev_ns_epsilon,
1626 ns.bc_bot, ns.bc_top);
1627 cudaDeviceSynchronize();
1628 checkForCudaErrorsIter("Post setNSghostNodesEpsilon(1)",
1629 iter);
1630
1631 #ifdef REPORT_MORE_EPSILON
1632 std::cout <<
1633 "\n###### EPSILON AFTER setNSghostNodes(epsilon) "
1634 << "######" << std::endl;
1635 transferNSepsilonFromGlobalDeviceMemory();
1636 printNSarray(stdout, ns.epsilon, "epsilon");
1637 #endif
1638 }
1639
1640 // Solve the system of epsilon using a Jacobi iterative
1641 // solver. The average normalized residual is initialized
1642 // to a large value.
1643 //double avg_norm_res;
1644 double max_norm_res;
1645
1646 // Write a log file of the normalized residuals during the
1647 // Jacobi iterations
1648 std::ofstream reslog;
1649 if (write_res_log == 1)
1650 reslog.open("max_res_norm.dat");
1651
1652 // transfer normalized residuals from GPU to CPU
1653 #ifdef REPORT_MORE_EPSILON
1654 std::cout << "\n###### BEFORE FIRST JACOBI ITERATION ######"
1655 << "\n@@@@@@ TIME STEP " << iter << " @@@@@@"
1656 << std::endl;
1657 transferNSepsilonFromGlobalDeviceMemory();
1658 printNSarray(stdout, ns.epsilon, "epsilon");
1659 #endif
1660
1661 for (unsigned int nijac = 0; nijac<ns.maxiter; ++nijac) {
1662
1663 // Only grad(epsilon) changes during the Jacobi
1664 // iterations. The remaining terms of the forcing
1665 // function are only calculated during the first
1666 // iteration.
1667 if (PROFILING == 1)
1668 startTimer(&kernel_tic);
1669 findNSforcing<<<dimGridFluid, dimBlockFluid>>>(
1670 dev_ns_epsilon,
1671 dev_ns_phi,
1672 dev_ns_dphi,
1673 dev_ns_v_p,
1674 dev_ns_v_p_x,
1675 dev_ns_v_p_y,
1676 dev_ns_v_p_z,
1677 nijac,
1678 ns.ndem,
1679 ns.c_v,
1680 ns.rho_f,
1681 dev_ns_f1,
1682 dev_ns_f2,
1683 dev_ns_f);
1684 cudaDeviceSynchronize();
1685 if (PROFILING == 1)
1686 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1687 &t_findNSforcing);
1688 checkForCudaErrorsIter("Post findNSforcing", iter);
1689 /*setNSghostNodesForcing
1690 <<dimGridFluid, dimBlockFluid>>>(
1691 dev_ns_f1,
1692 dev_ns_f2,
1693 dev_ns_f,
1694 nijac);
1695 cudaDeviceSynchronize();
1696 checkForCudaErrors("Post setNSghostNodesForcing",
1697 iter);*/
1698
1699 setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
1700 dev_ns_epsilon,
1701 ns.bc_bot, ns.bc_top);
1702 cudaDeviceSynchronize();
1703 checkForCudaErrorsIter("Post setNSghostNodesEpsilon(2)",
1704 iter);
1705
1706 #ifdef REPORT_EPSILON
1707 std::cout << "\n###### JACOBI ITERATION "
1708 << nijac
1709 << " after setNSghostNodes(epsilon,2) ######"
1710 << std::endl;
1711 transferNSepsilonFromGlobalDeviceMemory();
1712 printNSarray(stdout, ns.epsilon, "epsilon");
1713 #endif
1714
1715 // Perform a single Jacobi iteration
1716 if (PROFILING == 1)
1717 startTimer(&kernel_tic);
1718 jacobiIterationNS<<<dimGridFluid, dimBlockFluid>>>(
1719 dev_ns_epsilon,
1720 dev_ns_epsilon_new,
1721 dev_ns_norm,
1722 dev_ns_f,
1723 ns.bc_bot,
1724 ns.bc_top,
1725 ns.theta,
1726 wall0_iz,
1727 dp_dz);
1728 cudaDeviceSynchronize();
1729 if (PROFILING == 1)
1730 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1731 &t_jacobiIterationNS);
1732 checkForCudaErrorsIter("Post jacobiIterationNS", iter);
1733
1734 // set Dirichlet and Neumann BC at cells containing top
1735 // wall
1736 /*if (walls.nw > 0 && walls.wmode[0] == 1) {
1737 setNSepsilonAtTopWall
1738 <<<dimGridFluid, dimBlockFluid>>>(
1739 dev_ns_epsilon,
1740 dev_ns_epsilon_new,
1741 wall0_iz,
1742 epsilon_value,
1743 dp_dz);
1744 cudaDeviceSynchronize();
1745 checkForCudaErrorsIter("Post setNSepsilonAtTopWall",
1746 iter);
1747 }*/
1748
1749 // Copy new values to current values
1750 copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
1751 dev_ns_epsilon_new,
1752 dev_ns_epsilon);
1753 cudaDeviceSynchronize();
1754 checkForCudaErrorsIter
1755 ("Post copyValues (epsilon_new->epsilon)", iter);
1756
1757 #ifdef REPORT_EPSILON
1758 std::cout << "\n###### JACOBI ITERATION "
1759 << nijac << " after jacobiIterationNS ######"
1760 << std::endl;
1761 transferNSepsilonFromGlobalDeviceMemory();
1762 printNSarray(stdout, ns.epsilon, "epsilon");
1763 #endif
1764
1765 if (nijac % nijacnorm == 0) {
1766
1767 // Read the normalized residuals from the device
1768 transferNSnormFromGlobalDeviceMemory();
1769
1770 // Write the normalized residuals to the terminal
1771 //printNSarray(stdout, ns.norm, "norm");
1772
1773 // Find the maximum value of the normalized
1774 // residuals
1775 max_norm_res = maxNormResNS();
1776
1777 // Write the Jacobi iteration number and maximum
1778 // value of the normalized residual to the log file
1779 if (write_res_log == 1)
1780 reslog << nijac << '\t' << max_norm_res
1781 << std::endl;
1782 }
1783
1784 if (max_norm_res < ns.tolerance) {
1785
1786 if (write_conv_log == 1
1787 && iter % conv_log_interval == 0)
1788 convlog << iter+1 << '\t' << nijac << std::endl;
1789
1790 setNSghostNodes<Float>
1791 <<<dimGridFluid, dimBlockFluid>>>(
1792 dev_ns_epsilon,
1793 ns.bc_bot, ns.bc_top);
1794 cudaDeviceSynchronize();
1795 checkForCudaErrorsIter
1796 ("Post setNSghostNodesEpsilon(4)", iter);
1797
1798 // Apply smoothing if requested
1799 if (ns.gamma > 0.0) {
1800
1801 smoothing<<<dimGridFluid, dimBlockFluid>>>(
1802 dev_ns_epsilon,
1803 ns.gamma,
1804 ns.bc_bot, ns.bc_top);
1805 cudaDeviceSynchronize();
1806 checkForCudaErrorsIter("Post smoothing", iter);
1807
1808 setNSghostNodes<Float>
1809 <<<dimGridFluid, dimBlockFluid>>>(
1810 dev_ns_epsilon,
1811 ns.bc_bot, ns.bc_top);
1812 cudaDeviceSynchronize();
1813 checkForCudaErrorsIter
1814 ("Post setNSghostNodesEpsilon(4)", iter);
1815 }
1816
1817 #ifdef REPORT_EPSILON
1818 std::cout << "\n###### JACOBI ITERATION "
1819 << nijac << " after smoothing ######"
1820 << std::endl;
1821 transferNSepsilonFromGlobalDeviceMemory();
1822 printNSarray(stdout, ns.epsilon, "epsilon");
1823 #endif
1824
1825 break; // solution has converged, exit Jacobi loop
1826 }
1827
1828 if (nijac >= ns.maxiter-1) {
1829
1830 if (write_conv_log == 1)
1831 convlog << iter+1 << '\t' << nijac << std::endl;
1832
1833 std::cerr << "\nIteration " << iter << ", time "
1834 << iter*time.dt << " s: "
1835 "Error, the epsilon solution in the fluid "
1836 "calculations did not converge. Try increasing "
1837 "the value of 'ns.maxiter' (" << ns.maxiter
1838 << ") or increase 'ns.tolerance' ("
1839 << ns.tolerance << ")." << std::endl;
1840 }
1841 //break; // end after Jacobi first iteration
1842 } // end Jacobi iteration loop
1843
1844 if (write_res_log == 1)
1845 reslog.close();
1846
1847 // Find the new pressures and velocities
1848 if (PROFILING == 1)
1849 startTimer(&kernel_tic);
1850
1851 updateNSpressure<<<dimGridFluid, dimBlockFluid>>>(
1852 dev_ns_epsilon,
1853 ns.beta,
1854 dev_ns_p);
1855 cudaDeviceSynchronize();
1856 checkForCudaErrorsIter("Post updateNSpressure", iter);
1857
1858 updateNSvelocity<<<dimGridFluidFace, dimBlockFluidFace>>>(
1859 dev_ns_v_p_x,
1860 dev_ns_v_p_y,
1861 dev_ns_v_p_z,
1862 dev_ns_phi,
1863 dev_ns_epsilon,
1864 ns.beta,
1865 ns.bc_bot,
1866 ns.bc_top,
1867 ns.ndem,
1868 ns.c_v,
1869 ns.rho_f,
1870 wall0_iz,
1871 iter,
1872 dev_ns_v_x,
1873 dev_ns_v_y,
1874 dev_ns_v_z);
1875 cudaDeviceSynchronize();
1876 if (PROFILING == 1)
1877 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1878 &t_updateNSvelocityPressure);
1879 checkForCudaErrorsIter("Post updateNSvelocity", iter);
1880
1881 setNSghostNodesFace<Float>
1882 <<<dimGridFluidFace, dimBlockFluidFace>>>(
1883 dev_ns_v_p_x,
1884 dev_ns_v_p_y,
1885 dev_ns_v_p_z,
1886 ns.bc_bot, ns.bc_top);
1887 cudaDeviceSynchronize();
1888 checkForCudaErrorsIter(
1889 "Post setNSghostNodesFace(dev_ns_v)", iter);
1890
1891 interpolateFaceToCenter<<<dimGridFluid, dimBlockFluid>>>(
1892 dev_ns_v_x,
1893 dev_ns_v_y,
1894 dev_ns_v_z,
1895 dev_ns_v);
1896 cudaDeviceSynchronize();
1897 checkForCudaErrorsIter("Post interpolateFaceToCenter",
1898 iter);
1899 } // end iter % ns.dem == 0
1900 } // end cfd_solver == 0
1901
1902 // Darcy solution
1903 else if (cfd_solver == 1) {
1904
1905 #if defined(REPORT_EPSILON) || defined(REPORT_FORCING_TERMS)
1906 std::cout << "\n\n@@@@@@ TIME STEP " << iter << " @@@"
1907 << std::endl;
1908 #endif
1909
1910 if (walls.nw > 0 &&
1911 (walls.wmode[0] == 1 || walls.wmode[0] == 3)) {
1912 wall0_iz = walls.nx->w/(grid.L[2]/grid.num[2]);
1913 }
1914
1915 if (np > 0) {
1916
1917 if (PROFILING == 1)
1918 startTimer(&kernel_tic);
1919 setDarcyGhostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
1920 dev_darcy_p,
1921 darcy.bc_xn, darcy.bc_xp,
1922 darcy.bc_yn, darcy.bc_yp,
1923 darcy.bc_bot, darcy.bc_top);
1924 cudaDeviceSynchronize();
1925 if (PROFILING == 1)
1926 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1927 &t_setDarcyGhostNodes);
1928 checkForCudaErrorsIter("Post setDarcyGhostNodes("
1929 "dev_darcy_p) before findDarcyPressureForce", iter);
1930
1931 if (PROFILING == 1)
1932 startTimer(&kernel_tic);
1933 findDarcyPressureGradient<<<dimGridFluid, dimBlockFluid>>>(
1934 dev_darcy_p,
1935 dev_darcy_grad_p);
1936 cudaDeviceSynchronize();
1937 checkForCudaErrorsIter("After findDarcyPressureGradient",
1938 iter);
1939
1940 setDarcyGhostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
1941 dev_darcy_grad_p,
1942 darcy.bc_xn, darcy.bc_xp,
1943 darcy.bc_yn, darcy.bc_yp,
1944 darcy.bc_bot, darcy.bc_top);
1945 cudaDeviceSynchronize();
1946 if (PROFILING == 1)
1947 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1948 &t_setDarcyGhostNodes);
1949 checkForCudaErrorsIter("Post setDarcyGhostNodes("
1950 "dev_darcy_grad_p)", iter);
1951
1952 /*if (PROFILING == 1)
1953 startTimer(&kernel_tic);
1954 findDarcyPorositiesLinear<<<dimGridFluid, dimBlockFluid>>>(
1955 dev_cellStart,
1956 dev_cellEnd,
1957 dev_x_sorted,
1958 dev_vel_sorted,
1959 iter,
1960 darcy.ndem,
1961 np,
1962 darcy.c_phi,
1963 dev_darcy_phi,
1964 dev_darcy_dphi,
1965 dev_darcy_div_v_p);
1966 cudaDeviceSynchronize();
1967 if (PROFILING == 1)
1968 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1969 &t_findDarcyPorosities);
1970 checkForCudaErrorsIter("Post findDarcyPorosities", iter);*/
1971
1972 /*findDarcyPressureForce<<<dimGrid, dimBlock>>>(
1973 dev_x,
1974 dev_darcy_p,
1975 wall0_iz,
1976 darcy.rho_f,
1977 dev_force,
1978 dev_darcy_f_p);*/
1979 findDarcyPressureForceLinear<<<dimGrid, dimBlock>>>(
1980 dev_x,
1981 dev_darcy_grad_p,
1982 dev_darcy_phi,
1983 wall0_iz,
1984 darcy.rho_f,
1985 darcy.bc_top,
1986 dev_force,
1987 dev_darcy_f_p);
1988 cudaDeviceSynchronize();
1989 if (PROFILING == 1)
1990 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
1991 &t_findDarcyPressureForce);
1992 checkForCudaErrorsIter("Post findDarcyPressureForce",
1993 iter);
1994 }
1995
1996 if ((iter % darcy.ndem) == 0) {
1997
1998 if (PROFILING == 1)
1999 startTimer(&kernel_tic);
2000 /*findDarcyPorosities<<<dimGridFluid, dimBlockFluid>>>(
2001 dev_cellStart,
2002 dev_cellEnd,
2003 dev_x_sorted,
2004 dev_vel_sorted,
2005 iter,
2006 darcy.ndem,
2007 np,
2008 darcy.c_phi,
2009 dev_darcy_phi,
2010 dev_darcy_dphi);*/
2011 findDarcyPorositiesLinear<<<dimGridFluid, dimBlockFluid>>>(
2012 dev_cellStart,
2013 dev_cellEnd,
2014 dev_x_sorted,
2015 dev_vel_sorted,
2016 iter,
2017 darcy.ndem,
2018 np,
2019 darcy.c_phi,
2020 dev_darcy_phi,
2021 dev_darcy_dphi,
2022 dev_darcy_div_v_p,
2023 dev_darcy_vp_avg);
2024 cudaDeviceSynchronize();
2025 if (PROFILING == 1)
2026 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2027 &t_findDarcyPorosities);
2028 checkForCudaErrorsIter("Post findDarcyPorosities", iter);
2029
2030 // copy porosities to the frictionless Y boundaries
2031 if (grid.periodic == 2) {
2032 copyDarcyPorositiesToEdges<<<dimGridFluid,
2033 dimBlockFluid>>>(
2034 dev_darcy_phi,
2035 dev_darcy_dphi,
2036 dev_darcy_div_v_p,
2037 dev_darcy_vp_avg);
2038 cudaDeviceSynchronize();
2039 }
2040
2041 // copy porosities to the frictionless lower Z boundary
2042 if (grid.periodic == 2) {
2043 copyDarcyPorositiesToBottom<<<dimGridFluid,
2044 dimBlockFluid>>>(
2045 dev_darcy_phi,
2046 dev_darcy_dphi,
2047 dev_darcy_div_v_p,
2048 dev_darcy_vp_avg);
2049 cudaDeviceSynchronize();
2050 }
2051
2052 // Modulate the pressures at the upper boundary cells
2053 if ((darcy.p_mod_A > 1.0e-5 || darcy.p_mod_A < -1.0e-5) &&
2054 darcy.p_mod_f > 1.0e-7) {
2055 // original pressure
2056 Float new_pressure =
2057 darcy.p_top_orig + darcy.p_mod_A
2058 *sin(2.0*M_PI*darcy.p_mod_f*time.current
2059 + darcy.p_mod_phi);
2060 if (PROFILING == 1)
2061 startTimer(&kernel_tic);
2062 setDarcyTopPressure<<<dimGridFluid, dimBlockFluid>>>(
2063 new_pressure,
2064 dev_darcy_p,
2065 wall0_iz);
2066 cudaDeviceSynchronize();
2067 checkForCudaErrorsIter("Post setUpperPressureNS", iter);
2068
2069 // Modulate the pressures at the top wall
2070 setDarcyTopWallPressure
2071 <<<dimGridFluid, dimBlockFluid>>>(
2072 new_pressure,
2073 wall0_iz,
2074 dev_darcy_p);
2075 cudaDeviceSynchronize();
2076 checkForCudaErrorsIter("Post setDarcyTopWallPressure",
2077 iter);
2078
2079 if (PROFILING == 1)
2080 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2081 &t_setDarcyTopPressure);
2082 }
2083
2084 if (PROFILING == 1)
2085 startTimer(&kernel_tic);
2086 findDarcyPermeabilities<<<dimGridFluid, dimBlockFluid>>>(
2087 darcy.k_c, dev_darcy_phi, dev_darcy_k);
2088 cudaDeviceSynchronize();
2089 if (PROFILING == 1)
2090 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2091 &t_findDarcyPermeabilities);
2092 checkForCudaErrorsIter("Post findDarcyPermeabilities",
2093 iter);
2094
2095 if (PROFILING == 1)
2096 startTimer(&kernel_tic);
2097 setDarcyGhostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
2098 dev_darcy_phi,
2099 darcy.bc_xn, darcy.bc_xp,
2100 darcy.bc_yn, darcy.bc_yp,
2101 darcy.bc_bot, darcy.bc_top);
2102 cudaDeviceSynchronize();
2103 if (PROFILING == 1)
2104 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2105 &t_setDarcyGhostNodes);
2106 checkForCudaErrorsIter(
2107 "Post setDarcyGhostNodes(dev_darcy_phi)", iter);
2108
2109 if (PROFILING == 1)
2110 startTimer(&kernel_tic);
2111 setDarcyGhostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
2112 dev_darcy_k,
2113 darcy.bc_xn, darcy.bc_xp,
2114 darcy.bc_yn, darcy.bc_yp,
2115 darcy.bc_bot, darcy.bc_top);
2116 cudaDeviceSynchronize();
2117 if (PROFILING == 1)
2118 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2119 &t_setDarcyGhostNodes);
2120 checkForCudaErrorsIter(
2121 "Post setDarcyGhostNodes(dev_darcy_k)", iter);
2122
2123 if (PROFILING == 1)
2124 startTimer(&kernel_tic);
2125 findDarcyPermeabilityGradients
2126 <<<dimGridFluid, dimBlockFluid>>>
2127 (dev_darcy_k, dev_darcy_grad_k);
2128 cudaDeviceSynchronize();
2129 if (PROFILING == 1)
2130 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2131 &t_findDarcyPermeabilityGradients);
2132 checkForCudaErrorsIter(
2133 "Post findDarcyPermeabilityGradients", iter);
2134
2135 if (iter == 0) {
2136 setDarcyNormZero<<<dimGridFluid, dimBlockFluid>>>(
2137 dev_darcy_norm);
2138 cudaDeviceSynchronize();
2139 checkForCudaErrorsIter("Post setDarcyNormZero", iter);
2140
2141 if (PROFILING == 1)
2142 startTimer(&kernel_tic);
2143 copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
2144 dev_darcy_p,
2145 dev_darcy_p_old);
2146 cudaDeviceSynchronize();
2147 if (PROFILING == 1)
2148 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2149 &t_copyValues);
2150 checkForCudaErrorsIter("Post copyValues(p -> p_old)",
2151 iter);
2152 }
2153
2154 if (darcy.bc_bot == 4 || darcy.bc_top == 4) {
2155 if (PROFILING == 1)
2156 startTimer(&kernel_tic);
2157 setDarcyGhostNodesFlux<Float>
2158 <<<dimGridFluid, dimBlockFluid>>>(
2159 dev_darcy_p,
2160 darcy.bc_bot,
2161 darcy.bc_top,
2162 darcy.bc_bot_flux,
2163 darcy.bc_top_flux,
2164 dev_darcy_k,
2165 darcy.mu);
2166 cudaDeviceSynchronize();
2167 if (PROFILING == 1)
2168 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2169 &t_setDarcyGhostNodes);
2170 checkForCudaErrorsIter(
2171 "Post setDarcyGhostNodesFlux", iter);
2172 }
2173
2174 // Solve the system of epsilon using a Jacobi iterative
2175 // solver. The average normalized residual is initialized
2176 // to a large value.
2177 //double avg_norm_res;
2178 double max_norm_res;
2179
2180 // Write a log file of the normalized residuals during the
2181 // Jacobi iterations
2182 std::ofstream reslog;
2183 if (write_res_log == 1)
2184 reslog.open("max_res_norm.dat");
2185
2186 for (unsigned int nijac = 0; nijac<darcy.maxiter; ++nijac) {
2187
2188 #if defined(REPORT_EPSILON) || defined(REPORT_FORCING_TERMS)
2189 std::cout << "\n\n### Jacobi iteration " << nijac << std::endl;
2190 #endif
2191
2192 if (nijac == 0) {
2193 if (PROFILING == 1)
2194 startTimer(&kernel_tic);
2195 copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
2196 dev_darcy_p,
2197 dev_darcy_p_old);
2198 cudaDeviceSynchronize();
2199 if (PROFILING == 1)
2200 stopTimer(&kernel_tic, &kernel_toc,
2201 &kernel_elapsed, &t_copyValues);
2202 checkForCudaErrorsIter(
2203 "Post copyValues(p -> p_old)", iter);
2204 }
2205
2206 if (PROFILING == 1)
2207 startTimer(&kernel_tic);
2208 setDarcyGhostNodes<Float>
2209 <<<dimGridFluid, dimBlockFluid>>>(
2210 dev_darcy_p,
2211 darcy.bc_xn, darcy.bc_xp,
2212 darcy.bc_yn, darcy.bc_yp,
2213 darcy.bc_bot, darcy.bc_top);
2214 cudaDeviceSynchronize();
2215 if (PROFILING == 1)
2216 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2217 &t_setDarcyGhostNodes);
2218 checkForCudaErrorsIter("Post setDarcyGhostNodes("
2219 "dev_darcy_p) in Jacobi loop", iter);
2220
2221 if (nijac == 0) {
2222 if (PROFILING == 1)
2223 startTimer(&kernel_tic);
2224 firstDarcySolution<<<dimGridFluid, dimBlockFluid>>>(
2225 dev_darcy_p,
2226 dev_darcy_k,
2227 dev_darcy_phi,
2228 dev_darcy_dphi,
2229 dev_darcy_div_v_p,
2230 dev_darcy_vp_avg,
2231 dev_darcy_grad_k,
2232 darcy.beta_f,
2233 darcy.mu,
2234 darcy.bc_xn,
2235 darcy.bc_xp,
2236 darcy.bc_yn,
2237 darcy.bc_yp,
2238 darcy.bc_bot,
2239 darcy.bc_top,
2240 darcy.ndem,
2241 wall0_iz,
2242 dev_darcy_p_constant,
2243 dev_darcy_dp_expl);
2244 cudaDeviceSynchronize();
2245 if (PROFILING == 1)
2246 stopTimer(&kernel_tic, &kernel_toc,
2247 &kernel_elapsed,
2248 &t_updateDarcySolution);
2249 checkForCudaErrorsIter("Post updateDarcySolution",
2250 iter);
2251 }
2252
2253 if (PROFILING == 1)
2254 startTimer(&kernel_tic);
2255 updateDarcySolution<<<dimGridFluid, dimBlockFluid>>>(
2256 dev_darcy_p_old,
2257 //dev_darcy_dpdt,
2258 dev_darcy_dp_expl,
2259 dev_darcy_p,
2260 dev_darcy_k,
2261 dev_darcy_phi,
2262 dev_darcy_dphi,
2263 dev_darcy_div_v_p,
2264 dev_darcy_vp_avg,
2265 dev_darcy_grad_k,
2266 darcy.beta_f,
2267 darcy.mu,
2268 darcy.bc_xn,
2269 darcy.bc_xp,
2270 darcy.bc_yn,
2271 darcy.bc_yp,
2272 darcy.bc_bot,
2273 darcy.bc_top,
2274 darcy.ndem,
2275 wall0_iz,
2276 dev_darcy_p_constant,
2277 dev_darcy_p_new,
2278 dev_darcy_norm);
2279 cudaDeviceSynchronize();
2280 if (PROFILING == 1)
2281 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2282 &t_updateDarcySolution);
2283 checkForCudaErrorsIter("Post updateDarcySolution",
2284 iter);
2285
2286 if (darcy.bc_top == 1) {
2287 if (PROFILING == 1)
2288 startTimer(&kernel_tic);
2289 setDarcyTopWallFixedFlow
2290 <<<dimGridFluid, dimBlockFluid>>>
2291 (wall0_iz, dev_darcy_p);
2292 cudaDeviceSynchronize();
2293 if (PROFILING == 1)
2294 stopTimer(&kernel_tic, &kernel_toc,
2295 &kernel_elapsed,
2296 &t_updateDarcySolution);
2297 checkForCudaErrorsIter(
2298 "Post setDarcyTopWallFixedFlow", iter);
2299 }
2300
2301 if (darcy.bc_bot == 4 || darcy.bc_top == 4) {
2302 if (PROFILING == 1)
2303 startTimer(&kernel_tic);
2304 setDarcyGhostNodesFlux<Float>
2305 <<<dimGridFluid, dimBlockFluid>>>(
2306 dev_darcy_p,
2307 darcy.bc_bot,
2308 darcy.bc_top,
2309 darcy.bc_bot_flux,
2310 darcy.bc_top_flux,
2311 dev_darcy_k,
2312 darcy.mu);
2313 cudaDeviceSynchronize();
2314 if (PROFILING == 1)
2315 stopTimer(&kernel_tic, &kernel_toc,
2316 &kernel_elapsed,
2317 &t_setDarcyGhostNodes);
2318 checkForCudaErrorsIter(
2319 "Post setDarcyGhostNodesFlux", iter);
2320 }
2321
2322 // Copy new values to current values
2323 if (PROFILING == 1)
2324 startTimer(&kernel_tic);
2325 copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
2326 dev_darcy_p_new,
2327 dev_darcy_p);
2328 cudaDeviceSynchronize();
2329 if (PROFILING == 1)
2330 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2331 &t_copyValues);
2332 checkForCudaErrorsIter("Post copyValues(p_new -> p)",
2333 iter);
2334
2335 #ifdef REPORT_EPSILON
2336 std::cout << "\n###### JACOBI ITERATION "
2337 << nijac << " after copyValues ######" << std::endl;
2338 transferDarcyPressuresFromGlobalDeviceMemory();
2339 printDarcyArray(stdout, darcy.p, "p");
2340 #endif
2341
2342 if (nijac % nijacnorm == 0) {
2343 // Read the normalized residuals from the device
2344 transferDarcyNormFromGlobalDeviceMemory();
2345
2346 // Write the normalized residuals to the terminal
2347 //printDarcyArray(stdout, darcy.norm, "norm");
2348
2349 // Find the maximum value of the normalized
2350 // residuals
2351 max_norm_res = maxNormResDarcy();
2352
2353 // Write the Jacobi iteration number and maximum
2354 // value of the normalized residual to the log file
2355 if (write_res_log == 1)
2356 reslog << nijac << '\t' << max_norm_res
2357 << std::endl;
2358
2359 if (max_norm_res <= darcy.tolerance) {
2360 if (write_conv_log == 1
2361 && iter % conv_log_interval == 0)
2362 convlog << iter+1 << '\t' << nijac
2363 << std::endl;
2364
2365 break; // solution has converged
2366 }
2367 }
2368
2369 if (nijac == darcy.maxiter-1) {
2370
2371 if (write_conv_log == 1)
2372 convlog << iter+1 << '\t' << nijac << std::endl;
2373
2374 std::cerr << "\nIteration " << iter << ", time "
2375 << iter*time.dt << " s: "
2376 "Error, the pressure solution in the fluid "
2377 "calculations did not converge. Try increasing "
2378 "the value of 'darcy.maxiter' ("
2379 << darcy.maxiter
2380 << ") or increase 'darcy.tolerance' ("
2381 << darcy.tolerance << ")." << std::endl;
2382 }
2383
2384 if (write_res_log == 1)
2385 reslog.close();
2386
2387 //break; // end after first iteration
2388 }
2389
2390 // Zero all dphi values right after they are used in fluid
2391 // solution, unless a file is written in this step.
2392 if (filetimeclock + time.dt < time.file_dt) {
2393 setDarcyZeros<Float> <<<dimGridFluid, dimBlockFluid>>>
2394 (dev_darcy_dphi);
2395 cudaDeviceSynchronize();
2396 checkForCudaErrorsIter(
2397 "After setDarcyZeros(dev_darcy_dphi)", iter);
2398 }
2399
2400 if (PROFILING == 1)
2401 startTimer(&kernel_tic);
2402 setDarcyGhostNodes<Float> <<<dimGridFluid, dimBlockFluid>>>
2403 (dev_darcy_p,
2404 darcy.bc_xn, darcy.bc_xp,
2405 darcy.bc_yn, darcy.bc_yp,
2406 darcy.bc_bot, darcy.bc_top);
2407 cudaDeviceSynchronize();
2408 if (PROFILING == 1)
2409 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2410 &t_setDarcyGhostNodes);
2411 checkForCudaErrorsIter("Post setDarcyGhostNodes("
2412 "dev_darcy_p) after Jacobi loop", iter);
2413
2414 if (PROFILING == 1)
2415 startTimer(&kernel_tic);
2416 findDarcyVelocities<<<dimGridFluid, dimBlockFluid>>>(
2417 dev_darcy_p,
2418 dev_darcy_phi,
2419 dev_darcy_k,
2420 darcy.mu,
2421 dev_darcy_v);
2422 cudaDeviceSynchronize();
2423 if (PROFILING == 1)
2424 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2425 &t_findDarcyVelocities);
2426 checkForCudaErrorsIter("Post findDarcyVelocities", iter);
2427 }
2428 }
2429 }
2430 //break; // end after first iteration
2431
2432 if (np > 0) {
2433
2434 // Find shear stresses on upper fixed particles if a shear stress BC
2435 // is specified (wmode[0] == 3)
2436 if (walls.nw > 0 && walls.wmode[0] == 3) {
2437
2438 if (PROFILING == 1)
2439 startTimer(&kernel_tic);
2440 findShearStressOnFixedMovingParticles<<<dimGrid, dimBlock>>>
2441 (dev_x,
2442 dev_vel,
2443 dev_force,
2444 dev_walls_tau_eff_x_pp);
2445 cudaDeviceSynchronize();
2446 if (PROFILING == 1)
2447 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2448 &t_summation);
2449 checkForCudaErrorsIter(
2450 "Post findShearStressOnFixedMovingParticles", iter);
2451
2452 if (PROFILING == 1)
2453 startTimer(&kernel_tic);
2454 summation<<<dimGrid, dimBlock>>>(dev_walls_tau_eff_x_pp,
2455 dev_walls_tau_eff_x_partial);
2456 cudaDeviceSynchronize();
2457 if (PROFILING == 1)
2458 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2459 &t_summation);
2460 checkForCudaErrorsIter("Post shear stress summation", iter);
2461 }
2462
2463 // Determine whether it is time to step the velocity
2464 if (time.current >= v2_start && time.current < v2_end &&
2465 velocity_state == 1) {
2466 change_velocity_state = 1.0;
2467 velocity_state = 2;
2468 } else if (time.current >= v2_end && velocity_state == 2) {
2469 change_velocity_state = -1.0;
2470 velocity_state = 1;
2471 }
2472
2473 // Update particle kinematics
2474 if (PROFILING == 1)
2475 startTimer(&kernel_tic);
2476 integrate<<<dimGrid, dimBlock>>>(dev_x_sorted,
2477 dev_vel_sorted,
2478 dev_angvel_sorted,
2479 dev_x,
2480 dev_vel,
2481 dev_angvel,
2482 dev_force,
2483 dev_torque,
2484 dev_angpos,
2485 dev_acc,
2486 dev_angacc,
2487 dev_vel0,
2488 dev_angvel0,
2489 dev_xyzsum,
2490 dev_gridParticleIndex,
2491 iter,
2492 dev_walls_wmode,
2493 dev_walls_mvfd,
2494 dev_walls_tau_eff_x_partial,
2495 dev_walls_tau_x,
2496 walls.tau_x[0],
2497 change_velocity_state,
2498 velocity_factor,
2499 blocksPerGrid);
2500 cudaDeviceSynchronize();
2501 checkForCudaErrorsIter("Post integrate", iter);
2502 if (PROFILING == 1)
2503 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2504 &t_integrate);
2505
2506 if (change_velocity_state != 0)
2507 change_velocity_state = 0;
2508
2509 // Summation of forces on wall
2510 if (PROFILING == 1)
2511 startTimer(&kernel_tic);
2512 if (walls.nw > 0) {
2513 summation<<<dimGrid, dimBlock>>>(dev_walls_force_pp,
2514 dev_walls_force_partial);
2515 }
2516 cudaDeviceSynchronize();
2517 if (PROFILING == 1)
2518 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2519 &t_summation);
2520 checkForCudaErrorsIter("Post wall force summation", iter);
2521
2522 // Update wall kinematics
2523 if (PROFILING == 1)
2524 startTimer(&kernel_tic);
2525 if (walls.nw > 0) {
2526 integrateWalls<<< 1, walls.nw>>>(
2527 dev_walls_nx,
2528 dev_walls_mvfd,
2529 dev_walls_wmode,
2530 dev_walls_force_partial,
2531 dev_walls_acc,
2532 blocksPerGrid,
2533 time.current,
2534 iter);
2535 }
2536 cudaDeviceSynchronize();
2537 if (PROFILING == 1)
2538 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
2539 &t_integrateWalls);
2540 checkForCudaErrorsIter("Post integrateWalls", iter);
2541 }
2542
2543 // Update timers and counters
2544 //time.current = iter*time.dt;
2545 time.current += time.dt;
2546 filetimeclock += time.dt;
2547 ++iter;
2548
2549 // Make sure all preceding tasks are complete
2550 if (cudaDeviceSynchronize() != cudaSuccess) {
2551 cerr << "Error during cudaDeviceSynchronize()" << endl;
2552 }
2553
2554 // Report time to console
2555 if (verbose == 1 && (iter % stdout_report == 0)) {
2556
2557 toc = clock();
2558 time_spent = (toc - tic)/(CLOCKS_PER_SEC); // real time spent
2559
2560 // Real time it takes to compute a second of model time
2561 t_ratio = time_spent/(time.current - t_start);
2562 time_t estimated_seconds_left(t_ratio*(time.total - time.current));
2563 tm *time_eta = gmtime(&estimated_seconds_left);
2564
2565 cout << "\r Current time: " << time.current << "/"
2566 << time.total << " s. ("
2567 << t_ratio << " s_real/s_sim, ETA: "
2568 << time_eta->tm_yday << "d "
2569 << std::setw(2) << std::setfill('0') << time_eta->tm_hour << ":"
2570 << std::setw(2) << std::setfill('0') << time_eta->tm_min << ":"
2571 << std::setw(2) << std::setfill('0') << time_eta->tm_sec
2572 << ") "; // << std::flush;
2573 }
2574
2575
2576 // Produce output binary if the time interval
2577 // between output files has been reached
2578 if (filetimeclock >= time.file_dt) {
2579
2580 // Pause the CPU thread until all CUDA calls previously issued are
2581 // completed
2582 cudaDeviceSynchronize();
2583 checkForCudaErrorsIter("Beginning of file output section", iter);
2584
2585 // v_x, v_y, v_z -> v
2586 if (fluid == 1 && cfd_solver == 0) {
2587 interpolateFaceToCenter<<<dimGridFluid, dimBlockFluid>>>(
2588 dev_ns_v_x,
2589 dev_ns_v_y,
2590 dev_ns_v_z,
2591 dev_ns_v);
2592 cudaDeviceSynchronize();
2593 checkForCudaErrorsIter("Post interpolateFaceToCenter", iter);
2594 }
2595
2596 //// Copy device data to host memory
2597 transferFromGlobalDeviceMemory();
2598 checkForCudaErrorsIter("After transferFromGlobalDeviceMemory()",
2599 iter);
2600
2601 // Empty the dphi values after device to host transfer
2602 if (fluid == 1) {
2603 if (cfd_solver == 1) {
2604 setDarcyZeros<Float> <<<dimGridFluid, dimBlockFluid>>>
2605 (dev_darcy_dphi);
2606 cudaDeviceSynchronize();
2607 checkForCudaErrorsIter(
2608 "After setDarcyZeros(dev_darcy_dphi) after transfer",
2609 iter);
2610 }
2611 }
2612
2613 // Pause the CPU thread until all CUDA calls previously issued are
2614 // completed
2615 cudaDeviceSynchronize();
2616
2617 // Check the numerical stability of the NS solver
2618 if (fluid == 1)
2619 if (cfd_solver == 0)
2620 checkNSstability();
2621
2622 // Write binary output file
2623 time.step_count += 1;
2624 snprintf(file, sizeof(file), "output/%s.output%05d.bin",
2625 sid.c_str(), time.step_count);
2626 writebin(file);
2627
2628 /*std::cout
2629 << "\n###### OUTPUT FILE " << time.step_count << " ######"
2630 << std::endl;
2631 transferNSepsilonFromGlobalDeviceMemory();
2632 printNSarray(stdout, ns.epsilon, "epsilon");*/
2633
2634 // Write fluid arrays
2635 /*if (fluid == 1 && cfd_solver == 0) {
2636 sprintf(file,"output/%s.ns_phi.output%05d.bin", sid.c_str(),
2637 time.step_count);
2638 writeNSarray(ns.phi, file);
2639 }*/
2640
2641 if (CONTACTINFO == 1) {
2642 // Write contact information to stdout
2643 cout << "\n\n---------------------------\n"
2644 << "t = " << time.current << " s.\n"
2645 << "---------------------------\n";
2646
2647 for (int n = 0; n < np; ++n) {
2648 cout << "\n## Particle " << n << " ##\n";
2649
2650 cout << "- contacts:\n";
2651 for (int nc = 0; nc < NC; ++nc)
2652 cout << "[" << nc << "]=" << k.contacts[nc+NC*n] <<
2653 '\n';
2654
2655 cout << "\n- delta_t:\n";
2656 for (int nc = 0; nc < NC; ++nc)
2657 cout << k.delta_t[nc+NC*n].x << '\t'
2658 << k.delta_t[nc+NC*n].y << '\t'
2659 << k.delta_t[nc+NC*n].z << '\t'
2660 << k.delta_t[nc+NC*n].w << '\n';
2661
2662 cout << "\n- distmod:\n";
2663 for (int nc = 0; nc < NC; ++nc)
2664 cout << k.distmod[nc+NC*n].x << '\t'
2665 << k.distmod[nc+NC*n].y << '\t'
2666 << k.distmod[nc+NC*n].z << '\t'
2667 << k.distmod[nc+NC*n].w << '\n';
2668 }
2669 cout << '\n';
2670 }
2671
2672 // Update status.dat at the interval of filetime
2673 outfile = "output/" + sid + ".status.dat";
2674 fp = fopen(outfile.c_str(), "w");
2675 fprintf(fp,"%2.4e %2.4e %d\n",
2676 time.current,
2677 100.0*time.current/time.total,
2678 time.step_count);
2679 fclose(fp);
2680
2681 filetimeclock = 0.0;
2682 }
2683
2684 // Uncomment break command to stop after the first iteration
2685 //break;
2686 }
2687
2688 if (write_conv_log == 1)
2689 convlog.close();
2690
2691
2692 // Stop clock and display calculation time spent
2693 toc = clock();
2694 cudaEventRecord(dev_toc, 0);
2695 cudaEventSynchronize(dev_toc);
2696
2697 time_spent = (toc - tic)/(CLOCKS_PER_SEC);
2698 cudaEventElapsedTime(&dev_time_spent, dev_tic, dev_toc);
2699
2700 if (verbose == 1) {
2701 cout << "\nSimulation ended. Statistics:\n"
2702 << " - Last output file number: "
2703 << time.step_count << "\n"
2704 << " - GPU time spent: "
2705 << dev_time_spent/1000.0f << " s\n"
2706 << " - CPU time spent: "
2707 << time_spent << " s\n"
2708 << " - Mean duration of iteration:\n"
2709 << " " << dev_time_spent/((double)iter*1000.0f) << " s"
2710 << std::endl;
2711 }
2712
2713 cudaEventDestroy(dev_tic);
2714 cudaEventDestroy(dev_toc);
2715
2716 cudaEventDestroy(kernel_tic);
2717 cudaEventDestroy(kernel_toc);
2718
2719 // Report time spent on each kernel
2720 if (PROFILING == 1 && verbose == 1) {
2721 double t_sum = t_calcParticleCellID + t_thrustsort + t_reorderArrays +
2722 t_topology + t_interact + t_bondsLinear + t_latticeBoltzmannD3Q19 +
2723 t_integrate + t_summation + t_integrateWalls + t_findPorositiesDev +
2724 t_findNSstressTensor +
2725 t_findNSdivphiviv + t_findNSdivtau + t_findPredNSvelocities +
2726 t_setNSepsilon + t_setNSdirichlet + t_setNSghostNodesDev +
2727 t_findNSforcing + t_jacobiIterationNS + t_updateNSvelocityPressure +
2728 t_findDarcyPorosities + t_setDarcyGhostNodes +
2729 t_findDarcyPressureForce + t_setDarcyTopPressure +
2730 t_findDarcyPermeabilities + t_findDarcyPermeabilityGradients +
2731 //t_findDarcyPressureChange +
2732 t_updateDarcySolution + t_copyValues + t_findDarcyVelocities;
2733
2734 cout << "\nKernel profiling statistics:\n"
2735 << " - calcParticleCellID:\t\t" << t_calcParticleCellID/1000.0
2736 << " s"
2737 << "\t(" << 100.0*t_calcParticleCellID/t_sum << " %)\n"
2738 << " - thrustsort:\t\t\t" << t_thrustsort/1000.0 << " s"
2739 << "\t(" << 100.0*t_thrustsort/t_sum << " %)\n"
2740 << " - reorderArrays:\t\t" << t_reorderArrays/1000.0 << " s"
2741 << "\t(" << 100.0*t_reorderArrays/t_sum << " %)\n";
2742 if (params.contactmodel == 2 || params.contactmodel == 3) {
2743 cout
2744 << " - topology:\t\t\t" << t_topology/1000.0 << " s"
2745 << "\t(" << 100.0*t_topology/t_sum << " %)\n";
2746 }
2747 cout << " - interact:\t\t\t" << t_interact/1000.0 << " s"
2748 << "\t(" << 100.0*t_interact/t_sum << " %)\n";
2749 if (params.nb0 > 0) {
2750 cout << " - bondsLinear:\t\t" << t_bondsLinear/1000.0 << " s"
2751 << "\t(" << 100.0*t_bondsLinear/t_sum << " %)\n";
2752 }
2753 cout << " - integrate:\t\t\t" << t_integrate/1000.0 << " s"
2754 << "\t(" << 100.0*t_integrate/t_sum << " %)\n"
2755 << " - summation:\t\t\t" << t_summation/1000.0 << " s"
2756 << "\t(" << 100.0*t_summation/t_sum << " %)\n"
2757 << " - integrateWalls:\t\t" << t_integrateWalls/1000.0 << " s"
2758 << "\t(" << 100.0*t_integrateWalls/t_sum << " %)\n";
2759 if (fluid == 1 && cfd_solver == 0) {
2760 cout << " - findPorositiesDev:\t\t" << t_findPorositiesDev/1000.0
2761 << " s" << "\t(" << 100.0*t_findPorositiesDev/t_sum << " %)\n"
2762 << " - findNSstressTensor:\t\t" << t_findNSstressTensor/1000.0
2763 << " s" << "\t(" << 100.0*t_findNSstressTensor/t_sum << " %)\n"
2764 << " - findNSdivphiviv:\t\t" << t_findNSdivphiviv/1000.0
2765 << " s" << "\t(" << 100.0*t_findNSdivphiviv/t_sum << " %)\n"
2766 << " - findNSdivtau:\t\t" << t_findNSdivtau/1000.0
2767 << " s" << "\t(" << 100.0*t_findNSdivtau/t_sum << " %)\n"
2768 << " - findPredNSvelocities:\t" <<
2769 t_findPredNSvelocities/1000.0 << " s" << "\t(" <<
2770 100.0*t_findPredNSvelocities/t_sum << " %)\n"
2771 << " - setNSepsilon:\t\t" << t_setNSepsilon/1000.0
2772 << " s" << "\t(" << 100.0*t_setNSepsilon/t_sum << " %)\n"
2773 << " - setNSdirichlet:\t\t" << t_setNSdirichlet/1000.0
2774 << " s" << "\t(" << 100.0*t_setNSdirichlet/t_sum << " %)\n"
2775 << " - setNSghostNodesDev:\t\t" << t_setNSghostNodesDev/1000.0
2776 << " s" << "\t(" << 100.0*t_setNSghostNodesDev/t_sum << " %)\n"
2777 << " - findNSforcing:\t\t" << t_findNSforcing/1000.0 << " s"
2778 << "\t(" << 100.0*t_findNSforcing/t_sum << " %)\n"
2779 << " - jacobiIterationNS:\t\t" << t_jacobiIterationNS/1000.0
2780 << " s"
2781 << "\t(" << 100.0*t_jacobiIterationNS/t_sum << " %)\n"
2782 << " - updateNSvelocityPressure:\t"
2783 << t_updateNSvelocityPressure/1000.0 << " s"
2784 << "\t(" << 100.0*t_updateNSvelocityPressure/t_sum << " %)\n";
2785 } else if (fluid == 1 && cfd_solver == 1) {
2786 cout << " - findDarcyPorosities:\t" <<
2787 t_findDarcyPorosities/1000.0 << " s" << "\t(" <<
2788 100.0*t_findDarcyPorosities/t_sum << " %)\n"
2789 << " - setDarcyGhostNodes:\t\t" <<
2790 t_setDarcyGhostNodes/1000.0 << " s" << "\t(" <<
2791 100.0*t_setDarcyGhostNodes/t_sum << " %)\n"
2792 << " - findDarcyPressureForce:\t" <<
2793 t_findDarcyPressureForce/1000.0 << " s" << "\t(" <<
2794 100.0*t_findDarcyPressureForce/t_sum << " %)\n"
2795 << " - setDarcyTopPressure:\t" <<
2796 t_setDarcyTopPressure/1000.0 << " s" << "\t(" <<
2797 100.0*t_setDarcyTopPressure/t_sum << " %)\n"
2798 << " - findDarcyPermeabilities:\t" <<
2799 t_findDarcyPermeabilities/1000.0 << " s" << "\t(" <<
2800 100.0*t_findDarcyPermeabilities/t_sum << " %)\n"
2801 << " - findDarcyPermeabilityGrads:\t" <<
2802 t_findDarcyPermeabilityGradients/1000.0 << " s" << "\t(" <<
2803 100.0*t_findDarcyPermeabilityGradients/t_sum << " %)\n"
2804 //<< " - findDarcyPressureChange:\t" <<
2805 //t_findDarcyPressureChange/1000.0 << " s" << "\t(" <<
2806 //100.0*t_findDarcyPressureChange/t_sum << " %)\n"
2807 << " - updateDarcySolution:\t" <<
2808 t_updateDarcySolution/1000.0 << " s" << "\t(" <<
2809 100.0*t_updateDarcySolution/t_sum << " %)\n"
2810 << " - copyValues:\t\t\t" <<
2811 t_copyValues/1000.0 << " s" << "\t(" <<
2812 100.0*t_copyValues/t_sum << " %)\n"
2813 << " - findDarcyVelocities:\t" <<
2814 t_findDarcyVelocities/1000.0 << " s" << "\t(" <<
2815 100.0*t_findDarcyVelocities/t_sum << " %)" << std::endl;
2816 }
2817 }
2818
2819 // Free GPU device memory
2820 freeGlobalDeviceMemory();
2821 checkForCudaErrorsIter("After freeGlobalDeviceMemory()", iter);
2822
2823 // Free contact info arrays
2824 delete[] k.contacts;
2825 delete[] k.distmod;
2826 delete[] k.delta_t;
2827
2828 if (fluid == 1) {
2829 if (cfd_solver == 0)
2830 endNS();
2831 else if (cfd_solver == 1)
2832 endDarcy();
2833 }
2834
2835 cudaDeviceReset();
2836 }
2837 // vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4