mx1.adamsgaard.dk/1/src/sphere/file/src/device.cu.gph

       tdevice.cu - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
  HTML git clone git://src.adamsgaard.dk/sphere
   DIR Log
   DIR Files
   DIR Refs
   DIR LICENSE
       ---
       tdevice.cu (119471B)
       ---
            1 // device.cu -- GPU specific operations utilizing the CUDA API.
            2 #include <iostream>
            3 #include <fstream>
            4 #include <string>
            5 #include <cstdio>
            6 #include <cuda.h>
            7 #include <helper_math.h>
            8 #include <iomanip>
            9 #include <time.h>
           10 
           11 #include "vector_arithmetic.h"  // for arbitrary prec. vectors
           12 //#include <vector_functions.h> // for single prec. vectors
           13 #include "thrust/device_ptr.h"
           14 #include "thrust/sort.h"
           15 
           16 #include "sphere.h"
           17 #include "datatypes.h"
           18 #include "utility.h"
           19 #include "constants.cuh"
           20 #include "debug.h"
           21 #include "version.h"
           22 
           23 #include "sorting.cuh"
           24 #include "contactmodels.cuh"
           25 #include "cohesion.cuh"
           26 #include "contactsearch.cuh"
           27 #include "integration.cuh"
           28 #include "raytracer.cuh"
           29 #include "navierstokes.cuh"
           30 #include "darcy.cuh"
           31 
           32 // Returns the number of cores per streaming multiprocessor, which is
           33 // a function of the device compute capability
           34 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
           35 int cudaCoresPerSM(int major, int minor)
           36 {
           37     if (major == 1)
           38         return 8;
           39     else if (major == 2 && minor == 0)
           40         return 32;
           41     else if (major == 2 && minor == 1)
           42         return 48;
           43     else if (major == 3)
           44         return 192;
           45     else if (major == 4)
           46         return 128;
           47     else if (major == 5)
           48         return 128;
           49     else if (major == 6 && minor == 0)
           50         return 64;
           51     else if (major == 6 && minor == 1)
           52         return 128;
           53     else if (major == 6 && minor == 2)
           54         return 128;
           55     else if (major == 7)
           56         return 32;
           57     else if (major == 8)
           58         return 64;
           59     else
           60         printf("Error in cudaCoresPerSM Device compute capability value "
           61                 "(%d.%d) not recognized.", major, minor);
           62     return -1;
           63 }
           64 
           65 // Wrapper function for initializing the CUDA components.
           66 // Called from main.cpp
           67 void DEM::initializeGPU(void)
           68 {
           69     using std::cout; // stdout
           70 
           71     // Specify target device
           72     int cudadevice = 0;
           73 
           74     // Variables containing device properties
           75     cudaDeviceProp prop;
           76     int deviceCount;
           77     int cudaDriverVersion;
           78     int cudaRuntimeVersion;
           79 
           80     checkForCudaErrors("Before initializing CUDA device");
           81 
           82     // Register number of devices
           83     cudaGetDeviceCount(&deviceCount);
           84     ndevices = deviceCount; // store in DEM class
           85 
           86     if (deviceCount == 0) {
           87         std::cerr << "\nERROR: No CUDA-enabled devices availible. Bye."
           88             << std::endl;
           89         exit(EXIT_FAILURE);
           90     } else if (deviceCount == 1) {
           91         if (verbose == 1)
           92             cout << "  System contains 1 CUDA compatible device.\n";
           93     } else {
           94         if (verbose == 1)
           95             cout << "  System contains " << deviceCount
           96                 << " CUDA compatible devices.\n";
           97     }
           98 
           99     // Loop through GPU's and choose the one with the most CUDA cores
          100     if (device == -1) {
          101         int ncudacores;
          102         int max_ncudacores = 0;
          103         for (int d=0; d<ndevices; d++) {
          104             cudaGetDeviceProperties(&prop, d);
          105             cudaDriverGetVersion(&cudaDriverVersion);
          106             cudaRuntimeGetVersion(&cudaRuntimeVersion);
          107 
          108             ncudacores = prop.multiProcessorCount
          109                 *cudaCoresPerSM(prop.major, prop.minor);
          110             if (ncudacores > max_ncudacores) {
          111                 max_ncudacores = ncudacores;
          112                 cudadevice = d;
          113             }
          114 
          115             if (verbose == 1) {
          116                 cout << "  CUDA device ID: " << d << "\n";
          117                 cout << "  - Name: " <<  prop.name << ", compute capability: "
          118                      << prop.major << "." << prop.minor << ".\n";
          119                 cout << "  - CUDA Driver version: " << cudaDriverVersion/1000
          120                      << "." <<  cudaDriverVersion%100
          121                      << ", runtime version " << cudaRuntimeVersion/1000 << "."
          122                      << cudaRuntimeVersion%100 << std::endl;
          123             }
          124         }
          125 
          126         device = cudadevice; // store in DEM class
          127         if (verbose == 1) {
          128             cout << "  Using CUDA device ID " << device << " with "
          129                  << max_ncudacores << " cores." << std::endl;
          130         }
          131 
          132     } else {
          133 
          134         cudaGetDeviceProperties(&prop, device);
          135         cudaDriverGetVersion(&cudaDriverVersion);
          136         cudaRuntimeGetVersion(&cudaRuntimeVersion);
          137 
          138         int ncudacores = prop.multiProcessorCount
          139             *cudaCoresPerSM(prop.major, prop.minor);
          140 
          141         if (verbose == 1) {
          142             cout << "  CUDA device ID: " << device << "\n";
          143             cout << "  - Name: " <<  prop.name << ", compute capability: "
          144                  << prop.major << "." << prop.minor << ".\n";
          145             cout << "  - CUDA Driver version: " << cudaDriverVersion/1000
          146                  << "." <<  cudaDriverVersion%100
          147                  << ", runtime version " << cudaRuntimeVersion/1000 << "."
          148                  << cudaRuntimeVersion%100
          149                  << "\n  - " << ncudacores << " CUDA cores" << std::endl;
          150         }
          151     }
          152 
          153     // The value of device is now 0 or larger
          154     cudaSetDevice(device);
          155 
          156     checkForCudaErrors("While initializing CUDA device");
          157 }
          158 
          159 // Start timer for kernel profiling
          160 void startTimer(cudaEvent_t* kernel_tic)
          161 {
          162     cudaEventRecord(*kernel_tic);
          163 }
          164 
          165 // Stop timer for kernel profiling and time to function sum
          166 void stopTimer(cudaEvent_t *kernel_tic,
          167         cudaEvent_t *kernel_toc,
          168         float *kernel_elapsed,
          169         double* sum)
          170 {
          171     cudaEventRecord(*kernel_toc, 0);
          172     cudaEventSynchronize(*kernel_toc);
          173     cudaEventElapsedTime(kernel_elapsed, *kernel_tic, *kernel_toc);
          174     *sum += *kernel_elapsed;
          175 }
          176 
          177 // Check values of parameters in constant memory
          178 __global__ void checkConstantValues(int* dev_equal,
          179         Grid* dev_grid,
          180         Params* dev_params)
          181 {
          182     // Values ok (0)
          183     *dev_equal = 0;
          184 
          185     // Compare values between global- and constant
          186     // memory structures
          187     if (dev_grid->origo[0] != devC_grid.origo[0])
          188         *dev_equal = 1;
          189     if (dev_grid->origo[1] != devC_grid.origo[1])
          190         *dev_equal = 2; // Not ok
          191     if (dev_grid->origo[2] != devC_grid.origo[2])
          192         *dev_equal = 3; // Not ok
          193     if (dev_grid->L[0] != devC_grid.L[0])
          194         *dev_equal = 4; // Not ok
          195     if (dev_grid->L[1] != devC_grid.L[1])
          196         *dev_equal = 5; // Not ok
          197     if (dev_grid->L[2] != devC_grid.L[2])
          198         *dev_equal = 6; // Not ok
          199     if (dev_grid->num[0] != devC_grid.num[0])
          200         *dev_equal = 7; // Not ok
          201     if (dev_grid->num[1] != devC_grid.num[1])
          202         *dev_equal = 8; // Not ok
          203     if (dev_grid->num[2] != devC_grid.num[2])
          204         *dev_equal = 9; // Not ok
          205     if (dev_grid->periodic != devC_grid.periodic)
          206         *dev_equal = 10; // Not ok
          207 
          208     if (dev_params->g[0] != devC_params.g[0])
          209         *dev_equal = 11; // Not ok
          210     if (dev_params->g[1] != devC_params.g[1])
          211         *dev_equal = 12; // Not ok
          212     if (dev_params->g[2] != devC_params.g[2])
          213         *dev_equal = 13; // Not ok
          214     if (dev_params->k_n != devC_params.k_n)
          215         *dev_equal = 14; // Not ok
          216     if (dev_params->k_t != devC_params.k_t)
          217         *dev_equal = 15; // Not ok
          218     if (dev_params->k_r != devC_params.k_r)
          219         *dev_equal = 16; // Not ok
          220     if (dev_params->gamma_n != devC_params.gamma_n)
          221         *dev_equal = 17; // Not ok
          222     if (dev_params->gamma_t != devC_params.gamma_t)
          223         *dev_equal = 18; // Not ok
          224     if (dev_params->gamma_r != devC_params.gamma_r)
          225         *dev_equal = 19; // Not ok
          226     if (dev_params->mu_s != devC_params.mu_s)
          227         *dev_equal = 20; // Not ok
          228     if (dev_params->mu_d != devC_params.mu_d)
          229         *dev_equal = 21; // Not ok
          230     if (dev_params->mu_r != devC_params.mu_r)
          231         *dev_equal = 22; // Not ok
          232     if (dev_params->rho != devC_params.rho)
          233         *dev_equal = 23; // Not ok
          234     if (dev_params->contactmodel != devC_params.contactmodel)
          235         *dev_equal = 24; // Not ok
          236     if (dev_params->kappa != devC_params.kappa)
          237         *dev_equal = 25; // Not ok
          238     if (dev_params->db != devC_params.db)
          239         *dev_equal = 26; // Not ok
          240     if (dev_params->V_b != devC_params.V_b)
          241         *dev_equal = 27; // Not ok
          242     if (dev_params->lambda_bar != devC_params.lambda_bar)
          243         *dev_equal = 28; // Not ok
          244     if (dev_params->nb0 != devC_params.nb0)
          245         *dev_equal = 29; // Not ok
          246     if (dev_params->E != devC_params.E)
          247         *dev_equal = 30; // Not ok
          248 }
          249 
          250 __global__ void checkParticlePositions(
          251     const Float4* __restrict__ dev_x)
          252 {
          253     unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
          254 
          255     if (idx < devC_np) { // Condition prevents block size error
          256         Float4 x = dev_x[idx];
          257 
          258         // make sure grain doesn't have NaN or Inf position
          259         if (!isfinite(x.x) || !isfinite(x.y) || !isfinite(x.z)) {
          260             __syncthreads();
          261             printf("\nParticle %d has non-finite position: x = %f %f %f",
          262                     idx, x.x, x.y, x.z);
          263         }
          264 
          265         /*__syncthreads();
          266         printf("\nParticle %d: x = %f %f %f",
          267                 idx, x.x, x.y, x.z);*/
          268 
          269         // check that the particle is inside of the simulation domain
          270         if (x.x < devC_grid.origo[0] ||
          271                 x.y < devC_grid.origo[1] ||
          272                 x.z < devC_grid.origo[2] ||
          273                 x.x > devC_grid.L[0] ||
          274                 x.y > devC_grid.L[1] ||
          275                 x.z > devC_grid.L[2]) {
          276             __syncthreads();
          277             printf("\nParticle %d is outside the computational domain "
          278                     "(%f %f %f to %f %f %f): x = %f %f %f",
          279                     idx,
          280                     devC_grid.origo[0], devC_grid.origo[1], devC_grid.origo[2],
          281                     devC_grid.L[0], devC_grid.L[1], devC_grid.L[2],
          282                     x.x, x.y, x.z);
          283         }
          284     }
          285 }
          286 
          287 
          288 // Copy the constant data components to device memory,
          289 // and check whether the values correspond to the
          290 // values in constant memory.
          291 void DEM::checkConstantMemory()
          292 {
          293     // Allocate space in global device memory
          294     Grid* dev_grid;
          295     Params* dev_params;
          296     cudaMalloc((void**)&dev_grid, sizeof(Grid));
          297     cudaMalloc((void**)&dev_params, sizeof(Params));
          298 
          299     // Copy structure data from host to global device memory
          300     cudaMemcpy(dev_grid, &grid, sizeof(Grid), cudaMemcpyHostToDevice);
          301     cudaMemcpy(dev_params, &params, sizeof(Params), cudaMemcpyHostToDevice);
          302 
          303     // Compare values between global and constant memory
          304     // structures on the device.
          305     int* equal = new int;  // The values are equal = 0, if not = 1
          306     *equal = 0;
          307     int* dev_equal;
          308     cudaMalloc((void**)&dev_equal, sizeof(int));
          309     checkConstantValues<<<1,1>>>(dev_equal, dev_grid, dev_params);
          310     checkForCudaErrors("After constant memory check");
          311 
          312     // Copy result to host
          313     cudaMemcpy(equal, dev_equal, sizeof(int), cudaMemcpyDeviceToHost);
          314 
          315     // Free global device memory
          316     cudaFree(dev_grid);
          317     cudaFree(dev_params);
          318     cudaFree(dev_equal);
          319 
          320     // Are the values equal?
          321     if (*equal != 0) {
          322         std::cerr << "Error! The values in constant memory do not "
          323             << "seem to be correct (" << *equal << ")." << std::endl;
          324         exit(1);
          325     } else {
          326         if (verbose == 1)
          327             std::cout << "  Constant values ok (" << *equal << ")."
          328                 << std::endl;
          329     }
          330 }
          331 
          332 // Copy selected constant components to constant device memory.
          333 void DEM::transferToConstantDeviceMemory()
          334 {
          335     using std::cout;
          336 
          337     if (verbose == 1)
          338         cout << "  Transfering data to constant device memory:     ";
          339 
          340     /*for (int d=0; d<ndevices; d++) {
          341       cudaSetDevice(d);*/
          342         cudaMemcpyToSymbol(devC_nd, &nd, sizeof(nd));
          343         cudaMemcpyToSymbol(devC_np, &np, sizeof(np));
          344         cudaMemcpyToSymbol(devC_nw, &walls.nw, sizeof(unsigned int));
          345         cudaMemcpyToSymbol(devC_nc, &NC, sizeof(int));
          346         cudaMemcpyToSymbol(devC_dt, &time.dt, sizeof(Float));
          347         cudaMemcpyToSymbol(devC_grid, &grid, sizeof(Grid));
          348         cudaMemcpyToSymbol(devC_params, &params, sizeof(Params));
          349         /*}
          350           cudaSetDevice(device);*/
          351 
          352     checkForCudaErrors("After transferring to device constant memory");
          353 
          354     if (verbose == 1)
          355         cout << "Done\n";
          356 
          357     // only for device with most CUDA cores
          358     checkConstantMemory();
          359 }
          360 
          361 __global__ void printWorldSize(Float4* dev_walls_nx)
          362 {
          363     printf("\nL = %f, %f, %f\n",
          364             devC_grid.L[0], devC_grid.L[1], devC_grid.L[2]);
          365     printf("\ndev_walls_nx[0] = %f, %f, %f, %f\n",
          366             dev_walls_nx[0].x,
          367             dev_walls_nx[0].y,
          368             dev_walls_nx[0].z,
          369             dev_walls_nx[0].w);
          370 }
          371 
          372 void DEM::updateGridSize()
          373 {
          374     //printf("\nDEM::updateGridSize() start\n");
          375     Float* Lz = new Float;
          376 
          377     // Get top wall position from dev_walls_nx[0].z
          378     cudaMemcpy(Lz, &dev_walls_nx[0].w, sizeof(Float), cudaMemcpyDeviceToHost);
          379     checkForCudaErrors("DEM::updateGridSize(): copying wall position");
          380 
          381     //printWorldSize<<<1,1>>>(dev_walls_nx);
          382     //cudaDeviceSynchronize();
          383     //checkForCudaErrors("DEM::updateGridSize(): first printWorldSize");
          384 
          385     //printf("\nLz = %f\n", *Lz);
          386 
          387     // Write value to grid.L[2]
          388     grid.L[2] = *Lz;
          389 
          390     // Write value to devC_grid.L[2]
          391     //cudaMemcpyToSymbol(devC_grid.L[2], &Lz, sizeof(Float));
          392     cudaMemcpyToSymbol(devC_grid, &grid, sizeof(Grid));
          393 
          394     checkForCudaErrors("DEM::updateGridSize(): write to devC_grid.L[2]");
          395 
          396     //printWorldSize<<<1,1>>>(dev_walls_nx);
          397     //cudaDeviceSynchronize();
          398     //checkForCudaErrors("DEM::updateGridSize(): second printWorldSize");
          399 
          400     // check value only during debugging
          401     //checkConstantMemory();
          402 }
          403 
          404 
          405 // Allocate device memory for particle variables,
          406 // tied to previously declared pointers in structures
          407 void DEM::allocateGlobalDeviceMemory(void)
          408 {
          409     // Particle memory size
          410     unsigned int memSizeF  = sizeof(Float) * np;
          411     unsigned int memSizeF4 = sizeof(Float4) * np;
          412 
          413     if (verbose == 1)
          414         std::cout << "  Allocating global device memory:                ";
          415 
          416     k.acc = new Float4[np];
          417     k.angacc = new Float4[np];
          418 #pragma omp parallel for if(np>100)
          419     for (unsigned int i = 0; i<np; ++i) {
          420         k.acc[i] = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
          421         k.angacc[i] = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
          422     }
          423 
          424     // Kinematics arrays
          425     cudaMalloc((void**)&dev_x, memSizeF4);
          426     cudaMalloc((void**)&dev_xyzsum, memSizeF4);
          427     cudaMalloc((void**)&dev_vel, memSizeF4);
          428     cudaMalloc((void**)&dev_vel0, memSizeF4);
          429     cudaMalloc((void**)&dev_acc, memSizeF4);
          430     cudaMalloc((void**)&dev_force, memSizeF4);
          431     cudaMalloc((void**)&dev_angpos, memSizeF4);
          432     cudaMalloc((void**)&dev_angvel, memSizeF4);
          433     cudaMalloc((void**)&dev_angvel0, memSizeF4);
          434     cudaMalloc((void**)&dev_angacc, memSizeF4);
          435     cudaMalloc((void**)&dev_torque, memSizeF4);
          436 
          437     // Particle contact bookkeeping arrays
          438     cudaMalloc((void**)&dev_contacts,
          439                sizeof(unsigned int)*np*NC);
          440     cudaMalloc((void**)&dev_distmod, memSizeF4*NC);
          441     cudaMalloc((void**)&dev_delta_t, memSizeF4*NC);
          442     cudaMalloc((void**)&dev_bonds, sizeof(uint2)*params.nb0);
          443     cudaMalloc((void**)&dev_bonds_delta, sizeof(Float4)*params.nb0);
          444     cudaMalloc((void**)&dev_bonds_omega, sizeof(Float4)*params.nb0);
          445 
          446     // Sorted arrays
          447     cudaMalloc((void**)&dev_x_sorted, memSizeF4);
          448     cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
          449     cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
          450 
          451     // Energy arrays
          452     cudaMalloc((void**)&dev_es_dot, memSizeF);
          453     cudaMalloc((void**)&dev_ev_dot, memSizeF);
          454     cudaMalloc((void**)&dev_es, memSizeF);
          455     cudaMalloc((void**)&dev_ev, memSizeF);
          456     cudaMalloc((void**)&dev_p, memSizeF);
          457 
          458     // Cell-related arrays
          459     cudaMalloc((void**)&dev_gridParticleCellID, sizeof(unsigned int)*np);
          460     cudaMalloc((void**)&dev_gridParticleIndex, sizeof(unsigned int)*np);
          461     cudaMalloc((void**)&dev_cellStart, sizeof(unsigned int)
          462                *grid.num[0]*grid.num[1]*grid.num[2]);
          463     cudaMalloc((void**)&dev_cellEnd, sizeof(unsigned int)
          464                *grid.num[0]*grid.num[1]*grid.num[2]);
          465 
          466     // Host contact bookkeeping arrays
          467     k.contacts = new unsigned int[np*NC];
          468     // Initialize contacts lists to np
          469 #pragma omp parallel for if(np>100)
          470     for (unsigned int i=0; i<(np*NC); ++i)
          471         k.contacts[i] = np;
          472     k.distmod = new Float4[np*NC];
          473     k.delta_t = new Float4[np*NC];
          474 
          475     // Wall arrays
          476     cudaMalloc((void**)&dev_walls_wmode, sizeof(int)*walls.nw);
          477     cudaMalloc((void**)&dev_walls_nx, sizeof(Float4)*walls.nw);
          478     cudaMalloc((void**)&dev_walls_mvfd, sizeof(Float4)*walls.nw);
          479     cudaMalloc((void**)&dev_walls_tau_x, sizeof(Float)*walls.nw);
          480     cudaMalloc((void**)&dev_walls_tau_eff_x_pp, sizeof(Float)*walls.nw*np);
          481     cudaMalloc((void**)&dev_walls_force_pp, sizeof(Float)*walls.nw*np);
          482     cudaMalloc((void**)&dev_walls_acc, sizeof(Float)*walls.nw);
          483     // dev_walls_force_partial allocated later
          484     // dev_walls_tau_eff_x_partial allocated later
          485 
          486     checkForCudaErrors("End of allocateGlobalDeviceMemory");
          487     if (verbose == 1)
          488         std::cout << "Done" << std::endl;
          489 }
          490 
          491 // Allocate global memory on other devices required for "interact" function.
          492 // The values of domain_size[ndevices] must be set beforehand.
          493 void DEM::allocateHelperDeviceMemory(void)
          494 {
          495     // Particle memory size
          496     unsigned int memSizeF4 = sizeof(Float4) * np;
          497 
          498     // Initialize pointers to per-GPU arrays
          499     hdev_gridParticleIndex = (unsigned**)malloc(ndevices*sizeof(unsigned*));
          500     hdev_gridCellStart     = (unsigned**)malloc(ndevices*sizeof(unsigned*));
          501     hdev_gridCellEnd       = (unsigned**)malloc(ndevices*sizeof(unsigned*));
          502     hdev_x                 = (Float4**)malloc(ndevices*sizeof(Float4*));
          503     hdev_x_sorted          = (Float4**)malloc(ndevices*sizeof(Float4*));
          504     hdev_vel               = (Float4**)malloc(ndevices*sizeof(Float4*));
          505     hdev_vel_sorted        = (Float4**)malloc(ndevices*sizeof(Float4*));
          506     hdev_angvel            = (Float4**)malloc(ndevices*sizeof(Float4*));
          507     hdev_angvel_sorted     = (Float4**)malloc(ndevices*sizeof(Float4*));
          508     hdev_walls_nx          = (Float4**)malloc(ndevices*sizeof(Float4*));
          509     hdev_walls_mvfd        = (Float4**)malloc(ndevices*sizeof(Float4*));
          510     hdev_distmod           = (Float4**)malloc(ndevices*sizeof(Float4*));
          511 
          512     hdev_force             = (Float4**)malloc(ndevices*sizeof(Float4*));
          513     hdev_torque            = (Float4**)malloc(ndevices*sizeof(Float4*));
          514     hdev_delta_t           = (Float4**)malloc(ndevices*sizeof(Float4*));
          515     hdev_es_dot            = (Float**)malloc(ndevices*sizeof(Float*));
          516     hdev_es                = (Float**)malloc(ndevices*sizeof(Float*));
          517     hdev_ev_dot            = (Float**)malloc(ndevices*sizeof(Float*));
          518     hdev_ev                = (Float**)malloc(ndevices*sizeof(Float*));
          519     hdev_p                 = (Float**)malloc(ndevices*sizeof(Float*));
          520     hdev_walls_force_pp    = (Float**)malloc(ndevices*sizeof(Float*));
          521     hdev_contacts          = (unsigned**)malloc(ndevices*sizeof(unsigned*));
          522 
          523     for (int d=0; d<ndevices; d++) {
          524 
          525         // do not allocate memory on primary GPU
          526         if (d == device)
          527             continue;
          528 
          529         cudaSetDevice(d);
          530 
          531         // allocate space for full input arrays for interact()
          532         cudaMalloc((void**)&hdev_gridParticleIndex[d], sizeof(unsigned int)*np);
          533         cudaMalloc((void**)&hdev_gridCellStart[d], sizeof(unsigned int)
          534                    *grid.num[0]*grid.num[1]*grid.num[2]);
          535         cudaMalloc((void**)&hdev_gridCellEnd[d], sizeof(unsigned int)
          536                    *grid.num[0]*grid.num[1]*grid.num[2]);
          537         cudaMalloc((void**)&hdev_x[d], memSizeF4);
          538         cudaMalloc((void**)&hdev_x_sorted[d], memSizeF4);
          539         cudaMalloc((void**)&hdev_vel[d], memSizeF4);
          540         cudaMalloc((void**)&hdev_vel_sorted[d], memSizeF4);
          541         cudaMalloc((void**)&hdev_angvel[d], memSizeF4);
          542         cudaMalloc((void**)&hdev_angvel_sorted[d], memSizeF4);
          543         cudaMalloc((void**)&hdev_walls_nx[d], sizeof(Float4)*walls.nw);
          544         cudaMalloc((void**)&hdev_walls_mvfd[d], sizeof(Float4)*walls.nw);
          545         cudaMalloc((void**)&hdev_distmod[d], memSizeF4*NC);
          546 
          547         // allocate space for partial output arrays for interact()
          548         cudaMalloc((void**)&hdev_force[d], sizeof(Float4)*domain_size[d]);
          549         cudaMalloc((void**)&hdev_torque[d], sizeof(Float4)*domain_size[d]);
          550         cudaMalloc((void**)&hdev_es_dot[d], sizeof(Float)*domain_size[d]);
          551         cudaMalloc((void**)&hdev_ev_dot[d], sizeof(Float)*domain_size[d]);
          552         cudaMalloc((void**)&hdev_es[d], sizeof(Float)*domain_size[d]);
          553         cudaMalloc((void**)&hdev_ev[d], sizeof(Float)*domain_size[d]);
          554         cudaMalloc((void**)&hdev_p[d], sizeof(Float)*domain_size[d]);
          555         cudaMalloc((void**)&hdev_walls_force_pp[d],
          556                    sizeof(Float)*domain_size[d]*walls.nw);
          557         cudaMalloc((void**)&hdev_contacts[d],
          558                    sizeof(unsigned)*domain_size[d]*NC);
          559         cudaMalloc((void**)&hdev_delta_t[d], sizeof(Float4)*domain_size[d]*NC);
          560 
          561         checkForCudaErrors("During allocateGlobalDeviceMemoryOtherDevices");
          562     }
          563     cudaSetDevice(device); // select main GPU
          564 }
          565 
          566 void DEM::freeHelperDeviceMemory()
          567 {
          568     for (int d=0; d<ndevices; d++) {
          569 
          570         // do not allocate memory on primary GPU
          571         if (d == device)
          572             continue;
          573 
          574         cudaSetDevice(d);
          575 
          576         cudaFree(hdev_gridParticleIndex[d]);
          577         cudaFree(hdev_gridCellStart[d]);
          578         cudaFree(hdev_gridCellEnd[d]);
          579         cudaFree(hdev_x[d]);
          580         cudaFree(hdev_vel[d]);
          581         cudaFree(hdev_vel_sorted[d]);
          582         cudaFree(hdev_angvel[d]);
          583         cudaFree(hdev_angvel_sorted[d]);
          584         cudaFree(hdev_walls_nx[d]);
          585         cudaFree(hdev_walls_mvfd[d]);
          586         cudaFree(hdev_distmod[d]);
          587 
          588         cudaFree(hdev_force[d]);
          589         cudaFree(hdev_torque[d]);
          590         cudaFree(hdev_es_dot[d]);
          591         cudaFree(hdev_ev_dot[d]);
          592         cudaFree(hdev_es[d]);
          593         cudaFree(hdev_ev[d]);
          594         cudaFree(hdev_p[d]);
          595         cudaFree(hdev_walls_force_pp[d]);
          596         cudaFree(hdev_contacts[d]);
          597         cudaFree(hdev_delta_t[d]);
          598 
          599         checkForCudaErrors("During helper device cudaFree calls");
          600     }
          601     cudaSetDevice(device); // select primary GPU
          602 }
          603 
          604 void DEM::freeGlobalDeviceMemory()
          605 {
          606     if (verbose == 1)
          607         printf("\nFreeing device memory:                           ");
          608 
          609     // Particle arrays
          610     cudaFree(dev_x);
          611     cudaFree(dev_xyzsum);
          612     cudaFree(dev_vel);
          613     cudaFree(dev_vel0);
          614     cudaFree(dev_acc);
          615     cudaFree(dev_force);
          616     cudaFree(dev_angpos);
          617     cudaFree(dev_angvel);
          618     cudaFree(dev_angvel0);
          619     cudaFree(dev_angacc);
          620     cudaFree(dev_torque);
          621 
          622     cudaFree(dev_contacts);
          623     cudaFree(dev_distmod);
          624     cudaFree(dev_delta_t);
          625     cudaFree(dev_bonds);
          626     cudaFree(dev_bonds_delta);
          627     cudaFree(dev_bonds_omega);
          628 
          629     cudaFree(dev_es_dot);
          630     cudaFree(dev_es);
          631     cudaFree(dev_ev_dot);
          632     cudaFree(dev_ev);
          633     cudaFree(dev_p);
          634 
          635     cudaFree(dev_x_sorted);
          636     cudaFree(dev_vel_sorted);
          637     cudaFree(dev_angvel_sorted);
          638 
          639     // Cell-related arrays
          640     cudaFree(dev_gridParticleIndex);
          641     cudaFree(dev_cellStart);
          642     cudaFree(dev_cellEnd);
          643 
          644     // Wall arrays
          645     cudaFree(dev_walls_nx);
          646     cudaFree(dev_walls_mvfd);
          647     cudaFree(dev_walls_tau_x);
          648     cudaFree(dev_walls_force_partial);
          649     cudaFree(dev_walls_force_pp);
          650     cudaFree(dev_walls_acc);
          651     cudaFree(dev_walls_tau_eff_x_pp);
          652     cudaFree(dev_walls_tau_eff_x_partial);
          653 
          654     // Fluid arrays
          655     if (fluid == 1 && cfd_solver == 0) {
          656         freeNSmemDev();
          657     }
          658     if (fluid == 1 && cfd_solver == 1) {
          659         freeDarcyMemDev();
          660     }
          661 
          662     //checkForCudaErrors("During cudaFree calls");
          663 
          664     if (verbose == 1)
          665         std::cout << "Done" << std::endl;
          666 }
          667 
          668 
          669 void DEM::transferToGlobalDeviceMemory(int statusmsg)
          670 {
          671     if (verbose == 1 && statusmsg == 1)
          672         std::cout << "  Transfering data to the device:                 ";
          673 
          674     // Commonly-used memory sizes
          675     unsigned int memSizeF  = sizeof(Float) * np;
          676     unsigned int memSizeF4 = sizeof(Float4) * np;
          677 
          678     // Copy static-size structure data from host to global device memory
          679     //cudaMemcpy(dev_time, &time, sizeof(Time), cudaMemcpyHostToDevice);
          680 
          681     // Kinematic particle values
          682     cudaMemcpy( dev_x,        k.x,
          683                 memSizeF4, cudaMemcpyHostToDevice);
          684     cudaMemcpy( dev_xyzsum,   k.xyzsum,
          685                 memSizeF4, cudaMemcpyHostToDevice);
          686     cudaMemcpy( dev_vel,      k.vel,
          687                 memSizeF4, cudaMemcpyHostToDevice);
          688     cudaMemcpy( dev_vel0,     k.vel,
          689                 memSizeF4, cudaMemcpyHostToDevice);
          690     cudaMemcpy( dev_acc,      k.acc,
          691                 memSizeF4, cudaMemcpyHostToDevice);
          692     cudaMemcpy( dev_force,    k.force,
          693                 memSizeF4, cudaMemcpyHostToDevice);
          694     cudaMemcpy( dev_angpos,   k.angpos,
          695                 memSizeF4, cudaMemcpyHostToDevice);
          696     cudaMemcpy( dev_angvel,   k.angvel,
          697                 memSizeF4, cudaMemcpyHostToDevice);
          698     cudaMemcpy( dev_angvel0,  k.angvel,
          699                 memSizeF4, cudaMemcpyHostToDevice);
          700     cudaMemcpy( dev_angacc,   k.angacc,
          701                 memSizeF4, cudaMemcpyHostToDevice);
          702     cudaMemcpy( dev_torque,   k.torque,
          703                 memSizeF4, cudaMemcpyHostToDevice);
          704     cudaMemcpy( dev_contacts, k.contacts,
          705                 sizeof(unsigned int)*np*NC, cudaMemcpyHostToDevice);
          706     cudaMemcpy( dev_distmod, k.distmod,
          707                 memSizeF4*NC, cudaMemcpyHostToDevice);
          708     cudaMemcpy( dev_delta_t, k.delta_t,
          709                 memSizeF4*NC, cudaMemcpyHostToDevice);
          710     cudaMemcpy( dev_bonds, k.bonds,
          711                 sizeof(uint2)*params.nb0, cudaMemcpyHostToDevice);
          712     cudaMemcpy( dev_bonds_delta, k.bonds_delta,
          713                 sizeof(Float4)*params.nb0, cudaMemcpyHostToDevice);
          714     cudaMemcpy( dev_bonds_omega, k.bonds_omega,
          715                 sizeof(Float4)*params.nb0, cudaMemcpyHostToDevice);
          716 
          717     // Individual particle energy values
          718     cudaMemcpy( dev_es_dot, e.es_dot,
          719                 memSizeF, cudaMemcpyHostToDevice);
          720     cudaMemcpy( dev_es,     e.es,
          721                 memSizeF, cudaMemcpyHostToDevice);
          722     cudaMemcpy( dev_ev_dot, e.ev_dot,
          723                 memSizeF, cudaMemcpyHostToDevice);
          724     cudaMemcpy( dev_ev,     e.ev,
          725                 memSizeF, cudaMemcpyHostToDevice);
          726     cudaMemcpy( dev_p, e.p,
          727                 memSizeF, cudaMemcpyHostToDevice);
          728 
          729     // Wall parameters
          730     cudaMemcpy( dev_walls_wmode, walls.wmode,
          731                 sizeof(int)*walls.nw, cudaMemcpyHostToDevice);
          732     cudaMemcpy( dev_walls_nx,    walls.nx,
          733                 sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
          734     cudaMemcpy( dev_walls_mvfd,  walls.mvfd,
          735                 sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
          736     cudaMemcpy( dev_walls_tau_x,  walls.tau_x,
          737                 sizeof(Float)*walls.nw, cudaMemcpyHostToDevice);
          738 
          739     // Fluid arrays
          740     if (fluid == 1) {
          741         if (cfd_solver == 0) {
          742             transferNStoGlobalDeviceMemory(1);
          743         } else if (cfd_solver == 1) {
          744             transferDarcyToGlobalDeviceMemory(1);
          745         } else {
          746             std::cerr << "Error: cfd_solver value not understood ("
          747                 << cfd_solver << ")" << std::endl;
          748         }
          749     }
          750 
          751     checkForCudaErrors("End of transferToGlobalDeviceMemory");
          752     if (verbose == 1 && statusmsg == 1)
          753         std::cout << "Done" << std::endl;
          754 }
          755 
          756 void DEM::transferFromGlobalDeviceMemory()
          757 {
          758     //std::cout << "  Transfering data from the device:               ";
          759 
          760     // Commonly-used memory sizes
          761     unsigned int memSizeF  = sizeof(Float) * np;
          762     unsigned int memSizeF4 = sizeof(Float4) * np;
          763 
          764     // Copy static-size structure data from host to global device memory
          765     //cudaMemcpy(&time, dev_time, sizeof(Time), cudaMemcpyDeviceToHost);
          766 
          767     // Kinematic particle values
          768     cudaMemcpy( k.x, dev_x,
          769             memSizeF4, cudaMemcpyDeviceToHost);
          770     cudaMemcpy( k.xyzsum, dev_xyzsum,
          771             memSizeF4, cudaMemcpyDeviceToHost);
          772     cudaMemcpy( k.vel, dev_vel,
          773             memSizeF4, cudaMemcpyDeviceToHost);
          774     cudaMemcpy( k.acc, dev_acc,
          775             memSizeF4, cudaMemcpyDeviceToHost);
          776     cudaMemcpy( k.force, dev_force,
          777             memSizeF4, cudaMemcpyDeviceToHost);
          778     cudaMemcpy( k.angpos, dev_angpos,
          779             memSizeF4, cudaMemcpyDeviceToHost);
          780     cudaMemcpy( k.angvel, dev_angvel,
          781             memSizeF4, cudaMemcpyDeviceToHost);
          782     cudaMemcpy( k.angacc, dev_angacc,
          783             memSizeF4, cudaMemcpyDeviceToHost);
          784     cudaMemcpy( k.torque, dev_torque,
          785             memSizeF4, cudaMemcpyDeviceToHost);
          786     cudaMemcpy( k.contacts, dev_contacts,
          787             sizeof(unsigned int)*np*NC, cudaMemcpyDeviceToHost);
          788     cudaMemcpy( k.distmod, dev_distmod,
          789             memSizeF4*NC, cudaMemcpyDeviceToHost);
          790     cudaMemcpy( k.delta_t, dev_delta_t,
          791             memSizeF4*NC, cudaMemcpyDeviceToHost);
          792     cudaMemcpy( k.bonds, dev_bonds,
          793             sizeof(uint2)*params.nb0, cudaMemcpyDeviceToHost);
          794     cudaMemcpy( k.bonds_delta, dev_bonds_delta,
          795             sizeof(Float4)*params.nb0, cudaMemcpyDeviceToHost);
          796     cudaMemcpy( k.bonds_omega, dev_bonds_omega,
          797             sizeof(Float4)*params.nb0, cudaMemcpyDeviceToHost);
          798 
          799     // Individual particle energy values
          800     cudaMemcpy( e.es_dot, dev_es_dot,
          801             memSizeF, cudaMemcpyDeviceToHost);
          802     cudaMemcpy( e.es, dev_es,
          803             memSizeF, cudaMemcpyDeviceToHost);
          804     cudaMemcpy( e.ev_dot, dev_ev_dot,
          805             memSizeF, cudaMemcpyDeviceToHost);
          806     cudaMemcpy( e.ev, dev_ev,
          807             memSizeF, cudaMemcpyDeviceToHost);
          808     cudaMemcpy( e.p, dev_p,
          809             memSizeF, cudaMemcpyDeviceToHost);
          810 
          811     // Wall parameters
          812     cudaMemcpy( walls.wmode, dev_walls_wmode,
          813             sizeof(int)*walls.nw, cudaMemcpyDeviceToHost);
          814     cudaMemcpy( walls.nx, dev_walls_nx,
          815             sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
          816     cudaMemcpy( walls.mvfd, dev_walls_mvfd,
          817             sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
          818     cudaMemcpy( walls.tau_x, dev_walls_tau_x,
          819             sizeof(Float)*walls.nw, cudaMemcpyDeviceToHost);
          820 
          821     // Fluid arrays
          822     if (fluid == 1 && cfd_solver == 0) {
          823         transferNSfromGlobalDeviceMemory(0);
          824     }
          825     else if (fluid == 1 && cfd_solver == 1) {
          826         transferDarcyFromGlobalDeviceMemory(0);
          827         checkDarcyStability();
          828     }
          829 
          830     //checkForCudaErrors("End of transferFromGlobalDeviceMemory");
          831 }
          832 
          833 
          834 // Iterate through time by explicit time integration
          835 void DEM::startTime()
          836 {
          837     using std::cout;
          838     using std::cerr;
          839     using std::endl;
          840 
          841     std::string outfile;
          842     char file[200];
          843     FILE *fp;
          844 
          845     // Synchronization point
          846     cudaDeviceSynchronize();
          847     checkForCudaErrors("Start of startTime()");
          848 
          849     // Write initial data to output/<sid>.output00000.bin
          850     if (time.step_count == 0)
          851         writebin(("output/" + sid + ".output00000.bin").c_str());
          852 
          853     // Time variables
          854     clock_t tic, toc;
          855     double filetimeclock, time_spent;
          856     float dev_time_spent;
          857 
          858     // Start CPU clock
          859     tic = clock();
          860 
          861     //// GPU workload configuration
          862     unsigned int threadsPerBlock = 256;
          863     //unsigned int threadsPerBlock = 512;
          864 
          865     // Create enough blocks to accomodate the particles
          866     unsigned int blocksPerGrid = iDivUp(np, threadsPerBlock);
          867     dim3 dimGrid(blocksPerGrid, 1, 1); // Blocks arranged in 1D grid
          868     dim3 dimBlock(threadsPerBlock, 1, 1); // Threads arranged in 1D block
          869 
          870     unsigned int blocksPerGridBonds = iDivUp(params.nb0, threadsPerBlock);
          871     dim3 dimGridBonds(blocksPerGridBonds, 1, 1); // Blocks arranged in 1D grid
          872 
          873     // Use 3D block and grid layout for cell-centered fluid calculations
          874     dim3 dimBlockFluid(8, 8, 8);    // 512 threads per block
          875     dim3 dimGridFluid(
          876             iDivUp(grid.num[0], dimBlockFluid.x),
          877             iDivUp(grid.num[1], dimBlockFluid.y),
          878             iDivUp(grid.num[2], dimBlockFluid.z));
          879     if (dimGridFluid.z > 64 && fluid == 1) {
          880         cerr << "Error: dimGridFluid.z > 64" << endl;
          881         exit(1);
          882     }
          883 
          884     // Use 3D block and grid layout for cell-face fluid calculations
          885     dim3 dimBlockFluidFace(8, 8, 8);    // 512 threads per block
          886     dim3 dimGridFluidFace(
          887             iDivUp(grid.num[0]+1, dimBlockFluidFace.x),
          888             iDivUp(grid.num[1]+1, dimBlockFluidFace.y),
          889             iDivUp(grid.num[2]+1, dimBlockFluidFace.z));
          890     if (dimGridFluidFace.z > 64 && fluid == 1) {
          891         cerr << "Error: dimGridFluidFace.z > 64" << endl;
          892         exit(1);
          893     }
          894 
          895 
          896     // Shared memory per block
          897     unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);
          898 
          899     // Pre-sum of force per wall
          900     cudaMalloc((void**)&dev_walls_force_partial,
          901             sizeof(Float)*dimGrid.x*walls.nw);
          902 
          903     // Pre-sum of shear stress per wall
          904     cudaMalloc((void**)&dev_walls_tau_eff_x_partial,
          905             sizeof(Float)*dimGrid.x*walls.nw);
          906 
          907     // Report to stdout
          908     if (verbose == 1) {
          909         cout << "\n  Device memory allocation and transfer complete.\n"
          910             << "  - Blocks per grid: "
          911             << dimGrid.x << "*" << dimGrid.y << "*" << dimGrid.z << "\n"
          912             << "  - Threads per block: "
          913             << dimBlock.x << "*" << dimBlock.y << "*" << dimBlock.z << "\n"
          914             << "  - Shared memory required per block: " << smemSize << " bytes"
          915             << endl;
          916         if (fluid == 1) {
          917             cout << "  - Blocks per fluid grid: "
          918                 << dimGridFluid.x << "*" << dimGridFluid.y << "*" <<
          919                 dimGridFluid.z << "\n"
          920                 << "  - Threads per fluid block: "
          921                 << dimBlockFluid.x << "*" << dimBlockFluid.y << "*" <<
          922                 dimBlockFluid.z << endl;
          923         }
          924     }
          925 
          926     // Initialize counter variable values
          927     filetimeclock = 0.0;
          928     long iter = 0;
          929     const int stdout_report = 10; // no of steps between reporting to stdout
          930 
          931     // Create first status.dat
          932     //sprintf(file,"output/%s.status.dat", sid);
          933     outfile = "output/" + sid + ".status.dat";
          934     fp = fopen(outfile.c_str(), "w");
          935     fprintf(fp,"%2.4e %2.4e %d\n",
          936             time.current,
          937             100.0*time.current/time.total,
          938             time.step_count);
          939     fclose(fp);
          940 
          941     if (verbose == 1) {
          942         cout << "\n  Entering the main calculation time loop...\n\n"
          943             << "  IMPORTANT: Do not close this terminal, doing so will \n"
          944             << "             terminate this SPHERE process. Follow the \n"
          945             << "             progress by executing:\n"
          946             << "                $ ./sphere_status " << sid << endl << endl;
          947     }
          948 
          949 
          950     // Start GPU clock
          951     cudaEvent_t dev_tic, dev_toc;
          952     cudaEventCreate(&dev_tic);
          953     cudaEventCreate(&dev_toc);
          954     cudaEventRecord(dev_tic, 0);
          955 
          956     // If profiling is enabled, initialize timers for each kernel
          957     cudaEvent_t kernel_tic, kernel_toc;
          958     float kernel_elapsed;
          959     double t_calcParticleCellID = 0.0;
          960     double t_thrustsort = 0.0;
          961     double t_reorderArrays = 0.0;
          962     double t_topology = 0.0;
          963     double t_interact = 0.0;
          964     double t_bondsLinear = 0.0;
          965     double t_latticeBoltzmannD3Q19 = 0.0;
          966     double t_integrate = 0.0;
          967     double t_summation = 0.0;
          968     double t_integrateWalls = 0.0;
          969 
          970     double t_findPorositiesDev = 0.0;
          971     double t_findNSstressTensor = 0.0;
          972     double t_findNSdivphiviv = 0.0;
          973     double t_findNSdivtau = 0.0;
          974     double t_findPredNSvelocities = 0.0;
          975     double t_setNSepsilon = 0.0;
          976     double t_setNSdirichlet = 0.0;
          977     double t_setNSghostNodesDev = 0.0;
          978     double t_findNSforcing = 0.0;
          979     double t_jacobiIterationNS = 0.0;
          980     double t_updateNSvelocityPressure = 0.0;
          981 
          982     double t_findDarcyPorosities = 0.0;
          983     double t_setDarcyGhostNodes = 0.0;
          984     double t_findDarcyPressureForce = 0.0;
          985     double t_setDarcyTopPressure = 0.0;
          986     double t_findDarcyPermeabilities = 0.0;
          987     double t_findDarcyPermeabilityGradients = 0.0;
          988     //double t_findDarcyPressureChange = 0.0;
          989     double t_updateDarcySolution = 0.0;
          990     double t_copyValues = 0.0;
          991     double t_findDarcyVelocities = 0.0;
          992 
          993     if (PROFILING == 1) {
          994         cudaEventCreate(&kernel_tic);
          995         cudaEventCreate(&kernel_toc);
          996     }
          997 
          998     // The model start time is saved for profiling performance
          999     double t_start = time.current;
         1000     double t_ratio;     // ration between time flow in model vs. reality
         1001 
         1002     // Hard-coded parameters for stepwise velocity change (rate-state exp)
         1003     int velocity_state = 1;  // 1: v1, 2: v2
         1004     int change_velocity_state = 0;  // 1: increase velocity, 2: decrease vel.
         1005     const Float velocity_factor = 10.0;  // v2 = v1*velocity_factor
         1006     const Float v2_start = 10.0; // seconds
         1007     const Float v2_end = 15.0;  // seconds
         1008 
         1009     // Index of dynamic top wall (if it exists)
         1010     unsigned int wall0_iz = 10000000;
         1011     // weight of fluid between two cells in z direction
         1012     Float dp_dz;
         1013     if (fluid == 1) {
         1014         if (cfd_solver == 0)
         1015             dp_dz = fabs(ns.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
         1016         else if (cfd_solver == 1) {
         1017             dp_dz = fabs(darcy.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
         1018 
         1019             // determine pressure at top wall at t=0
         1020             darcy.p_top_orig = darcy.p[d_idx(0,0,darcy.nz-1)]
         1021                                 - darcy.p_mod_A
         1022                                 *sin(2.0*M_PI*darcy.p_mod_f*time.current
         1023                                         + darcy.p_mod_phi);
         1024         }
         1025     }
         1026     //std::cout << "dp_dz = " << dp_dz << std::endl;
         1027 
         1028     // Write a log file of the number of iterations it took before
         1029     // convergence in the fluid solver
         1030     std::ofstream convlog;
         1031     if (write_conv_log == 1) {
         1032         std::string f = "output/" + sid + "-conv.log";
         1033         convlog.open(f.c_str());
         1034     }
         1035 
         1036     if (verbose == 1)
         1037         cout << "  Current simulation time: " << time.current << " s.";
         1038 
         1039     // MAIN CALCULATION TIME LOOP
         1040     while (time.current <= time.total) {
         1041 
         1042         // Print current step number to terminal
         1043         //printf("\n\n@@@ DEM time step: %ld\n", iter);
         1044 
         1045         // Routine check for errors
         1046         checkForCudaErrors("Start of main while loop");
         1047 
         1048         if (np > 0) {
         1049 
         1050             // check if particle positions have finite values
         1051 #ifdef CHECK_PARTICLES_FINITE
         1052             checkParticlePositions<<<dimGrid, dimBlock>>>(dev_x);
         1053             cudaDeviceSynchronize();
         1054             checkForCudaErrorsIter("Post checkParticlePositions", iter);
         1055 #endif
         1056 
         1057             // If the grid is adaptive, readjust the grid height to equal the
         1058             // positions of the dynamic walls
         1059             if (grid.adaptive == 1 && walls.nw > 0) {
         1060                 updateGridSize();
         1061             }
         1062 
         1063             // For each particle:
         1064             // Compute hash key (cell index) from position
         1065             // in the fine, uniform and homogenous grid.
         1066             if (PROFILING == 1)
         1067                 startTimer(&kernel_tic);
         1068             calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID,
         1069                     dev_gridParticleIndex,
         1070                     dev_x);
         1071 
         1072             // Synchronization point
         1073             cudaDeviceSynchronize();
         1074             if (PROFILING == 1)
         1075                 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1076                         &t_calcParticleCellID);
         1077             checkForCudaErrorsIter("Post calcParticleCellID", iter);
         1078 
         1079 
         1080             // Sort particle (key, particle ID) pairs by hash key with Thrust
         1081             // radix sort
         1082             if (PROFILING == 1)
         1083                 startTimer(&kernel_tic);
         1084             thrust::sort_by_key(
         1085                     thrust::device_ptr<uint>(dev_gridParticleCellID),
         1086                     thrust::device_ptr<uint>(dev_gridParticleCellID + np),
         1087                     thrust::device_ptr<uint>(dev_gridParticleIndex));
         1088             cudaDeviceSynchronize(); // Maybe Thrust synchronizes implicitly?
         1089             if (PROFILING == 1)
         1090                 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1091                         &t_thrustsort);
         1092             checkForCudaErrorsIter("Post thrust::sort_by_key", iter);
         1093 
         1094 
         1095             // Zero cell array values by setting cellStart to its highest
         1096             // possible value, specified with pointer value 0xffffffff, which
         1097             // for a 32 bit unsigned int is 4294967295.
         1098             cudaMemset(dev_cellStart, 0xffffffff,
         1099                     grid.num[0]*grid.num[1]*grid.num[2]*sizeof(unsigned int));
         1100             cudaDeviceSynchronize();
         1101             checkForCudaErrorsIter("Post cudaMemset", iter);
         1102 
         1103             // Use sorted order to reorder particle arrays (position,
         1104             // velocities, radii) to ensure coherent memory access. Save ordered
         1105             // configurations in new arrays (*_sorted).
         1106             if (PROFILING == 1)
         1107                 startTimer(&kernel_tic);
         1108             reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
         1109                     dev_cellEnd,
         1110                     dev_gridParticleCellID,
         1111                     dev_gridParticleIndex,
         1112                     dev_x, dev_vel,
         1113                     dev_angvel,
         1114                     dev_x_sorted,
         1115                     dev_vel_sorted,
         1116                     dev_angvel_sorted);
         1117 
         1118             // Synchronization point
         1119             cudaDeviceSynchronize();
         1120             if (PROFILING == 1)
         1121                 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1122                         &t_reorderArrays);
         1123             checkForCudaErrorsIter("Post reorderArrays", iter);
         1124 
         1125             // The contact search in topology() is only necessary for
         1126             // determining the accumulated shear distance needed in the linear
         1127             // elastic and nonlinear contact force model
         1128             if (params.contactmodel == 2 || params.contactmodel == 3) {
         1129                 // For each particle: Search contacts in neighbor cells
         1130                 if (PROFILING == 1)
         1131                     startTimer(&kernel_tic);
         1132                 topology<<<dimGrid, dimBlock>>>(dev_cellStart,
         1133                         dev_cellEnd,
         1134                         dev_gridParticleIndex,
         1135                         dev_x_sorted,
         1136                         dev_contacts,
         1137                         dev_distmod);
         1138 
         1139                 // Synchronization point
         1140                 cudaDeviceSynchronize();
         1141                 if (PROFILING == 1)
         1142                     stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1143                             &t_topology);
         1144                 checkForCudaErrorsIter(
         1145                         "Post topology: One or more particles moved "
         1146                         "outside the grid.\nThis could possibly be caused by a "
         1147                         "numerical instability.\nIs the computational time step"
         1148                         " too large?", iter);
         1149             }
         1150 
         1151             // For each particle process collisions and compute resulting forces
         1152             //cudaPrintfInit();
         1153             if (PROFILING == 1)
         1154                 startTimer(&kernel_tic);
         1155             interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
         1156                     dev_cellStart,
         1157                     dev_cellEnd,
         1158                     dev_x,
         1159                     dev_x_sorted,
         1160                     dev_vel_sorted,
         1161                     dev_angvel_sorted,
         1162                     dev_vel,
         1163                     dev_angvel,
         1164                     dev_force,
         1165                     dev_torque,
         1166                     dev_es_dot,
         1167                     dev_ev_dot,
         1168                     dev_es,
         1169                     dev_ev,
         1170                     dev_p,
         1171                     dev_walls_nx,
         1172                     dev_walls_mvfd,
         1173                     dev_walls_force_pp,
         1174                     dev_contacts,
         1175                     dev_distmod,
         1176                     dev_delta_t);
         1177 
         1178             // Synchronization point
         1179             cudaDeviceSynchronize();
         1180             //cudaPrintfDisplay(stdout, true);
         1181             if (PROFILING == 1)
         1182                 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1183                         &t_interact);
         1184             checkForCudaErrorsIter(
         1185                     "Post interact - often caused if particles move "
         1186                     "outside the grid", iter);
         1187 
         1188             // Process particle pairs
         1189             if (params.nb0 > 0) {
         1190                 if (PROFILING == 1)
         1191                     startTimer(&kernel_tic);
         1192                 bondsLinear<<<dimGridBonds, dimBlock>>>(
         1193                         dev_bonds,
         1194                         dev_bonds_delta,
         1195                         dev_bonds_omega,
         1196                         dev_x,
         1197                         dev_vel,
         1198                         dev_angvel,
         1199                         dev_force,
         1200                         dev_torque);
         1201                 // Synchronization point
         1202                 cudaDeviceSynchronize();
         1203                 //cudaPrintfDisplay(stdout, true);
         1204                 if (PROFILING == 1)
         1205                     stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1206                             &t_bondsLinear);
         1207                 checkForCudaErrorsIter("Post bondsLinear", iter);
         1208             }
         1209         }
         1210 
         1211         // Solve fluid flow through the grid
         1212         if (fluid == 1) {
         1213 
         1214             // Navier-Stokes solution
         1215             if (cfd_solver == 0) {
         1216 
         1217                 checkForCudaErrorsIter("Before findPorositiesDev", iter);
         1218                 // Find cell porosities, average particle velocities, and
         1219                 // average particle diameters. These are needed for predicting
         1220                 // the fluid velocities
         1221                 if (PROFILING == 1)
         1222                     startTimer(&kernel_tic);
         1223                 findPorositiesVelocitiesDiametersSpherical
         1224                 //findPorositiesVelocitiesDiametersSphericalGradient
         1225                     <<<dimGridFluid, dimBlockFluid>>>(
         1226                             dev_cellStart,
         1227                             dev_cellEnd,
         1228                             dev_x_sorted,
         1229                             dev_vel_sorted,
         1230                             dev_ns_phi,
         1231                             dev_ns_dphi,
         1232                             dev_ns_vp_avg,
         1233                             dev_ns_d_avg,
         1234                             iter,
         1235                             np,
         1236                             ns.c_phi);
         1237                 cudaDeviceSynchronize();
         1238                 if (PROFILING == 1)
         1239                     stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1240                             &t_findPorositiesDev);
         1241                 checkForCudaErrorsIter("Post findPorositiesDev", iter);
         1242 
         1243 #ifdef CFD_DEM_COUPLING
         1244                 /*if (params.nu <= 0.0) {
         1245                   std::cerr << "Error! The fluid needs a positive viscosity "
         1246                   "value in order to simulate particle-fluid interaction."
         1247                   << std::endl;
         1248                   exit(1);
         1249                   }*/
         1250                 if (iter == 0) {
         1251                     // set cell center ghost nodes
         1252                     setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
         1253                             dev_ns_v, ns.bc_bot, ns.bc_top);
         1254 
         1255                     // find cell face velocities
         1256                     interpolateCenterToFace
         1257                         <<<dimGridFluidFace, dimBlockFluidFace>>>(
         1258                                 dev_ns_v,
         1259                                 dev_ns_v_x,
         1260                                 dev_ns_v_y,
         1261                                 dev_ns_v_z);
         1262                     cudaDeviceSynchronize();
         1263                     checkForCudaErrors("Post interpolateCenterToFace");
         1264                 }
         1265 
         1266                 setNSghostNodesFace<Float>
         1267                     <<<dimGridFluidFace, dimBlockFluidFace>>>(
         1268                             dev_ns_v_x,
         1269                             dev_ns_v_y,
         1270                             dev_ns_v_z,
         1271                             ns.bc_bot,
         1272                             ns.bc_top);
         1273                 cudaDeviceSynchronize();
         1274                 checkForCudaErrorsIter("Post setNSghostNodesFace", iter);
         1275 
         1276                 findFaceDivTau<<<dimGridFluidFace, dimBlockFluidFace>>>(
         1277                         dev_ns_v_x,
         1278                         dev_ns_v_y,
         1279                         dev_ns_v_z,
         1280                         ns.mu,
         1281                         dev_ns_div_tau_x,
         1282                         dev_ns_div_tau_y,
         1283                         dev_ns_div_tau_z);
         1284                 cudaDeviceSynchronize();
         1285                 checkForCudaErrorsIter("Post findFaceDivTau", iter);
         1286 
         1287                 setNSghostNodesFace<Float>
         1288                     <<<dimGridFluidFace, dimBlockFluid>>>(
         1289                             dev_ns_div_tau_x,
         1290                             dev_ns_div_tau_y,
         1291                             dev_ns_div_tau_z,
         1292                             ns.bc_bot,
         1293                             ns.bc_top);
         1294                 cudaDeviceSynchronize();
         1295                 checkForCudaErrorsIter("Post setNSghostNodes(dev_ns_div_tau)",
         1296                         iter);
         1297 
         1298                 setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         1299                         dev_ns_p, ns.bc_bot, ns.bc_top);
         1300                 cudaDeviceSynchronize();
         1301                 checkForCudaErrorsIter("Post setNSghostNodes(dev_ns_p)", iter);
         1302 
         1303                 setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         1304                         dev_ns_phi, ns.bc_bot, ns.bc_top);
         1305                 cudaDeviceSynchronize();
         1306                 checkForCudaErrorsIter("Post setNSghostNodes(dev_ns_p)", iter);
         1307 
         1308 
         1309                 if (np > 0) {
         1310 
         1311                     // Per particle, find the fluid-particle interaction force
         1312                     // f_pf and apply it to the particle
         1313                     findInteractionForce<<<dimGrid, dimBlock>>>(
         1314                             dev_x,
         1315                             dev_vel,
         1316                             dev_ns_phi,
         1317                             dev_ns_p,
         1318                             dev_ns_v_x,
         1319                             dev_ns_v_y,
         1320                             dev_ns_v_z,
         1321                             dev_ns_div_tau_x,
         1322                             dev_ns_div_tau_y,
         1323                             dev_ns_div_tau_z,
         1324                             //ns.c_v,
         1325                             ns.mu,
         1326                             ns.rho_f,
         1327                             dev_ns_f_pf,
         1328                             dev_force,
         1329                             dev_ns_f_d,
         1330                             dev_ns_f_p,
         1331                             dev_ns_f_v,
         1332                             dev_ns_f_sum);
         1333                     cudaDeviceSynchronize();
         1334                     checkForCudaErrorsIter("Post findInteractionForce", iter);
         1335 
         1336                     setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         1337                             dev_ns_p, ns.bc_bot, ns.bc_top);
         1338                     cudaDeviceSynchronize();
         1339                     checkForCudaErrorsIter("Post setNSghostNodes(dev_ns_p)",
         1340                             iter);
         1341 
         1342                     // Apply fluid-particle interaction force to the fluid
         1343                     applyInteractionForceToFluid
         1344                         <<<dimGridFluid, dimBlockFluid>>>(
         1345                             dev_gridParticleIndex,
         1346                             dev_cellStart,
         1347                             dev_cellEnd,
         1348                             dev_ns_f_pf,
         1349                             dev_ns_F_pf);
         1350                     //dev_ns_F_pf_x,
         1351                     //dev_ns_F_pf_y,
         1352                     //dev_ns_F_pf_z);
         1353                     cudaDeviceSynchronize();
         1354                     checkForCudaErrorsIter("Post applyInteractionForceToFluid",
         1355                             iter);
         1356 
         1357                     setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
         1358                             dev_ns_F_pf, ns.bc_bot, ns.bc_top);
         1359                     cudaDeviceSynchronize();
         1360                     checkForCudaErrorsIter("Post setNSghostNodes(F_pf)", iter);
         1361                 }
         1362 #endif
         1363 
         1364                 if ((iter % ns.ndem) == 0) {
         1365                     // Initial guess for the top epsilon values. These may be
         1366                     // changed in setUpperPressureNS
         1367                     // TODO: Check if this should only be set when top bc=Dirichlet
         1368                     Float pressure = ns.p[idx(0,0,ns.nz-1)];
         1369                     Float pressure_new = pressure; // Dirichlet
         1370                     Float epsilon_value = pressure_new - ns.beta*pressure;
         1371                     setNSepsilonTop<<<dimGridFluid, dimBlockFluid>>>(
         1372                             dev_ns_epsilon,
         1373                             dev_ns_epsilon_new,
         1374                             epsilon_value);
         1375                     cudaDeviceSynchronize();
         1376                     checkForCudaErrorsIter("Post setNSepsilonTop", iter);
         1377 
         1378 #if defined(REPORT_EPSILON) || defined(REPORT_V_P_COMPONENTS) || defined(REPORT_V_C_COMPONENTS)
         1379                     std::cout
         1380                         << "\n\n@@@@@@ TIME STEP " << iter << " @@@"
         1381                         << std::endl;
         1382 #endif
         1383 
         1384                     // find cell containing top wall
         1385                     if (walls.nw > 0 &&
         1386                             (walls.wmode[0] == 1 || walls.wmode[0] == 3)) {
         1387                         wall0_iz = walls.nx->w/(grid.L[2]/grid.num[2]);
         1388                         setNSepsilonAtTopWall<<<dimGridFluid, dimBlockFluid>>>(
         1389                                 dev_ns_epsilon,
         1390                                 dev_ns_epsilon_new,
         1391                                 wall0_iz,
         1392                                 epsilon_value,
         1393                                 dp_dz);
         1394                         cudaDeviceSynchronize();
         1395                         checkForCudaErrorsIter("Post setNSepsilonAtTopWall",
         1396                                 iter);
         1397 
         1398 #ifdef REPORT_EPSILON
         1399                         std::cout
         1400                             << "\n###### EPSILON setNSepsilonAtTopWall "
         1401                             << "######" << std::endl;
         1402                         transferNSepsilonFromGlobalDeviceMemory();
         1403                         printNSarray(stdout, ns.epsilon, "epsilon");
         1404 #endif
         1405                     }
         1406 
         1407                     // Modulate the pressures at the upper boundary cells
         1408                     if ((ns.p_mod_A > 1.0e-5 || ns.p_mod_A < -1.0e-5) &&
         1409                             ns.p_mod_f > 1.0e-7) {
         1410                         // original pressure
         1411                         Float new_pressure = ns.p[idx(0,0,ns.nz-1)]
         1412                             + ns.p_mod_A*sin(2.0*M_PI*ns.p_mod_f*time.current
         1413                                     + ns.p_mod_phi);
         1414                         setUpperPressureNS<<<dimGridFluid, dimBlockFluid>>>(
         1415                                 dev_ns_p,
         1416                                 dev_ns_epsilon,
         1417                                 dev_ns_epsilon_new,
         1418                                 ns.beta,
         1419                                 new_pressure);
         1420                         cudaDeviceSynchronize();
         1421                         checkForCudaErrorsIter("Post setUpperPressureNS", iter);
         1422 
         1423 #ifdef REPORT_MORE_EPSILON
         1424                         std::cout
         1425                             << "\n@@@@@@ TIME STEP " << iter << " @@@@@@"
         1426                             << "\n###### EPSILON AFTER setUpperPressureNS "
         1427                             << "######" << std::endl;
         1428                         transferNSepsilonFromGlobalDeviceMemory();
         1429                         printNSarray(stdout, ns.epsilon, "epsilon");
         1430 #endif
         1431                     }
         1432 
         1433                     // Set the values of the ghost nodes in the grid
         1434                     if (PROFILING == 1)
         1435                         startTimer(&kernel_tic);
         1436 
         1437                     setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         1438                             dev_ns_p, ns.bc_bot, ns.bc_top);
         1439 
         1440                     //setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
         1441                     //dev_ns_v, ns.bc_bot, ns.bc_top);
         1442 
         1443                     setNSghostNodesFace<Float>
         1444                         <<<dimGridFluidFace, dimBlockFluidFace>>>(
         1445                                 dev_ns_v_p_x,
         1446                                 dev_ns_v_p_y,
         1447                                 dev_ns_v_p_z,
         1448                                 ns.bc_bot, ns.bc_top);
         1449 
         1450                     setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         1451                             dev_ns_phi, ns.bc_bot, ns.bc_top);
         1452 
         1453                     setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         1454                             dev_ns_dphi, ns.bc_bot, ns.bc_top);
         1455 
         1456                     cudaDeviceSynchronize();
         1457                     if (PROFILING == 1)
         1458                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1459                                 &t_setNSghostNodesDev);
         1460                     checkForCudaErrorsIter("Post setNSghostNodesDev", iter);
         1461                     /*std::cout
         1462                       << "\n###### EPSILON AFTER setNSghostNodesDev #####"
         1463                       << std::endl;
         1464                       transferNSepsilonFromGlobalDeviceMemory();
         1465                       printNSarray(stdout, ns.epsilon, "epsilon");*/
         1466 
         1467                     // interpolate velocities to cell centers which makes
         1468                     // velocity prediction easier
         1469                     interpolateFaceToCenter<<<dimGridFluid, dimBlockFluid>>>(
         1470                             dev_ns_v_x,
         1471                             dev_ns_v_y,
         1472                             dev_ns_v_z,
         1473                             dev_ns_v);
         1474                     cudaDeviceSynchronize();
         1475                     checkForCudaErrorsIter(
         1476                             "Post interpolateFaceToCenter", iter);
         1477 
         1478                     // Set cell-center velocity ghost nodes
         1479                     setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
         1480                             dev_ns_v, ns.bc_bot, ns.bc_top);
         1481                     cudaDeviceSynchronize();
         1482                     checkForCudaErrorsIter("Post setNSghostNodes(v)", iter);
         1483 
         1484                     // Find the divergence of phi*vi*v, needed for predicting
         1485                     // the fluid velocities
         1486                     if (PROFILING == 1)
         1487                         startTimer(&kernel_tic);
         1488                     findNSdivphiviv<<<dimGridFluid, dimBlockFluid>>>(
         1489                             dev_ns_phi,
         1490                             dev_ns_v,
         1491                             dev_ns_div_phi_vi_v);
         1492                     cudaDeviceSynchronize();
         1493                     if (PROFILING == 1)
         1494                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1495                                 &t_findNSdivphiviv);
         1496                     checkForCudaErrorsIter("Post findNSdivphiviv", iter);
         1497 
         1498                     // Set cell-center ghost nodes
         1499                     setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
         1500                             dev_ns_div_phi_vi_v, ns.bc_bot, ns.bc_top);
         1501                     cudaDeviceSynchronize();
         1502                     checkForCudaErrorsIter("Post setNSghostNodes(div_phi_vi_v)",
         1503                             iter);
         1504 
         1505                     // Predict the fluid velocities on the base of the old
         1506                     // pressure field and ignoring the incompressibility
         1507                     // constraint
         1508                     if (PROFILING == 1)
         1509                         startTimer(&kernel_tic);
         1510                     findPredNSvelocities<<<dimGridFluidFace, dimBlockFluidFace>>>(
         1511                             dev_ns_p,
         1512                             dev_ns_v_x,
         1513                             dev_ns_v_y,
         1514                             dev_ns_v_z,
         1515                             dev_ns_phi,
         1516                             dev_ns_dphi,
         1517                             dev_ns_div_tau_x,
         1518                             dev_ns_div_tau_y,
         1519                             dev_ns_div_tau_z,
         1520                             dev_ns_div_phi_vi_v,
         1521                             ns.bc_bot,
         1522                             ns.bc_top,
         1523                             ns.beta,
         1524                             dev_ns_F_pf,
         1525                             ns.ndem,
         1526                             wall0_iz,
         1527                             ns.c_v,
         1528                             ns.mu,
         1529                             ns.rho_f,
         1530                             dev_ns_v_p_x,
         1531                             dev_ns_v_p_y,
         1532                             dev_ns_v_p_z);
         1533                     cudaDeviceSynchronize();
         1534                     if (PROFILING == 1)
         1535                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1536                                 &t_findPredNSvelocities);
         1537                     checkForCudaErrorsIter("Post findPredNSvelocities", iter);
         1538 
         1539                     setNSghostNodesFace<Float>
         1540                         <<<dimGridFluidFace, dimBlockFluidFace>>>(
         1541                                 dev_ns_v_p_x,
         1542                                 dev_ns_v_p_y,
         1543                                 dev_ns_v_p_z,
         1544                                 ns.bc_bot, ns.bc_top);
         1545                     cudaDeviceSynchronize();
         1546                     checkForCudaErrorsIter(
         1547                             "Post setNSghostNodesFace(dev_ns_v_p)", iter);
         1548 
         1549                     interpolateFaceToCenter<<<dimGridFluid, dimBlockFluid>>>(
         1550                             dev_ns_v_p_x,
         1551                             dev_ns_v_p_y,
         1552                             dev_ns_v_p_z,
         1553                             dev_ns_v_p);
         1554                     cudaDeviceSynchronize();
         1555                     checkForCudaErrorsIter(
         1556                             "Post interpolateFaceToCenter", iter);
         1557 
         1558 
         1559                     // In the first iteration of the sphere program, we'll need
         1560                     // to manually estimate the values of epsilon. In the
         1561                     // subsequent iterations, the previous values are  used.
         1562                     if (iter == 0) {
         1563 
         1564                         // Define the first estimate of the values of epsilon.
         1565                         // The initial guess depends on the value of ns.beta.
         1566                         Float pressure = ns.p[idx(2,2,2)];
         1567                         Float pressure_new = pressure; // Guess p_curr = p_new
         1568                         Float epsilon_value = pressure_new - ns.beta*pressure;
         1569                         if (PROFILING == 1)
         1570                             startTimer(&kernel_tic);
         1571                         setNSepsilonInterior<<<dimGridFluid, dimBlockFluid>>>(
         1572                                 dev_ns_epsilon, epsilon_value);
         1573                         cudaDeviceSynchronize();
         1574 
         1575                         setNSnormZero<<<dimGridFluid, dimBlockFluid>>>
         1576                             (dev_ns_norm);
         1577                         cudaDeviceSynchronize();
         1578 
         1579                         if (PROFILING == 1)
         1580                             stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1581                                     &t_setNSepsilon);
         1582                         checkForCudaErrorsIter("Post setNSepsilonInterior",
         1583                                 iter);
         1584 
         1585 #ifdef REPORT_MORE_EPSILON
         1586                         std::cout
         1587                             << "\n###### EPSILON AFTER setNSepsilonInterior "
         1588                             << "######" << std::endl;
         1589                         transferNSepsilonFromGlobalDeviceMemory();
         1590                         printNSarray(stdout, ns.epsilon, "epsilon");
         1591 #endif
         1592 
         1593                         // Set the epsilon values at the lower boundary
         1594                         pressure = ns.p[idx(0,0,0)];
         1595                         pressure_new = pressure; // Guess p_current = p_new
         1596                         epsilon_value = pressure_new - ns.beta*pressure;
         1597                         if (PROFILING == 1)
         1598                             startTimer(&kernel_tic);
         1599                         setNSepsilonBottom<<<dimGridFluid, dimBlockFluid>>>(
         1600                                 dev_ns_epsilon,
         1601                                 dev_ns_epsilon_new,
         1602                                 epsilon_value);
         1603                         cudaDeviceSynchronize();
         1604                         if (PROFILING == 1)
         1605                             stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1606                                     &t_setNSdirichlet);
         1607                         checkForCudaErrorsIter("Post setNSepsilonBottom", iter);
         1608 
         1609 #ifdef REPORT_MORE_EPSILON
         1610                         std::cout
         1611                             << "\n###### EPSILON AFTER setNSepsilonBottom "
         1612                             << "######" << std::endl;
         1613                         transferNSepsilonFromGlobalDeviceMemory();
         1614                         printNSarray(stdout, ns.epsilon, "epsilon");
         1615 #endif
         1616 
         1617                         /*setNSghostNodes<Float>
         1618                           <<<dimGridFluid, dimBlockFluid>>>(
         1619                           dev_ns_epsilon);
         1620                           cudaDeviceSynchronize();
         1621                           checkForCudaErrors(
         1622                           "Post setNSghostNodesFloat(dev_ns_epsilon)",
         1623                           iter);*/
         1624                         setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         1625                                 dev_ns_epsilon,
         1626                                 ns.bc_bot, ns.bc_top);
         1627                         cudaDeviceSynchronize();
         1628                         checkForCudaErrorsIter("Post setNSghostNodesEpsilon(1)",
         1629                                 iter);
         1630 
         1631 #ifdef REPORT_MORE_EPSILON
         1632                         std::cout <<
         1633                             "\n###### EPSILON AFTER setNSghostNodes(epsilon) "
         1634                             << "######" << std::endl;
         1635                         transferNSepsilonFromGlobalDeviceMemory();
         1636                         printNSarray(stdout, ns.epsilon, "epsilon");
         1637 #endif
         1638                     }
         1639 
         1640                     // Solve the system of epsilon using a Jacobi iterative
         1641                     // solver.  The average normalized residual is initialized
         1642                     // to a large value.
         1643                     //double avg_norm_res;
         1644                     double max_norm_res;
         1645 
         1646                     // Write a log file of the normalized residuals during the
         1647                     // Jacobi iterations
         1648                     std::ofstream reslog;
         1649                     if (write_res_log == 1)
         1650                         reslog.open("max_res_norm.dat");
         1651 
         1652                     // transfer normalized residuals from GPU to CPU
         1653 #ifdef REPORT_MORE_EPSILON
         1654                     std::cout << "\n###### BEFORE FIRST JACOBI ITERATION ######"
         1655                         << "\n@@@@@@ TIME STEP " << iter << " @@@@@@"
         1656                         << std::endl;
         1657                     transferNSepsilonFromGlobalDeviceMemory();
         1658                     printNSarray(stdout, ns.epsilon, "epsilon");
         1659 #endif
         1660 
         1661                     for (unsigned int nijac = 0; nijac<ns.maxiter; ++nijac) {
         1662 
         1663                         // Only grad(epsilon) changes during the Jacobi
         1664                         // iterations.  The remaining terms of the forcing
         1665                         // function are only calculated during the first
         1666                         // iteration.
         1667                         if (PROFILING == 1)
         1668                             startTimer(&kernel_tic);
         1669                         findNSforcing<<<dimGridFluid, dimBlockFluid>>>(
         1670                                 dev_ns_epsilon,
         1671                                 dev_ns_phi,
         1672                                 dev_ns_dphi,
         1673                                 dev_ns_v_p,
         1674                                 dev_ns_v_p_x,
         1675                                 dev_ns_v_p_y,
         1676                                 dev_ns_v_p_z,
         1677                                 nijac,
         1678                                 ns.ndem,
         1679                                 ns.c_v,
         1680                                 ns.rho_f,
         1681                                 dev_ns_f1,
         1682                                 dev_ns_f2,
         1683                                 dev_ns_f);
         1684                         cudaDeviceSynchronize();
         1685                         if (PROFILING == 1)
         1686                             stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1687                                     &t_findNSforcing);
         1688                         checkForCudaErrorsIter("Post findNSforcing", iter);
         1689                         /*setNSghostNodesForcing
         1690                           <<dimGridFluid, dimBlockFluid>>>(
         1691                           dev_ns_f1,
         1692                           dev_ns_f2,
         1693                           dev_ns_f,
         1694                           nijac);
         1695                           cudaDeviceSynchronize();
         1696                           checkForCudaErrors("Post setNSghostNodesForcing",
         1697                           iter);*/
         1698 
         1699                         setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         1700                                 dev_ns_epsilon,
         1701                                 ns.bc_bot, ns.bc_top);
         1702                         cudaDeviceSynchronize();
         1703                         checkForCudaErrorsIter("Post setNSghostNodesEpsilon(2)",
         1704                                 iter);
         1705 
         1706 #ifdef REPORT_EPSILON
         1707                         std::cout << "\n###### JACOBI ITERATION "
         1708                             << nijac
         1709                             << " after setNSghostNodes(epsilon,2) ######"
         1710                             << std::endl;
         1711                         transferNSepsilonFromGlobalDeviceMemory();
         1712                         printNSarray(stdout, ns.epsilon, "epsilon");
         1713 #endif
         1714 
         1715                         // Perform a single Jacobi iteration
         1716                         if (PROFILING == 1)
         1717                             startTimer(&kernel_tic);
         1718                         jacobiIterationNS<<<dimGridFluid, dimBlockFluid>>>(
         1719                                 dev_ns_epsilon,
         1720                                 dev_ns_epsilon_new,
         1721                                 dev_ns_norm,
         1722                                 dev_ns_f,
         1723                                 ns.bc_bot,
         1724                                 ns.bc_top,
         1725                                 ns.theta,
         1726                                 wall0_iz,
         1727                                 dp_dz);
         1728                         cudaDeviceSynchronize();
         1729                         if (PROFILING == 1)
         1730                             stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1731                                     &t_jacobiIterationNS);
         1732                         checkForCudaErrorsIter("Post jacobiIterationNS", iter);
         1733 
         1734                         // set Dirichlet and Neumann BC at cells containing top
         1735                         // wall
         1736                         /*if (walls.nw > 0 && walls.wmode[0] == 1) {
         1737                           setNSepsilonAtTopWall
         1738                           <<<dimGridFluid, dimBlockFluid>>>(
         1739                           dev_ns_epsilon,
         1740                           dev_ns_epsilon_new,
         1741                           wall0_iz,
         1742                           epsilon_value,
         1743                           dp_dz);
         1744                           cudaDeviceSynchronize();
         1745                           checkForCudaErrorsIter("Post setNSepsilonAtTopWall",
         1746                           iter);
         1747                           }*/
         1748 
         1749                         // Copy new values to current values
         1750                         copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
         1751                                 dev_ns_epsilon_new,
         1752                                 dev_ns_epsilon);
         1753                         cudaDeviceSynchronize();
         1754                         checkForCudaErrorsIter
         1755                             ("Post copyValues (epsilon_new->epsilon)", iter);
         1756 
         1757 #ifdef REPORT_EPSILON
         1758                         std::cout << "\n###### JACOBI ITERATION "
         1759                             << nijac << " after jacobiIterationNS ######"
         1760                             << std::endl;
         1761                         transferNSepsilonFromGlobalDeviceMemory();
         1762                         printNSarray(stdout, ns.epsilon, "epsilon");
         1763 #endif
         1764 
         1765                         if (nijac % nijacnorm == 0) {
         1766 
         1767                             // Read the normalized residuals from the device
         1768                             transferNSnormFromGlobalDeviceMemory();
         1769 
         1770                             // Write the normalized residuals to the terminal
         1771                             //printNSarray(stdout, ns.norm, "norm");
         1772 
         1773                             // Find the maximum value of the normalized
         1774                             // residuals
         1775                             max_norm_res = maxNormResNS();
         1776 
         1777                             // Write the Jacobi iteration number and maximum
         1778                             // value of the normalized residual to the log file
         1779                             if (write_res_log == 1)
         1780                                 reslog << nijac << '\t' << max_norm_res
         1781                                     << std::endl;
         1782                         }
         1783 
         1784                         if (max_norm_res < ns.tolerance) {
         1785 
         1786                             if (write_conv_log == 1
         1787                                     && iter % conv_log_interval == 0)
         1788                                 convlog << iter+1 << '\t' << nijac << std::endl;
         1789 
         1790                             setNSghostNodes<Float>
         1791                                 <<<dimGridFluid, dimBlockFluid>>>(
         1792                                         dev_ns_epsilon,
         1793                                         ns.bc_bot, ns.bc_top);
         1794                             cudaDeviceSynchronize();
         1795                             checkForCudaErrorsIter
         1796                                 ("Post setNSghostNodesEpsilon(4)", iter);
         1797 
         1798                             // Apply smoothing if requested
         1799                             if (ns.gamma > 0.0) {
         1800 
         1801                                 smoothing<<<dimGridFluid, dimBlockFluid>>>(
         1802                                         dev_ns_epsilon,
         1803                                         ns.gamma,
         1804                                         ns.bc_bot, ns.bc_top);
         1805                                 cudaDeviceSynchronize();
         1806                                 checkForCudaErrorsIter("Post smoothing", iter);
         1807 
         1808                                 setNSghostNodes<Float>
         1809                                     <<<dimGridFluid, dimBlockFluid>>>(
         1810                                             dev_ns_epsilon,
         1811                                             ns.bc_bot, ns.bc_top);
         1812                                 cudaDeviceSynchronize();
         1813                                 checkForCudaErrorsIter
         1814                                     ("Post setNSghostNodesEpsilon(4)", iter);
         1815                             }
         1816 
         1817 #ifdef REPORT_EPSILON
         1818                             std::cout << "\n###### JACOBI ITERATION "
         1819                                 << nijac << " after smoothing ######"
         1820                                 << std::endl;
         1821                             transferNSepsilonFromGlobalDeviceMemory();
         1822                             printNSarray(stdout, ns.epsilon, "epsilon");
         1823 #endif
         1824 
         1825                             break;  // solution has converged, exit Jacobi loop
         1826                         }
         1827 
         1828                         if (nijac >= ns.maxiter-1) {
         1829 
         1830                             if (write_conv_log == 1)
         1831                                 convlog << iter+1 << '\t' << nijac << std::endl;
         1832 
         1833                             std::cerr << "\nIteration " << iter << ", time "
         1834                                 << iter*time.dt << " s: "
         1835                                 "Error, the epsilon solution in the fluid "
         1836                                 "calculations did not converge. Try increasing "
         1837                                 "the value of 'ns.maxiter' (" << ns.maxiter
         1838                                 << ") or increase 'ns.tolerance' ("
         1839                                 << ns.tolerance << ")." << std::endl;
         1840                         }
         1841                         //break; // end after Jacobi first iteration
         1842                     } // end Jacobi iteration loop
         1843 
         1844                     if (write_res_log == 1)
         1845                         reslog.close();
         1846 
         1847                     // Find the new pressures and velocities
         1848                     if (PROFILING == 1)
         1849                         startTimer(&kernel_tic);
         1850 
         1851                     updateNSpressure<<<dimGridFluid, dimBlockFluid>>>(
         1852                             dev_ns_epsilon,
         1853                             ns.beta,
         1854                             dev_ns_p);
         1855                     cudaDeviceSynchronize();
         1856                     checkForCudaErrorsIter("Post updateNSpressure", iter);
         1857 
         1858                     updateNSvelocity<<<dimGridFluidFace, dimBlockFluidFace>>>(
         1859                             dev_ns_v_p_x,
         1860                             dev_ns_v_p_y,
         1861                             dev_ns_v_p_z,
         1862                             dev_ns_phi,
         1863                             dev_ns_epsilon,
         1864                             ns.beta,
         1865                             ns.bc_bot,
         1866                             ns.bc_top,
         1867                             ns.ndem,
         1868                             ns.c_v,
         1869                             ns.rho_f,
         1870                             wall0_iz,
         1871                             iter,
         1872                             dev_ns_v_x,
         1873                             dev_ns_v_y,
         1874                             dev_ns_v_z);
         1875                     cudaDeviceSynchronize();
         1876                     if (PROFILING == 1)
         1877                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1878                                 &t_updateNSvelocityPressure);
         1879                     checkForCudaErrorsIter("Post updateNSvelocity", iter);
         1880 
         1881                     setNSghostNodesFace<Float>
         1882                         <<<dimGridFluidFace, dimBlockFluidFace>>>(
         1883                                 dev_ns_v_p_x,
         1884                                 dev_ns_v_p_y,
         1885                                 dev_ns_v_p_z,
         1886                                 ns.bc_bot, ns.bc_top);
         1887                     cudaDeviceSynchronize();
         1888                     checkForCudaErrorsIter(
         1889                             "Post setNSghostNodesFace(dev_ns_v)", iter);
         1890 
         1891                     interpolateFaceToCenter<<<dimGridFluid, dimBlockFluid>>>(
         1892                             dev_ns_v_x,
         1893                             dev_ns_v_y,
         1894                             dev_ns_v_z,
         1895                             dev_ns_v);
         1896                     cudaDeviceSynchronize();
         1897                     checkForCudaErrorsIter("Post interpolateFaceToCenter",
         1898                             iter);
         1899                 } // end iter % ns.dem == 0
         1900             } // end cfd_solver == 0
         1901 
         1902             // Darcy solution
         1903             else if (cfd_solver == 1) {
         1904 
         1905 #if defined(REPORT_EPSILON) || defined(REPORT_FORCING_TERMS)
         1906                 std::cout << "\n\n@@@@@@ TIME STEP " << iter << " @@@"
         1907                         << std::endl;
         1908 #endif
         1909 
         1910                 if (walls.nw > 0 &&
         1911                         (walls.wmode[0] == 1 || walls.wmode[0] == 3)) {
         1912                     wall0_iz = walls.nx->w/(grid.L[2]/grid.num[2]);
         1913                 }
         1914 
         1915                 if (np > 0) {
         1916 
         1917                     if (PROFILING == 1)
         1918                         startTimer(&kernel_tic);
         1919                     setDarcyGhostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         1920                             dev_darcy_p,
         1921                             darcy.bc_xn, darcy.bc_xp,
         1922                             darcy.bc_yn, darcy.bc_yp,
         1923                             darcy.bc_bot, darcy.bc_top);
         1924                     cudaDeviceSynchronize();
         1925                     if (PROFILING == 1)
         1926                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1927                                 &t_setDarcyGhostNodes);
         1928                     checkForCudaErrorsIter("Post setDarcyGhostNodes("
         1929                             "dev_darcy_p) before findDarcyPressureForce", iter);
         1930 
         1931                     if (PROFILING == 1)
         1932                         startTimer(&kernel_tic);
         1933                     findDarcyPressureGradient<<<dimGridFluid, dimBlockFluid>>>(
         1934                             dev_darcy_p,
         1935                             dev_darcy_grad_p);
         1936                     cudaDeviceSynchronize();
         1937                     checkForCudaErrorsIter("After findDarcyPressureGradient",
         1938                             iter);
         1939 
         1940                     setDarcyGhostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
         1941                             dev_darcy_grad_p,
         1942                             darcy.bc_xn, darcy.bc_xp,
         1943                             darcy.bc_yn, darcy.bc_yp,
         1944                             darcy.bc_bot, darcy.bc_top);
         1945                     cudaDeviceSynchronize();
         1946                     if (PROFILING == 1)
         1947                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1948                                 &t_setDarcyGhostNodes);
         1949                     checkForCudaErrorsIter("Post setDarcyGhostNodes("
         1950                             "dev_darcy_grad_p)", iter);
         1951 
         1952                     /*if (PROFILING == 1)
         1953                         startTimer(&kernel_tic);
         1954                     findDarcyPorositiesLinear<<<dimGridFluid, dimBlockFluid>>>(
         1955                             dev_cellStart,
         1956                             dev_cellEnd,
         1957                             dev_x_sorted,
         1958                             dev_vel_sorted,
         1959                             iter,
         1960                             darcy.ndem,
         1961                             np,
         1962                             darcy.c_phi,
         1963                             dev_darcy_phi,
         1964                             dev_darcy_dphi,
         1965                             dev_darcy_div_v_p);
         1966                     cudaDeviceSynchronize();
         1967                     if (PROFILING == 1)
         1968                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1969                                 &t_findDarcyPorosities);
         1970                     checkForCudaErrorsIter("Post findDarcyPorosities", iter);*/
         1971 
         1972                     /*findDarcyPressureForce<<<dimGrid, dimBlock>>>(
         1973                             dev_x,
         1974                             dev_darcy_p,
         1975                             wall0_iz,
         1976                             darcy.rho_f,
         1977                             dev_force,
         1978                             dev_darcy_f_p);*/
         1979                     findDarcyPressureForceLinear<<<dimGrid, dimBlock>>>(
         1980                             dev_x,
         1981                             dev_darcy_grad_p,
         1982                             dev_darcy_phi,
         1983                             wall0_iz,
         1984                             darcy.rho_f,
         1985                             darcy.bc_top,
         1986                             dev_force,
         1987                             dev_darcy_f_p);
         1988                     cudaDeviceSynchronize();
         1989                     if (PROFILING == 1)
         1990                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         1991                                 &t_findDarcyPressureForce);
         1992                     checkForCudaErrorsIter("Post findDarcyPressureForce",
         1993                             iter);
         1994                 }
         1995 
         1996                 if ((iter % darcy.ndem) == 0) {
         1997 
         1998                     if (PROFILING == 1)
         1999                         startTimer(&kernel_tic);
         2000                     /*findDarcyPorosities<<<dimGridFluid, dimBlockFluid>>>(
         2001                             dev_cellStart,
         2002                             dev_cellEnd,
         2003                             dev_x_sorted,
         2004                             dev_vel_sorted,
         2005                             iter,
         2006                             darcy.ndem,
         2007                             np,
         2008                             darcy.c_phi,
         2009                             dev_darcy_phi,
         2010                             dev_darcy_dphi);*/
         2011                     findDarcyPorositiesLinear<<<dimGridFluid, dimBlockFluid>>>(
         2012                             dev_cellStart,
         2013                             dev_cellEnd,
         2014                             dev_x_sorted,
         2015                             dev_vel_sorted,
         2016                             iter,
         2017                             darcy.ndem,
         2018                             np,
         2019                             darcy.c_phi,
         2020                             dev_darcy_phi,
         2021                             dev_darcy_dphi,
         2022                             dev_darcy_div_v_p,
         2023                             dev_darcy_vp_avg);
         2024                     cudaDeviceSynchronize();
         2025                     if (PROFILING == 1)
         2026                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2027                                 &t_findDarcyPorosities);
         2028                     checkForCudaErrorsIter("Post findDarcyPorosities", iter);
         2029 
         2030                     // copy porosities to the frictionless Y boundaries
         2031                     if (grid.periodic == 2) {
         2032                         copyDarcyPorositiesToEdges<<<dimGridFluid,
         2033                             dimBlockFluid>>>(
         2034                                 dev_darcy_phi,
         2035                                 dev_darcy_dphi,
         2036                                 dev_darcy_div_v_p,
         2037                                 dev_darcy_vp_avg);
         2038                         cudaDeviceSynchronize();
         2039                     }
         2040 
         2041                     // copy porosities to the frictionless lower Z boundary
         2042                     if (grid.periodic == 2) {
         2043                         copyDarcyPorositiesToBottom<<<dimGridFluid,
         2044                                 dimBlockFluid>>>(
         2045                                 dev_darcy_phi,
         2046                                 dev_darcy_dphi,
         2047                                 dev_darcy_div_v_p,
         2048                                 dev_darcy_vp_avg);
         2049                         cudaDeviceSynchronize();
         2050                     }
         2051 
         2052                     // Modulate the pressures at the upper boundary cells
         2053                     if ((darcy.p_mod_A > 1.0e-5 || darcy.p_mod_A < -1.0e-5) &&
         2054                             darcy.p_mod_f > 1.0e-7) {
         2055                         // original pressure
         2056                         Float new_pressure =
         2057                             darcy.p_top_orig + darcy.p_mod_A
         2058                             *sin(2.0*M_PI*darcy.p_mod_f*time.current
         2059                                     + darcy.p_mod_phi);
         2060                         if (PROFILING == 1)
         2061                             startTimer(&kernel_tic);
         2062                         setDarcyTopPressure<<<dimGridFluid, dimBlockFluid>>>(
         2063                                 new_pressure,
         2064                                 dev_darcy_p,
         2065                                 wall0_iz);
         2066                         cudaDeviceSynchronize();
         2067                         checkForCudaErrorsIter("Post setUpperPressureNS", iter);
         2068 
         2069                         // Modulate the pressures at the top wall
         2070                         setDarcyTopWallPressure
         2071                             <<<dimGridFluid, dimBlockFluid>>>(
         2072                                     new_pressure,
         2073                                     wall0_iz,
         2074                                     dev_darcy_p);
         2075                         cudaDeviceSynchronize();
         2076                         checkForCudaErrorsIter("Post setDarcyTopWallPressure",
         2077                                 iter);
         2078 
         2079                         if (PROFILING == 1)
         2080                             stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2081                                     &t_setDarcyTopPressure);
         2082                     }
         2083 
         2084                     if (PROFILING == 1)
         2085                         startTimer(&kernel_tic);
         2086                     findDarcyPermeabilities<<<dimGridFluid, dimBlockFluid>>>(
         2087                             darcy.k_c, dev_darcy_phi, dev_darcy_k);
         2088                     cudaDeviceSynchronize();
         2089                     if (PROFILING == 1)
         2090                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2091                                 &t_findDarcyPermeabilities);
         2092                     checkForCudaErrorsIter("Post findDarcyPermeabilities",
         2093                             iter);
         2094 
         2095                     if (PROFILING == 1)
         2096                         startTimer(&kernel_tic);
         2097                     setDarcyGhostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         2098                             dev_darcy_phi,
         2099                             darcy.bc_xn, darcy.bc_xp,
         2100                             darcy.bc_yn, darcy.bc_yp,
         2101                             darcy.bc_bot, darcy.bc_top);
         2102                     cudaDeviceSynchronize();
         2103                     if (PROFILING == 1)
         2104                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2105                                 &t_setDarcyGhostNodes);
         2106                     checkForCudaErrorsIter(
         2107                             "Post setDarcyGhostNodes(dev_darcy_phi)", iter);
         2108 
         2109                     if (PROFILING == 1)
         2110                         startTimer(&kernel_tic);
         2111                     setDarcyGhostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
         2112                             dev_darcy_k,
         2113                             darcy.bc_xn, darcy.bc_xp,
         2114                             darcy.bc_yn, darcy.bc_yp,
         2115                             darcy.bc_bot, darcy.bc_top);
         2116                     cudaDeviceSynchronize();
         2117                     if (PROFILING == 1)
         2118                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2119                                 &t_setDarcyGhostNodes);
         2120                     checkForCudaErrorsIter(
         2121                             "Post setDarcyGhostNodes(dev_darcy_k)", iter);
         2122 
         2123                     if (PROFILING == 1)
         2124                         startTimer(&kernel_tic);
         2125                     findDarcyPermeabilityGradients
         2126                         <<<dimGridFluid, dimBlockFluid>>>
         2127                         (dev_darcy_k, dev_darcy_grad_k);
         2128                     cudaDeviceSynchronize();
         2129                     if (PROFILING == 1)
         2130                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2131                                 &t_findDarcyPermeabilityGradients);
         2132                     checkForCudaErrorsIter(
         2133                             "Post findDarcyPermeabilityGradients", iter);
         2134 
         2135                     if (iter == 0) {
         2136                         setDarcyNormZero<<<dimGridFluid, dimBlockFluid>>>(
         2137                                 dev_darcy_norm);
         2138                         cudaDeviceSynchronize();
         2139                         checkForCudaErrorsIter("Post setDarcyNormZero", iter);
         2140 
         2141                         if (PROFILING == 1)
         2142                             startTimer(&kernel_tic);
         2143                         copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
         2144                                 dev_darcy_p,
         2145                                 dev_darcy_p_old);
         2146                         cudaDeviceSynchronize();
         2147                         if (PROFILING == 1)
         2148                             stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2149                                     &t_copyValues);
         2150                         checkForCudaErrorsIter("Post copyValues(p -> p_old)",
         2151                                 iter);
         2152                     }
         2153 
         2154                     if (darcy.bc_bot == 4 || darcy.bc_top == 4) {
         2155                         if (PROFILING == 1)
         2156                             startTimer(&kernel_tic);
         2157                         setDarcyGhostNodesFlux<Float>
         2158                             <<<dimGridFluid, dimBlockFluid>>>(
         2159                                 dev_darcy_p,
         2160                                 darcy.bc_bot,
         2161                                 darcy.bc_top,
         2162                                 darcy.bc_bot_flux,
         2163                                 darcy.bc_top_flux,
         2164                                 dev_darcy_k,
         2165                                 darcy.mu);
         2166                         cudaDeviceSynchronize();
         2167                         if (PROFILING == 1)
         2168                             stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2169                                     &t_setDarcyGhostNodes);
         2170                         checkForCudaErrorsIter(
         2171                                 "Post setDarcyGhostNodesFlux", iter);
         2172                     }
         2173 
         2174                     // Solve the system of epsilon using a Jacobi iterative
         2175                     // solver.  The average normalized residual is initialized
         2176                     // to a large value.
         2177                     //double avg_norm_res;
         2178                     double max_norm_res;
         2179 
         2180                     // Write a log file of the normalized residuals during the
         2181                     // Jacobi iterations
         2182                     std::ofstream reslog;
         2183                     if (write_res_log == 1)
         2184                         reslog.open("max_res_norm.dat");
         2185 
         2186                     for (unsigned int nijac = 0; nijac<darcy.maxiter; ++nijac) {
         2187 
         2188 #if defined(REPORT_EPSILON) || defined(REPORT_FORCING_TERMS)
         2189                 std::cout << "\n\n### Jacobi iteration " << nijac << std::endl;
         2190 #endif
         2191 
         2192                         if (nijac == 0) {
         2193                             if (PROFILING == 1)
         2194                                 startTimer(&kernel_tic);
         2195                             copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
         2196                                     dev_darcy_p,
         2197                                     dev_darcy_p_old);
         2198                             cudaDeviceSynchronize();
         2199                             if (PROFILING == 1)
         2200                                 stopTimer(&kernel_tic, &kernel_toc,
         2201                                         &kernel_elapsed, &t_copyValues);
         2202                             checkForCudaErrorsIter(
         2203                                     "Post copyValues(p -> p_old)", iter);
         2204                         }
         2205 
         2206                         if (PROFILING == 1)
         2207                             startTimer(&kernel_tic);
         2208                         setDarcyGhostNodes<Float>
         2209                             <<<dimGridFluid, dimBlockFluid>>>(
         2210                                 dev_darcy_p,
         2211                                 darcy.bc_xn, darcy.bc_xp,
         2212                                 darcy.bc_yn, darcy.bc_yp,
         2213                                 darcy.bc_bot, darcy.bc_top);
         2214                         cudaDeviceSynchronize();
         2215                         if (PROFILING == 1)
         2216                             stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2217                                     &t_setDarcyGhostNodes);
         2218                         checkForCudaErrorsIter("Post setDarcyGhostNodes("
         2219                                 "dev_darcy_p) in Jacobi loop", iter);
         2220 
         2221                         if (nijac == 0) {
         2222                             if (PROFILING == 1)
         2223                                 startTimer(&kernel_tic);
         2224                             firstDarcySolution<<<dimGridFluid, dimBlockFluid>>>(
         2225                                     dev_darcy_p,
         2226                                     dev_darcy_k,
         2227                                     dev_darcy_phi,
         2228                                     dev_darcy_dphi,
         2229                                     dev_darcy_div_v_p,
         2230                                     dev_darcy_vp_avg,
         2231                                     dev_darcy_grad_k,
         2232                                     darcy.beta_f,
         2233                                     darcy.mu,
         2234                                     darcy.bc_xn,
         2235                                     darcy.bc_xp,
         2236                                     darcy.bc_yn,
         2237                                     darcy.bc_yp,
         2238                                     darcy.bc_bot,
         2239                                     darcy.bc_top,
         2240                                     darcy.ndem,
         2241                                     wall0_iz,
         2242                                     dev_darcy_p_constant,
         2243                                     dev_darcy_dp_expl);
         2244                             cudaDeviceSynchronize();
         2245                             if (PROFILING == 1)
         2246                                 stopTimer(&kernel_tic, &kernel_toc,
         2247                                         &kernel_elapsed,
         2248                                         &t_updateDarcySolution);
         2249                             checkForCudaErrorsIter("Post updateDarcySolution",
         2250                                     iter);
         2251                         }
         2252 
         2253                         if (PROFILING == 1)
         2254                             startTimer(&kernel_tic);
         2255                         updateDarcySolution<<<dimGridFluid, dimBlockFluid>>>(
         2256                                 dev_darcy_p_old,
         2257                                 //dev_darcy_dpdt,
         2258                                 dev_darcy_dp_expl,
         2259                                 dev_darcy_p,
         2260                                 dev_darcy_k,
         2261                                 dev_darcy_phi,
         2262                                 dev_darcy_dphi,
         2263                                 dev_darcy_div_v_p,
         2264                                 dev_darcy_vp_avg,
         2265                                 dev_darcy_grad_k,
         2266                                 darcy.beta_f,
         2267                                 darcy.mu,
         2268                                 darcy.bc_xn,
         2269                                 darcy.bc_xp,
         2270                                 darcy.bc_yn,
         2271                                 darcy.bc_yp,
         2272                                 darcy.bc_bot,
         2273                                 darcy.bc_top,
         2274                                 darcy.ndem,
         2275                                 wall0_iz,
         2276                                 dev_darcy_p_constant,
         2277                                 dev_darcy_p_new,
         2278                                 dev_darcy_norm);
         2279                         cudaDeviceSynchronize();
         2280                         if (PROFILING == 1)
         2281                             stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2282                                     &t_updateDarcySolution);
         2283                         checkForCudaErrorsIter("Post updateDarcySolution",
         2284                                 iter);
         2285 
         2286                         if (darcy.bc_top == 1) {
         2287                             if (PROFILING == 1)
         2288                                 startTimer(&kernel_tic);
         2289                             setDarcyTopWallFixedFlow
         2290                                 <<<dimGridFluid, dimBlockFluid>>>
         2291                                 (wall0_iz, dev_darcy_p);
         2292                             cudaDeviceSynchronize();
         2293                             if (PROFILING == 1)
         2294                                 stopTimer(&kernel_tic, &kernel_toc,
         2295                                         &kernel_elapsed,
         2296                                         &t_updateDarcySolution);
         2297                             checkForCudaErrorsIter(
         2298                                     "Post setDarcyTopWallFixedFlow", iter);
         2299                         }
         2300 
         2301                         if (darcy.bc_bot == 4 || darcy.bc_top == 4) {
         2302                             if (PROFILING == 1)
         2303                                 startTimer(&kernel_tic);
         2304                             setDarcyGhostNodesFlux<Float>
         2305                                 <<<dimGridFluid, dimBlockFluid>>>(
         2306                                         dev_darcy_p,
         2307                                         darcy.bc_bot,
         2308                                         darcy.bc_top,
         2309                                         darcy.bc_bot_flux,
         2310                                         darcy.bc_top_flux,
         2311                                         dev_darcy_k,
         2312                                         darcy.mu);
         2313                             cudaDeviceSynchronize();
         2314                             if (PROFILING == 1)
         2315                                 stopTimer(&kernel_tic, &kernel_toc,
         2316                                         &kernel_elapsed,
         2317                                         &t_setDarcyGhostNodes);
         2318                             checkForCudaErrorsIter(
         2319                                     "Post setDarcyGhostNodesFlux", iter);
         2320                         }
         2321 
         2322                         // Copy new values to current values
         2323                         if (PROFILING == 1)
         2324                             startTimer(&kernel_tic);
         2325                         copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
         2326                                 dev_darcy_p_new,
         2327                                 dev_darcy_p);
         2328                         cudaDeviceSynchronize();
         2329                         if (PROFILING == 1)
         2330                             stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2331                                     &t_copyValues);
         2332                         checkForCudaErrorsIter("Post copyValues(p_new -> p)",
         2333                                 iter);
         2334 
         2335 #ifdef REPORT_EPSILON
         2336                         std::cout << "\n###### JACOBI ITERATION "
         2337                             << nijac << " after copyValues ######" << std::endl;
         2338                         transferDarcyPressuresFromGlobalDeviceMemory();
         2339                         printDarcyArray(stdout, darcy.p, "p");
         2340 #endif
         2341 
         2342                         if (nijac % nijacnorm == 0) {
         2343                             // Read the normalized residuals from the device
         2344                             transferDarcyNormFromGlobalDeviceMemory();
         2345 
         2346                             // Write the normalized residuals to the terminal
         2347                             //printDarcyArray(stdout, darcy.norm, "norm");
         2348 
         2349                             // Find the maximum value of the normalized
         2350                             // residuals
         2351                             max_norm_res = maxNormResDarcy();
         2352 
         2353                             // Write the Jacobi iteration number and maximum
         2354                             // value of the normalized residual to the log file
         2355                             if (write_res_log == 1)
         2356                                 reslog << nijac << '\t' << max_norm_res
         2357                                     << std::endl;
         2358 
         2359                             if (max_norm_res <= darcy.tolerance) {
         2360                                 if (write_conv_log == 1
         2361                                         && iter % conv_log_interval == 0)
         2362                                     convlog << iter+1 << '\t' << nijac
         2363                                         << std::endl;
         2364 
         2365                                 break;  // solution has converged
         2366                             }
         2367                         }
         2368 
         2369                         if (nijac == darcy.maxiter-1) {
         2370 
         2371                             if (write_conv_log == 1)
         2372                                 convlog << iter+1 << '\t' << nijac << std::endl;
         2373 
         2374                             std::cerr << "\nIteration " << iter << ", time "
         2375                                 << iter*time.dt << " s: "
         2376                                 "Error, the pressure solution in the fluid "
         2377                                 "calculations did not converge. Try increasing "
         2378                                 "the value of 'darcy.maxiter' ("
         2379                                 << darcy.maxiter
         2380                                 << ") or increase 'darcy.tolerance' ("
         2381                                 << darcy.tolerance << ")." << std::endl;
         2382                         }
         2383 
         2384                         if (write_res_log == 1)
         2385                             reslog.close();
         2386 
         2387                         //break; // end after first iteration
         2388                     }
         2389 
         2390                     // Zero all dphi values right after they are used in fluid
         2391                     // solution, unless a file is written in this step.
         2392                     if (filetimeclock + time.dt < time.file_dt) {
         2393                         setDarcyZeros<Float> <<<dimGridFluid, dimBlockFluid>>>
         2394                             (dev_darcy_dphi);
         2395                         cudaDeviceSynchronize();
         2396                         checkForCudaErrorsIter(
         2397                                 "After setDarcyZeros(dev_darcy_dphi)", iter);
         2398                     }
         2399 
         2400                     if (PROFILING == 1)
         2401                         startTimer(&kernel_tic);
         2402                     setDarcyGhostNodes<Float> <<<dimGridFluid, dimBlockFluid>>>
         2403                         (dev_darcy_p,
         2404                          darcy.bc_xn, darcy.bc_xp,
         2405                          darcy.bc_yn, darcy.bc_yp,
         2406                          darcy.bc_bot, darcy.bc_top);
         2407                     cudaDeviceSynchronize();
         2408                     if (PROFILING == 1)
         2409                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2410                                 &t_setDarcyGhostNodes);
         2411                     checkForCudaErrorsIter("Post setDarcyGhostNodes("
         2412                             "dev_darcy_p) after Jacobi loop", iter);
         2413 
         2414                     if (PROFILING == 1)
         2415                         startTimer(&kernel_tic);
         2416                     findDarcyVelocities<<<dimGridFluid, dimBlockFluid>>>(
         2417                             dev_darcy_p,
         2418                             dev_darcy_phi,
         2419                             dev_darcy_k,
         2420                             darcy.mu,
         2421                             dev_darcy_v);
         2422                     cudaDeviceSynchronize();
         2423                     if (PROFILING == 1)
         2424                         stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2425                                 &t_findDarcyVelocities);
         2426                     checkForCudaErrorsIter("Post findDarcyVelocities", iter);
         2427                 }
         2428             }
         2429         }
         2430         //break; // end after first iteration
         2431 
         2432         if (np > 0) {
         2433 
         2434             // Find shear stresses on upper fixed particles if a shear stress BC
         2435             // is specified (wmode[0] == 3)
         2436             if (walls.nw > 0 && walls.wmode[0] == 3) {
         2437 
         2438                 if (PROFILING == 1)
         2439                     startTimer(&kernel_tic);
         2440                 findShearStressOnFixedMovingParticles<<<dimGrid, dimBlock>>>
         2441                     (dev_x,
         2442                      dev_vel,
         2443                      dev_force,
         2444                      dev_walls_tau_eff_x_pp);
         2445                 cudaDeviceSynchronize();
         2446                 if (PROFILING == 1)
         2447                     stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2448                             &t_summation);
         2449                 checkForCudaErrorsIter(
         2450                         "Post findShearStressOnFixedMovingParticles", iter);
         2451 
         2452                 if (PROFILING == 1)
         2453                     startTimer(&kernel_tic);
         2454                 summation<<<dimGrid, dimBlock>>>(dev_walls_tau_eff_x_pp,
         2455                         dev_walls_tau_eff_x_partial);
         2456                 cudaDeviceSynchronize();
         2457                 if (PROFILING == 1)
         2458                     stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2459                             &t_summation);
         2460                 checkForCudaErrorsIter("Post shear stress summation", iter);
         2461             }
         2462 
         2463             // Determine whether it is time to step the velocity
         2464             if (time.current >= v2_start && time.current < v2_end &&
         2465                     velocity_state == 1) {
         2466                 change_velocity_state = 1.0;
         2467                 velocity_state = 2;
         2468             } else if (time.current >= v2_end && velocity_state == 2) {
         2469                 change_velocity_state = -1.0;
         2470                 velocity_state = 1;
         2471             }
         2472 
         2473             // Update particle kinematics
         2474             if (PROFILING == 1)
         2475                 startTimer(&kernel_tic);
         2476             integrate<<<dimGrid, dimBlock>>>(dev_x_sorted,
         2477                     dev_vel_sorted,
         2478                     dev_angvel_sorted,
         2479                     dev_x,
         2480                     dev_vel,
         2481                     dev_angvel,
         2482                     dev_force,
         2483                     dev_torque,
         2484                     dev_angpos,
         2485                     dev_acc,
         2486                     dev_angacc,
         2487                     dev_vel0,
         2488                     dev_angvel0,
         2489                     dev_xyzsum,
         2490                     dev_gridParticleIndex,
         2491                     iter,
         2492                     dev_walls_wmode,
         2493                     dev_walls_mvfd,
         2494                     dev_walls_tau_eff_x_partial,
         2495                     dev_walls_tau_x,
         2496                     walls.tau_x[0],
         2497                     change_velocity_state,
         2498                     velocity_factor,
         2499                     blocksPerGrid);
         2500             cudaDeviceSynchronize();
         2501             checkForCudaErrorsIter("Post integrate", iter);
         2502             if (PROFILING == 1)
         2503                 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2504                         &t_integrate);
         2505 
         2506             if (change_velocity_state != 0)
         2507                 change_velocity_state = 0;
         2508 
         2509             // Summation of forces on wall
         2510             if (PROFILING == 1)
         2511                 startTimer(&kernel_tic);
         2512             if (walls.nw > 0) {
         2513                 summation<<<dimGrid, dimBlock>>>(dev_walls_force_pp,
         2514                         dev_walls_force_partial);
         2515             }
         2516             cudaDeviceSynchronize();
         2517             if (PROFILING == 1)
         2518                 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2519                         &t_summation);
         2520             checkForCudaErrorsIter("Post wall force summation", iter);
         2521 
         2522             // Update wall kinematics
         2523             if (PROFILING == 1)
         2524                 startTimer(&kernel_tic);
         2525             if (walls.nw > 0) {
         2526                 integrateWalls<<< 1, walls.nw>>>(
         2527                         dev_walls_nx,
         2528                         dev_walls_mvfd,
         2529                         dev_walls_wmode,
         2530                         dev_walls_force_partial,
         2531                         dev_walls_acc,
         2532                         blocksPerGrid,
         2533                         time.current,
         2534                         iter);
         2535             }
         2536             cudaDeviceSynchronize();
         2537             if (PROFILING == 1)
         2538                 stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
         2539                         &t_integrateWalls);
         2540             checkForCudaErrorsIter("Post integrateWalls", iter);
         2541         }
         2542 
         2543         // Update timers and counters
         2544         //time.current  = iter*time.dt;
         2545         time.current  += time.dt;
         2546         filetimeclock += time.dt;
         2547         ++iter;
         2548 
         2549         // Make sure all preceding tasks are complete
         2550         if (cudaDeviceSynchronize() != cudaSuccess) {
         2551             cerr << "Error during cudaDeviceSynchronize()" << endl;
         2552         }
         2553 
         2554         // Report time to console
         2555         if (verbose == 1 && (iter % stdout_report == 0)) {
         2556 
         2557             toc = clock();
         2558             time_spent = (toc - tic)/(CLOCKS_PER_SEC); // real time spent
         2559 
         2560             // Real time it takes to compute a second of model time
         2561             t_ratio = time_spent/(time.current - t_start);
         2562             time_t estimated_seconds_left(t_ratio*(time.total - time.current));
         2563             tm *time_eta = gmtime(&estimated_seconds_left);
         2564 
         2565             cout << "\r  Current time: " << time.current << "/"
         2566                 << time.total << " s. ("
         2567                 << t_ratio << " s_real/s_sim, ETA: "
         2568                 << time_eta->tm_yday << "d "
         2569                 << std::setw(2) << std::setfill('0') << time_eta->tm_hour << ":"
         2570                 << std::setw(2) << std::setfill('0') << time_eta->tm_min << ":"
         2571                 << std::setw(2) << std::setfill('0') << time_eta->tm_sec
         2572                 << ")       "; // << std::flush;
         2573         }
         2574 
         2575 
         2576         // Produce output binary if the time interval
         2577         // between output files has been reached
         2578         if (filetimeclock >= time.file_dt) {
         2579 
         2580             // Pause the CPU thread until all CUDA calls previously issued are
         2581             // completed
         2582             cudaDeviceSynchronize();
         2583             checkForCudaErrorsIter("Beginning of file output section", iter);
         2584 
         2585             // v_x, v_y, v_z -> v
         2586             if (fluid == 1 && cfd_solver == 0) {
         2587                 interpolateFaceToCenter<<<dimGridFluid, dimBlockFluid>>>(
         2588                         dev_ns_v_x,
         2589                         dev_ns_v_y,
         2590                         dev_ns_v_z,
         2591                         dev_ns_v);
         2592                 cudaDeviceSynchronize();
         2593                 checkForCudaErrorsIter("Post interpolateFaceToCenter", iter);
         2594             }
         2595 
         2596             //// Copy device data to host memory
         2597             transferFromGlobalDeviceMemory();
         2598             checkForCudaErrorsIter("After transferFromGlobalDeviceMemory()",
         2599                     iter);
         2600 
         2601             // Empty the dphi values after device to host transfer
         2602             if (fluid == 1) {
         2603                 if (cfd_solver == 1) {
         2604                     setDarcyZeros<Float> <<<dimGridFluid, dimBlockFluid>>>
         2605                         (dev_darcy_dphi);
         2606                     cudaDeviceSynchronize();
         2607                     checkForCudaErrorsIter(
         2608                             "After setDarcyZeros(dev_darcy_dphi) after transfer",
         2609                             iter);
         2610                 }
         2611             }
         2612 
         2613             // Pause the CPU thread until all CUDA calls previously issued are
         2614             // completed
         2615             cudaDeviceSynchronize();
         2616 
         2617             // Check the numerical stability of the NS solver
         2618             if (fluid == 1)
         2619                 if (cfd_solver == 0)
         2620                     checkNSstability();
         2621 
         2622             // Write binary output file
         2623             time.step_count += 1;
         2624             snprintf(file, sizeof(file), "output/%s.output%05d.bin",
         2625                      sid.c_str(), time.step_count);
         2626             writebin(file);
         2627 
         2628             /*std::cout
         2629               << "\n###### OUTPUT FILE " << time.step_count << " ######"
         2630                 << std::endl;
         2631             transferNSepsilonFromGlobalDeviceMemory();
         2632             printNSarray(stdout, ns.epsilon, "epsilon");*/
         2633 
         2634             // Write fluid arrays
         2635             /*if (fluid == 1 && cfd_solver == 0) {
         2636                 sprintf(file,"output/%s.ns_phi.output%05d.bin", sid.c_str(),
         2637                     time.step_count);
         2638                 writeNSarray(ns.phi, file);
         2639             }*/
         2640 
         2641             if (CONTACTINFO == 1) {
         2642                 // Write contact information to stdout
         2643                 cout << "\n\n---------------------------\n"
         2644                     << "t = " << time.current << " s.\n"
         2645                     << "---------------------------\n";
         2646 
         2647                 for (int n = 0; n < np; ++n) {
         2648                     cout << "\n## Particle " << n << " ##\n";
         2649 
         2650                     cout  << "- contacts:\n";
         2651                     for (int nc = 0; nc < NC; ++nc)
         2652                         cout << "[" << nc << "]=" << k.contacts[nc+NC*n] <<
         2653                             '\n';
         2654 
         2655                     cout << "\n- delta_t:\n";
         2656                     for (int nc = 0; nc < NC; ++nc)
         2657                         cout << k.delta_t[nc+NC*n].x << '\t'
         2658                             << k.delta_t[nc+NC*n].y << '\t'
         2659                             << k.delta_t[nc+NC*n].z << '\t'
         2660                             << k.delta_t[nc+NC*n].w << '\n';
         2661 
         2662                     cout << "\n- distmod:\n";
         2663                     for (int nc = 0; nc < NC; ++nc)
         2664                         cout << k.distmod[nc+NC*n].x << '\t'
         2665                             << k.distmod[nc+NC*n].y << '\t'
         2666                             << k.distmod[nc+NC*n].z << '\t'
         2667                             << k.distmod[nc+NC*n].w << '\n';
         2668                 }
         2669                 cout << '\n';
         2670             }
         2671 
         2672             // Update status.dat at the interval of filetime
         2673             outfile = "output/" + sid + ".status.dat";
         2674             fp = fopen(outfile.c_str(), "w");
         2675             fprintf(fp,"%2.4e %2.4e %d\n",
         2676                     time.current,
         2677                     100.0*time.current/time.total,
         2678                     time.step_count);
         2679             fclose(fp);
         2680 
         2681             filetimeclock = 0.0;
         2682         }
         2683 
         2684         // Uncomment break command to stop after the first iteration
         2685         //break;
         2686     }
         2687 
         2688     if (write_conv_log == 1)
         2689         convlog.close();
         2690 
         2691 
         2692     // Stop clock and display calculation time spent
         2693     toc = clock();
         2694     cudaEventRecord(dev_toc, 0);
         2695     cudaEventSynchronize(dev_toc);
         2696 
         2697     time_spent = (toc - tic)/(CLOCKS_PER_SEC);
         2698     cudaEventElapsedTime(&dev_time_spent, dev_tic, dev_toc);
         2699 
         2700     if (verbose == 1) {
         2701         cout << "\nSimulation ended. Statistics:\n"
         2702             << "  - Last output file number: "
         2703             << time.step_count << "\n"
         2704             << "  - GPU time spent: "
         2705             << dev_time_spent/1000.0f << " s\n"
         2706             << "  - CPU time spent: "
         2707             << time_spent << " s\n"
         2708             << "  - Mean duration of iteration:\n"
         2709             << "      " << dev_time_spent/((double)iter*1000.0f) << " s"
         2710             << std::endl;
         2711     }
         2712 
         2713     cudaEventDestroy(dev_tic);
         2714     cudaEventDestroy(dev_toc);
         2715 
         2716     cudaEventDestroy(kernel_tic);
         2717     cudaEventDestroy(kernel_toc);
         2718 
         2719     // Report time spent on each kernel
         2720     if (PROFILING == 1 && verbose == 1) {
         2721         double t_sum = t_calcParticleCellID + t_thrustsort + t_reorderArrays +
         2722             t_topology + t_interact + t_bondsLinear + t_latticeBoltzmannD3Q19 +
         2723             t_integrate + t_summation + t_integrateWalls + t_findPorositiesDev +
         2724             t_findNSstressTensor +
         2725             t_findNSdivphiviv + t_findNSdivtau + t_findPredNSvelocities +
         2726             t_setNSepsilon + t_setNSdirichlet + t_setNSghostNodesDev +
         2727             t_findNSforcing + t_jacobiIterationNS + t_updateNSvelocityPressure +
         2728             t_findDarcyPorosities + t_setDarcyGhostNodes +
         2729             t_findDarcyPressureForce + t_setDarcyTopPressure +
         2730             t_findDarcyPermeabilities + t_findDarcyPermeabilityGradients +
         2731             //t_findDarcyPressureChange +
         2732             t_updateDarcySolution + t_copyValues + t_findDarcyVelocities;
         2733 
         2734         cout << "\nKernel profiling statistics:\n"
         2735             << "  - calcParticleCellID:\t\t" << t_calcParticleCellID/1000.0
         2736             << " s"
         2737             << "\t(" << 100.0*t_calcParticleCellID/t_sum << " %)\n"
         2738             << "  - thrustsort:\t\t\t" << t_thrustsort/1000.0 << " s"
         2739             << "\t(" << 100.0*t_thrustsort/t_sum << " %)\n"
         2740             << "  - reorderArrays:\t\t" << t_reorderArrays/1000.0 << " s"
         2741             << "\t(" << 100.0*t_reorderArrays/t_sum << " %)\n";
         2742         if (params.contactmodel == 2 || params.contactmodel == 3) {
         2743             cout
         2744             << "  - topology:\t\t\t" << t_topology/1000.0 << " s"
         2745             << "\t(" << 100.0*t_topology/t_sum << " %)\n";
         2746         }
         2747         cout << "  - interact:\t\t\t" << t_interact/1000.0 << " s"
         2748             << "\t(" << 100.0*t_interact/t_sum << " %)\n";
         2749         if (params.nb0 > 0) {
         2750             cout << "  - bondsLinear:\t\t" << t_bondsLinear/1000.0 << " s"
         2751             << "\t(" << 100.0*t_bondsLinear/t_sum << " %)\n";
         2752         }
         2753         cout << "  - integrate:\t\t\t" << t_integrate/1000.0 << " s"
         2754             << "\t(" << 100.0*t_integrate/t_sum << " %)\n"
         2755             << "  - summation:\t\t\t" << t_summation/1000.0 << " s"
         2756             << "\t(" << 100.0*t_summation/t_sum << " %)\n"
         2757             << "  - integrateWalls:\t\t" << t_integrateWalls/1000.0 << " s"
         2758             << "\t(" << 100.0*t_integrateWalls/t_sum << " %)\n";
         2759         if (fluid == 1 && cfd_solver == 0) {
         2760             cout << "  - findPorositiesDev:\t\t" << t_findPorositiesDev/1000.0
         2761                 << " s" << "\t(" << 100.0*t_findPorositiesDev/t_sum << " %)\n"
         2762                 << "  - findNSstressTensor:\t\t" << t_findNSstressTensor/1000.0
         2763                 << " s" << "\t(" << 100.0*t_findNSstressTensor/t_sum << " %)\n"
         2764                 << "  - findNSdivphiviv:\t\t" << t_findNSdivphiviv/1000.0
         2765                 << " s" << "\t(" << 100.0*t_findNSdivphiviv/t_sum << " %)\n"
         2766                 << "  - findNSdivtau:\t\t" << t_findNSdivtau/1000.0
         2767                 << " s" << "\t(" << 100.0*t_findNSdivtau/t_sum << " %)\n"
         2768                 << "  - findPredNSvelocities:\t" <<
         2769                 t_findPredNSvelocities/1000.0 << " s" << "\t(" <<
         2770                 100.0*t_findPredNSvelocities/t_sum << " %)\n"
         2771                 << "  - setNSepsilon:\t\t" << t_setNSepsilon/1000.0
         2772                 << " s" << "\t(" << 100.0*t_setNSepsilon/t_sum << " %)\n"
         2773                 << "  - setNSdirichlet:\t\t" << t_setNSdirichlet/1000.0
         2774                 << " s" << "\t(" << 100.0*t_setNSdirichlet/t_sum << " %)\n"
         2775                 << "  - setNSghostNodesDev:\t\t" << t_setNSghostNodesDev/1000.0
         2776                 << " s" << "\t(" << 100.0*t_setNSghostNodesDev/t_sum << " %)\n"
         2777                 << "  - findNSforcing:\t\t" << t_findNSforcing/1000.0 << " s"
         2778                 << "\t(" << 100.0*t_findNSforcing/t_sum << " %)\n"
         2779                 << "  - jacobiIterationNS:\t\t" << t_jacobiIterationNS/1000.0
         2780                 << " s"
         2781                 << "\t(" << 100.0*t_jacobiIterationNS/t_sum << " %)\n"
         2782                 << "  - updateNSvelocityPressure:\t"
         2783                 << t_updateNSvelocityPressure/1000.0 << " s"
         2784                 << "\t(" << 100.0*t_updateNSvelocityPressure/t_sum << " %)\n";
         2785         } else if (fluid == 1 && cfd_solver == 1) {
         2786             cout << "  - findDarcyPorosities:\t" <<
         2787                 t_findDarcyPorosities/1000.0 << " s" << "\t(" <<
         2788                 100.0*t_findDarcyPorosities/t_sum << " %)\n"
         2789                 << "  - setDarcyGhostNodes:\t\t" <<
         2790                 t_setDarcyGhostNodes/1000.0 << " s" << "\t(" <<
         2791                 100.0*t_setDarcyGhostNodes/t_sum << " %)\n"
         2792                 << "  - findDarcyPressureForce:\t" <<
         2793                 t_findDarcyPressureForce/1000.0 << " s" << "\t(" <<
         2794                 100.0*t_findDarcyPressureForce/t_sum << " %)\n"
         2795                 << "  - setDarcyTopPressure:\t" <<
         2796                 t_setDarcyTopPressure/1000.0 << " s" << "\t(" <<
         2797                 100.0*t_setDarcyTopPressure/t_sum << " %)\n"
         2798                 << "  - findDarcyPermeabilities:\t" <<
         2799                 t_findDarcyPermeabilities/1000.0 << " s" << "\t(" <<
         2800                 100.0*t_findDarcyPermeabilities/t_sum << " %)\n"
         2801                 << "  - findDarcyPermeabilityGrads:\t" <<
         2802                 t_findDarcyPermeabilityGradients/1000.0 << " s" << "\t(" <<
         2803                 100.0*t_findDarcyPermeabilityGradients/t_sum << " %)\n"
         2804                 //<< "  - findDarcyPressureChange:\t" <<
         2805                 //t_findDarcyPressureChange/1000.0 << " s" << "\t(" <<
         2806                 //100.0*t_findDarcyPressureChange/t_sum << " %)\n"
         2807                 << "  - updateDarcySolution:\t" <<
         2808                 t_updateDarcySolution/1000.0 << " s" << "\t(" <<
         2809                 100.0*t_updateDarcySolution/t_sum << " %)\n"
         2810                 << "  - copyValues:\t\t\t" <<
         2811                 t_copyValues/1000.0 << " s" << "\t(" <<
         2812                 100.0*t_copyValues/t_sum << " %)\n"
         2813                 << "  - findDarcyVelocities:\t" <<
         2814                 t_findDarcyVelocities/1000.0 << " s" << "\t(" <<
         2815                 100.0*t_findDarcyVelocities/t_sum << " %)" << std::endl;
         2816         }
         2817     }
         2818 
         2819     // Free GPU device memory
         2820     freeGlobalDeviceMemory();
         2821     checkForCudaErrorsIter("After freeGlobalDeviceMemory()", iter);
         2822 
         2823     // Free contact info arrays
         2824     delete[] k.contacts;
         2825     delete[] k.distmod;
         2826     delete[] k.delta_t;
         2827 
         2828     if (fluid == 1) {
         2829         if (cfd_solver == 0)
         2830             endNS();
         2831         else if (cfd_solver == 1)
         2832             endDarcy();
         2833     }
         2834 
         2835     cudaDeviceReset();
         2836 }
         2837 // vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4