mx1.adamsgaard.dk/1/src/sphere/commit/18da38f2435da18e108b208f2054eb4c11107207.gph

       tAdded LBM fluid simulation, values off - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
  HTML git clone git://src.adamsgaard.dk/sphere
   DIR Log
   DIR Files
   DIR Refs
   DIR LICENSE
       ---
   DIR commit 18da38f2435da18e108b208f2054eb4c11107207
   DIR parent d456b79cf318820c11ea7fbf63ea807591a44cc4
  HTML Author: Anders Damsgaard <adc@geo.au.dk>
       Date:   Wed,  1 May 2013 17:07:48 +0200
       
       Added LBM fluid simulation, values off
       
       Diffstat:
         M CMakeLists.txt                      |       7 ++++++-
         M python/sphere.py                    |     110 ++++++++++++++++++++++---------
         M src/datatypes.h                     |       1 +
         M src/device.cu                       |     139 ++++++++++++++++++++++++++-----
         M src/file_io.cpp                     |      24 ++++++++++++++++++++++++
         M src/latticeboltzmann.cuh            |       5 ++---
         M src/sorting.cuh                     |       5 ++++-
         M src/sphere.h                        |      11 ++++++++++-
         M tests/bond_tests.py                 |       5 +++--
         M tests/io_tests.py                   |       1 -
       
       10 files changed, 247 insertions(+), 61 deletions(-)
       ---
   DIR diff --git a/CMakeLists.txt b/CMakeLists.txt
       t@@ -23,8 +23,13 @@ find_package(OpenMP)
        # Uncomment to enable testing
        enable_testing()
        
       -# Set build type
       +# Set build type = Debug
        #set(CMAKE_BUILD_TYPE Debug)
       +#if (CUDA_FOUND)
       +#    set(CUDA_NVCC_FLAGS -g;-G)
       +#endif()
       +
       +# Set build type = Release
        set(CMAKE_BUILD_TYPE Release)
        
        # Add source directory to project.
   DIR diff --git a/python/sphere.py b/python/sphere.py
       t@@ -108,9 +108,18 @@ class Spherebin:
                self.tau_b = numpy.ones(1, dtype=numpy.uint32) * numpy.infty
                self.bonds = numpy.zeros((self.nb0, 2), dtype=numpy.uint32)
                self.bonds_delta_n = numpy.zeros(self.nb0, dtype=numpy.float64)
       -        self.bonds_delta_t = numpy.zeros((self.nb0, self.nd), dtype=numpy.float64)
       +        self.bonds_delta_t = numpy.zeros((self.nb0, self.nd),
       +                                         dtype=numpy.float64)
                self.bonds_omega_n = numpy.zeros(self.nb0, dtype=numpy.float64)
       -        self.bonds_omega_t = numpy.zeros((self.nb0, self.nd), dtype=numpy.float64)
       +        self.bonds_omega_t = numpy.zeros((self.nb0, self.nd),
       +                                         dtype=numpy.float64)
       +
       +        self.nu = numpy.zeros(1, dtype=numpy.float64)
       +        self.f_v = numpy.zeros(
       +            (self.num[0] * self.num[1] * self.num[2], self.nd),
       +            dtype=numpy.float64)
       +        self.f_rho = numpy.zeros(self.num[0] * self.num[1] * self.num[2],
       +                               dtype=numpy.float64)
        
            def __cmp__(self, other):
                """ Called when to Spherebin objects are compared.
       t@@ -176,13 +185,16 @@ class Spherebin:
                        self.bonds_delta_n == other.bonds_delta_n and\
                        self.bonds_delta_t == other.bonds_delta_t and\
                        self.bonds_omega_n == other.bonds_omega_n and\
       -                self.bonds_omega_t == other.bonds_omega_t\
       +                self.bonds_omega_t == other.bonds_omega_t and\
       +                self.nu == other.nu and\
       +                (self.f_v == other.f_v).all() and\
       +                (self.f_rho == other.f_rho).all()\
                        ).all() == True):
                            return 0 # All equal
       -        else :
       +        else:
                    return 1
        
       -    def readbin(self, targetbin, verbose = True, bonds = True, devsmod = True):
       +    def readbin(self, targetbin, verbose = True, bonds = True, devsmod = True, fluid = True):
                'Reads a target SPHERE binary file'
        
                fh = None
       t@@ -203,20 +215,16 @@ class Spherebin:
                    self.time_step_count = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
        
                    # Allocate array memory for particles
       -            self.x       = numpy.zeros((self.np, self.nd), dtype=numpy.float64)
       -            self.radius  = numpy.zeros(self.np, dtype=numpy.float64)
       -            self.xysum   = numpy.zeros((self.np, 2), dtype=numpy.float64)
       -            self.vel     = numpy.zeros((self.np, self.nd), dtype=numpy.float64)
       -            self.fixvel  = numpy.zeros(self.np, dtype=numpy.float64)
       -            #self.force   = numpy.zeros((self.np, self.nd), dtype=numpy.float64)
       -            #self.angpos  = numpy.zeros((self.np, self.nd), dtype=numpy.float64)
       -            #self.angvel  = numpy.zeros((self.np, self.nd), dtype=numpy.float64)
       -            #self.torque  = numpy.zeros((self.np, self.nd), dtype=numpy.float64)
       -            self.es_dot  = numpy.zeros(self.np, dtype=numpy.float64)
       -            self.es      = numpy.zeros(self.np, dtype=numpy.float64)
       -            self.ev_dot  = numpy.zeros(self.np, dtype=numpy.float64)
       -            self.ev      = numpy.zeros(self.np, dtype=numpy.float64)
       -            self.p       = numpy.zeros(self.np, dtype=numpy.float64)
       +            self.x       = numpy.empty((self.np, self.nd), dtype=numpy.float64)
       +            self.radius  = numpy.empty(self.np, dtype=numpy.float64)
       +            self.xysum   = numpy.empty((self.np, 2), dtype=numpy.float64)
       +            self.vel     = numpy.empty((self.np, self.nd), dtype=numpy.float64)
       +            self.fixvel  = numpy.empty(self.np, dtype=numpy.float64)
       +            self.es_dot  = numpy.empty(self.np, dtype=numpy.float64)
       +            self.es      = numpy.empty(self.np, dtype=numpy.float64)
       +            self.ev_dot  = numpy.empty(self.np, dtype=numpy.float64)
       +            self.ev      = numpy.empty(self.np, dtype=numpy.float64)
       +            self.p       = numpy.empty(self.np, dtype=numpy.float64)
        
                    # Read remaining data from binary
                    self.origo    = numpy.fromfile(fh, dtype=numpy.float64, count=self.nd)
       t@@ -271,13 +279,13 @@ class Spherebin:
        
                    # Wall data
                    self.nw      = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
       -            self.wmode   = numpy.zeros(self.nw, dtype=numpy.int32)
       -            self.w_n     = numpy.zeros(self.nw*self.nd, dtype=numpy.float64).reshape(self.nw,self.nd)
       -            self.w_x     = numpy.zeros(self.nw, dtype=numpy.float64)
       -            self.w_m     = numpy.zeros(self.nw, dtype=numpy.float64)
       -            self.w_vel   = numpy.zeros(self.nw, dtype=numpy.float64)
       -            self.w_force = numpy.zeros(self.nw, dtype=numpy.float64)
       -            self.w_devs  = numpy.zeros(self.nw, dtype=numpy.float64)
       +            self.wmode   = numpy.empty(self.nw, dtype=numpy.int32)
       +            self.w_n     = numpy.empty(self.nw*self.nd, dtype=numpy.float64).reshape(self.nw,self.nd)
       +            self.w_x     = numpy.empty(self.nw, dtype=numpy.float64)
       +            self.w_m     = numpy.empty(self.nw, dtype=numpy.float64)
       +            self.w_vel   = numpy.empty(self.nw, dtype=numpy.float64)
       +            self.w_force = numpy.empty(self.nw, dtype=numpy.float64)
       +            self.w_devs  = numpy.empty(self.nw, dtype=numpy.float64)
        
                    self.wmode   = numpy.fromfile(fh, dtype=numpy.int32, count=self.nw)
                    for i in range(self.nw):
       t@@ -298,17 +306,25 @@ class Spherebin:
                        self.nb0 = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
                        self.sigma_b = numpy.fromfile(fh, dtype=numpy.float64, count=1)
                        self.tau_b = numpy.fromfile(fh, dtype=numpy.float64, count=1)
       -                self.bonds = numpy.zeros((self.nb0, 2), dtype=numpy.uint32)
       +                self.bonds = numpy.empty((self.nb0, 2), dtype=numpy.uint32)
                        for i in range(self.nb0):
                            self.bonds[i,0] = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
                            self.bonds[i,1] = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
                        self.bonds_delta_n = numpy.fromfile(fh, dtype=numpy.float64, count=self.nb0)
       -                self.bonds_delta_t = numpy.fromfile(fh, dtype=numpy.float64, count=self.nb0*self.nd).reshape(self.nb0*self.nd)
       +                self.bonds_delta_t = numpy.fromfile(fh, dtype=numpy.float64, count=self.nb0*self.nd).reshape(self.nb0, self.nd)
                        self.bonds_omega_n = numpy.fromfile(fh, dtype=numpy.float64, count=self.nb0)
       -                self.bonds_omega_t = numpy.fromfile(fh, dtype=numpy.float64, count=self.nb0*self.nd).reshape(self.nb0*self.nd)
       +                self.bonds_omega_t = numpy.fromfile(fh, dtype=numpy.float64, count=self.nb0*self.nd).reshape(self.nb0, self.nd)
                    else:
                        self.nb0 = numpy.zeros(1, dtype=numpy.uint32)
        
       +            if (fluid == True):
       +                ncells = self.num[0]*self.num[1]*self.num[2]
       +                self.nu = numpy.fromfile(fh, dtype=numpy.float64, count=1)
       +                self.f_v = numpy.empty(ncells*self.nd, dtype=numpy.float64)
       +                self.f_rho = numpy.empty(ncells, dtype=numpy.float64)
       +                self.f_v = numpy.fromfile(fh, dtype=numpy.float64, count=ncells*self.nd)
       +                self.f_rho = numpy.fromfile(fh, dtype=numpy.float64, count=ncells)
       +
                finally:
                    if fh is not None:
                        fh.close()
       t@@ -412,11 +428,18 @@ class Spherebin:
                    fh.write(self.bonds_omega_n.astype(numpy.float64))
                    fh.write(self.bonds_omega_t.astype(numpy.float64))
        
       +            fh.write(self.nu.astype(numpy.float64))
       +            fh.write(self.f_v.astype(numpy.float64))
       +            fh.write(self.f_rho.astype(numpy.float64))
        
                finally:
                    if fh is not None:
                        fh.close()
        
       +    def readfirst(self, verbose=True):
       +        fn = "../output/{0}.output00000.bin".format(self.sid)
       +        self.readbin(fn, verbose)
       +
            def readlast(self, verbose=True):
                lastfile = status(self.sid)
                fn = "../output/{0}.output{1:0=5}.bin".format(self.sid, lastfile)
       t@@ -508,6 +531,11 @@ class Spherebin:
                cellsize = 2 * r_max
                self.L = self.num * cellsize
        
       +        # Init fluid arrays
       +        self.f_v = numpy.zeros((self.num[0]*self.num[1]*self.num[2],self.nd), dtype=numpy.float64)
       +        self.f_rho = numpy.ones(self.num[0]*self.num[1]*self.num[2], dtype=numpy.float64)
       +
       +
                # Particle positions randomly distributed without overlap
                for i in range(self.np):
                    overlaps = True
       t@@ -551,6 +579,11 @@ class Spherebin:
                    print("Error: The grid must be at least 3 cells in each direction")
                    print(" Grid: x={}, y={}, z={}".format(self.num[0], self.num[1], self.num[2]))
        
       +        # Init fluid arrays
       +        self.f_v = numpy.zeros((self.num[0]*self.num[1]*self.num[2],self.nd), dtype=numpy.float64)
       +        self.f_rho = numpy.ones(self.num[0]*self.num[1]*self.num[2], dtype=numpy.float64)
       +
       +
                # Put upper wall at top boundary
                if (self.nw > 0):
                    self.w_x[0] = self.L[0]
       t@@ -591,6 +624,11 @@ class Spherebin:
                    print("Error: The grid must be at least 3 cells in each direction")
                    print(self.num)
        
       +        # Init fluid arrays
       +        self.f_v = numpy.zeros((self.num[0]*self.num[1]*self.num[2],self.nd), dtype=numpy.float64)
       +        self.f_rho = numpy.ones(self.num[0]*self.num[1]*self.num[2], dtype=numpy.float64)
       +
       +
                self.contactmodel[0] = contactmodel
        
                # Put upper wall at top boundary
       t@@ -656,6 +694,10 @@ class Spherebin:
                            self.x[i,0] += 0.5*cellsize
                            self.x[i,1] += 0.5*cellsize
        
       +        # Init fluid arrays
       +        self.f_v = numpy.zeros((self.num[0]*self.num[1]*self.num[2],self.nd), dtype=numpy.float64)
       +        self.f_rho = numpy.ones(self.num[0]*self.num[1]*self.num[2], dtype=numpy.float64)
       +
                self.contactmodel[0] = contactmodel
        
                # Readjust grid to correct size
       t@@ -717,6 +759,10 @@ class Spherebin:
                self.num[2] = numpy.ceil(z_max/cellsize)
                self.L = self.num * cellsize
        
       +        # Init fluid arrays
       +        self.f_v = numpy.zeros((self.num[0]*self.num[1]*self.num[2],self.nd), dtype=numpy.float64)
       +        self.f_rho = numpy.ones(self.num[0]*self.num[1]*self.num[2], dtype=numpy.float64)
       +
            def createBondPair(self, i, j, spacing=-0.1):
                """ Bond particles i and j. Particle j is moved adjacent to particle i,
                and oriented randomly.
       t@@ -940,7 +986,8 @@ class Spherebin:
                    gamma_r = 0,
                    gamma_wn = 1e4,
                    gamma_wt = 1e4,
       -            capillaryCohesion = 0):
       +            capillaryCohesion = 0,
       +            nu = 0.0):
                """ Initialize particle parameters to default values.
                    Radii must be set prior to calling this function.
                """
       t@@ -1007,6 +1054,9 @@ class Spherebin:
                # Debonding distance
                self.db[0] = (1.0 + theta/2.0) * self.V_b**(1.0/3.0)
        
       +        # Fluid dynamic viscosity
       +        self.nu[0] = nu
       +
        
            def bond(self, i, j):
                """ Create a bond between particles i and j """
   DIR diff --git a/src/datatypes.h b/src/datatypes.h
       t@@ -96,6 +96,7 @@ struct Params {
            Float tau_b;          // Bond shear strength
            Float devs_A;         // Amplitude of modulations in deviatoric normal stress
            Float devs_f;         // Frequency of modulations in deviatoric normal stress
       +    Float nu;             // Fluid dynamic viscosity
        };
        
        // Structure containing wall parameters
   DIR diff --git a/src/device.cu b/src/device.cu
       t@@ -25,6 +25,7 @@
        #include "contactsearch.cuh"
        #include "integration.cuh"
        #include "raytracer.cuh"
       +#include "latticeboltzmann.cuh"
        
        
        // Wrapper function for initializing the CUDA components.
       t@@ -140,7 +141,8 @@ __global__ void checkConstantValues(int* dev_equal,
                    dev_params->db != devC_params.db ||
                    dev_params->V_b != devC_params.V_b ||
                    dev_params->lambda_bar != devC_params.lambda_bar ||
       -            dev_params->nb0 != devC_params.nb0)
       +            dev_params->nb0 != devC_params.nb0 ||
       +            dev_params->nu != devC_params.nu)
                *dev_equal = 2; // Not ok
        
        }
       t@@ -184,11 +186,12 @@ __host__ void DEM::checkConstantMemory()
            // Are the values equal?
            if (*equal != 0) {
                std::cerr << "Error! The values in constant memory do not "
       -            << "seem to be correct (" << *equal << ").\n";
       +            << "seem to be correct (" << *equal << ")." << std::endl;
                exit(1);
            } else {
                if (verbose == 1)
       -            std::cout << "  Constant values ok (" << *equal << ").\n";
       +            std::cout << "  Constant values ok (" << *equal << ")."
       +                << std::endl;
            }
        }
        
       t@@ -256,7 +259,8 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
            cudaMalloc((void**)&dev_torque, memSizeF4);
        
            // Particle contact bookkeeping arrays
       -    cudaMalloc((void**)&dev_contacts, sizeof(unsigned int)*np*NC); // Max NC contacts per particle
       +    cudaMalloc((void**)&dev_contacts,
       +            sizeof(unsigned int)*np*NC); // Max NC contacts per particle
            cudaMalloc((void**)&dev_distmod, memSizeF4*NC);
            cudaMalloc((void**)&dev_delta_t, memSizeF4*NC);
            cudaMalloc((void**)&dev_bonds, sizeof(uint2)*params.nb0);
       t@@ -278,8 +282,10 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
            // Cell-related arrays
            cudaMalloc((void**)&dev_gridParticleCellID, sizeof(unsigned int)*np);
            cudaMalloc((void**)&dev_gridParticleIndex, sizeof(unsigned int)*np);
       -    cudaMalloc((void**)&dev_cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       -    cudaMalloc((void**)&dev_cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       +    cudaMalloc((void**)&dev_cellStart, 
       +            sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       +    cudaMalloc((void**)&dev_cellEnd,
       +            sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
        
            // Host contact bookkeeping arrays
            k.contacts = new unsigned int[np*NC];
       t@@ -298,9 +304,17 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
            cudaMalloc((void**)&dev_walls_vel0, sizeof(Float)*walls.nw);
            // dev_walls_force_partial allocated later
        
       +    // Fluid arrays
       +    cudaMalloc((void**)&dev_f,
       +            sizeof(Float)*grid.num[0]*grid.num[1]*grid.num[2]*19);
       +    cudaMalloc((void**)&dev_f_new,
       +            sizeof(Float)*grid.num[0]*grid.num[1]*grid.num[2]*19);
       +    cudaMalloc((void**)&dev_v_rho,
       +            sizeof(Float4)*grid.num[0]*grid.num[1]*grid.num[2]);
       +
            checkForCudaErrors("End of allocateGlobalDeviceMemory");
            if (verbose == 1)
       -        std::cout << "Done\n";
       +        std::cout << "Done" << std::endl;
        }
        
        __host__ void DEM::freeGlobalDeviceMemory()
       t@@ -348,10 +362,16 @@ __host__ void DEM::freeGlobalDeviceMemory()
            cudaFree(dev_walls_force_partial);
            cudaFree(dev_walls_force_pp);
            cudaFree(dev_walls_vel0);
       +
       +    // Fluid arrays
       +    cudaFree(dev_f);
       +    cudaFree(dev_f_new);
       +    cudaFree(dev_v_rho);
       +
            checkForCudaErrors("During cudaFree calls");
        
            if (verbose == 1)
       -        printf("Done\n");
       +        std::cout << "Done" << std::endl;
        }
        
        
       t@@ -427,9 +447,18 @@ __host__ void DEM::transferToGlobalDeviceMemory()
                        sizeof(Float), cudaMemcpyHostToDevice);
            }
        
       +    // Fluid arrays
       +    if (params.nu > 0.0) {
       +        cudaMemcpy( dev_f, f, sizeof(Float)*grid.num[0]*grid.num[1]*grid.num[2]*19,
       +                cudaMemcpyHostToDevice);
       +        cudaMemcpy( dev_v_rho, v_rho,
       +                sizeof(Float)*grid.num[0]*grid.num[1]*grid.num[2],
       +                cudaMemcpyHostToDevice);
       +    }
       +
            checkForCudaErrors("End of transferToGlobalDeviceMemory");
            if (verbose == 1)
       -        std::cout << "Done\n";
       +        std::cout << "Done" << std::endl;
        }
        
        __host__ void DEM::transferFromGlobalDeviceMemory()
       t@@ -495,8 +524,15 @@ __host__ void DEM::transferFromGlobalDeviceMemory()
            cudaMemcpy( walls.mvfd, dev_walls_mvfd,
                    sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
        
       -    // Bond parameters
       -
       +    // Fluid arrays
       +    if (params.nu > 0.0) {
       +        cudaMemcpy( f, dev_f,
       +                sizeof(Float)*grid.num[0]*grid.num[1]*grid.num[2]*19,
       +                cudaMemcpyDeviceToHost);
       +        cudaMemcpy( v_rho, dev_v_rho,
       +                sizeof(Float)*grid.num[0]*grid.num[1]*grid.num[2],
       +                cudaMemcpyDeviceToHost);
       +    }
        
            checkForCudaErrors("End of transferFromGlobalDeviceMemory");
        }
       t@@ -506,6 +542,8 @@ __host__ void DEM::transferFromGlobalDeviceMemory()
        __host__ void DEM::startTime()
        {
            using std::cout; // Namespace directive
       +    using std::cerr; // Namespace directive
       +    using std::endl; // Namespace directive
            std::string outfile;
            char file[200];
            FILE *fp;
       t@@ -523,14 +561,34 @@ __host__ void DEM::startTime()
            // Start CPU clock
            tic = clock();
        
       -    // GPU workload configuration
       -    unsigned int threadsPerBlock = 256; 
       +    //// GPU workload configuration
       +    //unsigned int threadsPerBlock = 256; 
       +    unsigned int threadsPerBlock = 512; 
       +
            // Create enough blocks to accomodate the particles
            unsigned int blocksPerGrid = iDivUp(np, threadsPerBlock); 
            dim3 dimGrid(blocksPerGrid, 1, 1); // Blocks arranged in 1D grid
            dim3 dimBlock(threadsPerBlock, 1, 1); // Threads arranged in 1D block
       +
            unsigned int blocksPerGridBonds = iDivUp(params.nb0, threadsPerBlock); 
            dim3 dimGridBonds(blocksPerGridBonds, 1, 1); // Blocks arranged in 1D grid
       +
       +    // Use 3D block and grid layout for Lattice-Boltzmann fluid calculations
       +    dim3 dimBlockFluid(8, 8, 8);    // 512 threads per block
       +    //dim3 dimGridFluid(
       +            //(grid.num[2]+8-1)/8,    // Use x as the z
       +            //(grid.num[1]+8-1)/8,
       +            //(grid.num[0]+8-1)/8);
       +    dim3 dimGridFluid(
       +            iDivUp(grid.num[2],dimBlockFluid.x),    // Use z as x
       +            iDivUp(grid.num[1],dimBlockFluid.y),
       +            iDivUp(grid.num[0],dimBlockFluid.z));   // Use x as z
       +    if (dimGridFluid.z > 64) {
       +        cerr << "Error: dimGridFluid.z > 64" << endl;
       +        exit(1);
       +    }
       +
       +
            // Shared memory per block
            unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);
        
       t@@ -544,7 +602,16 @@ __host__ void DEM::startTime()
                    << dimGrid.x << "*" << dimGrid.y << "*" << dimGrid.z << "\n"
                    << "  - Threads per block: "
                    << dimBlock.x << "*" << dimBlock.y << "*" << dimBlock.z << "\n"
       -            << "  - Shared memory required per block: " << smemSize << " bytes\n";
       +            << "  - Shared memory required per block: " << smemSize << " bytes"
       +            << endl;
       +        if (params.nu > 0.0) {
       +            cout << "  - Blocks per fluid grid: "
       +                << dimGridFluid.x << "*" << dimGridFluid.y << "*" <<
       +                dimGridFluid.z << "\n"
       +                << "  - Threads per fluid block: "
       +                << dimBlockFluid.x << "*" << dimBlockFluid.y << "*" <<
       +                dimBlockFluid.z << endl;
       +        }
            }
        
            // Initialize counter variable values
       t@@ -561,17 +628,17 @@ __host__ void DEM::startTime()
                    time.step_count);
            fclose(fp);
        
       -    // Write first output data file: output0.bin, thus testing writing of bin files
       -    //outfile = "output/" + sid + ".output0.bin";
       -    //sprintf(file,"output/%s.output0.bin", sid);
       -    //writebin(outfile.c_str());
       +    // Initialize fluid distribution array
       +    initfluid<<< dimGridFluid, dimBlockFluid >>>(dev_v_rho, dev_f);
       +    cudaThreadSynchronize();
       +    checkForCudaErrors("Post initfluid");
        
            if (verbose == 1) {
                cout << "\n  Entering the main calculation time loop...\n\n"
                    << "  IMPORTANT: Do not close this terminal, doing so will \n"
                    << "             terminate this SPHERE process. Follow the \n"
                    << "             progress by executing:\n"
       -            << "                $ ./sphere_status " << sid << "\n\n";
       +            << "                $ ./sphere_status " << sid << endl << endl;
            }
        
        
       t@@ -590,6 +657,7 @@ __host__ void DEM::startTime()
            double t_topology = 0.0;
            double t_interact = 0.0;
            double t_bondsLinear = 0.0;
       +    double t_latticeBoltzmannD3Q19 = 0.0;
            double t_integrate = 0.0;
            double t_summation = 0.0;
            double t_integrateWalls = 0.0;
       t@@ -606,8 +674,6 @@ __host__ void DEM::startTime()
            // MAIN CALCULATION TIME LOOP
            while (time.current <= time.total) {
        
       -        // Increment iteration counter
       -        ++iter;
        
                // Print current step number to terminal
                //printf("Step: %d\n", time.step_count);
       t@@ -733,7 +799,8 @@ __host__ void DEM::startTime()
        
                // Process particle pairs
                if (params.nb0 > 0) {
       -            //bondsLinear<<< 1, params.nb0 >>>(
       +            if (PROFILING == 1)
       +                startTimer(&kernel_tic);
                    bondsLinear<<<dimGridBonds, dimBlock>>>(
                            dev_bonds,
                            dev_bonds_delta,
       t@@ -751,6 +818,31 @@ __host__ void DEM::startTime()
                    checkForCudaErrors("Post bondsLinear", iter);
                }
        
       +        // Process fluid and particle interaction in each cell
       +        if (params.nu > 0.0) {
       +            if (PROFILING == 1)
       +                startTimer(&kernel_tic);
       +            latticeBoltzmannD3Q19<<< dimGridFluid, dimBlockFluid>>> (
       +                    dev_f,
       +                    dev_f_new,
       +                    dev_v_rho,
       +                    dev_cellStart,
       +                    dev_cellEnd,
       +                    dev_x_sorted,
       +                    dev_vel_sorted,
       +                    dev_force,
       +                    dev_gridParticleIndex);
       +            cudaThreadSynchronize();
       +            if (PROFILING == 1)
       +                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                        &t_latticeBoltzmannD3Q19);
       +            checkForCudaErrors("Post latticeBoltzmannD3Q19", iter);
       +
       +            // Flip flop
       +            swapFloatArrays(dev_f, dev_f_new);
       +        }
       +
       +
        
                // Update particle kinematics
                if (PROFILING == 1)
       t@@ -822,6 +914,7 @@ __host__ void DEM::startTime()
                // Update timers and counters
                time.current  += time.dt;
                filetimeclock += time.dt;
       +        ++iter;
        
                // Report time to console
                if (verbose == 1) {
       t@@ -938,6 +1031,8 @@ __host__ void DEM::startTime()
                    << "\t(" << 100.0*t_interact/t_sum << " %)\n"
                    << "  - bondsLinear:\t" << t_bondsLinear/1000.0 << " s"
                    << "\t(" << 100.0*t_bondsLinear/t_sum << " %)\n"
       +            << "  - latticeBoltzmann:\t" << t_latticeBoltzmannD3Q19/1000.0 << " s"
       +            << "\t(" << 100.0*t_latticeBoltzmannD3Q19/t_sum << " %)\n"
                    << "  - integrate:\t\t" << t_integrate/1000.0 << " s"
                    << "\t(" << 100.0*t_integrate/t_sum << " %)\n"
                    << "  - summation:\t\t" << t_summation/1000.0 << " s"
   DIR diff --git a/src/file_io.cpp b/src/file_io.cpp
       t@@ -236,6 +236,21 @@ void DEM::readbin(const char *target)
                ifs.read(as_bytes(k.bonds_omega[i].z), sizeof(Float));
            }
        
       +    // Read fluid parameters
       +    ifs.read(as_bytes(params.nu), sizeof(params.nu));
       +    v_rho = new Float4[grid.num[0]*grid.num[1]*grid.num[2]];
       +    f = new Float[grid.num[0]*grid.num[1]*grid.num[2]*19];
       +    if (params.nu > 0.0) {
       +        for (i = 0; i<grid.num[0]*grid.num[1]*grid.num[2]; ++i) {
       +            ifs.read(as_bytes(v_rho[i].x), sizeof(Float));
       +            ifs.read(as_bytes(v_rho[i].y), sizeof(Float));
       +            ifs.read(as_bytes(v_rho[i].z), sizeof(Float));
       +        }
       +        for (i = 0; i<grid.num[0]*grid.num[1]*grid.num[2]; ++i)
       +            ifs.read(as_bytes(v_rho[i].w), sizeof(Float));
       +    }
       +
       +
            // Close file if it is still open
            if (ifs.is_open())
                ifs.close();
       t@@ -394,6 +409,15 @@ void DEM::writebin(const char *target)
                    ofs.write(as_bytes(k.bonds_omega[i].z), sizeof(Float));
                }
        
       +        ofs.write(as_bytes(params.nu), sizeof(params.nu));
       +        for (i = 0; i<grid.num[0]*grid.num[1]*grid.num[2]; ++i) {
       +            ofs.write(as_bytes(v_rho[i].x), sizeof(Float));
       +            ofs.write(as_bytes(v_rho[i].y), sizeof(Float));
       +            ofs.write(as_bytes(v_rho[i].z), sizeof(Float));
       +        }
       +        for (i = 0; i<grid.num[0]*grid.num[1]*grid.num[2]; ++i)
       +            ofs.write(as_bytes(v_rho[i].w), sizeof(Float));
       +
                // Close file if it is still open
                if (ofs.is_open())
                    ofs.close();
   DIR diff --git a/src/latticeboltzmann.cuh b/src/latticeboltzmann.cuh
       t@@ -360,7 +360,7 @@ __global__ void latticeBoltzmannD3Q19(
                    dev_f_new[grid2index(nx-1,   0,   z,  9)] = fmax(0.0, f_9);
        
                // Edge 10 (+x,-y): Periodic
       -        /*if (x < nx-1 && y > 0)  // not at boundary
       +        if (x < nx-1 && y > 0)  // not at boundary
                    dev_f_new[grid2index( x+1, y-1,   z, 10)] = fmax(0.0, f_10);
                else if (x < nx-1)  // at -y boundary
                    dev_f_new[grid2index( x+1,ny-1,   z, 10)] = fmax(0.0, f_10);
       t@@ -445,11 +445,10 @@ __global__ void latticeBoltzmannD3Q19(
                else if (y < ny-1)  // at -z boundary
                    dev_f_new[grid2index(   x, y+1,   0, 17)] = fmax(0.0, f_18);
                else if (z > 0)     // at +y boundary
       -            dev_f_new[grid2index(   x,   0, z+1, 18)] = fmax(0.0, f_18);
       +            dev_f_new[grid2index(   x,   0, z-1, 18)] = fmax(0.0, f_18);
                else    // at +y-z boundary
                    dev_f_new[grid2index(   x,   0,   0, 17)] = fmax(0.0, f_18);
        
       -        // */
            }
        }
        
   DIR diff --git a/src/sorting.cuh b/src/sorting.cuh
       t@@ -1,3 +1,6 @@
       +#ifndef SORTING_CUH_
       +#define SORTING_CUH_
       +
        // Returns the cellID containing the particle, based cubic grid
        // See Bayraktar et al. 2009
        // Kernel is executed on the device, and is callable from the device only
       t@@ -118,5 +121,5 @@ __global__ void reorderArrays(unsigned int* dev_cellStart,
        } // End of reorderArrays(...)
        
        
       -
       +#endif
        // vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
   DIR diff --git a/src/sphere.h b/src/sphere.h
       t@@ -134,9 +134,16 @@ class DEM {
                        const Float *z_pos,
                        const Float *porosity);
        
       +        // Lattice-Boltzmann data arrays (D3Q19)
       +        Float  *f;          // Fluid distribution (f0..f18)
       +        Float  *dev_f;      // Device equivalent
       +        Float  *dev_f_new;  // Device equivalent
       +        Float4 *v_rho;      // Fluid velocity v (xyz), and pressure rho (w) 
       +        Float4 *dev_v_rho;  // Device equivalent
       +
        
       -        // Values and functions accessible from the outside
            public:
       +        // Values and functions accessible from the outside
        
                // Constructor, some parameters with default values
                DEM(std::string inputbin, 
       t@@ -201,5 +208,7 @@ class DEM {
        
        };
        
       +
       +
        #endif
        // vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
   DIR diff --git a/tests/bond_tests.py b/tests/bond_tests.py
       t@@ -45,10 +45,10 @@ for d in distances:
            sb.bond(0, 1)
            sb.defaultParams(gamma_n = 0.0, gamma_t = 0.0)
            #sb.initTemporal(total=0.5, file_dt=0.01)
       -
            #sb.render(verbose=False)
            #visualize(sb.sid, "energy")
        
       +
            print("# Stability test")
            sb.x[0,:] = numpy.array((2.0, 2.0, 2.0))
            sb.x[1,:] = numpy.array((2.0+2.0*radii+d, 2.0, 2.0))
       t@@ -85,8 +85,9 @@ for d in distances:
                print(passed())
            compareNumpyArrays(sb.vel[:,1:2], z2_2, "vel[:,1:2]")
            compareNumpyArrays(sb.angvel, z2_3, "angvel\t")
       -    #printKinematics(sb)
       +    printKinematics(sb)
            #visualize(sb.sid, "energy")
       +    continue
        
            print("# Normal contraction")
            sb.zeroKinematics()
   DIR diff --git a/tests/io_tests.py b/tests/io_tests.py
       t@@ -21,7 +21,6 @@ compare(orig, py, "Python IO:")
        
        # Test C++ IO routines
        orig.run(verbose=False, hideinputfile=True)
       -#orig.run()
        cpp = Spherebin()
        cpp.readbin("../output/" + orig.sid + ".output00000.bin", verbose=False)
        compare(orig, cpp, "C++ IO:   ")