URI:
       tMajor revision: Double precision, kernels split into separate sources - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
  HTML git clone git://src.adamsgaard.dk/sphere
   DIR Log
   DIR Files
   DIR Refs
   DIR LICENSE
       ---
   DIR commit cd41737e0b69a87e1dbca387031c53793b51ab56
   DIR parent d818c179c8ce377aa14d4a4398ff6e6f9907fe75
  HTML Author: Anders Damsgaard <adc@geo.au.dk>
       Date:   Mon, 20 Aug 2012 16:36:05 +0200
       
       Major revision: Double precision, kernels split into separate sources
       
       Diffstat:
         A img_out/about.txt                   |       2 ++
         A input/1e3-test-shear.bin            |       0 
         D mfiles/freadbin.m                   |     118 -------------------------------
         D mfiles/fwritebin.m                  |      98 -------------------------------
         D mfiles/initsetup.m                  |     180 -------------------------------
         D mfiles/plotspheres.m                |      99 -------------------------------
         D mfiles/status.m                     |      11 -----------
         D mfiles/useoutput.m                  |     112 -------------------------------
         D mfiles/visualize.m                  |     379 -------------------------------
         A python/elastic_collision.py         |      46 +++++++++++++++++++++++++++++++
         A python/inelastic_collision.py       |      49 +++++++++++++++++++++++++++++++
         A raytracer/colorbar.h                |      22 ++++++++++++++++++++++
         A raytracer/render_all_outputs_CPU.sh |      27 +++++++++++++++++++++++++++
         A raytracer/render_all_outputs_GPU.sh |      29 +++++++++++++++++++++++++++++
         A raytracer/rt-kernel.cu              |     442 +++++++++++++++++++++++++++++++
         A raytracer/rt_GPU_init_pres.sh       |      36 +++++++++++++++++++++++++++++++
         A raytracer/rt_GPU_pres.sh            |      36 +++++++++++++++++++++++++++++++
         A raytracer/rt_GPU_tall_pres.sh       |      36 +++++++++++++++++++++++++++++++
         A src/cohesion.cuh                    |     126 +++++++++++++++++++++++++++++++
         A src/contactmodels.cuh               |     411 ++++++++++++++++++++++++++++++
         A src/contactsearch.cuh               |     625 +++++++++++++++++++++++++++++++
         A src/integration.cuh                 |     207 ++++++++++++++++++++++++++++++
         A src/sorting.cuh                     |     127 +++++++++++++++++++++++++++++++
         A src/vector_arithmetic.h             |     994 +++++++++++++++++++++++++++++++
       
       24 files changed, 3215 insertions(+), 997 deletions(-)
       ---
   DIR diff --git a/img_out/about.txt b/img_out/about.txt
       t@@ -0,0 +1,2 @@
       +This folder will contain images rendered from ../output binaries, and
       +graphs plotted with ../python/sphere.py.
   DIR diff --git a/input/1e3-test-shear.bin b/input/1e3-test-shear.bin
       Binary files differ.
   DIR diff --git a/mfiles/freadbin.m b/mfiles/freadbin.m
       t@@ -1,118 +0,0 @@
       -function [p, grids, time, params, walls] = freadbin(path, fn)
       -    %path = '../output/'; %Target directory
       -    %fn   = 'output0.bin'; %Target binary
       -    
       -    %Open binary file for reading with little endian ordering and unicode char encoding.
       -    %fid = fopen([path fn],'rb','ieee-le','UTF-8'); 
       -    fid = fopen([path fn], 'rb', 'ieee-le');
       -
       -    % Read the number of dimensions and particles
       -    grids.nd = fread(fid, 1, 'int');
       -    p.np     = fread(fid, 1, 'uint32');
       -    
       -    % Read the time variables
       -    time.dt         = fread(fid, 1, 'float');
       -    time.current    = fread(fid, 1, 'double');
       -    time.total      = fread(fid, 1, 'double');
       -    time.file_dt    = fread(fid, 1, 'float');
       -    time.step_count = fread(fid, 1, 'uint32');
       -    
       -    %Initiate variables for faster execution
       -    p.x       = zeros(p.np, grids.nd); % Coordinate vector
       -    p.vel     = zeros(p.np, grids.nd); % Velocity vector
       -    p.angvel  = zeros(p.np, grids.nd); % Acceleration vector
       -    p.force   = zeros(p.np, grids.nd); % Linear force vector
       -    p.torque  = zeros(p.np, grids.nd); % Rotational torque vector
       -    p.fixvel  = zeros(p.np, 1);               % Fix horizontal particle velocity: 1: yes, 0: no
       -    p.xsum    = zeros(p.np, 1);               % Total particle displacement along x-axis
       -    p.radius  = zeros(p.np, 1);               % Particle radius
       -    p.rho     = zeros(p.np, 1);        % Density
       -    p.k_n     = zeros(p.np, 1);        % Normal stiffness
       -    p.k_s     = zeros(p.np, 1);               % Shear stiffness
       -    p.k_r     = zeros(p.np, 1);               % Rolling stiffness
       -    p.gamma_s = zeros(p.np, 1);        % Shear viscosity
       -    p.gamma_r = zeros(p.np, 1);               % Rolling viscosity
       -    p.mu_s    = zeros(p.np, 1);        % Inter-particle contact static shear friction coefficient
       -    p.mu_d    = zeros(p.np, 1);        % Inter-particle contact dynamic shear friction coefficient
       -    p.mu_r    = zeros(p.np, 1);        % Inter-particle contact rolling friction coefficient
       -    p.C       = zeros(p.np, 1);        % Inter-particle cohesion
       -    p.E       = zeros(p.np, 1);        % Young's modulus
       -    p.K       = zeros(p.np, 1);        % Bulk modulus
       -    p.nu      = zeros(p.np, 1);        % Poisson's ratio
       -    p.es_dot  = zeros(p.np, 1);        % Current shear energy dissipation
       -    p.es      = zeros(p.np, 1);        % Total shear energy dissipation
       -    p.p       = zeros(p.np, 1);        % Pressure on particle
       -    p.bonds   = zeros(p.np, 2);               % Inter-particle bonds
       -
       -    % Read remaining data from MATLAB binary
       -    grids.origo = fread(fid, [1, grids.nd], 'float');
       -    grids.L     = fread(fid, [1, grids.nd], 'float');
       -    grids.num   = fread(fid, [1, grids.nd], 'uint32');
       -
       -    for j=1:p.np
       -        %p.id = fread(fid, 1, 'uint32');
       -        
       -        for i=1:grids.nd %Coordinates, velocity, acceleration, pre-velocity    
       -            p.x(j,i)      = fread(fid, 1, 'float');
       -            p.vel(j,i)    = fread(fid, 1, 'float');
       -            p.angvel(j,i) = fread(fid, 1, 'float');
       -            p.force(j,i)  = fread(fid, 1, 'float');
       -            p.torque(j,i) = fread(fid, 1, 'float');
       -        end
       -    end
       -    
       -    for j=1:p.np %Parameters with one value per particle
       -        p.fixvel(j)  = fread(fid, 1, 'float');
       -        p.xsum(j)    = fread(fid, 1, 'float');
       -        p.radius(j)  = fread(fid, 1, 'float');
       -        p.rho(j)     = fread(fid, 1, 'float');
       -        p.k_n(j)     = fread(fid, 1, 'float');
       -        p.k_s(j)     = fread(fid, 1, 'float');
       -        p.k_r(j)     = fread(fid, 1, 'float');
       -        p.gamma_s(j) = fread(fid, 1, 'float');
       -        p.gamma_r(j) = fread(fid, 1, 'float');
       -        p.mu_s(j)    = fread(fid, 1, 'float');
       -        p.mu_d(j)    = fread(fid, 1, 'float');
       -        p.mu_r(j)    = fread(fid, 1, 'float');
       -        p.C(j)       = fread(fid, 1, 'float');
       -        p.E(j)       = fread(fid, 1, 'float');
       -        p.K(j)       = fread(fid, 1, 'float');
       -        p.nu(j)      = fread(fid, 1, 'float');
       -        p.es_dot(j)  = fread(fid, 1, 'float');
       -        p.es(j)      = fread(fid, 1, 'float');
       -        p.p(j)       = fread(fid, 1, 'float');
       -    end
       -    
       -    %Physical, constant parameters
       -    %params.global = fread(fid, 1, 'ubit1');
       -    params.global     = fread(fid, 1, 'int');
       -    params.g          = fread(fid, [1, grids.nd], 'float');
       -    params.kappa      = fread(fid, 1, 'float');
       -    params.db                = fread(fid, 1, 'float');
       -    params.V_b              = fread(fid, 1, 'float');
       -    params.shearmodel = fread(fid, 1, 'uint32');
       -    
       -    % Wall data
       -    walls.nw = fread(fid, 1, 'uint32');
       -    for j=1:walls.nw
       -        for i=1:grids.nd
       -            walls.n(j,i)     = fread(fid, 1, 'float');
       -        end
       -        walls.x(j)     = fread(fid, 1, 'float');
       -        walls.m(j)     = fread(fid, 1, 'float');
       -        walls.vel(j)   = fread(fid, 1, 'float');
       -        walls.force(j) = fread(fid, 1, 'float');
       -        walls.devs(j)  = fread(fid, 1, 'float');
       -    end
       -    params.periodic = fread(fid, 1, 'int');
       -
       -    for j=1:p.np
       -      p.bonds(j,1) = fread(fid, 1, 'uint32');
       -      p.bonds(j,2) = fread(fid, 1, 'uint32');
       -      p.bonds(j,3) = fread(fid, 1, 'uint32');
       -      p.bonds(j,4) = fread(fid, 1, 'uint32');
       -    end
       -    
       -    fclose(fid);
       -
       -end
   DIR diff --git a/mfiles/fwritebin.m b/mfiles/fwritebin.m
       t@@ -1,98 +0,0 @@
       -function fwritebin(path, fn, p, grids, time, params, walls)
       -    %path = '../input/';  %Target directory
       -    %fn   = '3dtest.bin'; %Target binary
       -    
       -    %Open binary file for writing, little endian ordering, unicode char encoding.
       -    %fid = fopen([path fn], 'wb','ieee-le','UTF-8'); 
       -    fid = fopen([path fn], 'wb', 'ieee-le'); 
       -
       -    %fwrite(fid, grids.nd, 'ushort'); %Unsigned short int
       -    %fwrite(fid, p.np, 'ulong'); %Unsigned long int
       -    fwrite(fid, grids.nd, 'int'); 
       -    fwrite(fid, p.np, 'uint32'); 
       -    
       -    %Time parameters
       -    fwrite(fid, time.dt, 'float');
       -    fwrite(fid, time.current, 'double');
       -    fwrite(fid, time.total, 'double');
       -    fwrite(fid, time.file_dt, 'float');
       -    fwrite(fid, time.step_count, 'uint32');  
       -
       -    for i=1:grids.nd %Grid origo
       -        fwrite(fid, grids.origo(i), 'float');
       -    end
       -
       -    for i=1:grids.nd %Grid dimensions
       -        fwrite(fid, grids.L(i), 'float');
       -    end
       -    
       -    for i=1:grids.nd %Grid dimensions
       -        fwrite(fid, grids.num(i), 'uint32');
       -    end
       -
       -    for j=1:p.np
       -        for i=1:grids.nd %Coordinates, velocity, acceleration, pre-velocity
       -            fwrite(fid, p.x(j,i), 'float');
       -            fwrite(fid, p.vel(j,i), 'float');
       -            fwrite(fid, p.angvel(j,i), 'float');
       -            fwrite(fid, p.force(j,i), 'float');
       -            fwrite(fid, p.torque(j,i), 'float');
       -        end
       -    end
       -
       -    for j=1:p.np %Parameters with one value per particle
       -        fwrite(fid, p.fixvel(j), 'float');
       -        fwrite(fid, p.xsum(j), 'float');
       -        fwrite(fid, p.radius(j), 'float');
       -        fwrite(fid, p.rho(j), 'float');
       -        fwrite(fid, p.k_n(j), 'float');
       -        fwrite(fid, p.k_s(j), 'float');
       -        fwrite(fid, p.k_r(j), 'float');
       -        fwrite(fid, p.gamma_s(j), 'float');
       -        fwrite(fid, p.gamma_r(j), 'float');
       -        fwrite(fid, p.mu_s(j), 'float');
       -        fwrite(fid, p.mu_d(j), 'float');
       -        fwrite(fid, p.mu_r(j), 'float');
       -        fwrite(fid, p.C(j), 'float');
       -        fwrite(fid, p.E(j), 'float');
       -        fwrite(fid, p.K(j), 'float');
       -        fwrite(fid, p.nu(j), 'float');
       -        fwrite(fid, p.es_dot(j), 'float');
       -        fwrite(fid, p.es(j), 'float');
       -        fwrite(fid, p.p(j), 'float');
       -    end
       -    
       -    %Physical, constant parameters
       -    %fwrite(fid, params.global, 'ubit1');
       -    fwrite(fid, params.global, 'int');
       -    for i=1:grids.nd
       -      fwrite(fid, params.g(i), 'float');
       -    end
       -    fwrite(fid, params.kappa, 'float');
       -    fwrite(fid, params.db, 'float');
       -    fwrite(fid, params.V_b, 'float');
       -    fwrite(fid, params.shearmodel, 'uint32');
       -    
       -    fwrite(fid, walls.nw, 'uint32');
       -    for j=1:walls.nw
       -        for i=1:grids.nd
       -            fwrite(fid, walls.n(j,i), 'float'); % Wall normal
       -        end
       -        fwrite(fid, walls.x(j), 'float'); % Wall pos. on axis parallel to wall normal
       -        fwrite(fid, walls.m(j), 'float'); % Wall mass
       -        fwrite(fid, walls.vel(j), 'float'); % Wall vel. on axis parallel to wall normal
       -        fwrite(fid, walls.force(j), 'float'); % Wall force on axis parallel to wall normal
       -        fwrite(fid, walls.devs(j), 'float'); % Deviatoric stress on wall normal
       -    end
       -    fwrite(fid, params.periodic, 'int');
       -
       -    for j=1:p.np
       -      fwrite(fid, p.bonds(i,1), 'uint32');
       -      fwrite(fid, p.bonds(i,2), 'uint32');
       -      fwrite(fid, p.bonds(i,3), 'uint32');
       -      fwrite(fid, p.bonds(i,4), 'uint32');
       -    end
       -
       -    fclose(fid);
       -
       -end
   DIR diff --git a/mfiles/initsetup.m b/mfiles/initsetup.m
       t@@ -1,180 +0,0 @@
       - function initsetup(plotfigure)
       -% initsetup() creates a model setup of particles in 3D with cubic
       -% packing. Specify PSD and desired number of particles, and the 
       -% function will determine the model dimensions, and fill the space 
       -% with a number of particles from a specified particle size distribution.
       -close all;
       -
       -% Simulation project name
       -simulation_name = '1e3-init-visc';
       -
       -% Plot particle assembly in MATLAB after initialization
       -if exist('plotfigure','var')
       -    if plotfigure == 1
       -        visualize = 1; 
       -    else
       -        visualize = 0;
       -    end
       -else
       -    visualize = 0;
       -end
       - 
       -% Physical, constant parameters
       -params.g = [0.0, 0.0, -9.80665]; %standard gravity, by definition 9.80665 m/s^2
       -%params.g = 0.98;
       -
       -% No. of dimensions
       -grids.nd = 3;
       -grids.origo = [0 0 0]; %Coordinate system origo
       -
       -% Number of particles
       -p.np = 1e3;
       -
       -% Create grid
       -form = 4; % For a cubic grid, use 3.
       -          % For a higher grid, use 4 or more (for cubic end config. use 5).
       -          % For a flatter grid, use 1 or 2.
       -grids.num(1) = ceil(nthroot(p.np, form)); % Particles in x direction
       -grids.num(2) = grids.num(1);  % Particles in y direction
       -grids.num(3) = ceil(p.np/(grids.num(1)*grids.num(2))); % Particles in z direction
       -
       -disp(['Grid dimensions: x=' num2str(grids.num(1)) ...
       -      ', y=' num2str(grids.num(2)) ...
       -      ', z=' num2str(grids.num(3))])
       -
       -%grids.num = ceil(nthroot(p.np,grids.nd)) * ones(grids.nd, 1); %Square/cubic
       -p.np = grids.num(1)*grids.num(2)*grids.num(3);
       -
       -% Particle size distribution
       -p.psd = 'logn'; %PSD: logn or uni
       -
       -p.m = 440e-6;       %Mean size
       -p.v = p.m*0.00002;  %Variance
       -
       -p.radius = zeros(p.np, 1);
       -p.x      = zeros(p.np, grids.nd);
       -
       -p.bonds  = ones(p.np, 4) * p.np; % No bonds when the values equal the no. of particles
       -
       -if strcmp(p.psd,'logn') %Log-normal PSD. Note: Keep variance small.
       -    mu = log((p.m^2)/sqrt(p.v+p.m^2));
       -    sigma = sqrt(log(p.v/(p.m^2)+1));
       -    p.radius = lognrnd(mu,sigma,1,p.np); %Array of particle radii 
       -elseif strcmp(p.psd,'uni') %Uniform PSD between rmin and rmax
       -    rmin = p.m - p.v*1e5;
       -    rmax = p.m + p.v*1e5;
       -    %rmin = 0.1*dd; rmax = 0.4*dd;
       -    p.radius = (rmax-rmin)*rand(p.np,1)+rmin; %Array of particle radii
       -end
       -
       -% Display PSD
       -if visualize == 1
       -    figure;
       -    hist(p.radius);
       -    title(['PSD: ' num2str(p.np) ' particles, m = ' num2str(p.m) ' m']); 
       -end
       -
       -% Friction angles
       -ang_s = 30; % Angle of static shear resistance
       -ang_d = 25; % Angle of dynamic shear resistance
       -ang_r = 35; % Angle of rolling resistance
       -
       -% Other particle parameters
       -p.vel    = zeros(p.np, grids.nd); % Velocity vector
       -p.angvel = zeros(p.np, grids.nd); % Angular velocity
       -p.fixvel = zeros(p.np, 1);           % 0: horizontal particle velocity free, 1: hotiz. particle velocity fixed
       -p.xsum   = zeros(p.np, 1);          % Total displacement along x-axis
       -p.force  = zeros(p.np, grids.nd); % Force vector
       -p.torque = zeros(p.np, grids.nd); % Torque vector
       -
       -params.global = 1; % 1: Physical parameters global, 0: individual per particle
       -p.rho     = 3600*ones(p.np,1);    % Density
       -p.k_n     = 4e5*ones(p.np,1);     % Normal stiffness [N/m]
       -p.k_s          = p.k_n(:);                  % Shear stiffness [N/m]
       -p.k_r          = p.k_s(:).*(10);          % Rolling stiffness
       -params.shearmodel = 1;                   % Contact model. 1=frictional, viscous, 2=frictional, elastic
       -p.gamma_s = p.k_n./1e3;            % Shear viscosity [Ns/m]
       -p.gamma_r = p.gamma_s(:);          % Rolling viscosity [?]
       -p.mu_s    = tand(ang_s)*ones(p.np,1);     % Inter-particle shear contact friction coefficient [0;1[
       -p.mu_r    = tand(ang_r)*ones(p.np,1);     % Inter-particle rolling contact friction coefficient [0;1[
       -p.C       = 0*zeros(p.np,1);      % Inter-particle cohesion
       -p.E       = 10e9*ones(p.np,1);    % Young's modulus
       -p.K       = 38e9*ones(p.np,1);    % Bulk modulus
       -p.nu      = 0.1*2.0*sqrt(min(4/3*pi.*p.radius(:).*p.radius(:).*p.radius(:).*p.rho(1))*p.k_n(1))*ones(p.np,1);      % Poisson's ratio (critical damping: 2*sqrt(m*k_n)). Normal component elastic if nu=0
       -p.es_dot  = zeros(p.np,1);        % Current shear energy dissipation
       -p.es      = zeros(p.np,1);        % Total shear energy dissipation
       -p.p       = zeros(p.np,1);        % Pressure on particle
       -
       -% Parameters related to capillary bonds
       -% Disable capillary cohesion by setting kappa to zero.
       -enableCapillaryCohesion = 0;
       -theta = 0.0;             % Wettability (0: perfect)
       -if (enableCapillaryCohesion == 1) 
       -    params.kappa = 2*pi*p.gamma_s(1) * cos(theta); % Prefactor
       -    params.V_b = 1e-12;  % Liquid volume at bond
       -else   
       -    params.kappa = 0;    % Zero capillary force
       -    params.V_b = 0;      % Zero liquid volume at bond
       -end
       -params.db = (1.0 + theta/2.0) * params.V_b^(1.0/3.0); % Debonding distance
       -
       -% Time parameters
       -%time.dt         = 0.2*min(sqrt((p.rho(:).*p.radius(:).^2)./p.K(:))); % Computational delta t
       -%time.dt         = time.dt*1e1; % Speedup factor
       -time.dt         = 0.17*sqrt(min(4/3*pi.*p.radius(:).*p.radius(:).*p.radius(:).*p.rho(1))/p.k_n(1)); % Computational time step (O'Sullivan et al. 2003)
       -time.current    = 0.0;
       -time.total      = 1.500+2.0*time.dt; % Total simulation time [s]
       -time.file_dt    = 0.0010; % Interval between output#.bin generation [s]
       -time.step_count = 0;
       -
       -% Calculate particle coordinates
       -%  Grid unit length. Maximum particle diameter determines grid size
       -GU = 2*max(p.radius)*1.40; % Forty percent margin
       -grids.L = [GU*grids.num(1) GU*grids.num(2) GU*grids.num(3)];
       -
       -% Particle coordinates by filling grid.
       -x = linspace(GU/2, grids.L(1)-GU, grids.num(1));
       -y = linspace(GU/2, grids.L(2)-GU, grids.num(2));
       -z = linspace(GU/2, grids.L(3)-GU, grids.num(3));
       -
       -[X Y Z] = meshgrid(x,y,z);
       -X=X(:); Y=Y(:); Z=Z(:);
       -
       -p.x = [X Y Z]; 
       -
       -%Particle positions randomly modified by + 10 percent of a grid unit
       -p.x = p.x + (rand(p.np, grids.nd)*0.49*GU);
       -
       -% Walls with friction
       -%   Note that the world boundaries already act as fricionless boundaries
       -% Upper wall
       -wno = 1;
       -walls.n(wno,:)     = [0.0, 0.0, -1.0];   % Normal along negative z-axis
       -walls.x(wno)       = grids.L(3);  % Positioned at upper boundary
       -walls.m(wno)       = p.rho(1)*p.np*pi*max(p.radius)*max(p.radius)*max(p.radius); % Wall mass
       -walls.vel(wno)     = 0.0;   % Wall velocity
       -walls.force(wno)   = 0.0;   % Wall force
       -walls.devs(wno)    = 0.0;  % Deviatoric stress
       -walls.nw = wno;
       -
       -% Define behavior of x- and y boundaries.
       -% Uncomment only one!
       -%params.periodic = 0; % Behave as frictional walls
       -params.periodic = 1; % Behave as periodic boundaries
       -%params.periodic = 2; % x: periodic, y: frictional walls
       -
       -% Write output binary
       -fwritebin('../input/', [simulation_name '.bin'], p, grids, time, params, walls);
       -disp('Writing of binary file complete.');
       -
       -% Plot particles in bubble plot
       -if visualize == 1
       -    disp('Waiting for visualization.');
       -    plotspheres(p, grids, 5, 0, 1);
       -end
       -
       -disp(['Project "' simulation_name '" is now ready for processing with SPHERE.']);
       -disp(['Call "' pwd '/sphere ' simulation_name '" from the system terminal ']);
       -disp(['to initiate, and check progress here in MATLAB using "status(''' simulation_name ''')"']);
       -
       -end
   DIR diff --git a/mfiles/plotspheres.m b/mfiles/plotspheres.m
       t@@ -1,98 +0,0 @@
       -function plotspheres(p, grids, n, quiver, visible)
       -
       -if exist('visible','var')
       -    if visible == 0
       -        % Create a new invisible figure window
       -        figure('visible','off');
       -    else
       -        % Create a new figure window
       -        figure('Renderer','OpenGL')
       -    end
       -else
       -    % Create a new figure window
       -    figure('Renderer','OpenGL')
       -end
       -
       -% Generate unit sphere, consisting of n-by-n faces (i.e. the resolution)
       -if exist('n', 'var')
       -    [x,y,z] = sphere(n);
       -else
       -    [x,y,z] = sphere(10);
       -end
       -
       -% Iterate through particle data
       -hold on
       -for i=1:p.np
       -    spheresurf = surf(x*p.radius(i)+p.x(i,1), ...
       -                      y*p.radius(i)+p.x(i,2), ...
       -                      z*p.radius(i)+p.x(i,3));
       -    set(spheresurf,'EdgeColor','none', ...
       -                   'FaceColor',[0.96 0.64 0.38], ...  % RGB
       -                   'FaceLighting','phong', ... 
       -                   'AmbientStrength',0.3, ... 
       -                   'DiffuseStrength',0.8, ... 
       -                   'SpecularStrength',0.9, ... 
       -                   'SpecularExponent',25, ... 
       -                   'BackFaceLighting','lit'); 
       -end
       -camlight left;
       -hidden off
       -
       -% Optional quiver3 plot (velocity vectors)
       -if exist('quiver','var')
       -    if quiver == 1
       -        quiver3(p.x(:,1),    p.x(:,2)  , p.x(:,3), ...
       -                p.vel(:,1), p.vel(:,2), p.vel(:,3), 3);
       -    end
       -end
       -
       -% Draw walls from 8 vertices and connect these to 6 faces
       -vertices = [grids.origo(1) grids.origo(2) grids.origo(3); ... % 1
       -            grids.origo(1) grids.L(2)     grids.origo(3); ... % 2
       -            grids.L(1)     grids.L(2)     grids.origo(3); ... % 3
       -            grids.L(1)     grids.origo(2) grids.origo(3); ... % 4
       -            grids.origo(1) grids.origo(2) grids.L(3);     ... % 5
       -            grids.origo(1) grids.L(2)     grids.L(3);     ... % 6
       -            grids.L(1)     grids.L(2)     grids.L(3);     ... % 7
       -            grids.L(1)     grids.origo(2) grids.L(3)];        % 8
       -            
       -faces = [1 2 3 4; ... %  (observing along pos. y axis direction)
       -         2 6 7 3; ... % 
       -         4 3 7 8; ... % 
       -         1 5 8 4; ... % 
       -         1 2 6 5; ... % 
       -         5 6 7 8];    % 
       -
       -patch('Faces', faces, 'Vertices', vertices, ...
       -      'FaceColor','none', 'EdgeColor','black','LineWidth',2);
       -
       -% View specifications
       -%daspect([1 1 1])
       -
       -view([grids.L(1), -2*grids.L(2), grids.L(3)])
       -grid on
       -axis equal
       -maxr = max(p.radius);
       -axis([grids.origo(1)-maxr grids.L(1)+maxr ... 
       -      grids.origo(2)-maxr grids.L(2)+maxr ...
       -      grids.origo(3)-maxr grids.L(3)+maxr]);
       -
       -light('Position', [grids.L(1), -grids.L(2), grids.L(3)]);
       -material shiny; %shiny dull metal
       -camlight(45,45);
       -lighting gouraud; %flat gouraud phone none
       -
       -% Remove hidden lines from mesh plot (hidden on)
       -hidden off
       -
       -% Labels
       -title([num2str(p.np) ' particles']);
       -xlabel('x [m]');
       -ylabel('y [m]');
       -zlabel('z [m]');
       -
       -% Add delaunay triangulation
       -%dt = DelaunayTri(p.x(:,1), p.x(:,2), p.x(:,3));
       -%tetramesh(dt,'FaceAlpha',0.02);
       -
       -end
       -\ No newline at end of file
   DIR diff --git a/mfiles/status.m b/mfiles/status.m
       t@@ -1,11 +0,0 @@
       -function lastfile = status(project)
       -
       -%  status() writes status of current model run
       -
       -st = load(['../output/' project '.status.dat']);
       -
       -disp(['Status: time = ',num2str(st(1)),', ',num2str(st(2)),'% done, filenr = ',num2str(st(3))]);
       -
       -lastfile = st(3);
       -
       -end
   DIR diff --git a/mfiles/useoutput.m b/mfiles/useoutput.m
       t@@ -1,112 +0,0 @@
       -%function useoutput(fn, plotfigure)
       -% Use particle data from target binary from output directory,
       -% create a fitting grid, and zero time variables.
       -% Final configuration is saved in a new filename in input/ directory.
       -
       -% Plot particle assembly in MATLAB after initialization
       -visualize = 0;  % No
       -%visualize = 1;  % Yes
       -
       -% Input binary file
       -directory = '../input/';
       -inputbin  = '5e4-cons_10kPa.bin';
       -
       -% New configuration name
       -simulation_name = '5e4-shear_10kPa';
       -
       -% Read data
       -[p, grids, time, params, walls] = freadbin(directory, inputbin);
       -%[p, grids, time, params, walls] = freadbinold(directory, inputbin);
       -
       -% Change temporal parameters (comment out to use previous values"
       -time.current    = 0.0;         % New starting time
       -time.step_count = 0;           % First output file number
       -time.total      = 0.25+2*time.dt; % Total simulation time [s]
       -time.file_dt    = 0.01; %Interval between output#.bin generation [s]
       -time.dt         = 0.17*sqrt(min(4/3*pi.*p.radius(:).*p.radius(:).*p.radius(:).*p.rho(1))/p.k_n(1)); % Computational time step (O'Sullivan et al. 2003)
       -
       -% Particle stiffness
       -p.k_n = 4e5*ones(p.np,1); % Normal stiffness [N/m]
       -p.gamma_s = p.k_n./1e3;          % Shear viscosity [Ns/m]
       -p.gamma_r = p.gamma_s(:); % Rolling viscosity
       -
       -% Friction angles
       -ang_s = 25; % Angle of shear resistance
       -ang_r = 35; % Angle of rolling resistance
       -p.mu_s = tand(ang_s)*ones(p.np,1); % Inter-particle shear contact friction coefficient [0;1[
       -p.mu_r = tand(ang_r)*ones(p.np,1); % Inter-particle rolling contact friction coefficient [0;1[
       -
       -% Compute new grid, scaled to fit max.- & min. particle positions
       -GU = 2*max(p.radius); % Cell size
       -x_min = min(p.x(:,1));% - p.radius(:));
       -x_max = max(p.x(:,1));% + p.radius(:));%*3.0;
       -%y_min = min(p.x(:,2) - p.radius(:));
       -%y_max = max(p.x(:,2) + p.radius(:));
       -z_min = min(p.x(:,3) - p.radius(:));
       -z_max = max(p.x(:,3) + p.radius(:));
       -z_adjust = 1.2; % Specify overheightening of world to allow for shear dilatancy
       -%grids.num(1) = ceil((x_max-x_min)/GU);
       -%grids.num(2) = ceil((y_max-y_min)/GU);
       -grids.num(3) = ceil((z_max-z_min)*z_adjust/GU);
       -%grids.L = [(x_max-x_min) (y_max-y_min) (z_max-z_min)*z_adjust];
       -grids.L(3) = (z_max-z_min)*z_adjust;
       -
       -% Parameters related to capillary bonds
       -% Disable capillary cohesion by setting kappa to zero.
       -enableCapillaryCohesion = 0;
       -theta = 0.0;             % Wettability (0: perfect)
       -if (enableCapillaryCohesion == 1) 
       -    params.kappa = 2*pi*p.gamma_s(1) * cos(theta); % Prefactor
       -    params.V_b = 1e-12;  % Liquid volume at bond
       -else   
       -    params.kappa = 0;    % Zero capillary force
       -    params.V_b = 0;      % Zero liquid volume at bond
       -end
       -params.db = (1.0 + theta/2.0) * params.V_b^(1.0/3.0); % Debonding distance
       -
       -% Move mobile upper wall to top of domain
       -walls.x(1) = max(p.x(:,3)+p.radius(:));
       -
       -% Define new deviatoric stress [Pa]
       -%walls.devs(1) = 0.0;
       -walls.devs(1) = 10.0e3;
       -
       -% Let x- and y boundaries be periodic
       -params.periodic = 1; % x- and y boundaries periodic
       -%params.periodic = 2; % Only x-boundaries periodic
       -
       -% By default, all particles are free to move horizontally
       -p.fixvel = zeros(p.np, 1); % Start off by defining all particles as free
       -shearing = 1; % 1: true, 0: false
       -
       -if shearing == 1
       -  % Fix horizontal velocity to 0.0 of lowermost particles
       -  I = find(p.x(:,3) < (z_max-z_min)*0.1); % Find the lower 10%
       -  p.fixvel(I) = 1;  % Fix horiz. velocity
       -  p.vel(I,1) = 0.0; % x-value
       -  p.vel(I,2) = 0.0; % y-value
       -
       -  % Fix horizontal velocity to 0.0 of uppermost particles
       -  I = find(p.x(:,3) > (z_max-z_min)*0.9); % Find the upper 10%
       -  p.fixvel(I) = 1;  % Fix horiz. velocity
       -  p.vel(I,1) = (x_max-x_min)*1.0; % x-value: One grid length per second
       -  p.vel(I,2) = 0.0; % y-value
       -end
       -
       -% Zero x-axis displacement
       -p.xsum = zeros(p.np, 1);
       -
       -% Write output binary
       -fwritebin('../input/', [simulation_name '.bin'], p, grids, time, params, walls);
       -disp('Writing of binary file complete.');
       -
       -% Plot particles in bubble plot
       -if visualize == 1
       -    disp('Waiting for visualization.');
       -    plotspheres(p, grids, 5, 0, 1);
       -end
       -
       -disp(['Project "' simulation_name '" is now ready for processing with SPHERE.']);
       -disp(['Call "' pwd '/sphere ' simulation_name '" from the system terminal ']);
       -disp(['to initiate, and check progress here in MATLAB using "status(''' simulation_name ''')"']);
       -
   DIR diff --git a/mfiles/visualize.m b/mfiles/visualize.m
       t@@ -1,379 +0,0 @@
       -function visualize(project, method, file_nr)
       -
       -  lastfile = status(project);
       -
       -  if exist('file_nr','var')
       -    filenr = file_nr;
       -  else
       -    filenr = lastfile;
       -  end
       -
       -  figure;
       -
       -  % Plot sum of kinetic energy vs. time
       -  if strcmpi(method, 'energy')
       -
       -    disp('Energy')
       -
       -    Epot   = zeros(1,lastfile);
       -    Ekin   = zeros(1,lastfile);
       -    Erot   = zeros(1,lastfile);
       -    Es     = zeros(1,lastfile);
       -    Es_dot = zeros(1,lastfile);
       -    Esum   = zeros(1,lastfile);
       -
       -    % Load data from all output files
       -    for i = 1:lastfile
       -      fn = [project '.output' num2str(i) '.bin'];
       -      disp(fn);
       -      [p, ~, time, params, ~] = freadbin('../output/', fn);
       -      for j = 1:p.np
       -        r = p.radius(j);
       -        m = (4/3*pi*r*r*r*p.rho(j));
       -        Epot(i) = Epot(i) + m * norm(params.g) * p.x(j,3);
       -        Ekin(i) = Ekin(i) + 0.5 * m ...
       -          * norm(p.vel(j,:)) * norm(p.vel(j,:));
       -        Erot(i) = Erot(i) + 0.5 * (2/5 * m * r*r) ...
       -          * norm(p.angvel(j,:)) * norm(p.angvel(j,:));
       -      end
       -      Es(i)     = sum(p.es);
       -      Es_dot(i) = sum(p.es_dot);
       -
       -      Esum(i) = Epot(i) + Ekin(i) + Erot(i) + Es(i);
       -      %Esum(i) = Epot(i) + Ekin(i) + Es(i);
       -    end
       -
       -    % Time axis
       -    t = linspace(time.file_dt, time.current, length(Ekin));
       -
       -    %disp(num2str(Ekin(:)));
       -
       -    % Visualization, E_pot
       -    subplot(2,3,1);
       -    plot(t, Epot, '+-');
       -    title('Potential energy');
       -    xlabel('Time [s]');
       -    ylabel('Total potential energy [J]');
       -    box on;
       -    grid on;
       -
       -    % Visualization, E_kin
       -    subplot(2,3,2);
       -    plot(t, Ekin, '+-');
       -    title('Kinetic energy');
       -    xlabel('Time [s]');
       -    ylabel('Total kinetic energy [J]');
       -    box on;
       -    grid on;
       -
       -    % Visualization, E_rot
       -    subplot(2,3,3);
       -    plot(t, Erot, '+-');
       -    title('Rotational energy');
       -    xlabel('Time [s]');
       -    ylabel('Total rotational energy [J]');
       -    box on;
       -    grid on;
       -
       -    % Visualizaiton, E_s_dot (current shear energy)
       -    subplot(2,3,4);
       -    plot(t, Es_dot, '+-');
       -    title('Shear energy rate');
       -    xlabel('Time [s]');
       -    ylabel('Shear energy rate [W]');
       -    box on;
       -    grid on;
       -
       -    % Visualizaiton, E_s (total shear energy)
       -    subplot(2,3,5);
       -    plot(t, Es, '+-');
       -    title('Total shear energy');
       -    xlabel('Time [s]');
       -    ylabel('Total shear energy [J]');
       -    box on;
       -    grid on;
       -
       -    % Total energy, Esum
       -    subplot(2,3,6);
       -    plot(t, Esum, '+-');
       -    title('Total energy: Pot + Kin + Rot + Shear');
       -    xlabel('Time [s]');
       -    ylabel('Total energy [J]');
       -    box on;
       -    grid on;
       -
       -  end % End visualize energy
       -
       -
       -  % Visualize wall kinematics and force
       -  if strcmpi(method, 'walls')
       -    disp('Walls')
       -
       -    [~, ~, ~, ~, walls] = freadbin('../output/', [project '.output0.bin']);
       -
       -    wforce = zeros(lastfile, walls.nw);
       -    wvel   = zeros(lastfile, walls.nw);
       -    wpos   = zeros(lastfile, walls.nw);
       -    wdevs  = zeros(lastfile, walls.nw);
       -
       -    % Load data from all output files
       -    for i = 1:lastfile
       -      fn = [project '.output' num2str(i) '.bin'];
       -      disp(fn);
       -      [~, ~, time, ~, walls] = freadbin('../output/', fn);
       -
       -      for j = 1:walls.nw
       -        wforce(i,j) = walls.force(j);
       -        wvel(i,j)   = walls.vel(j);
       -        wpos(i,j)   = walls.x(j);
       -        wdevs(i,j)  = walls.devs(j);
       -      end
       -    end
       -
       -    % Time axis
       -    t = linspace(time.file_dt, time.current, lastfile);
       -
       -    % Visualization, wall position
       -    subplot(2,2,1);
       -    hold on
       -    for i=1:walls.nw
       -      p = plot(t, wpos(:,i), '+-');
       -      if i==2
       -        set(p,'Color','red')
       -      end
       -    end
       -    hold off
       -    title('Wall positions');
       -    xlabel('Time [s]');
       -    ylabel('Position [m]');
       -    box on;
       -    grid on;
       -
       -    % Visualization, wall force
       -    subplot(2,2,2);
       -    hold on
       -    for i=1:walls.nw
       -      p = plot(t, wforce(:,i), '+-');
       -      if i==2
       -        set(p,'Color','red')
       -      end
       -    end
       -    hold off
       -    title('Wall forces');
       -    xlabel('Time [s]');
       -    ylabel('Force [N]');
       -    box on;
       -    grid on;
       -
       -    % Visualization, wall velocity
       -    subplot(2,2,3);
       -    hold on
       -    for i=1:walls.nw
       -      p = plot(t, wvel(:,i), '+-');
       -      if i==2
       -        set(p,'Color','red')
       -      end
       -    end
       -    hold off
       -    title('Wall velocities');
       -    xlabel('Time [s]');
       -    ylabel('Velocity [m/s]');
       -    box on;
       -    grid on;
       -
       -    % Visualization, wall deviatoric stresses
       -    subplot(2,2,4);
       -    hold on
       -    for i=1:walls.nw
       -      p = plot(t, wdevs(:,i), '+-');
       -      if i==2
       -        set(p,'Color','red')
       -      end
       -    end
       -    hold off
       -    title('Wall deviatoric stresses');
       -    xlabel('Time [s]');
       -    ylabel('Stress [Pa]');
       -    box on;
       -    grid on;
       -
       -  end % End visualize walls
       -
       -
       -  % Visualize first output file with plotspheres
       -  if strcmpi(method, 'first')
       -    disp('Visualizing first output file')
       -    fn = [project '.output0.bin'];
       -    [p, grids, ~, ~] = freadbin('../output/', fn);
       -    plotspheres(p, grids, 5, 0, 1);
       -  end % End visualize last file
       -
       -
       -  % Visualize last output file with plotspheres
       -  if strcmpi(method, 'last')
       -    disp('Visualizing last output file')
       -    fn = [project '.output' num2str(lastfile) '.bin'];
       -    [p, grids, ~, ~] = freadbin('../output/', fn);
       -    plotspheres(p, grids, 5, 0, 1);
       -  end % End visualize last file
       -
       -
       -  % Visualize horizontal velocities (velocity profile)
       -  if strcmpi(method, 'veloprofile')
       -
       -    disp('Visualizing velocity profile');
       -
       -    % Read data
       -    fn = [project '.output' num2str(filenr) '.bin'];
       -    disp(fn);
       -    [p, ~, time, params, ~] = freadbin('../output/', fn);
       -    
       -    horiz_velo = zeros(2, p.np); % x,y velocities for each particle
       -
       -    for j = 1:p.np
       -      horiz_velo(1, j) = p.vel(j, 1); % x-velocity
       -      horiz_velo(2, j) = p.vel(j, 2); % y-velocity
       -    end
       -    
       -    % Find shear velocity (for scaling first axis)
       -    fixidx = find(p.fixvel > 0.0);
       -    shearvel = max(p.vel(fixidx,1));
       -
       -    % Plot x- and y velocities vs. z position
       -    hold on;
       -    plot(horiz_velo(2,:), p.x(:,3), '.b'); % y-velocity (blue)
       -    plot(horiz_velo(1,:), p.x(:,3), '.r'); % x-velocity (red)
       -    axis([-0.5*shearvel, shearvel*1.5, min(p.x(:,3)), max(p.x(:,3))]);
       -    title(['Velocity profile, t = ' num2str(time.current) ' s']);
       -    xlabel('Horizontal velocity [m/s]');
       -    ylabel('Vertical position [m]');
       -    legend('y', 'x', 'Location', 'SouthEast');
       -    box on;
       -    grid on;
       -    hold off;
       -
       -  end % End visualize velocity profile
       -
       -  if strcmpi(method, 'sheardisp')
       -    disp('Visualizing shear displacement, 1D');
       -
       -    % Read first datafile
       -    [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output0.bin']);
       -
       -    % Read original z-position at t = 0 s.
       -    zpos = p.x(:,3);
       -
       -    % Read last datafile
       -    [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output' num2str(lastfile) '.bin']);
       -
       -    % Plot
       -    plot(p.xsum(:), zpos(:), 'o');
       -    title(project);
       -    xlabel('Shear displacement [m]');
       -    ylabel('Initial vertical position [m]');
       -    box on;
       -    grid on;
       -
       -  end % End visualize shear displacement
       -
       -  if strcmpi(method, 'sheardisp2d')
       -    disp('Visualizing shear displacement, 2D');
       -
       -    % Read first datafile
       -    [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output0.bin']);
       -
       -    % Read original z-position at t = 0 s.
       -    zpos = p.x(:,3);
       -
       -    % Read last datafile
       -    [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output' num2str(lastfile) '.bin']);
       -
       -
       -
       -    % Plot
       -    colormap(jet)
       -    title(project);
       -    xlabel('Shear displacement [m]');
       -    ylabel('y [m]');
       -    zlabel('z [m]');
       -    box on;
       -    grid on;
       -    axis equal;
       -    axis vis3d;
       -    rotate3d on;
       -
       -  end % End visualize shear displacement 2D
       -
       -  if strcmpi(method, 'sheardisp3d')
       -    disp('Visualizing shear displacement, 3D');
       -
       -    % Read last datafile
       -    [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output' num2str(lastfile) '.bin']);
       -
       -    % Plot
       -    scatter3(p.xsum(:), p.x(:,2), p.x(:,3), 30, p.x(:,3), 'filled');
       -    colormap(jet)
       -    title(project);
       -    xlabel('Shear displacement [m]');
       -    ylabel('y [m]');
       -    zlabel('z [m]');
       -    box on;
       -    grid on;
       -    axis equal;
       -    axis vis3d;
       -    rotate3d on;
       -
       -  end % End visualize shear displacement 3D
       -
       -  % Visualize shear stresses
       -  if strcmpi(method, 'shear')
       -
       -    [p, grids, time, params, walls] = freadbin('../output/',[project '.output0.bin']);
       -
       -    disp('Visualizing shear stress dynamics')
       -    xdisp    = zeros(1, lastfile+1); % Shear displacement
       -    sigma    = zeros(1, lastfile+1); % Effective normal stress
       -    tau      = zeros(1, lastfile+1); % Shear stress
       -    dilation = zeros(1, lastfile+1); % Dilation
       -
       -    % Calculate the shear velocity
       -    fixedidx_all = find(p.fixvel(:) > 0.0); % All particles with a fixed horiz. vel.
       -    shearvel = max(p.vel(fixedidx_all,1)); 
       -
       -    % Surface area
       -    A = grids.L(1) * grids.L(2); % x-length * y-length
       -
       -    % Load data from all output files
       -    for i = 0:lastfile
       -      fn = [project '.output' num2str(i) '.bin'];
       -      disp(fn);
       -      [p, ~, time, ~, walls] = freadbin('../output/', fn);
       -
       -      xdisp(i+1)    = time.current * shearvel;
       -      sigma(i+1)    = walls.force(1) / A;
       -      tau(i+1)      = shearstress(p, A);
       -      dilation(i+1) = walls.x;
       -    end
       -
       -    % Plot stresses
       -    subplot(2,1,1);
       -    plot(xdisp, sigma, 'o-g',  xdisp, tau, '+-r');
       -    title('Stress dynamics');
       -    xlabel('Shear distance [m]');
       -    ylabel('Stress [Pa]');
       -    legend('\sigma`','\tau');
       -    box on;
       -    grid on;
       -
       -    % Plot dilation
       -    subplot(2,1,2);
       -    plot(xdisp, dilation, '+-b');
       -    title('Dilation');
       -    xlabel('Shear distance [m]');
       -    ylabel('Upper wall pos. [m]');
       -    box on;
       -    grid on;
       -    
       -  end
       -
       -end % End function
   DIR diff --git a/python/elastic_collision.py b/python/elastic_collision.py
       t@@ -0,0 +1,46 @@
       +#!/usr/bin/env python
       +import subprocess
       +import numpy
       +import matplotlib.pyplot as plt
       +
       +# Import sphere functionality
       +from sphere import *
       +
       +# New class
       +init = Spherebin(np = 2, nd = 3)
       +
       +#init.generateRadii(radius_mean = 0.5, radius_variance = 1e-15, histogram = 0)
       +init.radius[0] = numpy.ones(1, dtype=numpy.float64) * 0.5;
       +init.radius[1] = numpy.ones(1, dtype=numpy.float64) * 0.52;
       +
       +init.defaultparams()
       +
       +# The world should never be less than 3 cells in ANY direction, due to contact searcg algorithm
       +init.initsetup(gridnum = numpy.array([12,4,4]), periodic = 1, shearmodel = 2, g = numpy.array([0.0, 0.0, 0.0]))
       +
       +init.x[0] = numpy.array([1, 2, 2])
       +init.x[1] = numpy.array([6, 2, 2])
       +#init.x[2] = numpy.array([7, 2, 2])
       +#init.x[3] = numpy.array([8, 2, 2])
       +
       +init.nu = numpy.zeros(init.np, dtype=numpy.float64)
       +
       +#for i in range(init.np):
       +#  init.x[i] = numpy.array([4+i*init.radius[i]*2, 2, 2])
       +
       +init.vel[0] = numpy.array([10.0, 0.0, 0.0])
       +
       +
       +init.initTemporal(total = 1.0)
       +
       +init.writebin("../input/nc-test.bin")
       +#render("../input/nc-test.bin", out = "~/Desktop/nc-test")
       +
       +subprocess.call("cd ..; rm output/*; ./sphere_darwin_X86_64 nc-test", shell=True)
       +
       +visualize("nc-test", "energy", savefig=True, outformat='png')
       +#visualize("nc-test", "walls", savefig=True)
       +
       +subprocess.call("rm ../img_out/*; cd ../raytracer; ./render_all_outputs_GPU_clever.sh", shell=True)
       +
       +
   DIR diff --git a/python/inelastic_collision.py b/python/inelastic_collision.py
       t@@ -0,0 +1,49 @@
       +#!/usr/bin/env python
       +import subprocess
       +import numpy
       +import matplotlib.pyplot as plt
       +
       +# Import sphere functionality
       +from sphere import *
       +
       +# New class
       +init = Spherebin(np = 2, nd = 3)
       +
       +init.radius = numpy.ones(init.np, dtype=numpy.float64) * 0.5;
       +
       +init.defaultparams()
       +
       +# The world should never be less than 3 cells in ANY direction, due to contact searcg algorithm
       +init.initsetup(gridnum = numpy.array([12,4,4]), periodic = 1, shearmodel = 2, g = numpy.array([0.0, 0.0, 0.0]))
       +
       +init.x[0] = numpy.array([1, 2, 2])
       +init.x[1] = numpy.array([6, 2, 2])
       +#init.x[2] = numpy.array([7, 2, 2])
       +#init.x[3] = numpy.array([8, 2, 2])
       +
       +# Set fraction of critical damping (0 = elastic, 1 = completely inelastic)
       +damping_fraction = 1.0
       +init.nu = numpy.ones(init.np, dtype=numpy.float64) \
       +          * damping_fraction * 2.0 * math.sqrt(4.0/3.0 * math.pi * init.radius.min()**3 \
       +          * init.rho[0] * init.k_n[0])
       +
       +
       +#for i in range(init.np):
       +#  init.x[i] = numpy.array([4+i*init.radius[i]*2, 2, 2])
       +
       +init.vel[0] = numpy.array([10.0, 0.0, 0.0])
       +
       +
       +init.initTemporal(total = 1.0)
       +
       +init.writebin("../input/nc-test.bin")
       +#render("../input/nc-test.bin", out = "~/Desktop/nc-test")
       +
       +subprocess.call("cd ..; rm output/*; ./sphere_darwin_X86_64 nc-test", shell=True)
       +
       +visualize("nc-test", "energy", savefig=True, outformat='png')
       +#visualize("nc-test", "walls", savefig=True)
       +
       +subprocess.call("rm ../img_out/*; cd ../raytracer; ./render_all_outputs_GPU_clever.sh", shell=True)
       +
       +
   DIR diff --git a/raytracer/colorbar.h b/raytracer/colorbar.h
       t@@ -0,0 +1,22 @@
       +#ifndef COLORBAR_H_
       +#define COLORBAR_H_
       +
       +// Functions that determine red-, green- and blue color components
       +// in a blue-white-red colormap. Ratio should be between 0.0-1.0.
       +
       +__inline__ __host__ __device__ float red(float ratio)
       +{
       +  return fmin(1.0f, 0.209f*ratio*ratio*ratio - 2.49f*ratio*ratio + 3.0f*ratio + 0.0109f);
       +};
       +
       +__inline__ __host__ __device__ float green(float ratio)
       +{
       +  return fmin(1.0f, -2.44f*ratio*ratio + 2.15f*ratio + 0.369f);
       +};
       +
       +__inline__ __host__ __device__ float blue(float ratio)
       +{
       +  return fmin(1.0f, -2.21f*ratio*ratio + 1.61f*ratio + 0.573f);
       +};
       +
       +#endif
   DIR diff --git a/raytracer/render_all_outputs_CPU.sh b/raytracer/render_all_outputs_CPU.sh
       t@@ -0,0 +1,27 @@
       +#!/bin/bash
       +
       +FILES=`ls ../output/*.bin`
       +
       +XRES=800
       +YRES=800
       +
       +WORKHORSE=CPU
       +
       +echo "# Rendering PPM images and converting to JPG"
       +for F in ../output/*.bin
       +do
       +  (BASE=`basename $F`;
       +  ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm;
       +  convert ../img_out/$BASE.ppm ../img_out/$BASE.jpg;)
       +  #rm ../img_out/$BASE.ppm &)
       +done
       +
       +#echo "# Converting PPM files to JPG using ImageMagick in parallel"
       +#for F in ../img_out/*.ppm
       +#do
       +#  (BASE=`basename $F`; convert $F $F.jpg &)
       +#done
       +
       +#echo "# Removed temporary files"
       +#rm ../img_out/*.ppm
       +
   DIR diff --git a/raytracer/render_all_outputs_GPU.sh b/raytracer/render_all_outputs_GPU.sh
       t@@ -0,0 +1,29 @@
       +#!/bin/bash
       +
       +FILES=`ls ../output/*.bin`
       +
       +XRES=800
       +YRES=800
       +
       +WORKHORSE=GPU
       +
       +echo "# Rendering PPM images"
       +for F in ../output/*.bin
       +do
       +  BASE=`basename $F`
       +  ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null
       +  if [ $? -ne 0 ] ; then
       +    echo "Error rendering $F, trying again..."
       +    ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null
       +  fi
       +done
       +
       +echo "# Converting PPM files to JPEG using ImageMagick in parallel"
       +for F in ../img_out/*.ppm
       +do
       +  (BASE=`basename $F`; convert $F $F.jpg &)
       +done
       +
       +#echo "# Removed temporary files"
       +#rm ../img_out/*.ppm
       +
   DIR diff --git a/raytracer/rt-kernel.cu b/raytracer/rt-kernel.cu
       t@@ -0,0 +1,442 @@
       +#include <iostream>
       +#include <cutil_math.h>
       +#include "header.h"
       +#include "rt-kernel.h"
       +#include "colorbar.h"
       +
       +unsigned int iDivUp (unsigned int a, unsigned int b) {
       +  return (a % b != 0) ? (a / b + 1) : (a / b);
       +}
       +
       +__inline__ __host__ __device__ float3 f4_to_f3(float4 in)
       +{
       +  return make_float3(in.x, in.y, in.z);
       +}
       +
       +__inline__ __host__ __device__ float4 f3_to_f4(float3 in)
       +{
       +  return make_float4(in.x, in.y, in.z, 0.0f);
       +}
       +
       +// Kernel for initializing image data
       +__global__ void imageInit(unsigned char* _img, unsigned int pixels)
       +{
       +  // Compute pixel position from threadIdx/blockIdx
       +  unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       +  if (mempos > pixels)
       +    return;
       +
       +  _img[mempos*4]     = 255;        // Red channel
       +  _img[mempos*4 + 1] = 255;        // Green channel
       +  _img[mempos*4 + 2] = 255;        // Blue channel
       +}
       +
       +// Calculate ray origins and directions
       +__global__ void rayInitPerspective(float4* _ray_origo, 
       +                                       float4* _ray_direction, 
       +                                   float4 eye, 
       +                                   unsigned int width,
       +                                   unsigned int height)
       +{
       +  // Compute pixel position from threadIdx/blockIdx
       +  unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       +  if (mempos > width*height) 
       +    return;
       +
       +  // Calculate 2D position from linear index
       +  unsigned int i = mempos % width;
       +  unsigned int j = (int)floor((float)mempos/width) % width;
       + 
       +  // Calculate pixel coordinates in image plane
       +  float p_u = const_imgplane.x + (const_imgplane.y - const_imgplane.x)
       +              * (i + 0.5f) / width;
       +  float p_v = const_imgplane.z + (const_imgplane.w - const_imgplane.z)
       +              * (j + 0.5f) / height;
       +
       +  // Write ray origo and direction to global memory
       +  _ray_origo[mempos]     = const_eye;
       +  _ray_direction[mempos] = -const_d*const_w + p_u*const_u + p_v*const_v;
       +}
       +
       +// Check wether the pixel's viewing ray intersects with the spheres,
       +// and shade the pixel correspondingly
       +__global__ void rayIntersectSpheres(float4* _ray_origo, 
       +                                    float4* _ray_direction,
       +                                    float4* _p, 
       +                                    unsigned char* _img)
       +{
       +  // Compute pixel position from threadIdx/blockIdx
       +  unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       +  if (mempos > const_pixels)
       +    return;
       +
       +  // Read ray data from global memory
       +  float3 e = f4_to_f3(_ray_origo[mempos]);
       +  float3 d = f4_to_f3(_ray_direction[mempos]);
       +  //float  step = length(d);
       +
       +  // Distance, in ray steps, between object and eye initialized with a large value
       +  float tdist = 1e10f;
       +
       +  // Surface normal at closest sphere intersection
       +  float3 n;
       +
       +  // Intersection point coordinates
       +  float3 p;
       +
       +  // Iterate through all particles
       +  for (Inttype i=0; i<const_np; ++i) {
       +
       +    // Read sphere coordinate and radius
       +    float3 c     = f4_to_f3(_p[i]);
       +    float  R     = _p[i].w;
       +
       +    // Calculate the discriminant: d = B^2 - 4AC
       +    float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c)))  // B^2
       +                  - 4.0f*dot(d,d)        // -4*A
       +                  * (dot((e-c),(e-c)) - R*R);  // C
       +
       +    // If the determinant is positive, there are two solutions
       +    // One where the line enters the sphere, and one where it exits
       +    if (Delta > 0.0f) { 
       +      
       +      // Calculate roots, Shirley 2009 p. 77
       +      float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
       +                      * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
       +
       +      // Check wether intersection is closer than previous values
       +      if (fabs(t_minus) < tdist) {
       +        p = e + t_minus*d;
       +        tdist = fabs(t_minus);
       +        n = normalize(2.0f * (p - c));   // Surface normal
       +      }
       +
       +    } // End of solution branch
       +
       +  } // End of particle loop
       +
       +  // Write pixel color
       +  if (tdist < 1e10f) {
       +
       +    // Lambertian shading parameters
       +    float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light)));
       +    float I_d = 40.0f;  // Light intensity
       +    float k_d = 5.0f;  // Diffuse coefficient
       +    
       +    // Ambient shading
       +    float k_a = 10.0f;
       +    float I_a = 5.0f; // 5.0 for black background
       +
       +    // Write shading model values to pixel color channels
       +    _img[mempos*4]     = (unsigned char) ((k_d * I_d * dotprod 
       +                       + k_a * I_a)*0.48f);
       +    _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
       +                              + k_a * I_a)*0.41f);
       +    _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
       +                             + k_a * I_a)*0.27f);
       +
       +  }
       +}
       +
       +// Check wether the pixel's viewing ray intersects with the spheres,
       +// and shade the pixel correspondingly using a colormap
       +__global__ void rayIntersectSpheresColormap(float4* _ray_origo, 
       +                                            float4* _ray_direction,
       +                                            float4* _p, 
       +                                            float*  _fixvel,
       +                                            float*  _linarr,
       +                                            float max_val,
       +                                            unsigned char* _img)
       +{
       +  // Compute pixel position from threadIdx/blockIdx
       +  unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       +  if (mempos > const_pixels)
       +    return;
       +
       +  // Read ray data from global memory
       +  float3 e = f4_to_f3(_ray_origo[mempos]);
       +  float3 d = f4_to_f3(_ray_direction[mempos]);
       +
       +  // Distance, in ray steps, between object and eye initialized with a large value
       +  float tdist = 1e10f;
       +
       +  // Surface normal at closest sphere intersection
       +  float3 n;
       +
       +  // Intersection point coordinates
       +  float3 p;
       +
       +  unsigned int hitidx;
       +
       +  // Iterate through all particles
       +  for (Inttype i=0; i<const_np; ++i) {
       +
       +    // Read sphere coordinate and radius
       +    float3 c     = f4_to_f3(_p[i]);
       +    float  R     = _p[i].w;
       +
       +    // Calculate the discriminant: d = B^2 - 4AC
       +    float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c)))  // B^2
       +                  - 4.0f*dot(d,d)        // -4*A
       +                  * (dot((e-c),(e-c)) - R*R);  // C
       +
       +    // If the determinant is positive, there are two solutions
       +    // One where the line enters the sphere, and one where it exits
       +    if (Delta > 0.0f) {
       +      
       +      // Calculate roots, Shirley 2009 p. 77
       +      float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
       +                      * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
       +
       +      // Check wether intersection is closer than previous values
       +      if (fabs(t_minus) < tdist) {
       +        p = e + t_minus*d;
       +        tdist = fabs(t_minus);
       +        n = normalize(2.0f * (p - c));   // Surface normal
       +        hitidx = i;
       +      }
       +
       +    } // End of solution branch
       +
       +  } // End of particle loop
       +
       +  // Write pixel color
       +  if (tdist < 1e10) {
       +
       +    // Fetch particle data used for color
       +    float ratio = _linarr[hitidx] / max_val;
       +    float fixvel   = _fixvel[hitidx];
       +
       +    // Make sure the ratio doesn't exceed the 0.0-1.0 interval
       +    if (ratio < 0.01f)
       +      ratio = 0.01f;
       +    if (ratio > 0.99f)
       +      ratio = 0.99f;
       +
       +    // Lambertian shading parameters
       +    float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light)));
       +    float I_d = 40.0f;  // Light intensity
       +    float k_d = 5.0f;  // Diffuse coefficient
       +    
       +    // Ambient shading
       +    float k_a = 10.0f;
       +    float I_a = 5.0f;
       +
       +    float redv   = red(ratio);
       +    float greenv = green(ratio);
       +    float bluev  = blue(ratio);
       +
       +    // Make particle dark grey if the horizontal velocity is fixed
       +    if (fixvel > 0.f) {
       +      redv = 0.5;
       +      greenv = 0.5;
       +      bluev = 0.5;
       +    }
       +
       +    // Write shading model values to pixel color channels
       +    _img[mempos*4]     = (unsigned char) ((k_d * I_d * dotprod 
       +                       + k_a * I_a)*redv);
       +    _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
       +                              + k_a * I_a)*greenv);
       +    _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
       +                             + k_a * I_a)*bluev);
       +  }
       +}
       +
       +extern "C"
       +__host__ void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio,
       +                             unsigned int pixels, Inttype np)
       +{
       +  // Image dimensions in world space (l, r, b, t)
       +  float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, 0.5f*imgw*hw_ratio);
       +
       +  // The view vector
       +  float4 view = eye - lookat;
       +
       +  // Construct the camera view orthonormal base
       +  //float4 up = make_float4(0.0f, 1.0f, 0.0f, 0.0f);  // Pointing upward along +y
       +  float4 up = make_float4(0.0f, 0.0f, 1.0f, 0.0f);  // Pointing upward along +z
       +  float4 w = -view/length(view);                   // w: Pointing backwards
       +  float4 u = make_float4(cross(make_float3(up.x, up.y, up.z),
       +                               make_float3(w.x, w.y, w.z)), 0.0f)
       +            / length(cross(make_float3(up.x, up.y, up.z), make_float3(w.x, w.y, w.z)));
       +  float4 v = make_float4(cross(make_float3(w.x, w.y, w.z), make_float3(u.x, u.y, u.z)), 0.0f);
       +
       +  // Focal length 20% of eye vector length
       +  float d = length(view)*0.8f;
       +
       +  // Light direction (points towards light source)
       +  float4 light = normalize(-1.0f*eye*make_float4(1.0f, 0.2f, 0.6f, 0.0f));
       +
       +  std::cout << "  Transfering camera values to constant memory\n";
       +
       +  cudaMemcpyToSymbol("const_u", &u, sizeof(u));
       +  cudaMemcpyToSymbol("const_v", &v, sizeof(v));
       +  cudaMemcpyToSymbol("const_w", &w, sizeof(w));
       +  cudaMemcpyToSymbol("const_eye", &eye, sizeof(eye));
       +  cudaMemcpyToSymbol("const_imgplane", &imgplane, sizeof(imgplane));
       +  cudaMemcpyToSymbol("const_d", &d, sizeof(d));
       +  cudaMemcpyToSymbol("const_light", &light, sizeof(light));
       +  cudaMemcpyToSymbol("const_pixels", &pixels, sizeof(pixels));
       +  cudaMemcpyToSymbol("const_np", &np, sizeof(np));
       +}
       +
       +// Check for CUDA errors
       +extern "C"
       +__host__ void checkForCudaErrors(const char* checkpoint_description)
       +{
       +  cudaError_t err = cudaGetLastError();
       +  if (err != cudaSuccess) {
       +    std::cout << "\nCuda error detected, checkpoint: " << checkpoint_description
       +              << "\nError string: " << cudaGetErrorString(err) << "\n";
       +    exit(EXIT_FAILURE);
       +  }
       +}
       +
       +
       +// Wrapper for the rt kernel
       +extern "C"
       +__host__ int rt(float4* p, Inttype np,
       +                rgb* img, unsigned int width, unsigned int height,
       +                f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
       +                int visualize, float max_val,
       +                float* fixvel, float* pres, float* es_dot, float* es, float* vel)
       +{
       +  using std::cout;
       +
       +  cout << "Initializing CUDA:\n";
       +
       +  // Initialize GPU timestamp recorders
       +  float t1, t2;
       +  cudaEvent_t t1_go, t2_go, t1_stop, t2_stop;
       +  cudaEventCreate(&t1_go);
       +  cudaEventCreate(&t2_go);
       +  cudaEventCreate(&t2_stop);
       +  cudaEventCreate(&t1_stop);
       +
       +  // Start timer 1
       +  cudaEventRecord(t1_go, 0);
       +
       +  // Allocate memory
       +  cout << "  Allocating device memory\n";
       +  static float4 *_p;                        // Particle positions (x,y,z) and radius (w)
       +  static float  *_fixvel;               // Indicates whether a particle has a fixed horizontal velocity
       +  static float  *_linarr;                // Array for linear values to color the particles after
       +  static unsigned char *_img;                 // RGBw values in image
       +  static float4 *_ray_origo;                // Ray origo (x,y,z)
       +  static float4 *_ray_direction;         // Ray direction (x,y,z)
       +  cudaMalloc((void**)&_p, np*sizeof(float4));
       +  cudaMalloc((void**)&_fixvel, np*sizeof(float));
       +  cudaMalloc((void**)&_linarr, np*sizeof(float)); // 0 size if visualize = 0;
       +  cudaMalloc((void**)&_img, width*height*4*sizeof(unsigned char));
       +  cudaMalloc((void**)&_ray_origo, width*height*sizeof(float4));
       +  cudaMalloc((void**)&_ray_direction, width*height*sizeof(float4));
       +
       +  // Transfer particle data
       +  cout << "  Transfering particle data: host -> device\n";
       +  cudaMemcpy(_p, p, np*sizeof(float4), cudaMemcpyHostToDevice);
       +  cudaMemcpy(_fixvel, fixvel, np*sizeof(float), cudaMemcpyHostToDevice);
       +  if (visualize == 1 || visualize == 4)
       +    cudaMemcpy(_linarr, pres, np*sizeof(float), cudaMemcpyHostToDevice);
       +  if (visualize == 2)
       +    cudaMemcpy(_linarr, es_dot, np*sizeof(float), cudaMemcpyHostToDevice);
       +  if (visualize == 3)
       +    cudaMemcpy(_linarr, es, np*sizeof(float), cudaMemcpyHostToDevice);
       +  if (visualize == 4)
       +    cudaMemcpy(_linarr, vel, np*sizeof(float), cudaMemcpyHostToDevice);
       +
       +  // Check for errors after memory allocation
       +  checkForCudaErrors("CUDA error after memory allocation"); 
       +
       +  // Arrange thread/block structure
       +  unsigned int pixels = width*height;
       +  float hw_ratio = (float)height/(float)width;
       +  const unsigned int threadsPerBlock = 256;
       +  const unsigned int blocksPerGrid = iDivUp(pixels, threadsPerBlock);
       +
       +  // Start timer 2
       +  cudaEventRecord(t2_go, 0);
       +  
       +  // Initialize image to background color
       +  imageInit<<< blocksPerGrid, threadsPerBlock >>>(_img, pixels);
       +
       +  // Initialize camera
       +  cameraInit(make_float4(eye.x, eye.y, eye.z, 0.0f), 
       +                   make_float4(lookat.x, lookat.y, lookat.z, 0.0f),
       +             imgw, hw_ratio, pixels, np);
       +  checkForCudaErrors("CUDA error after cameraInit");
       +
       +  // Construct rays for perspective projection
       +  rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>(
       +      _ray_origo, _ray_direction, 
       +      make_float4(eye.x, eye.y, eye.z, 0.0f), 
       +      width, height);
       +
       +  cudaThreadSynchronize();
       +  
       +  // Find closest intersection between rays and spheres
       +  if (visualize == 1) { // Visualize pressure
       +    cout << "  Pressure color map range: [0, " << max_val << "] Pa\n"; 
       +    rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
       +        _ray_origo, _ray_direction,
       +        _p, _fixvel, _linarr, max_val, _img);
       +  } else if (visualize == 2) { // es_dot visualization
       +    cout << "  Shear heat production rate color map range: [0, " << max_val << "] W\n";
       +    rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
       +        _ray_origo, _ray_direction,
       +        _p, _fixvel, _linarr, max_val, _img);
       +  } else if (visualize == 3) { // es visualization
       +    cout << "  Total shear heat color map range: [0, " << max_val << "] J\n";
       +    rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
       +        _ray_origo, _ray_direction,
       +        _p, _fixvel, _linarr, max_val, _img);
       +  } else if (visualize == 4) { // velocity visualization
       +    cout << "  Velocity color map range: [0, " << max_val << "] m/s\n";
       +    rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
       +        _ray_origo, _ray_direction,
       +        _p, _fixvel, _linarr, max_val, _img);
       +  } else { // Normal visualization
       +    rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>(
       +        _ray_origo, _ray_direction,
       +        _p, _img);
       +  }
       +
       +  // Make sure all threads are done before continuing CPU control sequence
       +  cudaThreadSynchronize();
       +  
       +  // Check for errors
       +  checkForCudaErrors("CUDA error after kernel execution");
       +
       +  // Stop timer 2
       +  cudaEventRecord(t2_stop, 0);
       +  cudaEventSynchronize(t2_stop);
       +
       +  // Transfer image data from device to host
       +  cout << "  Transfering image data: device -> host\n";
       +  cudaMemcpy(img, _img, width*height*4*sizeof(unsigned char), cudaMemcpyDeviceToHost);
       +
       +  // Free dynamically allocated device memory
       +  cudaFree(_p);
       +  cudaFree(_fixvel);
       +  cudaFree(_linarr);
       +  cudaFree(_img);
       +  cudaFree(_ray_origo);
       +  cudaFree(_ray_direction);
       +
       +  // Stop timer 1
       +  cudaEventRecord(t1_stop, 0);
       +  cudaEventSynchronize(t1_stop);
       +  
       +  // Calculate time spent in t1 and t2
       +  cudaEventElapsedTime(&t1, t1_go, t1_stop);
       +  cudaEventElapsedTime(&t2, t2_go, t2_stop);
       +
       +  // Report time spent
       +  cout << "  Time spent on entire GPU routine: "
       +       << t1 << " ms\n";
       +  cout << "  - Kernels: " << t2 << " ms\n"
       +       << "  - Memory alloc. and transfer: " << t1-t2 << " ms\n";
       +
       +  // Return successfully
       +  return 0;
       +}
   DIR diff --git a/raytracer/rt_GPU_init_pres.sh b/raytracer/rt_GPU_init_pres.sh
       t@@ -0,0 +1,36 @@
       +#!/bin/bash
       +
       +# Pass max. pressure as command line argument
       +
       +XRES=400
       +YRES=1600
       +
       +WORKHORSE=GPU
       +
       +echo "# Rendering PPM images"
       +for F in ../output/*init*.bin
       +do
       +  BASE=`basename $F`
       +  OUTFILE=$BASE.ppm.jpg
       +  if [ -e ../img_out/$OUTFILE ]
       +  then
       +    echo "$OUTFILE exists." > /dev/null
       +  else
       +    ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       +    if [ $? -ne 0 ] ; then
       +      echo "Error rendering $F, trying again..."
       +      ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       +    fi
       +  fi
       +done
       +
       +echo "# Converting PPM files to JPEG using ImageMagick in parallel"
       +for F in ../img_out/*.ppm
       +do
       +  (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
       +done
       +
       +sleep 5
       +echo "# Removing temporary PPM files"
       +rm ../img_out/*.ppm
       +
   DIR diff --git a/raytracer/rt_GPU_pres.sh b/raytracer/rt_GPU_pres.sh
       t@@ -0,0 +1,36 @@
       +#!/bin/bash
       +
       +# Pass max. pressure as command line argument
       +
       +XRES=800
       +YRES=800
       +
       +WORKHORSE=GPU
       +
       +echo "# Rendering PPM images"
       +for F in ../output/*.bin
       +do
       +  BASE=`basename $F`
       +  OUTFILE=$BASE.ppm.jpg
       +  if [ -e ../img_out/$OUTFILE ]
       +  then
       +    echo "$OUTFILE exists." > /dev/null
       +  else
       +    ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       +    if [ $? -ne 0 ] ; then
       +      echo "Error rendering $F, trying again..."
       +      ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       +    fi
       +  fi
       +done
       +
       +echo "# Converting PPM files to JPEG using ImageMagick in parallel"
       +for F in ../img_out/*.ppm
       +do
       +  (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
       +done
       +
       +sleep 5
       +echo "# Removing temporary PPM files"
       +rm ../img_out/*.ppm
       +
   DIR diff --git a/raytracer/rt_GPU_tall_pres.sh b/raytracer/rt_GPU_tall_pres.sh
       t@@ -0,0 +1,36 @@
       +#!/bin/bash
       +
       +# Pass max. pressure as command line argument
       +
       +XRES=800
       +YRES=1200
       +
       +WORKHORSE=GPU
       +
       +echo "# Rendering PPM images"
       +for F in ../output/*.bin
       +do
       +  BASE=`basename $F`
       +  OUTFILE=$BASE.ppm.jpg
       +  if [ -e ../img_out/$OUTFILE ]
       +  then
       +    echo "$OUTFILE exists." > /dev/null
       +  else
       +    ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       +    if [ $? -ne 0 ] ; then
       +      echo "Error rendering $F, trying again..."
       +      ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       +    fi
       +  fi
       +done
       +
       +echo "# Converting PPM files to JPEG using ImageMagick in parallel"
       +for F in ../img_out/*.ppm
       +do
       +  (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
       +done
       +
       +sleep 5
       +echo "# Removing temporary PPM files"
       +rm ../img_out/*.ppm
       +
   DIR diff --git a/src/cohesion.cuh b/src/cohesion.cuh
       t@@ -0,0 +1,126 @@
       +#ifndef COHESION_CUH_
       +#define COHESION_CUH_
       +
       +// cohesion.cuh
       +// Functions governing attractive forces between contacts
       +
       +
       +// Linear-elastic bond: Attractive force with normal- and shear components
       +// acting upon particle A in a bonded particle pair
       +__device__ void bondLinear(Float3* N, Float3* T, Float* es_dot, Float* p,
       +                           unsigned int idx_a, unsigned int idx_b, 
       +                           Float4* dev_x_sorted, Float4* dev_vel_sorted, 
       +                           Float4* dev_angvel_sorted,
       +                           Float radius_a, Float radius_b, 
       +                           Float3 x_ab, Float x_ab_length, 
       +                           Float delta_ab) 
       +{
       +
       +  // If particles are not overlapping, apply bond force
       +  if (delta_ab > 0.0f) {
       +
       +    // Allocate variables and fetch missing time=t values for particle A and B
       +    Float4 vel_a     = dev_vel_sorted[idx_a];
       +    Float4 vel_b     = dev_vel_sorted[idx_b];
       +    Float4 angvel4_a = dev_angvel_sorted[idx_a];
       +    Float4 angvel4_b = dev_angvel_sorted[idx_b];
       +
       +    // Convert to Float3's
       +    Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
       +    Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
       +
       +    // Normal vector of contact
       +    Float3 n_ab = x_ab/x_ab_length;
       +
       +    // Relative contact interface velocity, w/o rolling
       +    Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x, 
       +                                          vel_a.y - vel_b.y, 
       +                                          vel_a.z - vel_b.z);
       +
       +    // Relative contact interface velocity of particle surfaces at
       +    // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
       +    Float3 vel_ab = vel_ab_linear
       +                    + radius_a * cross(n_ab, angvel_a)
       +                    + radius_b * cross(n_ab, angvel_b);
       +
       +    // Relative contact interface rolling velocity
       +    //Float3 angvel_ab = angvel_a - angvel_b;
       +    //Float  angvel_ab_length = length(angvel_ab);
       +
       +    // Normal component of the relative contact interface velocity
       +    //Float vel_n_ab = dot(vel_ab_linear, n_ab);
       +
       +    // Tangential component of the relative contact interface velocity
       +    // Hinrichsen and Wolf 2004, eq. 13.9
       +    Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
       +    //Float  vel_t_ab_length = length(vel_t_ab);
       +
       +    Float3 f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +    Float3 f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +
       +    // Mean radius
       +    Float R_bar = (radius_a + radius_b)/2.0f;
       +
       +    // Normal force component: Elastic
       +    f_n = devC_k_n * delta_ab * n_ab;
       +
       +    if (length(vel_t_ab) > 0.f) {
       +      // Shear force component: Viscous
       +      f_t = -1.0f * devC_gamma_s * vel_t_ab;
       +
       +      // Shear friction production rate [W]
       +      //*es_dot += -dot(vel_t_ab, f_t);
       +    }
       +
       +    // Add force components from this bond to total force for particle
       +    *N += f_n + f_t;
       +    *T += -R_bar * cross(n_ab, f_t);
       +
       +    // Pressure excerted onto the particle from this bond
       +    *p += length(f_n) / (4.0f * PI * radius_a*radius_a);
       +
       +  }
       +} // End of bondLinear()
       +
       +
       +// Capillary cohesion after Richefeu et al. (2006)
       +__device__ void capillaryCohesion_exp(Float3* N, Float radius_a, 
       +                                          Float radius_b, Float delta_ab,
       +                                          Float3 x_ab, Float x_ab_length, 
       +                                      Float kappa)
       +{
       +
       +  // Normal vector 
       +  Float3 n_ab = x_ab/x_ab_length;
       +
       +  Float3 f_c;
       +  Float lambda, R_geo, R_har, r, h;
       +
       +  // Determine the ratio; r = max{Ri/Rj;Rj/Ri}
       +  if ((radius_a/radius_b) > (radius_b/radius_a))
       +    r = radius_a/radius_b;
       +  else
       +    r = radius_b/radius_a;
       +
       +  // Exponential decay function
       +  h = -sqrtf(r);
       +
       +  // The harmonic mean
       +  R_har = (2.0f * radius_a * radius_b) / (radius_a + radius_b);
       +
       +  // The geometrical mean
       +  R_geo = sqrtf(radius_a * radius_b);
       +
       +  // The exponential falloff of the capillary force with distance
       +  lambda = 0.9f * h * sqrtf(devC_V_b/R_har);
       +
       +  // Calculate cohesional force
       +  f_c = -kappa * R_geo * expf(-delta_ab/lambda) * n_ab;
       +
       +  // Add force components from this collision to total force for particle
       +  *N += f_c;
       +
       +} // End of capillaryCohesion_exp
       +
       +
       +#endif
   DIR diff --git a/src/contactmodels.cuh b/src/contactmodels.cuh
       t@@ -0,0 +1,411 @@
       +#ifndef CONTACTMODELS_CUH_
       +#define CONTACTMODELS_CUH_
       +
       +// contactmodels.cuh
       +// Functions governing repulsive forces between contacts
       +
       +
       +// Linear viscoelastic contact model for particle-wall interactions
       +// with tangential friction and rolling resistance
       +__device__ Float contactLinear_wall(Float3* N, Float3* T, Float* es_dot, Float* p,
       +                                    unsigned int idx_a, Float radius_a,
       +                                    Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
       +                                    Float3 n, Float delta, Float wvel)
       +{
       +  // Fetch particle velocities from global memory
       +  Float4 linvel_tmp = dev_vel_sorted[idx_a];
       +  Float4 angvel_tmp = dev_angvel_sorted[idx_a];
       +
       +  // Convert velocities to three-component vectors
       +  Float3 linvel = MAKE_FLOAT3(linvel_tmp.x,
       +                                             linvel_tmp.y,
       +                                       linvel_tmp.z);
       +  Float3 angvel = MAKE_FLOAT3(angvel_tmp.x,
       +                                             angvel_tmp.y,
       +                                       angvel_tmp.z);
       +
       +  // Store the length of the angular velocity for later use
       +  Float angvel_length = length(angvel);
       +
       +  // Contact velocity is the sum of the linear and
       +  // rotational components
       +  Float3 vel = linvel + radius_a * cross(n, angvel) + wvel;
       +
       +  // Normal component of the contact velocity
       +  Float vel_n = dot(vel, n);
       +
       +  // The tangential velocity is the contact velocity
       +  // with the normal component subtracted
       +  Float3 vel_t = vel - n * (dot(vel, n));
       +  Float  vel_t_length = length(vel_t);
       +
       +  // Calculate elastic normal component
       +  //Float3 f_n = -devC_k_n * delta * n;
       +
       +  // Normal force component: Elastic - viscous damping
       +  Float3 f_n = (-devC_k_n * delta - devC_gamma_wn * vel_n) * n;
       +
       +  // Make sure the viscous damping doesn't exceed the elastic component,
       +  // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
       +  if (dot(f_n, n) < 0.0f)
       +    f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +
       +  Float  f_n_length = length(f_n); // Save length for later use
       +
       +  // Initialize vectors
       +  Float3 f_t   = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +  Float3 T_res = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +
       +  // Check that the tangential velocity is high enough to avoid
       +  // divide by zero (producing a NaN)
       +  if (vel_t_length > 0.f) {
       +
       +    Float f_t_visc  = devC_gamma_ws * vel_t_length; // Tangential force by viscous model
       +    Float f_t_limit = devC_mu_s * f_n_length;      // Max. friction
       +
       +    // If the shear force component exceeds the friction,
       +    // the particle slips and energy is dissipated
       +    if (f_t_visc < f_t_limit) {
       +      f_t = -1.0f * f_t_visc * vel_t/vel_t_length;
       +
       +    } else { // Dynamic friction, friction failure
       +      f_t = -1.0f * f_t_limit * vel_t/vel_t_length;
       +      
       +      // Shear energy production rate [W]
       +      //*es_dot += -dot(vel_t, f_t);
       +    }
       +  }
       +
       +/*  if (angvel_length > 0.f) {
       +    // Apply rolling resistance (Zhou et al. 1999)
       +    //T_res = -angvel_a/angvel_length * devC_mu_r * radius_a * f_n_length;
       +
       +    // New rolling resistance model
       +    T_res = -1.0f * fmin(devC_gamma_r * radius_a * angvel_length,
       +                         devC_mu_r * radius_a * f_n_length)
       +            * angvel_a/angvel_length;
       +  }*/
       +
       +  // Total force from wall
       +  *N += f_n + f_t;
       +
       +  // Total torque from wall
       +  *T += -radius_a * cross(n, f_t) + T_res;
       +
       +  // Pressure excerted onto particle from this contact
       +  *p += f_n_length / (4.0f * PI * radius_a*radius_a);
       +
       +  // Return force excerted onto the wall
       +  //return -dot(*N, n);
       +  return dot(f_n, n);
       +}
       +
       +
       +// Linear vicoelastic contact model for particle-particle interactions
       +// with tangential friction and rolling resistance
       +__device__ void contactLinearViscous(Float3* N, Float3* T, Float* es_dot, Float* p,
       +                                               unsigned int idx_a, unsigned int idx_b, 
       +                                     Float4* dev_vel_sorted, 
       +                                     Float4* dev_angvel_sorted,
       +                                     Float radius_a, Float radius_b, 
       +                                     Float3 x_ab, Float x_ab_length, 
       +                                     Float delta_ab, Float kappa) 
       +{
       +
       +  // Allocate variables and fetch missing time=t values for particle A and B
       +  Float4 vel_a     = dev_vel_sorted[idx_a];
       +  Float4 vel_b     = dev_vel_sorted[idx_b];
       +  Float4 angvel4_a = dev_angvel_sorted[idx_a];
       +  Float4 angvel4_b = dev_angvel_sorted[idx_b];
       +
       +  // Convert to Float3's
       +  Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
       +  Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
       +
       +  // Force between grain pair decomposed into normal- and tangential part
       +  Float3 f_n, f_t, f_c, T_res;
       +
       +  // Normal vector of contact
       +  Float3 n_ab = x_ab/x_ab_length;
       +
       +  // Relative contact interface velocity, w/o rolling
       +  Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x, 
       +                                              vel_a.y - vel_b.y, 
       +                                              vel_a.z - vel_b.z);
       +
       +  // Relative contact interface velocity of particle surfaces at
       +  // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
       +  Float3 vel_ab = vel_ab_linear
       +                        + radius_a * cross(n_ab, angvel_a)
       +                        + radius_b * cross(n_ab, angvel_b);
       +
       +  // Relative contact interface rolling velocity
       +  Float3 angvel_ab = angvel_a - angvel_b;
       +  Float  angvel_ab_length = length(angvel_ab);
       +
       +  // Normal component of the relative contact interface velocity
       +  Float vel_n_ab = dot(vel_ab_linear, n_ab);
       +
       +  // Tangential component of the relative contact interface velocity
       +  // Hinrichsen and Wolf 2004, eq. 13.9
       +  Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
       +  Float  vel_t_ab_length = length(vel_t_ab);
       +
       +  // Compute the normal stiffness of the contact
       +  //Float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
       +
       +  // Calculate rolling radius
       +  Float R_bar = (radius_a + radius_b) / 2.0f;
       +
       +  // Normal force component: linear-elastic approximation (Augier 2009, eq. 3)
       +  // with velocity dependant damping
       +  //   Damping coefficient: alpha = 0.8
       +  //f_n = (-k_n_ab * delta_ab + 2.0f * 0.8f * sqrtf(m_eff*k_n_ab) * vel_ab) * n_ab;
       +
       +  // Linear spring for normal component (Renzo 2004, eq. 35)
       +  // Dissipation due to  plastic deformation is modelled by using a different
       +  // unloading spring constant (Walton and Braun 1986)
       +  // Here the factor in the second term determines the relative strength of the
       +  // unloading spring relative to the loading spring.
       +  /*  if (vel_n_ab > 0.0f) {        // Loading
       +      f_n = (-k_n_ab * delta_ab) * n_ab;
       +      } else {                        // Unloading
       +      f_n = (-k_n_ab * 0.90f * delta_ab) * n_ab;
       +      } // f_n is OK! */
       +
       +  // Normal force component: Elastic
       +  //f_n = -k_n_ab * delta_ab * n_ab;
       +
       +  // Normal force component: Elastic - viscous damping
       +  f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
       +
       +  // Make sure the viscous damping doesn't exceed the elastic component,
       +  // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
       +  if (dot(f_n, n_ab) < 0.0f)
       +    f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +
       +  Float f_n_length = length(f_n);
       +
       +  // Add max. capillary force
       +  f_c = -kappa * sqrtf(radius_a * radius_b) * n_ab;
       +
       +  // Initialize force vectors to zero
       +  f_t   = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +  T_res = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +
       +  // Shear force component: Nonlinear relation
       +  // Coulomb's law of friction limits the tangential force to less or equal
       +  // to the normal force
       +  if (vel_t_ab_length > 0.f) {
       +
       +    // Tangential force by viscous model
       +    Float f_t_visc  = devC_gamma_s * vel_t_ab_length;
       +
       +    // Determine max. friction
       +    Float f_t_limit;
       +    if (vel_t_ab_length > 0.001f) { // Dynamic
       +      f_t_limit = devC_mu_d * length(f_n-f_c);
       +    } else { // Static
       +      f_t_limit = devC_mu_s * length(f_n-f_c);
       +    }
       +
       +    // If the shear force component exceeds the friction,
       +    // the particle slips and energy is dissipated
       +    if (f_t_visc < f_t_limit) { // Static
       +      f_t = -1.0f * f_t_visc * vel_t_ab/vel_t_ab_length;
       +
       +    } else { // Dynamic, friction failure
       +      f_t = -1.0f * f_t_limit * vel_t_ab/vel_t_ab_length;
       +
       +      // Shear friction production rate [W]
       +      //*es_dot += -dot(vel_t_ab, f_t);
       +    }
       +  }
       +
       +/*  if (angvel_ab_length > 0.f) {
       +    // Apply rolling resistance (Zhou et al. 1999)
       +    //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
       +
       +    // New rolling resistance model
       +    T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
       +                         devC_mu_r * R_bar * f_n_length)
       +            * angvel_ab/angvel_ab_length;
       +  }
       +*/
       +
       +  // Add force components from this collision to total force for particle
       +  *N += f_n + f_t + f_c; 
       +  *T += -R_bar * cross(n_ab, f_t) + T_res;
       +
       +  // Pressure excerted onto the particle from this contact
       +  *p += f_n_length / (4.0f * PI * radius_a*radius_a);
       +
       +} // End of contactLinearViscous()
       +
       +
       +// Linear elastic contact model for particle-particle interactions
       +__device__ void contactLinear(Float3* N, Float3* T, 
       +                                  Float* es_dot, Float* p,
       +                              unsigned int idx_a_orig,
       +                              unsigned int idx_b_orig, 
       +                              Float4  vel_a, 
       +                              Float4* dev_vel,
       +                              Float3  angvel_a,
       +                              Float4* dev_angvel,
       +                              Float radius_a, Float radius_b, 
       +                              Float3 x_ab, Float x_ab_length, 
       +                              Float delta_ab, Float4* dev_delta_t,
       +                              unsigned int mempos) 
       +{
       +
       +  // Allocate variables and fetch missing time=t values for particle A and B
       +  Float4 vel_b     = dev_vel[idx_b_orig];
       +  Float4 angvel4_b = dev_angvel[idx_b_orig];
       +
       +  // Fetch previous sum of shear displacement for the contact pair
       +  Float4 delta_t0_4 = dev_delta_t[mempos];
       +
       +  Float3 delta_t0_uncor = MAKE_FLOAT3(delta_t0_4.x,
       +                                               delta_t0_4.y,
       +                                               delta_t0_4.z);
       +
       +  // Convert to Float3
       +  Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
       +
       +  // Force between grain pair decomposed into normal- and tangential part
       +  Float3 f_n, f_t, f_c, T_res;
       +
       +  // Normal vector of contact
       +  Float3 n_ab = x_ab/x_ab_length;
       +
       +  // Relative contact interface velocity, w/o rolling
       +  Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x, 
       +                                              vel_a.y - vel_b.y, 
       +                                              vel_a.z - vel_b.z);
       +
       +  // Relative contact interface velocity of particle surfaces at
       +  // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
       +  Float3 vel_ab = vel_ab_linear
       +                        + radius_a * cross(n_ab, angvel_a)
       +                        + radius_b * cross(n_ab, angvel_b);
       +
       +  // Relative contact interface rolling velocity
       +  Float3 angvel_ab = angvel_a - angvel_b;
       +  Float  angvel_ab_length = length(angvel_ab);
       +
       +  // Normal component of the relative contact interface velocity
       +  Float vel_n_ab = dot(vel_ab_linear, n_ab);
       +
       +  // Tangential component of the relative contact interface velocity
       +  // Hinrichsen and Wolf 2004, eq. 13.9
       +  Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
       +  Float  vel_t_ab_length = length(vel_t_ab);
       +
       +  // Correct tangential displacement vector, which is
       +  // necessary if the tangential plane rotated
       +  Float3 delta_t0 = delta_t0_uncor - (n_ab * dot(delta_t0_uncor, n_ab));
       +  Float  delta_t0_length = length(delta_t0);
       +
       +  // New tangential displacement vector
       +  Float3 delta_t;
       +
       +  // Compute the normal stiffness of the contact
       +  //Float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
       +
       +  // Calculate rolling radius
       +  Float R_bar = (radius_a + radius_b) / 2.0f;
       +
       +  // Normal force component: Elastic
       +  //f_n = -devC_k_n * delta_ab * n_ab;
       +
       +  // Normal force component: Elastic - viscous damping
       +  f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
       +
       +  // Make sure the viscous damping doesn't exceed the elastic component,
       +  // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
       +  if (dot(f_n, n_ab) < 0.0f)
       +    f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +
       +  Float f_n_length = length(f_n);
       +
       +  // Add max. capillary force
       +  f_c = -devC_kappa * sqrtf(radius_a * radius_b) * n_ab;
       +
       +  // Initialize force vectors to zero
       +  f_t   = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +  T_res = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +
       +  // Apply a tangential force if the previous tangential displacement
       +  // is non-zero, or the current sliding velocity is non-zero.
       +  if (delta_t0_length > 0.f || vel_t_ab_length > 0.f) {
       +
       +    // Shear force: Visco-Elastic, limited by Coulomb friction
       +    Float3 f_t_elast = -1.0f * devC_k_s * delta_t0;
       +    Float3 f_t_visc  = -1.0f * devC_gamma_s * vel_t_ab;
       +
       +    Float f_t_limit;
       +    
       +    if (vel_t_ab_length > 0.001f) { // Dynamic friciton
       +      f_t_limit = devC_mu_d * length(f_n-f_c);
       +    } else { // Static friction
       +      f_t_limit = devC_mu_s * length(f_n-f_c);
       +    }
       +
       +    // Tangential force before friction limit correction
       +    f_t = f_t_elast + f_t_visc;
       +    Float f_t_length = length(f_t);
       +
       +    // If failure criterion is not met, contact is viscous-linear elastic.
       +    // If failure criterion is met, contact force is limited, 
       +    // resulting in a slip and energy dissipation
       +    if (f_t_length > f_t_limit) { // Dynamic case
       +      
       +      // Frictional force is reduced to equal the limit
       +      f_t *= f_t_limit/f_t_length;
       +
       +      // A slip event zeros the displacement vector
       +      //delta_t = make_Float3(0.0f, 0.0f, 0.0f);
       +
       +      // In a slip event, the tangential spring is adjusted to a 
       +      // length which is consistent with Coulomb's equation
       +      // (Hinrichsen and Wolf, 2004)
       +      delta_t = -1.0f/devC_k_s * f_t + devC_gamma_s * vel_t_ab;
       +
       +      // Shear friction heat production rate: 
       +      // The energy lost from the tangential spring is dissipated as heat
       +      //*es_dot += -dot(vel_t_ab, f_t);
       +      *es_dot += length(delta_t0 - delta_t) * devC_k_s / devC_dt; // Seen in EsyS-Particle
       +
       +    } else { // Static case
       +
       +      // No correction of f_t is required
       +
       +      // Add tangential displacement to total tangential displacement
       +      delta_t = delta_t0 + vel_t_ab * devC_dt;
       +    }
       +  }
       +
       +  if (angvel_ab_length > 0.f) {
       +    // Apply rolling resistance (Zhou et al. 1999)
       +    //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
       +
       +    // New rolling resistance model
       +    T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
       +                         devC_mu_r * R_bar * f_n_length)
       +            * angvel_ab/angvel_ab_length;
       +  }
       +
       +  // Add force components from this collision to total force for particle
       +  *N += f_n + f_t + f_c; 
       +  *T += -R_bar * cross(n_ab, f_t) + T_res;
       +
       +  // Pressure excerted onto the particle from this contact
       +  *p += f_n_length / (4.0f * PI * radius_a*radius_a);
       +
       +  // Store sum of tangential displacements
       +  dev_delta_t[mempos] = MAKE_FLOAT4(delta_t.x, delta_t.y, delta_t.z, 0.0f);
       +
       +} // End of contactLinear()
       +
       +
       +#endif
   DIR diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
       t@@ -0,0 +1,625 @@
       +#ifndef CONTACTSEARCH_CUH_
       +#define CONTACTSEARCH_CUH_
       +
       +// contactsearch.cuh
       +// Functions for identifying contacts and processing boundaries
       +
       +
       +// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
       +// Kernel executed on device, and callable from device only.
       +__device__ void overlapsInCell(int3 targetCell, 
       +                                   unsigned int idx_a, 
       +                               Float4 x_a, Float radius_a,
       +                               Float3* N, Float3* T, 
       +                               Float* es_dot, Float* p,
       +                               Float4* dev_x_sorted, 
       +                               Float* dev_radius_sorted,
       +                               Float4* dev_vel_sorted, 
       +                               Float4* dev_angvel_sorted,
       +                               unsigned int* dev_cellStart, 
       +                               unsigned int* dev_cellEnd,
       +                               Float4* dev_w_nx, 
       +                               Float4* dev_w_mvfd)
       +                               //uint4 bonds)
       +{
       +
       +  // Variable containing modifier for interparticle
       +  // vector, if it crosses a periodic boundary
       +  Float3 distmod = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +
       +  // Check whether x- and y boundaries are to be treated as periodic
       +  // 1: x- and y boundaries periodic
       +  // 2: x boundaries periodic
       +  if (devC_periodic == 1) {
       +
       +    // Periodic x-boundary
       +    if (targetCell.x < 0) {
       +      targetCell.x = devC_num[0] - 1;
       +      distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
       +    }
       +    if (targetCell.x == devC_num[0]) {
       +      targetCell.x = 0;
       +      distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
       +    }
       +
       +    // Periodic y-boundary
       +    if (targetCell.y < 0) {
       +      targetCell.y = devC_num[1] - 1;
       +      distmod += MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
       +    }
       +    if (targetCell.y == devC_num[1]) {
       +      targetCell.y = 0;
       +      distmod -= MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
       +    }
       +
       +  } else if (devC_periodic == 2) {
       +
       +    // Periodic x-boundary
       +    if (targetCell.x < 0) {
       +      targetCell.x = devC_num[0] - 1;
       +      distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
       +    }
       +    if (targetCell.x == devC_num[0]) {
       +      targetCell.x = 0;
       +      distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
       +    }
       +
       +    // Hande out-of grid cases on y-axis
       +    if (targetCell.y < 0 || targetCell.y == devC_num[1])
       +      return;
       +
       +  } else {
       +
       +    // Hande out-of grid cases on x- and y-axes
       +    if (targetCell.x < 0 || targetCell.x == devC_num[0])
       +      return;
       +    if (targetCell.y < 0 || targetCell.y == devC_num[1])
       +      return;
       +  }
       +
       +  // Handle out-of-grid cases on z-axis
       +  if (targetCell.z < 0 || targetCell.z == devC_num[2])
       +    return;
       +
       +
       +  //// Check and process particle-particle collisions
       +
       +  // Calculate linear cell ID
       +  unsigned int cellID = targetCell.x  
       +                            + __umul24(targetCell.y, devC_num[0]) 
       +                        + __umul24(__umul24(devC_num[0], 
       +                                                  devC_num[1]), 
       +                                    targetCell.z); 
       +
       +  // Lowest particle index in cell
       +  unsigned int startIdx = dev_cellStart[cellID];
       +
       +  // Make sure cell is not empty
       +  if (startIdx != 0xffffffff) {
       +
       +    // Highest particle index in cell + 1
       +    unsigned int endIdx = dev_cellEnd[cellID];
       +
       +    // Iterate over cell particles
       +    for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
       +      if (idx_b != idx_a) { // Do not collide particle with itself
       +
       +        // Fetch position and velocity of particle B.
       +        Float4 x_b      = dev_x_sorted[idx_b];
       +        Float  radius_b = dev_radius_sorted[idx_b];
       +        Float  kappa         = devC_kappa;
       +
       +        // Distance between particle centers (Float4 -> Float3)
       +        Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x, 
       +                                         x_a.y - x_b.y, 
       +                                     x_a.z - x_b.z);
       +
       +        // Adjust interparticle vector if periodic boundary/boundaries
       +        // are crossed
       +        x_ab += distmod;
       +
       +        Float x_ab_length = length(x_ab);
       +
       +        // Distance between particle perimeters
       +        Float delta_ab = x_ab_length - (radius_a + radius_b); 
       +
       +        // Check for particle overlap
       +        if (delta_ab < 0.0f) {
       +                  contactLinearViscous(N, T, es_dot, p, 
       +                                     idx_a, idx_b,
       +                               dev_vel_sorted, 
       +                               dev_angvel_sorted,
       +                               radius_a, radius_b, 
       +                               x_ab, x_ab_length,
       +                               delta_ab, kappa);
       +        } else if (delta_ab < devC_db) { 
       +          // Check wether particle distance satisfies the capillary bond distance
       +          capillaryCohesion_exp(N, radius_a, radius_b, delta_ab, 
       +                                      x_ab, x_ab_length, kappa);
       +        }
       +
       +        // Check wether particles are bonded together
       +        /*if (bonds.x == idx_b || bonds.y == idx_b ||
       +            bonds.z == idx_b || bonds.w == idx_b) {
       +          bondLinear(N, T, es_dot, p, 
       +                           idx_a, idx_b,
       +                        dev_x_sorted, dev_vel_sorted,
       +                     dev_angvel_sorted,
       +                     radius_a, radius_b,
       +                     x_ab, x_ab_length,
       +                     delta_ab);
       +        }*/
       +
       +      } // Do not collide particle with itself end
       +    } // Iterate over cell particles end
       +  } // Check wether cell is empty end
       +} // End of overlapsInCell(...)
       +
       +// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
       +// Write the indexes of the overlaps in array contacts[]
       +// Kernel executed on device, and callable from device only.
       +__device__ void findContactsInCell(int3 targetCell, 
       +                                       unsigned int idx_a, 
       +                                   Float4 x_a, Float radius_a,
       +                                   Float4* dev_x_sorted, 
       +                                   Float* dev_radius_sorted,
       +                                   unsigned int* dev_cellStart, 
       +                                   unsigned int* dev_cellEnd,
       +                                   unsigned int* dev_gridParticleIndex,
       +                                   int* nc,
       +                                   unsigned int* dev_contacts,
       +                                   Float4* dev_distmod)
       +{
       +  // Variable containing modifier for interparticle
       +  // vector, if it crosses a periodic boundary
       +  Float3 distmod = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +
       +  // Check whether x- and y boundaries are to be treated as periodic
       +  // 1: x- and y boundaries periodic
       +  // 2: x boundaries periodic
       +  if (devC_periodic == 1) {
       +
       +    // Periodic x-boundary
       +    if (targetCell.x < 0) {
       +      targetCell.x = devC_num[0] - 1;
       +      distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
       +    }
       +    if (targetCell.x == devC_num[0]) {
       +      targetCell.x = 0;
       +      distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
       +    }
       +
       +    // Periodic y-boundary
       +    if (targetCell.y < 0) {
       +      targetCell.y = devC_num[1] - 1;
       +      distmod += MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
       +    }
       +    if (targetCell.y == devC_num[1]) {
       +      targetCell.y = 0;
       +      distmod -= MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
       +    }
       +
       +  } else if (devC_periodic == 2) {
       +
       +    // Periodic x-boundary
       +    if (targetCell.x < 0) {
       +      targetCell.x = devC_num[0] - 1;
       +      distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
       +    }
       +    if (targetCell.x == devC_num[0]) {
       +      targetCell.x = 0;
       +      distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
       +    }
       +
       +    // Hande out-of grid cases on y-axis
       +    if (targetCell.y < 0 || targetCell.y == devC_num[1])
       +      return;
       +
       +  } else {
       +
       +    // Hande out-of grid cases on x- and y-axes
       +    if (targetCell.x < 0 || targetCell.x == devC_num[0])
       +      return;
       +    if (targetCell.y < 0 || targetCell.y == devC_num[1])
       +      return;
       +  }
       +
       +  // Handle out-of-grid cases on z-axis
       +  if (targetCell.z < 0 || targetCell.z == devC_num[2])
       +    return;
       +
       +
       +  //// Check and process particle-particle collisions
       +
       +  // Calculate linear cell ID
       +  unsigned int cellID = targetCell.x  
       +                            + __umul24(targetCell.y, devC_num[0]) 
       +                        + __umul24(__umul24(devC_num[0], 
       +                                                  devC_num[1]), 
       +                                    targetCell.z); 
       +
       +  // Lowest particle index in cell
       +  unsigned int startIdx = dev_cellStart[cellID];
       +
       +  // Make sure cell is not empty
       +  if (startIdx != 0xffffffff) {
       +
       +    // Highest particle index in cell + 1
       +    unsigned int endIdx = dev_cellEnd[cellID];
       +
       +    // Read the original index of particle A
       +    unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
       +
       +    // Iterate over cell particles
       +    for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
       +      if (idx_b != idx_a) { // Do not collide particle with itself
       +
       +        // Fetch position and radius of particle B.
       +        Float4 x_b      = dev_x_sorted[idx_b];
       +        Float  radius_b = dev_radius_sorted[idx_b];
       +
       +        // Read the original index of particle B
       +        unsigned int idx_b_orig = dev_gridParticleIndex[idx_b];
       +
       +        __syncthreads();
       +
       +        // Distance between particle centers (Float4 -> Float3)
       +        Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x, 
       +                                      x_a.y - x_b.y, 
       +                                  x_a.z - x_b.z);
       +
       +        // Adjust interparticle vector if periodic boundary/boundaries
       +        // are crossed
       +        x_ab += distmod;
       +
       +        Float x_ab_length = length(x_ab);
       +
       +        // Distance between particle perimeters
       +        Float delta_ab = x_ab_length - (radius_a + radius_b); 
       +
       +        // Check for particle overlap
       +        if (delta_ab < 0.0f) {
       +          
       +          // If the particle is not yet registered in the contact list,
       +          // use the next position in the array
       +          int cpos = *nc;
       +          unsigned int cidx;
       +
       +          // Find out, if particle is already registered in contact list
       +          for (int i=0; i<devC_nc; ++i) {
       +            __syncthreads();
       +            cidx = dev_contacts[(unsigned int)(idx_a_orig*devC_nc+i)];
       +            if (cidx == idx_b_orig)
       +              cpos = i;
       +          }
       +
       +          __syncthreads();
       +
       +          // Write the particle index to the relevant position,
       +          // no matter if it already is there or not (concurrency of write)
       +          dev_contacts[(unsigned int)(idx_a_orig*devC_nc+cpos)] = idx_b_orig;
       +
       +
       +          // Write the interparticle vector and radius of particle B
       +         //dev_x_ab_r_b[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_Float4(x_ab, radius_b);
       +          dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = MAKE_FLOAT4(distmod.x, distmod.y, distmod.z, radius_b);
       +          
       +          // Increment contact counter
       +          ++*nc;
       +        }
       +
       +        // Write the inter-particle position vector correction and radius of particle B
       +        //dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_Float4(distmod, radius_b);
       +
       +        // Check wether particles are bonded together
       +        /*if (bonds.x == idx_b || bonds.y == idx_b ||
       +            bonds.z == idx_b || bonds.w == idx_b) {
       +          bondLinear(N, T, es_dot, p, 
       +                           idx_a, idx_b,
       +                        dev_x_sorted, dev_vel_sorted,
       +                     dev_angvel_sorted,
       +                     radius_a, radius_b,
       +                     x_ab, x_ab_length,
       +                     delta_ab);
       +        }*/
       +
       +      } // Do not collide particle with itself end
       +    } // Iterate over cell particles end
       +  } // Check wether cell is empty end
       +} // End of findContactsInCell(...)
       +
       +
       +// For a single particle:
       +// Search for neighbors to particle 'idx' inside the 27 closest cells, 
       +// and save the contact pairs in global memory.
       +__global__ void topology(unsigned int* dev_cellStart, 
       +                             unsigned int* dev_cellEnd, // Input: Particles in cell 
       +                         unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
       +                         Float4* dev_x_sorted, Float* dev_radius_sorted, 
       +                         unsigned int* dev_contacts,
       +                         Float4* dev_distmod)
       +{
       +  // Thread index equals index of particle A
       +  unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
       +  if (idx_a < devC_np) {
       +    // Fetch particle data in global read
       +    Float4 x_a      = dev_x_sorted[idx_a];
       +    Float  radius_a = dev_radius_sorted[idx_a];
       +
       +    // Count the number of contacts in this time step
       +    int nc = 0;
       +
       +    // Grid position of host and neighbor cells in uniform, cubic grid
       +    int3 gridPos;
       +    int3 targetPos;
       +
       +    // Calculate cell address in grid from position of particle
       +    gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
       +    gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
       +    gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
       +
       +    // Find overlaps between particle no. idx and all particles
       +    // from its own cell + 26 neighbor cells
       +    for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
       +      for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
       +        for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
       +          targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
       +          findContactsInCell(targetPos, idx_a, x_a, radius_a,
       +                                    dev_x_sorted, dev_radius_sorted,
       +                             dev_cellStart, dev_cellEnd,
       +                             dev_gridParticleIndex,
       +                                 &nc, dev_contacts, dev_distmod);
       +        }
       +      }
       +    }
       +  }
       +} // End of topology(...)
       +
       +
       +// For a single particle:
       +// Search for neighbors to particle 'idx' inside the 27 closest cells, 
       +// and compute the resulting normal- and shear force on it.
       +// Collide with top- and bottom walls, save resulting force on upper wall.
       +// Kernel is executed on device, and is callable from host only.
       +__global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
       +                         unsigned int* dev_cellStart,
       +                         unsigned int* dev_cellEnd,
       +                         Float4* dev_x, Float* dev_radius,
       +                             Float4* dev_x_sorted, Float* dev_radius_sorted, 
       +                         Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
       +                         Float4* dev_vel, Float4* dev_angvel,
       +                         Float4* dev_force, Float4* dev_torque,
       +                         Float* dev_es_dot, Float* dev_es, Float* dev_p,
       +                         Float4* dev_w_nx, Float4* dev_w_mvfd, 
       +                         Float* dev_w_force, //uint4* dev_bonds_sorted,
       +                         unsigned int* dev_contacts, 
       +                         Float4* dev_distmod,
       +                         Float4* dev_delta_t)
       +{
       +  // Thread index equals index of particle A
       +  unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
       +
       +  if (idx_a < devC_np) {
       +
       +    // Fetch particle data in global read
       +    unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
       +    Float4 x_a      = dev_x_sorted[idx_a];
       +    Float  radius_a = dev_radius_sorted[idx_a];
       +
       +    // Fetch wall data in global read
       +    Float4 w_up_nx   = dev_w_nx[0];
       +    Float4 w_up_mvfd = dev_w_mvfd[0];
       +
       +    // Fetch world dimensions in constant memory read
       +    Float3 origo = MAKE_FLOAT3(devC_origo[0], 
       +                               devC_origo[1], 
       +                               devC_origo[2]); 
       +    Float3 L = MAKE_FLOAT3(devC_L[0], 
       +                           devC_L[1], 
       +                           devC_L[2]);
       +
       +    // Index of particle which is bonded to particle A.
       +    // The index is equal to the particle no (p.np)
       +    // if particle A is bond-less.
       +    //uint4 bonds = dev_bonds_sorted[idx_a];
       +
       +    // Initiate shear friction loss rate at 0.0
       +    Float es_dot = 0.0f; 
       +
       +    // Initiate pressure on particle at 0.0
       +    Float p = 0.0f;
       +
       +    // Allocate memory for temporal force/torque vector values
       +    Float3 N = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +    Float3 T = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       +
       +    // Apply linear elastic, frictional contact model to registered contacts
       +    if (devC_shearmodel == 2) {
       +      unsigned int idx_b_orig, mempos;
       +      Float delta_n, x_ab_length, radius_b;
       +      Float3 x_ab;
       +      Float4 x_b, distmod;
       +      Float4 vel_a     = dev_vel_sorted[idx_a];
       +      Float4 angvel4_a = dev_angvel_sorted[idx_a];
       +      Float3 angvel_a  = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
       +
       +      // Loop over all possible contacts, and remove contacts
       +      // that no longer are valid (delta_n > 0.0)
       +      for (int i = 0; i<(int)devC_nc; ++i) {
       +        mempos = (unsigned int)(idx_a_orig * devC_nc + i);
       +        __syncthreads();
       +        idx_b_orig = dev_contacts[mempos];
       +        distmod    = dev_distmod[mempos];
       +        x_b        = dev_x[idx_b_orig];
       +        //radius_b   = dev_radius[idx_b_orig];
       +        radius_b   = distmod.w;
       +
       +        // Inter-particle vector, corrected for periodic boundaries
       +        x_ab = MAKE_FLOAT3(x_a.x - x_b.x + distmod.x,
       +                                  x_a.y - x_b.y + distmod.y,
       +                              x_a.z - x_b.z + distmod.z);
       +
       +        x_ab_length = length(x_ab);
       +        delta_n = x_ab_length - (radius_a + radius_b);
       +
       +
       +        if (idx_b_orig != (unsigned int)devC_np) {
       +
       +          // Process collision if the particles are overlapping
       +          if (delta_n < 0.0f) {
       +            //cuPrintf("\nProcessing contact, idx_a_orig = %u, idx_b_orig = %u, contact = %d, delta_n = %f\n",
       +            //  idx_a_orig, idx_b_orig, i, delta_n);
       +            contactLinear(&N, &T, &es_dot, &p, 
       +                          idx_a_orig,
       +                          idx_b_orig,
       +                          vel_a,
       +                          dev_vel,
       +                          angvel_a,
       +                          dev_angvel,
       +                          radius_a, radius_b, 
       +                          x_ab, x_ab_length,
       +                          delta_n, dev_delta_t, 
       +                          mempos);
       +          } else {
       +            __syncthreads();
       +            // Remove this contact (there is no particle with index=np)
       +            dev_contacts[mempos] = devC_np; 
       +            // Zero sum of shear displacement in this position
       +            dev_delta_t[mempos]  = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
       +          }
       +        } else {
       +          __syncthreads();
       +          dev_delta_t[mempos] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
       +        }
       +      } // Contact loop end
       +
       +    } else if (devC_shearmodel == 1) {
       +
       +      int3 gridPos;
       +      int3 targetPos;
       +
       +      // Calculate address in grid from position
       +      gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
       +      gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
       +      gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
       +
       +      // Find overlaps between particle no. idx and all particles
       +      // from its own cell + 26 neighbor cells.
       +      // Calculate resulting normal- and shear-force components and
       +      // torque for the particle on the base of contactLinearViscous()
       +      for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
       +        for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
       +          for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
       +            targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
       +            overlapsInCell(targetPos, idx_a, x_a, radius_a,
       +                           &N, &T, &es_dot, &p,
       +                           dev_x_sorted, dev_radius_sorted, 
       +                           dev_vel_sorted, dev_angvel_sorted,
       +                           dev_cellStart, dev_cellEnd,
       +                           dev_w_nx, dev_w_mvfd);
       +          }
       +        }
       +      }
       +
       +    }
       +
       +    //// Interact with walls
       +    Float delta_w; // Overlap distance
       +    Float3 w_n;    // Wall surface normal
       +    Float w_force = 0.0f; // Force on wall from particle A
       +
       +    // Upper wall (idx 0)
       +    delta_w = w_up_nx.w - (x_a.z + radius_a);
       +    w_n = MAKE_FLOAT3(0.0f, 0.0f, -1.0f);
       +    if (delta_w < 0.0f) {
       +      w_force = contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       +                                     dev_vel_sorted, dev_angvel_sorted,
       +                                   w_n, delta_w, w_up_mvfd.y);
       +    }
       +
       +    // Lower wall (force on wall not stored)
       +    delta_w = x_a.z - radius_a - origo.z;
       +    w_n = MAKE_FLOAT3(0.0f, 0.0f, 1.0f);
       +    if (delta_w < 0.0f) {
       +      (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       +                                 dev_vel_sorted, dev_angvel_sorted,
       +                                 w_n, delta_w, 0.0f);
       +    }
       +
       +
       +    if (devC_periodic == 0) {
       +
       +      // Left wall
       +      delta_w = x_a.x - radius_a - origo.x;
       +      w_n = MAKE_FLOAT3(1.0f, 0.0f, 0.0f);
       +      if (delta_w < 0.0f) {
       +        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       +                                     dev_vel_sorted, dev_angvel_sorted,
       +                                 w_n, delta_w, 0.0f);
       +      }
       +
       +      // Right wall
       +      delta_w = L.x - (x_a.x + radius_a);
       +      w_n = MAKE_FLOAT3(-1.0f, 0.0f, 0.0f);
       +      if (delta_w < 0.0f) {
       +        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       +                                     dev_vel_sorted, dev_angvel_sorted,
       +                                 w_n, delta_w, 0.0f);
       +      }
       +
       +      // Front wall
       +      delta_w = x_a.y - radius_a - origo.y;
       +      w_n = MAKE_FLOAT3(0.0f, 1.0f, 0.0f);
       +      if (delta_w < 0.0f) {
       +        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       +                                     dev_vel_sorted, dev_angvel_sorted,
       +                                 w_n, delta_w, 0.0f);
       +      }
       +
       +      // Back wall
       +      delta_w = L.y - (x_a.y + radius_a);
       +      w_n = MAKE_FLOAT3(0.0f, -1.0f, 0.0f);
       +      if (delta_w < 0.0f) {
       +        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       +                                     dev_vel_sorted, dev_angvel_sorted,
       +                                 w_n, delta_w, 0.0f);
       +      }
       +
       +    } else if (devC_periodic == 2) {
       +
       +      // Front wall
       +      delta_w = x_a.y - radius_a - origo.y;
       +      w_n = MAKE_FLOAT3(0.0f, 1.0f, 0.0f);
       +      if (delta_w < 0.0f) {
       +        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       +                                     dev_vel_sorted, dev_angvel_sorted,
       +                                 w_n, delta_w, 0.0f);
       +      }
       +
       +      // Back wall
       +      delta_w = L.y - (x_a.y + radius_a);
       +      w_n = MAKE_FLOAT3(0.0f, -1.0f, 0.0f);
       +      if (delta_w < 0.0f) {
       +        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       +                                     dev_vel_sorted, dev_angvel_sorted,
       +                                 w_n, delta_w, 0.0f);
       +      }
       +    }
       +
       +
       +    // Hold threads for coalesced write
       +    __syncthreads();
       +
       +    // Write force to unsorted position
       +    unsigned int orig_idx = dev_gridParticleIndex[idx_a];
       +    dev_force[orig_idx]   = MAKE_FLOAT4(N.x, N.y, N.z, 0.0f);
       +    dev_torque[orig_idx]  = MAKE_FLOAT4(T.x, T.y, T.z, 0.0f);
       +    dev_es_dot[orig_idx]  = es_dot;
       +    dev_es[orig_idx]     += es_dot * devC_dt;
       +    dev_p[orig_idx]       = p;
       +    dev_w_force[orig_idx] = w_force;
       +  }
       +} // End of interact(...)
       +
       +
       +#endif
   DIR diff --git a/src/integration.cuh b/src/integration.cuh
       t@@ -0,0 +1,207 @@
       +#ifndef INTEGRATION_CUH_
       +#define INTEGRATION_CUH_
       +
       +// integration.cuh
       +// Functions responsible for temporal integration
       +
       +
       +// Second order integration scheme based on Taylor expansion of particle kinematics. 
       +// Kernel executed on device, and callable from host only.
       +__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
       +                          Float4* dev_angvel_sorted, Float* dev_radius_sorted, // Input
       +                          Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, // Output
       +                          Float4* dev_force, Float4* dev_torque, // Input
       +                          unsigned int* dev_gridParticleIndex) // Input: Sorted-Unsorted key
       +{
       +  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
       +
       +  if (idx < devC_np) { // Condition prevents block size error
       +
       +    // Copy data to temporary arrays to avoid any potential read-after-write, 
       +    // write-after-read, or write-after-write hazards. 
       +    unsigned int orig_idx = dev_gridParticleIndex[idx];
       +    Float4 force  = dev_force[orig_idx];
       +    Float4 torque = dev_torque[orig_idx];
       +
       +    // Initialize acceleration vectors to zero
       +    Float4 acc    = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
       +    Float4 angacc = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
       +
       +    // Fetch particle position and velocity values from global read
       +    Float4 x      = dev_x_sorted[idx];
       +    Float4 vel    = dev_vel_sorted[idx];
       +    Float4 angvel = dev_angvel_sorted[idx];
       +    Float  radius = dev_radius_sorted[idx];
       +
       +    // Coherent read from constant memory to registers
       +    Float  dt    = devC_dt;
       +    Float3 origo = MAKE_FLOAT3(devC_origo[0], devC_origo[1], devC_origo[2]); 
       +    Float3 L     = MAKE_FLOAT3(devC_L[0], devC_L[1], devC_L[2]);
       +    Float  rho   = devC_rho;
       +
       +    // Particle mass
       +    Float m = 4.0f/3.0f * PI * radius*radius*radius * rho;
       +
       +    // Update linear acceleration of particle
       +    acc.x = force.x / m;
       +    acc.y = force.y / m;
       +    acc.z = force.z / m;
       +
       +    // Update angular acceleration of particle 
       +    // (angacc = (total moment)/Intertia, intertia = 2/5*m*r^2)
       +    angacc.x = torque.x * 1.0f / (2.0f/5.0f * m * radius*radius);
       +    angacc.y = torque.y * 1.0f / (2.0f/5.0f * m * radius*radius);
       +    angacc.z = torque.z * 1.0f / (2.0f/5.0f * m * radius*radius);
       +
       +    // Add gravity
       +    acc.x += devC_g[0];
       +    acc.y += devC_g[1];
       +    acc.z += devC_g[2];
       +
       +    // Check if particle has a fixed horizontal velocity
       +    if (vel.w > 0.0f) {
       +
       +      // Zero horizontal acceleration and disable
       +      // gravity to counteract segregation.
       +      // Particles may move in the z-dimension,
       +      // to allow for dilation.
       +      acc.x = 0.0f;
       +      acc.y = 0.0f;
       +      acc.z -= devC_g[2];
       +
       +      // Zero the angular acceleration
       +      angacc = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
       +    } 
       +
       +    // Update angular velocity
       +    angvel.x += angacc.x * dt;
       +    angvel.y += angacc.y * dt;
       +    angvel.z += angacc.z * dt;
       +
       +    // Update linear velocity
       +    vel.x += acc.x * dt;
       +    vel.y += acc.y * dt;
       +    vel.z += acc.z * dt;
       +
       +    // Update position. First-order Euler's scheme:
       +    //x.x += vel.x * dt;
       +    //x.y += vel.y * dt;
       +    //x.z += vel.z * dt;
       +
       +    // Update position. Second-order scheme based on Taylor expansion 
       +    // (greater accuracy than the first-order Euler's scheme)
       +    x.x += vel.x * dt + (acc.x * dt*dt)/2.0f;
       +    x.y += vel.y * dt + (acc.y * dt*dt)/2.0f;
       +    x.z += vel.z * dt + (acc.z * dt*dt)/2.0f;
       +
       +    // Add x-displacement for this time step to 
       +    // sum of x-displacements
       +    x.w += vel.x * dt + (acc.x * dt*dt)/2.0f;
       +
       +    // Move particle across boundary if it is periodic
       +    if (devC_periodic == 1) {
       +      if (x.x < origo.x)
       +        x.x += L.x;
       +      if (x.x > L.x)
       +        x.x -= L.x;
       +      if (x.y < origo.y)
       +        x.y += L.y;
       +      if (x.y > L.y)
       +        x.y -= L.y;
       +    } else if (devC_periodic == 2) {
       +      if (x.x < origo.x)
       +        x.x += L.x;
       +      if (x.x > L.x)
       +        x.x -= L.x;
       +    }
       +
       +    // Hold threads for coalesced write
       +    __syncthreads();
       +
       +    // Store data in global memory at original, pre-sort positions
       +    dev_angvel[orig_idx] = angvel;
       +    dev_vel[orig_idx]    = vel;
       +    dev_x[orig_idx]      = x;
       +  } 
       +} // End of integrate(...)
       +
       +
       +// Reduce wall force contributions from particles to a single value per wall
       +__global__ void summation(Float* in, Float *out)
       +{
       +  __shared__ Float cache[256];
       +  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
       +  unsigned int cacheIdx = threadIdx.x;
       +
       +  Float temp = 0.0f;
       +  while (idx < devC_np) {
       +    temp += in[idx];
       +    idx += blockDim.x * gridDim.x;
       +  }
       +
       +  // Set the cache values
       +  cache[cacheIdx] = temp;
       +
       +  __syncthreads();
       +
       +  // For reductions, threadsPerBlock must be a power of two
       +  // because of the following code
       +  unsigned int i = blockDim.x/2;
       +  while (i != 0) {
       +    if (cacheIdx < i)
       +      cache[cacheIdx] += cache[cacheIdx + i];
       +    __syncthreads();
       +    i /= 2;
       +  }
       +
       +  // Write sum for block to global memory
       +  if (cacheIdx == 0)
       +    out[blockIdx.x] = cache[0];
       +}
       +
       +// Update wall positions
       +__global__ void integrateWalls(Float4* dev_w_nx, 
       +                                   Float4* dev_w_mvfd,
       +                               Float* dev_w_force_partial,
       +                               unsigned int blocksPerGrid)
       +{
       +  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
       +
       +  if (idx < devC_nw) { // Condition prevents block size error
       +
       +    // Copy data to temporary arrays to avoid any potential read-after-write, 
       +    // write-after-read, or write-after-write hazards. 
       +    Float4 w_nx   = dev_w_nx[idx];
       +    Float4 w_mvfd = dev_w_mvfd[idx];
       +
       +    // Find the final sum of forces on wall
       +    w_mvfd.z = 0.0f;
       +    for (int i=0; i<blocksPerGrid; ++i) {
       +      w_mvfd.z += dev_w_force_partial[i];
       +    }
       +
       +    Float dt = devC_dt;
       +
       +    // Normal load = Deviatoric stress times wall surface area,
       +    // directed downwards.
       +    Float N = -w_mvfd.w*devC_L[0]*devC_L[1];
       +
       +    // Calculate resulting acceleration of wall
       +    // (Wall mass is stored in w component of position Float4)
       +    Float acc = (w_mvfd.z+N)/w_mvfd.x;
       +
       +    // Update linear velocity
       +    w_mvfd.y += acc * dt;
       +
       +    // Update position. Second-order scheme based on Taylor expansion 
       +    // (greater accuracy than the first-order Euler's scheme)
       +    w_nx.w += w_mvfd.y * dt + (acc * dt*dt)/2.0f;
       +
       +    // Store data in global memory
       +    dev_w_nx[idx]   = w_nx;
       +    dev_w_mvfd[idx] = w_mvfd;
       +  }
       +} // End of integrateWalls(...)
       +
       +
       +#endif
   DIR diff --git a/src/sorting.cuh b/src/sorting.cuh
       t@@ -0,0 +1,127 @@
       +// Returns the cellID containing the particle, based cubic grid
       +// See Bayraktar et al. 2009
       +// Kernel is executed on the device, and is callable from the device only
       +__device__ int calcCellID(Float3 x) 
       +{ 
       +  unsigned int i_x, i_y, i_z;
       +
       +  // Calculate integral coordinates:
       +  i_x = floor((x.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
       +  i_y = floor((x.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
       +  i_z = floor((x.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
       +
       +  // Integral coordinates are converted to 1D coordinate:
       +  return __umul24(__umul24(i_z, devC_num[1]),
       +                        devC_num[0]) 
       +             + __umul24(i_y, devC_num[0]) + i_x;
       +
       +} // End of calcCellID(...)
       +
       +
       +// Calculate hash value for each particle, based on position in grid.
       +// Kernel executed on device, and callable from host only.
       +__global__ void calcParticleCellID(unsigned int* dev_gridParticleCellID, 
       +                                       unsigned int* dev_gridParticleIndex, 
       +                                   Float4* dev_x) 
       +{
       +  //unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
       +  unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
       +
       +  if (idx < devC_np) { // Condition prevents block size error
       +
       +    //volatile Float4 x = dev_x[idx]; // Ensure coalesced read
       +    Float4 x = dev_x[idx]; // Ensure coalesced read
       +
       +    unsigned int cellID = calcCellID(MAKE_FLOAT3(x.x, x.y, x.z));
       +
       +    // Store values    
       +    dev_gridParticleCellID[idx] = cellID;
       +    dev_gridParticleIndex[idx]  = idx;
       +
       +  }
       +} // End of calcParticleCellID(...)
       +
       +
       +// Reorder particle data into sorted order, and find the start and end particle indexes
       +// of each cell in the sorted hash array.
       +// Kernel executed on device, and callable from host only.
       +__global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_cellEnd,
       +                              unsigned int* dev_gridParticleCellID, 
       +                              unsigned int* dev_gridParticleIndex,
       +                              Float4* dev_x, Float4* dev_vel, 
       +                              Float4* dev_angvel, Float* dev_radius,
       +                              //uint4* dev_bonds,
       +                              Float4* dev_x_sorted, Float4* dev_vel_sorted,
       +                              Float4* dev_angvel_sorted, Float* dev_radius_sorted)
       +                              //uint4* dev_bonds_sorted)
       +{ 
       +
       +  // Create hash array in shared on-chip memory. The size of the array 
       +  // (threadsPerBlock + 1) is determined at launch time (extern notation).
       +  extern __shared__ unsigned int shared_data[]; 
       +
       +  // Thread index in block
       +  unsigned int tidx = threadIdx.x;
       +
       +  // Thread index in grid
       +  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; 
       +  //unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
       +
       +  // CellID hash value of particle idx
       +  unsigned int cellID;
       +
       +  // Read cellID data and store it in shared memory (shared_data)
       +  if (idx < devC_np) { // Condition prevents block size error
       +    cellID = dev_gridParticleCellID[idx];
       +
       +    // Load hash data into shared memory, allowing access to neighbor particle cellID values
       +    shared_data[tidx+1] = cellID; 
       +
       +    if (idx > 0 && tidx == 0) {
       +      // First thread in block must load neighbor particle hash
       +      shared_data[0] = dev_gridParticleCellID[idx-1];
       +    }
       +  }
       +
       +  // Pause completed threads in this block, until all 
       +  // threads are done loading data into shared memory
       +  __syncthreads();
       +
       +  // Find lowest and highest particle index in each cell
       +  if (idx < devC_np) { // Condition prevents block size error
       +    // If this particle has a different cell index to the previous particle, it's the first
       +    // particle in the cell -> Store the index of this particle in the cell.
       +    // The previous particle must be the last particle in the previous cell.
       +    if (idx == 0 || cellID != shared_data[tidx]) {
       +      dev_cellStart[cellID] = idx;
       +      if (idx > 0)
       +        dev_cellEnd[shared_data[tidx]] = idx;
       +    }
       +
       +    // Check wether the thread is the last one
       +    if (idx == (devC_np - 1)) 
       +      dev_cellEnd[cellID] = idx + 1;
       +
       +
       +    // Use the sorted index to reorder the position and velocity data
       +    unsigned int sortedIndex = dev_gridParticleIndex[idx];
       +
       +    // Fetch from global read
       +    Float4 x      = dev_x[sortedIndex];
       +    Float4 vel    = dev_vel[sortedIndex];
       +    Float4 angvel = dev_angvel[sortedIndex];
       +    Float  radius = dev_radius[sortedIndex];
       +    //uint4  bonds  = dev_bonds[sortedIndex];
       +
       +    __syncthreads();
       +    // Write sorted data to global memory
       +    dev_x_sorted[idx]      = x;
       +    dev_vel_sorted[idx]    = vel;
       +    dev_angvel_sorted[idx] = angvel;
       +    dev_radius_sorted[idx] = radius;
       +    //dev_bonds_sorted[idx]  = bonds;
       +  }
       +} // End of reorderArrays(...)
       +
       +
       +
   DIR diff --git a/src/vector_arithmetic.h b/src/vector_arithmetic.h
       t@@ -0,0 +1,994 @@
       +#ifndef VECTOR_ARITHMETIC_H_
       +#define VECTOR_ARITHMETIC_H_
       +
       +#include "cuda_runtime.h"
       +#include "datatypes.h"
       +
       +typedef unsigned int uint;
       +typedef unsigned short ushort;
       +
       +#ifndef __CUDACC__
       +#include <math.h>
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// host implementations of CUDA functions
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline Float fminf(Float a, Float b)
       +{
       +  return a < b ? a : b;
       +}
       +
       +inline Float fmaxf(Float a, Float b)
       +{
       +  return a > b ? a : b;
       +}
       +
       +inline int max(int a, int b)
       +{
       +  return a > b ? a : b;
       +}
       +
       +inline int min(int a, int b)
       +{
       +  return a < b ? a : b;
       +}
       +
       +inline Float rsqrtf(Float x)
       +{
       +    return 1.0f / sqrtf(x);
       +}
       +#endif
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// constructors
       +////////////////////////////////////////////////////////////////////////////////
       +/*inline __host__ __device__ Float3 MAKE_FLOAT3(Float s)
       +{
       +    return MAKE_FLOAT3(s, s, s);
       +}
       +inline __host__ __device__ Float3 MAKE_FLOAT3(Float4 a)
       +{
       +    return MAKE_FLOAT3(a.x, a.y, a.z);
       +}
       +inline __host__ __device__ Float3 MAKE_FLOAT3(int3 a)
       +{
       +    return MAKE_FLOAT3(Float(a.x), Float(a.y), Float(a.z));
       +}
       +inline __host__ __device__ Float3 MAKE_FLOAT3(uint3 a)
       +{
       +    return MAKE_FLOAT3(Float(a.x), Float(a.y), Float(a.z));
       +}
       +
       +inline __host__ __device__ Float4 MAKE_FLOAT4(Float s)
       +{
       +    return MAKE_FLOAT4(s, s, s, s);
       +}
       +inline __host__ __device__ Float4 MAKE_FLOAT4(Float3 a)
       +{
       +    return MAKE_FLOAT4(a.x, a.y, a.z, 0.0f);
       +}
       +inline __host__ __device__ Float4 MAKE_FLOAT4(Float3 a, Float w)
       +{
       +    return MAKE_FLOAT4(a.x, a.y, a.z, w);
       +}
       +inline __host__ __device__ Float4 MAKE_FLOAT4(int4 a)
       +{
       +    return MAKE_FLOAT4(Float(a.x), Float(a.y), Float(a.z), Float(a.w));
       +}
       +inline __host__ __device__ Float4 MAKE_FLOAT4(uint4 a)
       +{
       +    return MAKE_FLOAT4(Float(a.x), Float(a.y), Float(a.z), Float(a.w));
       +}
       +*/
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// negate
       +////////////////////////////////////////////////////////////////////////////////
       +inline __host__ __device__ Float3 operator-(Float3 &a)
       +{
       +    return MAKE_FLOAT3(-a.x, -a.y, -a.z);
       +}
       +inline __host__ __device__ Float4 operator-(Float4 &a)
       +{
       +    return MAKE_FLOAT4(-a.x, -a.y, -a.z, -a.w);
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// addition
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ int2 operator+(int2 a, int2 b)
       +{
       +    return make_int2(a.x + b.x, a.y + b.y);
       +}
       +inline __host__ __device__ void operator+=(int2 &a, int2 b)
       +{
       +    a.x += b.x; a.y += b.y;
       +}
       +inline __host__ __device__ int2 operator+(int2 a, int b)
       +{
       +    return make_int2(a.x + b, a.y + b);
       +}
       +inline __host__ __device__ int2 operator+(int b, int2 a)
       +{
       +    return make_int2(a.x + b, a.y + b);
       +}
       +inline __host__ __device__ void operator+=(int2 &a, int b)
       +{
       +    a.x += b; a.y += b;
       +}
       +
       +inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
       +{
       +    return make_uint2(a.x + b.x, a.y + b.y);
       +}
       +inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
       +{
       +    a.x += b.x; a.y += b.y;
       +}
       +inline __host__ __device__ uint2 operator+(uint2 a, uint b)
       +{
       +    return make_uint2(a.x + b, a.y + b);
       +}
       +inline __host__ __device__ uint2 operator+(uint b, uint2 a)
       +{
       +    return make_uint2(a.x + b, a.y + b);
       +}
       +inline __host__ __device__ void operator+=(uint2 &a, uint b)
       +{
       +    a.x += b; a.y += b;
       +}
       +
       +
       +inline __host__ __device__ Float3 operator+(Float3 a, Float3 b)
       +{
       +    return MAKE_FLOAT3(a.x + b.x, a.y + b.y, a.z + b.z);
       +}
       +inline __host__ __device__ void operator+=(Float3 &a, Float3 b)
       +{
       +    a.x += b.x; a.y += b.y; a.z += b.z;
       +}
       +inline __host__ __device__ Float3 operator+(Float3 a, Float b)
       +{
       +    return MAKE_FLOAT3(a.x + b, a.y + b, a.z + b);
       +}
       +inline __host__ __device__ void operator+=(Float3 &a, Float b)
       +{
       +    a.x += b; a.y += b; a.z += b;
       +}
       +
       +inline __host__ __device__ int3 operator+(int3 a, int3 b)
       +{
       +    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
       +}
       +inline __host__ __device__ void operator+=(int3 &a, int3 b)
       +{
       +    a.x += b.x; a.y += b.y; a.z += b.z;
       +}
       +inline __host__ __device__ int3 operator+(int3 a, int b)
       +{
       +    return make_int3(a.x + b, a.y + b, a.z + b);
       +}
       +inline __host__ __device__ void operator+=(int3 &a, int b)
       +{
       +    a.x += b; a.y += b; a.z += b;
       +}
       +
       +inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
       +{
       +    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
       +}
       +inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
       +{
       +    a.x += b.x; a.y += b.y; a.z += b.z;
       +}
       +inline __host__ __device__ uint3 operator+(uint3 a, uint b)
       +{
       +    return make_uint3(a.x + b, a.y + b, a.z + b);
       +}
       +inline __host__ __device__ void operator+=(uint3 &a, uint b)
       +{
       +    a.x += b; a.y += b; a.z += b;
       +}
       +
       +inline __host__ __device__ int3 operator+(int b, int3 a)
       +{
       +    return make_int3(a.x + b, a.y + b, a.z + b);
       +}
       +inline __host__ __device__ uint3 operator+(uint b, uint3 a)
       +{
       +    return make_uint3(a.x + b, a.y + b, a.z + b);
       +}
       +inline __host__ __device__ Float3 operator+(Float b, Float3 a)
       +{
       +    return MAKE_FLOAT3(a.x + b, a.y + b, a.z + b);
       +}
       +
       +inline __host__ __device__ Float4 operator+(Float4 a, Float4 b)
       +{
       +    return MAKE_FLOAT4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
       +}
       +inline __host__ __device__ void operator+=(Float4 &a, Float4 b)
       +{
       +    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
       +}
       +inline __host__ __device__ Float4 operator+(Float4 a, Float b)
       +{
       +    return MAKE_FLOAT4(a.x + b, a.y + b, a.z + b, a.w + b);
       +}
       +inline __host__ __device__ Float4 operator+(Float b, Float4 a)
       +{
       +    return MAKE_FLOAT4(a.x + b, a.y + b, a.z + b, a.w + b);
       +}
       +inline __host__ __device__ void operator+=(Float4 &a, Float b)
       +{
       +    a.x += b; a.y += b; a.z += b; a.w += b;
       +}
       +
       +inline __host__ __device__ int4 operator+(int4 a, int4 b)
       +{
       +    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
       +}
       +inline __host__ __device__ void operator+=(int4 &a, int4 b)
       +{
       +    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
       +}
       +inline __host__ __device__ int4 operator+(int4 a, int b)
       +{
       +    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
       +}
       +inline __host__ __device__ int4 operator+(int b, int4 a)
       +{
       +    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
       +}
       +inline __host__ __device__ void operator+=(int4 &a, int b)
       +{
       +    a.x += b; a.y += b; a.z += b; a.w += b;
       +}
       +
       +inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
       +{
       +    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
       +}
       +inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
       +{
       +    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
       +}
       +inline __host__ __device__ uint4 operator+(uint4 a, uint b)
       +{
       +    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
       +}
       +inline __host__ __device__ uint4 operator+(uint b, uint4 a)
       +{
       +    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
       +}
       +inline __host__ __device__ void operator+=(uint4 &a, uint b)
       +{
       +    a.x += b; a.y += b; a.z += b; a.w += b;
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// subtract
       +////////////////////////////////////////////////////////////////////////////////
       +
       +
       +inline __host__ __device__ int2 operator-(int2 a, int2 b)
       +{
       +    return make_int2(a.x - b.x, a.y - b.y);
       +}
       +inline __host__ __device__ void operator-=(int2 &a, int2 b)
       +{
       +    a.x -= b.x; a.y -= b.y;
       +}
       +inline __host__ __device__ int2 operator-(int2 a, int b)
       +{
       +    return make_int2(a.x - b, a.y - b);
       +}
       +inline __host__ __device__ int2 operator-(int b, int2 a)
       +{
       +    return make_int2(b - a.x, b - a.y);
       +}
       +inline __host__ __device__ void operator-=(int2 &a, int b)
       +{
       +    a.x -= b; a.y -= b;
       +}
       +
       +inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
       +{
       +    return make_uint2(a.x - b.x, a.y - b.y);
       +}
       +inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
       +{
       +    a.x -= b.x; a.y -= b.y;
       +}
       +inline __host__ __device__ uint2 operator-(uint2 a, uint b)
       +{
       +    return make_uint2(a.x - b, a.y - b);
       +}
       +inline __host__ __device__ uint2 operator-(uint b, uint2 a)
       +{
       +    return make_uint2(b - a.x, b - a.y);
       +}
       +inline __host__ __device__ void operator-=(uint2 &a, uint b)
       +{
       +    a.x -= b; a.y -= b;
       +}
       +
       +inline __host__ __device__ Float3 operator-(Float3 a, Float3 b)
       +{
       +    return MAKE_FLOAT3(a.x - b.x, a.y - b.y, a.z - b.z);
       +}
       +inline __host__ __device__ void operator-=(Float3 &a, Float3 b)
       +{
       +    a.x -= b.x; a.y -= b.y; a.z -= b.z;
       +}
       +inline __host__ __device__ Float3 operator-(Float3 a, Float b)
       +{
       +    return MAKE_FLOAT3(a.x - b, a.y - b, a.z - b);
       +}
       +inline __host__ __device__ Float3 operator-(Float b, Float3 a)
       +{
       +    return MAKE_FLOAT3(b - a.x, b - a.y, b - a.z);
       +}
       +inline __host__ __device__ void operator-=(Float3 &a, Float b)
       +{
       +    a.x -= b; a.y -= b; a.z -= b;
       +}
       +
       +inline __host__ __device__ int3 operator-(int3 a, int3 b)
       +{
       +    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
       +}
       +inline __host__ __device__ void operator-=(int3 &a, int3 b)
       +{
       +    a.x -= b.x; a.y -= b.y; a.z -= b.z;
       +}
       +inline __host__ __device__ int3 operator-(int3 a, int b)
       +{
       +    return make_int3(a.x - b, a.y - b, a.z - b);
       +}
       +inline __host__ __device__ int3 operator-(int b, int3 a)
       +{
       +    return make_int3(b - a.x, b - a.y, b - a.z);
       +}
       +inline __host__ __device__ void operator-=(int3 &a, int b)
       +{
       +    a.x -= b; a.y -= b; a.z -= b;
       +}
       +
       +inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
       +{
       +    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
       +}
       +inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
       +{
       +    a.x -= b.x; a.y -= b.y; a.z -= b.z;
       +}
       +inline __host__ __device__ uint3 operator-(uint3 a, uint b)
       +{
       +    return make_uint3(a.x - b, a.y - b, a.z - b);
       +}
       +inline __host__ __device__ uint3 operator-(uint b, uint3 a)
       +{
       +    return make_uint3(b - a.x, b - a.y, b - a.z);
       +}
       +inline __host__ __device__ void operator-=(uint3 &a, uint b)
       +{
       +    a.x -= b; a.y -= b; a.z -= b;
       +}
       +
       +inline __host__ __device__ Float4 operator-(Float4 a, Float4 b)
       +{
       +    return MAKE_FLOAT4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
       +}
       +inline __host__ __device__ void operator-=(Float4 &a, Float4 b)
       +{
       +    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
       +}
       +inline __host__ __device__ Float4 operator-(Float4 a, Float b)
       +{
       +    return MAKE_FLOAT4(a.x - b, a.y - b, a.z - b,  a.w - b);
       +}
       +inline __host__ __device__ void operator-=(Float4 &a, Float b)
       +{
       +    a.x -= b; a.y -= b; a.z -= b; a.w -= b;
       +}
       +
       +inline __host__ __device__ int4 operator-(int4 a, int4 b)
       +{
       +    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
       +}
       +inline __host__ __device__ void operator-=(int4 &a, int4 b)
       +{
       +    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
       +}
       +inline __host__ __device__ int4 operator-(int4 a, int b)
       +{
       +    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
       +}
       +inline __host__ __device__ int4 operator-(int b, int4 a)
       +{
       +    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
       +}
       +inline __host__ __device__ void operator-=(int4 &a, int b)
       +{
       +    a.x -= b; a.y -= b; a.z -= b; a.w -= b;
       +}
       +
       +inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
       +{
       +    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
       +}
       +inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
       +{
       +    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
       +}
       +inline __host__ __device__ uint4 operator-(uint4 a, uint b)
       +{
       +    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
       +}
       +inline __host__ __device__ uint4 operator-(uint b, uint4 a)
       +{
       +    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
       +}
       +inline __host__ __device__ void operator-=(uint4 &a, uint b)
       +{
       +    a.x -= b; a.y -= b; a.z -= b; a.w -= b;
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// multiply
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ int2 operator*(int2 a, int2 b)
       +{
       +    return make_int2(a.x * b.x, a.y * b.y);
       +}
       +inline __host__ __device__ void operator*=(int2 &a, int2 b)
       +{
       +    a.x *= b.x; a.y *= b.y;
       +}
       +inline __host__ __device__ int2 operator*(int2 a, int b)
       +{
       +    return make_int2(a.x * b, a.y * b);
       +}
       +inline __host__ __device__ int2 operator*(int b, int2 a)
       +{
       +    return make_int2(b * a.x, b * a.y);
       +}
       +inline __host__ __device__ void operator*=(int2 &a, int b)
       +{
       +    a.x *= b; a.y *= b;
       +}
       +
       +inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
       +{
       +    return make_uint2(a.x * b.x, a.y * b.y);
       +}
       +inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
       +{
       +    a.x *= b.x; a.y *= b.y;
       +}
       +inline __host__ __device__ uint2 operator*(uint2 a, uint b)
       +{
       +    return make_uint2(a.x * b, a.y * b);
       +}
       +inline __host__ __device__ uint2 operator*(uint b, uint2 a)
       +{
       +    return make_uint2(b * a.x, b * a.y);
       +}
       +inline __host__ __device__ void operator*=(uint2 &a, uint b)
       +{
       +    a.x *= b; a.y *= b;
       +}
       +
       +inline __host__ __device__ Float3 operator*(Float3 a, Float3 b)
       +{
       +    return MAKE_FLOAT3(a.x * b.x, a.y * b.y, a.z * b.z);
       +}
       +inline __host__ __device__ void operator*=(Float3 &a, Float3 b)
       +{
       +    a.x *= b.x; a.y *= b.y; a.z *= b.z;
       +}
       +inline __host__ __device__ Float3 operator*(Float3 a, Float b)
       +{
       +    return MAKE_FLOAT3(a.x * b, a.y * b, a.z * b);
       +}
       +inline __host__ __device__ Float3 operator*(Float b, Float3 a)
       +{
       +    return MAKE_FLOAT3(b * a.x, b * a.y, b * a.z);
       +}
       +inline __host__ __device__ void operator*=(Float3 &a, Float b)
       +{
       +    a.x *= b; a.y *= b; a.z *= b;
       +}
       +
       +inline __host__ __device__ int3 operator*(int3 a, int3 b)
       +{
       +    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
       +}
       +inline __host__ __device__ void operator*=(int3 &a, int3 b)
       +{
       +    a.x *= b.x; a.y *= b.y; a.z *= b.z;
       +}
       +inline __host__ __device__ int3 operator*(int3 a, int b)
       +{
       +    return make_int3(a.x * b, a.y * b, a.z * b);
       +}
       +inline __host__ __device__ int3 operator*(int b, int3 a)
       +{
       +    return make_int3(b * a.x, b * a.y, b * a.z);
       +}
       +inline __host__ __device__ void operator*=(int3 &a, int b)
       +{
       +    a.x *= b; a.y *= b; a.z *= b;
       +}
       +
       +inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
       +{
       +    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
       +}
       +inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
       +{
       +    a.x *= b.x; a.y *= b.y; a.z *= b.z;
       +}
       +inline __host__ __device__ uint3 operator*(uint3 a, uint b)
       +{
       +    return make_uint3(a.x * b, a.y * b, a.z * b);
       +}
       +inline __host__ __device__ uint3 operator*(uint b, uint3 a)
       +{
       +    return make_uint3(b * a.x, b * a.y, b * a.z);
       +}
       +inline __host__ __device__ void operator*=(uint3 &a, uint b)
       +{
       +    a.x *= b; a.y *= b; a.z *= b;
       +}
       +
       +inline __host__ __device__ Float4 operator*(Float4 a, Float4 b)
       +{
       +    return MAKE_FLOAT4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
       +}
       +inline __host__ __device__ void operator*=(Float4 &a, Float4 b)
       +{
       +    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
       +}
       +inline __host__ __device__ Float4 operator*(Float4 a, Float b)
       +{
       +    return MAKE_FLOAT4(a.x * b, a.y * b, a.z * b,  a.w * b);
       +}
       +inline __host__ __device__ Float4 operator*(Float b, Float4 a)
       +{
       +    return MAKE_FLOAT4(b * a.x, b * a.y, b * a.z, b * a.w);
       +}
       +inline __host__ __device__ void operator*=(Float4 &a, Float b)
       +{
       +    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
       +}
       +
       +inline __host__ __device__ int4 operator*(int4 a, int4 b)
       +{
       +    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
       +}
       +inline __host__ __device__ void operator*=(int4 &a, int4 b)
       +{
       +    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
       +}
       +inline __host__ __device__ int4 operator*(int4 a, int b)
       +{
       +    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
       +}
       +inline __host__ __device__ int4 operator*(int b, int4 a)
       +{
       +    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
       +}
       +inline __host__ __device__ void operator*=(int4 &a, int b)
       +{
       +    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
       +}
       +
       +inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
       +{
       +    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
       +}
       +inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
       +{
       +    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
       +}
       +inline __host__ __device__ uint4 operator*(uint4 a, uint b)
       +{
       +    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
       +}
       +inline __host__ __device__ uint4 operator*(uint b, uint4 a)
       +{
       +    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
       +}
       +inline __host__ __device__ void operator*=(uint4 &a, uint b)
       +{
       +    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// divide
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float3 operator/(Float3 a, Float3 b)
       +{
       +    return MAKE_FLOAT3(a.x / b.x, a.y / b.y, a.z / b.z);
       +}
       +inline __host__ __device__ void operator/=(Float3 &a, Float3 b)
       +{
       +    a.x /= b.x; a.y /= b.y; a.z /= b.z;
       +}
       +inline __host__ __device__ Float3 operator/(Float3 a, Float b)
       +{
       +    return MAKE_FLOAT3(a.x / b, a.y / b, a.z / b);
       +}
       +inline __host__ __device__ void operator/=(Float3 &a, Float b)
       +{
       +    a.x /= b; a.y /= b; a.z /= b;
       +}
       +inline __host__ __device__ Float3 operator/(Float b, Float3 a)
       +{
       +    return MAKE_FLOAT3(b / a.x, b / a.y, b / a.z);
       +}
       +
       +inline __host__ __device__ Float4 operator/(Float4 a, Float4 b)
       +{
       +    return MAKE_FLOAT4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
       +}
       +inline __host__ __device__ void operator/=(Float4 &a, Float4 b)
       +{
       +    a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
       +}
       +inline __host__ __device__ Float4 operator/(Float4 a, Float b)
       +{
       +    return MAKE_FLOAT4(a.x / b, a.y / b, a.z / b,  a.w / b);
       +}
       +inline __host__ __device__ void operator/=(Float4 &a, Float b)
       +{
       +    a.x /= b; a.y /= b; a.z /= b; a.w /= b;
       +}
       +inline __host__ __device__ Float4 operator/(Float b, Float4 a){
       +    return MAKE_FLOAT4(b / a.x, b / a.y, b / a.z, b / a.w);
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// min
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float3 fminf(Float3 a, Float3 b)
       +{
       +        return MAKE_FLOAT3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
       +}
       +inline  __host__ __device__ Float4 fminf(Float4 a, Float4 b)
       +{
       +        return MAKE_FLOAT4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
       +}
       +
       +inline __host__ __device__ int2 min(int2 a, int2 b)
       +{
       +    return make_int2(min(a.x,b.x), min(a.y,b.y));
       +}
       +inline __host__ __device__ int3 min(int3 a, int3 b)
       +{
       +    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
       +}
       +inline __host__ __device__ int4 min(int4 a, int4 b)
       +{
       +    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
       +}
       +
       +inline __host__ __device__ uint2 min(uint2 a, uint2 b)
       +{
       +    return make_uint2(min(a.x,b.x), min(a.y,b.y));
       +}
       +inline __host__ __device__ uint3 min(uint3 a, uint3 b)
       +{
       +    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
       +}
       +inline __host__ __device__ uint4 min(uint4 a, uint4 b)
       +{
       +    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// max
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float3 fmaxf(Float3 a, Float3 b)
       +{
       +        return MAKE_FLOAT3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
       +}
       +inline __host__ __device__ Float4 fmaxf(Float4 a, Float4 b)
       +{
       +        return MAKE_FLOAT4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
       +}
       +
       +inline __host__ __device__ int2 max(int2 a, int2 b)
       +{
       +    return make_int2(max(a.x,b.x), max(a.y,b.y));
       +}
       +inline __host__ __device__ int3 max(int3 a, int3 b)
       +{
       +    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
       +}
       +inline __host__ __device__ int4 max(int4 a, int4 b)
       +{
       +    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
       +}
       +
       +inline __host__ __device__ uint2 max(uint2 a, uint2 b)
       +{
       +    return make_uint2(max(a.x,b.x), max(a.y,b.y));
       +}
       +inline __host__ __device__ uint3 max(uint3 a, uint3 b)
       +{
       +    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
       +}
       +inline __host__ __device__ uint4 max(uint4 a, uint4 b)
       +{
       +    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// lerp
       +// - linear interpolation between a and b, based on value t in [0, 1] range
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __device__ __host__ Float lerp(Float a, Float b, Float t)
       +{
       +    return a + t*(b-a);
       +}
       +inline __device__ __host__ Float3 lerp(Float3 a, Float3 b, Float t)
       +{
       +    return a + t*(b-a);
       +}
       +inline __device__ __host__ Float4 lerp(Float4 a, Float4 b, Float t)
       +{
       +    return a + t*(b-a);
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// clamp
       +// - clamp the value v to be in the range [a, b]
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __device__ __host__ Float clamp(Float f, Float a, Float b)
       +{
       +    return fmaxf(a, fminf(f, b));
       +}
       +inline __device__ __host__ int clamp(int f, int a, int b)
       +{
       +    return max(a, min(f, b));
       +}
       +inline __device__ __host__ uint clamp(uint f, uint a, uint b)
       +{
       +    return max(a, min(f, b));
       +}
       +
       +inline __device__ __host__ Float3 clamp(Float3 v, Float a, Float b)
       +{
       +    return MAKE_FLOAT3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
       +}
       +inline __device__ __host__ Float3 clamp(Float3 v, Float3 a, Float3 b)
       +{
       +    return MAKE_FLOAT3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
       +}
       +inline __device__ __host__ Float4 clamp(Float4 v, Float a, Float b)
       +{
       +    return MAKE_FLOAT4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
       +}
       +inline __device__ __host__ Float4 clamp(Float4 v, Float4 a, Float4 b)
       +{
       +    return MAKE_FLOAT4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
       +}
       +
       +inline __device__ __host__ int2 clamp(int2 v, int a, int b)
       +{
       +    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
       +}
       +inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
       +{
       +    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
       +}
       +inline __device__ __host__ int3 clamp(int3 v, int a, int b)
       +{
       +    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
       +}
       +inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
       +{
       +    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
       +}
       +inline __device__ __host__ int4 clamp(int4 v, int a, int b)
       +{
       +    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
       +}
       +inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
       +{
       +    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
       +}
       +
       +inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
       +{
       +    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
       +}
       +inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
       +{
       +    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
       +}
       +inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
       +{
       +    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
       +}
       +inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
       +{
       +    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
       +}
       +inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
       +{
       +    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
       +}
       +inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
       +{
       +    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// dot product
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float dot(Float3 a, Float3 b)
       +{ 
       +    return a.x * b.x + a.y * b.y + a.z * b.z;
       +}
       +inline __host__ __device__ Float dot(Float4 a, Float4 b)
       +{ 
       +    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
       +}
       +
       +inline __host__ __device__ int dot(int2 a, int2 b)
       +{ 
       +    return a.x * b.x + a.y * b.y;
       +}
       +inline __host__ __device__ int dot(int3 a, int3 b)
       +{ 
       +    return a.x * b.x + a.y * b.y + a.z * b.z;
       +}
       +inline __host__ __device__ int dot(int4 a, int4 b)
       +{ 
       +    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
       +}
       +
       +inline __host__ __device__ uint dot(uint2 a, uint2 b)
       +{ 
       +    return a.x * b.x + a.y * b.y;
       +}
       +inline __host__ __device__ uint dot(uint3 a, uint3 b)
       +{ 
       +    return a.x * b.x + a.y * b.y + a.z * b.z;
       +}
       +inline __host__ __device__ uint dot(uint4 a, uint4 b)
       +{ 
       +    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// length
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float length(Float3 v)
       +{
       +    return sqrtf(dot(v, v));
       +}
       +inline __host__ __device__ Float length(Float4 v)
       +{
       +    return sqrtf(dot(v, v));
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// normalize
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float3 normalize(Float3 v)
       +{
       +    Float invLen = rsqrtf(dot(v, v));
       +    return v * invLen;
       +}
       +inline __host__ __device__ Float4 normalize(Float4 v)
       +{
       +    Float invLen = rsqrtf(dot(v, v));
       +    return v * invLen;
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// floor
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float3 floorf(Float3 v)
       +{
       +    return MAKE_FLOAT3(floorf(v.x), floorf(v.y), floorf(v.z));
       +}
       +inline __host__ __device__ Float4 floorf(Float4 v)
       +{
       +    return MAKE_FLOAT4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// frac - returns the fractional portion of a scalar or each vector component
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float fracf(Float v)
       +{
       +    return v - floorf(v);
       +}
       +inline __host__ __device__ Float3 fracf(Float3 v)
       +{
       +    return MAKE_FLOAT3(fracf(v.x), fracf(v.y), fracf(v.z));
       +}
       +inline __host__ __device__ Float4 fracf(Float4 v)
       +{
       +    return MAKE_FLOAT4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// fmod
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float3 fmodf(Float3 a, Float3 b)
       +{
       +    return MAKE_FLOAT3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
       +}
       +inline __host__ __device__ Float4 fmodf(Float4 a, Float4 b)
       +{
       +    return MAKE_FLOAT4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// absolute value
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float3 fabs(Float3 v)
       +{
       +        return MAKE_FLOAT3(fabs(v.x), fabs(v.y), fabs(v.z));
       +}
       +inline __host__ __device__ Float4 fabs(Float4 v)
       +{
       +        return MAKE_FLOAT4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
       +}
       +
       +inline __host__ __device__ int2 abs(int2 v)
       +{
       +        return make_int2(abs(v.x), abs(v.y));
       +}
       +inline __host__ __device__ int3 abs(int3 v)
       +{
       +        return make_int3(abs(v.x), abs(v.y), abs(v.z));
       +}
       +inline __host__ __device__ int4 abs(int4 v)
       +{
       +        return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// reflect
       +// - returns reflection of incident ray I around surface normal N
       +// - N should be normalized, reflected vector's length is equal to length of I
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float3 reflect(Float3 i, Float3 n)
       +{
       +        return i - 2.0f * n * dot(n,i);
       +}
       +
       +////////////////////////////////////////////////////////////////////////////////
       +// cross product
       +////////////////////////////////////////////////////////////////////////////////
       +
       +inline __host__ __device__ Float3 cross(Float3 a, Float3 b)
       +{ 
       +    return MAKE_FLOAT3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 
       +}
       +
       +#endif