tMajor revision: Double precision, kernels split into separate sources - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
HTML git clone git://src.adamsgaard.dk/sphere
DIR Log
DIR Files
DIR Refs
DIR LICENSE
---
DIR commit cd41737e0b69a87e1dbca387031c53793b51ab56
DIR parent d818c179c8ce377aa14d4a4398ff6e6f9907fe75
HTML Author: Anders Damsgaard <adc@geo.au.dk>
Date: Mon, 20 Aug 2012 16:36:05 +0200
Major revision: Double precision, kernels split into separate sources
Diffstat:
A img_out/about.txt | 2 ++
A input/1e3-test-shear.bin | 0
D mfiles/freadbin.m | 118 -------------------------------
D mfiles/fwritebin.m | 98 -------------------------------
D mfiles/initsetup.m | 180 -------------------------------
D mfiles/plotspheres.m | 99 -------------------------------
D mfiles/status.m | 11 -----------
D mfiles/useoutput.m | 112 -------------------------------
D mfiles/visualize.m | 379 -------------------------------
A python/elastic_collision.py | 46 +++++++++++++++++++++++++++++++
A python/inelastic_collision.py | 49 +++++++++++++++++++++++++++++++
A raytracer/colorbar.h | 22 ++++++++++++++++++++++
A raytracer/render_all_outputs_CPU.sh | 27 +++++++++++++++++++++++++++
A raytracer/render_all_outputs_GPU.sh | 29 +++++++++++++++++++++++++++++
A raytracer/rt-kernel.cu | 442 +++++++++++++++++++++++++++++++
A raytracer/rt_GPU_init_pres.sh | 36 +++++++++++++++++++++++++++++++
A raytracer/rt_GPU_pres.sh | 36 +++++++++++++++++++++++++++++++
A raytracer/rt_GPU_tall_pres.sh | 36 +++++++++++++++++++++++++++++++
A src/cohesion.cuh | 126 +++++++++++++++++++++++++++++++
A src/contactmodels.cuh | 411 ++++++++++++++++++++++++++++++
A src/contactsearch.cuh | 625 +++++++++++++++++++++++++++++++
A src/integration.cuh | 207 ++++++++++++++++++++++++++++++
A src/sorting.cuh | 127 +++++++++++++++++++++++++++++++
A src/vector_arithmetic.h | 994 +++++++++++++++++++++++++++++++
24 files changed, 3215 insertions(+), 997 deletions(-)
---
DIR diff --git a/img_out/about.txt b/img_out/about.txt
t@@ -0,0 +1,2 @@
+This folder will contain images rendered from ../output binaries, and
+graphs plotted with ../python/sphere.py.
DIR diff --git a/input/1e3-test-shear.bin b/input/1e3-test-shear.bin
Binary files differ.
DIR diff --git a/mfiles/freadbin.m b/mfiles/freadbin.m
t@@ -1,118 +0,0 @@
-function [p, grids, time, params, walls] = freadbin(path, fn)
- %path = '../output/'; %Target directory
- %fn = 'output0.bin'; %Target binary
-
- %Open binary file for reading with little endian ordering and unicode char encoding.
- %fid = fopen([path fn],'rb','ieee-le','UTF-8');
- fid = fopen([path fn], 'rb', 'ieee-le');
-
- % Read the number of dimensions and particles
- grids.nd = fread(fid, 1, 'int');
- p.np = fread(fid, 1, 'uint32');
-
- % Read the time variables
- time.dt = fread(fid, 1, 'float');
- time.current = fread(fid, 1, 'double');
- time.total = fread(fid, 1, 'double');
- time.file_dt = fread(fid, 1, 'float');
- time.step_count = fread(fid, 1, 'uint32');
-
- %Initiate variables for faster execution
- p.x = zeros(p.np, grids.nd); % Coordinate vector
- p.vel = zeros(p.np, grids.nd); % Velocity vector
- p.angvel = zeros(p.np, grids.nd); % Acceleration vector
- p.force = zeros(p.np, grids.nd); % Linear force vector
- p.torque = zeros(p.np, grids.nd); % Rotational torque vector
- p.fixvel = zeros(p.np, 1); % Fix horizontal particle velocity: 1: yes, 0: no
- p.xsum = zeros(p.np, 1); % Total particle displacement along x-axis
- p.radius = zeros(p.np, 1); % Particle radius
- p.rho = zeros(p.np, 1); % Density
- p.k_n = zeros(p.np, 1); % Normal stiffness
- p.k_s = zeros(p.np, 1); % Shear stiffness
- p.k_r = zeros(p.np, 1); % Rolling stiffness
- p.gamma_s = zeros(p.np, 1); % Shear viscosity
- p.gamma_r = zeros(p.np, 1); % Rolling viscosity
- p.mu_s = zeros(p.np, 1); % Inter-particle contact static shear friction coefficient
- p.mu_d = zeros(p.np, 1); % Inter-particle contact dynamic shear friction coefficient
- p.mu_r = zeros(p.np, 1); % Inter-particle contact rolling friction coefficient
- p.C = zeros(p.np, 1); % Inter-particle cohesion
- p.E = zeros(p.np, 1); % Young's modulus
- p.K = zeros(p.np, 1); % Bulk modulus
- p.nu = zeros(p.np, 1); % Poisson's ratio
- p.es_dot = zeros(p.np, 1); % Current shear energy dissipation
- p.es = zeros(p.np, 1); % Total shear energy dissipation
- p.p = zeros(p.np, 1); % Pressure on particle
- p.bonds = zeros(p.np, 2); % Inter-particle bonds
-
- % Read remaining data from MATLAB binary
- grids.origo = fread(fid, [1, grids.nd], 'float');
- grids.L = fread(fid, [1, grids.nd], 'float');
- grids.num = fread(fid, [1, grids.nd], 'uint32');
-
- for j=1:p.np
- %p.id = fread(fid, 1, 'uint32');
-
- for i=1:grids.nd %Coordinates, velocity, acceleration, pre-velocity
- p.x(j,i) = fread(fid, 1, 'float');
- p.vel(j,i) = fread(fid, 1, 'float');
- p.angvel(j,i) = fread(fid, 1, 'float');
- p.force(j,i) = fread(fid, 1, 'float');
- p.torque(j,i) = fread(fid, 1, 'float');
- end
- end
-
- for j=1:p.np %Parameters with one value per particle
- p.fixvel(j) = fread(fid, 1, 'float');
- p.xsum(j) = fread(fid, 1, 'float');
- p.radius(j) = fread(fid, 1, 'float');
- p.rho(j) = fread(fid, 1, 'float');
- p.k_n(j) = fread(fid, 1, 'float');
- p.k_s(j) = fread(fid, 1, 'float');
- p.k_r(j) = fread(fid, 1, 'float');
- p.gamma_s(j) = fread(fid, 1, 'float');
- p.gamma_r(j) = fread(fid, 1, 'float');
- p.mu_s(j) = fread(fid, 1, 'float');
- p.mu_d(j) = fread(fid, 1, 'float');
- p.mu_r(j) = fread(fid, 1, 'float');
- p.C(j) = fread(fid, 1, 'float');
- p.E(j) = fread(fid, 1, 'float');
- p.K(j) = fread(fid, 1, 'float');
- p.nu(j) = fread(fid, 1, 'float');
- p.es_dot(j) = fread(fid, 1, 'float');
- p.es(j) = fread(fid, 1, 'float');
- p.p(j) = fread(fid, 1, 'float');
- end
-
- %Physical, constant parameters
- %params.global = fread(fid, 1, 'ubit1');
- params.global = fread(fid, 1, 'int');
- params.g = fread(fid, [1, grids.nd], 'float');
- params.kappa = fread(fid, 1, 'float');
- params.db = fread(fid, 1, 'float');
- params.V_b = fread(fid, 1, 'float');
- params.shearmodel = fread(fid, 1, 'uint32');
-
- % Wall data
- walls.nw = fread(fid, 1, 'uint32');
- for j=1:walls.nw
- for i=1:grids.nd
- walls.n(j,i) = fread(fid, 1, 'float');
- end
- walls.x(j) = fread(fid, 1, 'float');
- walls.m(j) = fread(fid, 1, 'float');
- walls.vel(j) = fread(fid, 1, 'float');
- walls.force(j) = fread(fid, 1, 'float');
- walls.devs(j) = fread(fid, 1, 'float');
- end
- params.periodic = fread(fid, 1, 'int');
-
- for j=1:p.np
- p.bonds(j,1) = fread(fid, 1, 'uint32');
- p.bonds(j,2) = fread(fid, 1, 'uint32');
- p.bonds(j,3) = fread(fid, 1, 'uint32');
- p.bonds(j,4) = fread(fid, 1, 'uint32');
- end
-
- fclose(fid);
-
-end
DIR diff --git a/mfiles/fwritebin.m b/mfiles/fwritebin.m
t@@ -1,98 +0,0 @@
-function fwritebin(path, fn, p, grids, time, params, walls)
- %path = '../input/'; %Target directory
- %fn = '3dtest.bin'; %Target binary
-
- %Open binary file for writing, little endian ordering, unicode char encoding.
- %fid = fopen([path fn], 'wb','ieee-le','UTF-8');
- fid = fopen([path fn], 'wb', 'ieee-le');
-
- %fwrite(fid, grids.nd, 'ushort'); %Unsigned short int
- %fwrite(fid, p.np, 'ulong'); %Unsigned long int
- fwrite(fid, grids.nd, 'int');
- fwrite(fid, p.np, 'uint32');
-
- %Time parameters
- fwrite(fid, time.dt, 'float');
- fwrite(fid, time.current, 'double');
- fwrite(fid, time.total, 'double');
- fwrite(fid, time.file_dt, 'float');
- fwrite(fid, time.step_count, 'uint32');
-
- for i=1:grids.nd %Grid origo
- fwrite(fid, grids.origo(i), 'float');
- end
-
- for i=1:grids.nd %Grid dimensions
- fwrite(fid, grids.L(i), 'float');
- end
-
- for i=1:grids.nd %Grid dimensions
- fwrite(fid, grids.num(i), 'uint32');
- end
-
- for j=1:p.np
- for i=1:grids.nd %Coordinates, velocity, acceleration, pre-velocity
- fwrite(fid, p.x(j,i), 'float');
- fwrite(fid, p.vel(j,i), 'float');
- fwrite(fid, p.angvel(j,i), 'float');
- fwrite(fid, p.force(j,i), 'float');
- fwrite(fid, p.torque(j,i), 'float');
- end
- end
-
- for j=1:p.np %Parameters with one value per particle
- fwrite(fid, p.fixvel(j), 'float');
- fwrite(fid, p.xsum(j), 'float');
- fwrite(fid, p.radius(j), 'float');
- fwrite(fid, p.rho(j), 'float');
- fwrite(fid, p.k_n(j), 'float');
- fwrite(fid, p.k_s(j), 'float');
- fwrite(fid, p.k_r(j), 'float');
- fwrite(fid, p.gamma_s(j), 'float');
- fwrite(fid, p.gamma_r(j), 'float');
- fwrite(fid, p.mu_s(j), 'float');
- fwrite(fid, p.mu_d(j), 'float');
- fwrite(fid, p.mu_r(j), 'float');
- fwrite(fid, p.C(j), 'float');
- fwrite(fid, p.E(j), 'float');
- fwrite(fid, p.K(j), 'float');
- fwrite(fid, p.nu(j), 'float');
- fwrite(fid, p.es_dot(j), 'float');
- fwrite(fid, p.es(j), 'float');
- fwrite(fid, p.p(j), 'float');
- end
-
- %Physical, constant parameters
- %fwrite(fid, params.global, 'ubit1');
- fwrite(fid, params.global, 'int');
- for i=1:grids.nd
- fwrite(fid, params.g(i), 'float');
- end
- fwrite(fid, params.kappa, 'float');
- fwrite(fid, params.db, 'float');
- fwrite(fid, params.V_b, 'float');
- fwrite(fid, params.shearmodel, 'uint32');
-
- fwrite(fid, walls.nw, 'uint32');
- for j=1:walls.nw
- for i=1:grids.nd
- fwrite(fid, walls.n(j,i), 'float'); % Wall normal
- end
- fwrite(fid, walls.x(j), 'float'); % Wall pos. on axis parallel to wall normal
- fwrite(fid, walls.m(j), 'float'); % Wall mass
- fwrite(fid, walls.vel(j), 'float'); % Wall vel. on axis parallel to wall normal
- fwrite(fid, walls.force(j), 'float'); % Wall force on axis parallel to wall normal
- fwrite(fid, walls.devs(j), 'float'); % Deviatoric stress on wall normal
- end
- fwrite(fid, params.periodic, 'int');
-
- for j=1:p.np
- fwrite(fid, p.bonds(i,1), 'uint32');
- fwrite(fid, p.bonds(i,2), 'uint32');
- fwrite(fid, p.bonds(i,3), 'uint32');
- fwrite(fid, p.bonds(i,4), 'uint32');
- end
-
- fclose(fid);
-
-end
DIR diff --git a/mfiles/initsetup.m b/mfiles/initsetup.m
t@@ -1,180 +0,0 @@
- function initsetup(plotfigure)
-% initsetup() creates a model setup of particles in 3D with cubic
-% packing. Specify PSD and desired number of particles, and the
-% function will determine the model dimensions, and fill the space
-% with a number of particles from a specified particle size distribution.
-close all;
-
-% Simulation project name
-simulation_name = '1e3-init-visc';
-
-% Plot particle assembly in MATLAB after initialization
-if exist('plotfigure','var')
- if plotfigure == 1
- visualize = 1;
- else
- visualize = 0;
- end
-else
- visualize = 0;
-end
-
-% Physical, constant parameters
-params.g = [0.0, 0.0, -9.80665]; %standard gravity, by definition 9.80665 m/s^2
-%params.g = 0.98;
-
-% No. of dimensions
-grids.nd = 3;
-grids.origo = [0 0 0]; %Coordinate system origo
-
-% Number of particles
-p.np = 1e3;
-
-% Create grid
-form = 4; % For a cubic grid, use 3.
- % For a higher grid, use 4 or more (for cubic end config. use 5).
- % For a flatter grid, use 1 or 2.
-grids.num(1) = ceil(nthroot(p.np, form)); % Particles in x direction
-grids.num(2) = grids.num(1); % Particles in y direction
-grids.num(3) = ceil(p.np/(grids.num(1)*grids.num(2))); % Particles in z direction
-
-disp(['Grid dimensions: x=' num2str(grids.num(1)) ...
- ', y=' num2str(grids.num(2)) ...
- ', z=' num2str(grids.num(3))])
-
-%grids.num = ceil(nthroot(p.np,grids.nd)) * ones(grids.nd, 1); %Square/cubic
-p.np = grids.num(1)*grids.num(2)*grids.num(3);
-
-% Particle size distribution
-p.psd = 'logn'; %PSD: logn or uni
-
-p.m = 440e-6; %Mean size
-p.v = p.m*0.00002; %Variance
-
-p.radius = zeros(p.np, 1);
-p.x = zeros(p.np, grids.nd);
-
-p.bonds = ones(p.np, 4) * p.np; % No bonds when the values equal the no. of particles
-
-if strcmp(p.psd,'logn') %Log-normal PSD. Note: Keep variance small.
- mu = log((p.m^2)/sqrt(p.v+p.m^2));
- sigma = sqrt(log(p.v/(p.m^2)+1));
- p.radius = lognrnd(mu,sigma,1,p.np); %Array of particle radii
-elseif strcmp(p.psd,'uni') %Uniform PSD between rmin and rmax
- rmin = p.m - p.v*1e5;
- rmax = p.m + p.v*1e5;
- %rmin = 0.1*dd; rmax = 0.4*dd;
- p.radius = (rmax-rmin)*rand(p.np,1)+rmin; %Array of particle radii
-end
-
-% Display PSD
-if visualize == 1
- figure;
- hist(p.radius);
- title(['PSD: ' num2str(p.np) ' particles, m = ' num2str(p.m) ' m']);
-end
-
-% Friction angles
-ang_s = 30; % Angle of static shear resistance
-ang_d = 25; % Angle of dynamic shear resistance
-ang_r = 35; % Angle of rolling resistance
-
-% Other particle parameters
-p.vel = zeros(p.np, grids.nd); % Velocity vector
-p.angvel = zeros(p.np, grids.nd); % Angular velocity
-p.fixvel = zeros(p.np, 1); % 0: horizontal particle velocity free, 1: hotiz. particle velocity fixed
-p.xsum = zeros(p.np, 1); % Total displacement along x-axis
-p.force = zeros(p.np, grids.nd); % Force vector
-p.torque = zeros(p.np, grids.nd); % Torque vector
-
-params.global = 1; % 1: Physical parameters global, 0: individual per particle
-p.rho = 3600*ones(p.np,1); % Density
-p.k_n = 4e5*ones(p.np,1); % Normal stiffness [N/m]
-p.k_s = p.k_n(:); % Shear stiffness [N/m]
-p.k_r = p.k_s(:).*(10); % Rolling stiffness
-params.shearmodel = 1; % Contact model. 1=frictional, viscous, 2=frictional, elastic
-p.gamma_s = p.k_n./1e3; % Shear viscosity [Ns/m]
-p.gamma_r = p.gamma_s(:); % Rolling viscosity [?]
-p.mu_s = tand(ang_s)*ones(p.np,1); % Inter-particle shear contact friction coefficient [0;1[
-p.mu_r = tand(ang_r)*ones(p.np,1); % Inter-particle rolling contact friction coefficient [0;1[
-p.C = 0*zeros(p.np,1); % Inter-particle cohesion
-p.E = 10e9*ones(p.np,1); % Young's modulus
-p.K = 38e9*ones(p.np,1); % Bulk modulus
-p.nu = 0.1*2.0*sqrt(min(4/3*pi.*p.radius(:).*p.radius(:).*p.radius(:).*p.rho(1))*p.k_n(1))*ones(p.np,1); % Poisson's ratio (critical damping: 2*sqrt(m*k_n)). Normal component elastic if nu=0
-p.es_dot = zeros(p.np,1); % Current shear energy dissipation
-p.es = zeros(p.np,1); % Total shear energy dissipation
-p.p = zeros(p.np,1); % Pressure on particle
-
-% Parameters related to capillary bonds
-% Disable capillary cohesion by setting kappa to zero.
-enableCapillaryCohesion = 0;
-theta = 0.0; % Wettability (0: perfect)
-if (enableCapillaryCohesion == 1)
- params.kappa = 2*pi*p.gamma_s(1) * cos(theta); % Prefactor
- params.V_b = 1e-12; % Liquid volume at bond
-else
- params.kappa = 0; % Zero capillary force
- params.V_b = 0; % Zero liquid volume at bond
-end
-params.db = (1.0 + theta/2.0) * params.V_b^(1.0/3.0); % Debonding distance
-
-% Time parameters
-%time.dt = 0.2*min(sqrt((p.rho(:).*p.radius(:).^2)./p.K(:))); % Computational delta t
-%time.dt = time.dt*1e1; % Speedup factor
-time.dt = 0.17*sqrt(min(4/3*pi.*p.radius(:).*p.radius(:).*p.radius(:).*p.rho(1))/p.k_n(1)); % Computational time step (O'Sullivan et al. 2003)
-time.current = 0.0;
-time.total = 1.500+2.0*time.dt; % Total simulation time [s]
-time.file_dt = 0.0010; % Interval between output#.bin generation [s]
-time.step_count = 0;
-
-% Calculate particle coordinates
-% Grid unit length. Maximum particle diameter determines grid size
-GU = 2*max(p.radius)*1.40; % Forty percent margin
-grids.L = [GU*grids.num(1) GU*grids.num(2) GU*grids.num(3)];
-
-% Particle coordinates by filling grid.
-x = linspace(GU/2, grids.L(1)-GU, grids.num(1));
-y = linspace(GU/2, grids.L(2)-GU, grids.num(2));
-z = linspace(GU/2, grids.L(3)-GU, grids.num(3));
-
-[X Y Z] = meshgrid(x,y,z);
-X=X(:); Y=Y(:); Z=Z(:);
-
-p.x = [X Y Z];
-
-%Particle positions randomly modified by + 10 percent of a grid unit
-p.x = p.x + (rand(p.np, grids.nd)*0.49*GU);
-
-% Walls with friction
-% Note that the world boundaries already act as fricionless boundaries
-% Upper wall
-wno = 1;
-walls.n(wno,:) = [0.0, 0.0, -1.0]; % Normal along negative z-axis
-walls.x(wno) = grids.L(3); % Positioned at upper boundary
-walls.m(wno) = p.rho(1)*p.np*pi*max(p.radius)*max(p.radius)*max(p.radius); % Wall mass
-walls.vel(wno) = 0.0; % Wall velocity
-walls.force(wno) = 0.0; % Wall force
-walls.devs(wno) = 0.0; % Deviatoric stress
-walls.nw = wno;
-
-% Define behavior of x- and y boundaries.
-% Uncomment only one!
-%params.periodic = 0; % Behave as frictional walls
-params.periodic = 1; % Behave as periodic boundaries
-%params.periodic = 2; % x: periodic, y: frictional walls
-
-% Write output binary
-fwritebin('../input/', [simulation_name '.bin'], p, grids, time, params, walls);
-disp('Writing of binary file complete.');
-
-% Plot particles in bubble plot
-if visualize == 1
- disp('Waiting for visualization.');
- plotspheres(p, grids, 5, 0, 1);
-end
-
-disp(['Project "' simulation_name '" is now ready for processing with SPHERE.']);
-disp(['Call "' pwd '/sphere ' simulation_name '" from the system terminal ']);
-disp(['to initiate, and check progress here in MATLAB using "status(''' simulation_name ''')"']);
-
-end
DIR diff --git a/mfiles/plotspheres.m b/mfiles/plotspheres.m
t@@ -1,98 +0,0 @@
-function plotspheres(p, grids, n, quiver, visible)
-
-if exist('visible','var')
- if visible == 0
- % Create a new invisible figure window
- figure('visible','off');
- else
- % Create a new figure window
- figure('Renderer','OpenGL')
- end
-else
- % Create a new figure window
- figure('Renderer','OpenGL')
-end
-
-% Generate unit sphere, consisting of n-by-n faces (i.e. the resolution)
-if exist('n', 'var')
- [x,y,z] = sphere(n);
-else
- [x,y,z] = sphere(10);
-end
-
-% Iterate through particle data
-hold on
-for i=1:p.np
- spheresurf = surf(x*p.radius(i)+p.x(i,1), ...
- y*p.radius(i)+p.x(i,2), ...
- z*p.radius(i)+p.x(i,3));
- set(spheresurf,'EdgeColor','none', ...
- 'FaceColor',[0.96 0.64 0.38], ... % RGB
- 'FaceLighting','phong', ...
- 'AmbientStrength',0.3, ...
- 'DiffuseStrength',0.8, ...
- 'SpecularStrength',0.9, ...
- 'SpecularExponent',25, ...
- 'BackFaceLighting','lit');
-end
-camlight left;
-hidden off
-
-% Optional quiver3 plot (velocity vectors)
-if exist('quiver','var')
- if quiver == 1
- quiver3(p.x(:,1), p.x(:,2) , p.x(:,3), ...
- p.vel(:,1), p.vel(:,2), p.vel(:,3), 3);
- end
-end
-
-% Draw walls from 8 vertices and connect these to 6 faces
-vertices = [grids.origo(1) grids.origo(2) grids.origo(3); ... % 1
- grids.origo(1) grids.L(2) grids.origo(3); ... % 2
- grids.L(1) grids.L(2) grids.origo(3); ... % 3
- grids.L(1) grids.origo(2) grids.origo(3); ... % 4
- grids.origo(1) grids.origo(2) grids.L(3); ... % 5
- grids.origo(1) grids.L(2) grids.L(3); ... % 6
- grids.L(1) grids.L(2) grids.L(3); ... % 7
- grids.L(1) grids.origo(2) grids.L(3)]; % 8
-
-faces = [1 2 3 4; ... % (observing along pos. y axis direction)
- 2 6 7 3; ... %
- 4 3 7 8; ... %
- 1 5 8 4; ... %
- 1 2 6 5; ... %
- 5 6 7 8]; %
-
-patch('Faces', faces, 'Vertices', vertices, ...
- 'FaceColor','none', 'EdgeColor','black','LineWidth',2);
-
-% View specifications
-%daspect([1 1 1])
-
-view([grids.L(1), -2*grids.L(2), grids.L(3)])
-grid on
-axis equal
-maxr = max(p.radius);
-axis([grids.origo(1)-maxr grids.L(1)+maxr ...
- grids.origo(2)-maxr grids.L(2)+maxr ...
- grids.origo(3)-maxr grids.L(3)+maxr]);
-
-light('Position', [grids.L(1), -grids.L(2), grids.L(3)]);
-material shiny; %shiny dull metal
-camlight(45,45);
-lighting gouraud; %flat gouraud phone none
-
-% Remove hidden lines from mesh plot (hidden on)
-hidden off
-
-% Labels
-title([num2str(p.np) ' particles']);
-xlabel('x [m]');
-ylabel('y [m]');
-zlabel('z [m]');
-
-% Add delaunay triangulation
-%dt = DelaunayTri(p.x(:,1), p.x(:,2), p.x(:,3));
-%tetramesh(dt,'FaceAlpha',0.02);
-
-end
-\ No newline at end of file
DIR diff --git a/mfiles/status.m b/mfiles/status.m
t@@ -1,11 +0,0 @@
-function lastfile = status(project)
-
-% status() writes status of current model run
-
-st = load(['../output/' project '.status.dat']);
-
-disp(['Status: time = ',num2str(st(1)),', ',num2str(st(2)),'% done, filenr = ',num2str(st(3))]);
-
-lastfile = st(3);
-
-end
DIR diff --git a/mfiles/useoutput.m b/mfiles/useoutput.m
t@@ -1,112 +0,0 @@
-%function useoutput(fn, plotfigure)
-% Use particle data from target binary from output directory,
-% create a fitting grid, and zero time variables.
-% Final configuration is saved in a new filename in input/ directory.
-
-% Plot particle assembly in MATLAB after initialization
-visualize = 0; % No
-%visualize = 1; % Yes
-
-% Input binary file
-directory = '../input/';
-inputbin = '5e4-cons_10kPa.bin';
-
-% New configuration name
-simulation_name = '5e4-shear_10kPa';
-
-% Read data
-[p, grids, time, params, walls] = freadbin(directory, inputbin);
-%[p, grids, time, params, walls] = freadbinold(directory, inputbin);
-
-% Change temporal parameters (comment out to use previous values"
-time.current = 0.0; % New starting time
-time.step_count = 0; % First output file number
-time.total = 0.25+2*time.dt; % Total simulation time [s]
-time.file_dt = 0.01; %Interval between output#.bin generation [s]
-time.dt = 0.17*sqrt(min(4/3*pi.*p.radius(:).*p.radius(:).*p.radius(:).*p.rho(1))/p.k_n(1)); % Computational time step (O'Sullivan et al. 2003)
-
-% Particle stiffness
-p.k_n = 4e5*ones(p.np,1); % Normal stiffness [N/m]
-p.gamma_s = p.k_n./1e3; % Shear viscosity [Ns/m]
-p.gamma_r = p.gamma_s(:); % Rolling viscosity
-
-% Friction angles
-ang_s = 25; % Angle of shear resistance
-ang_r = 35; % Angle of rolling resistance
-p.mu_s = tand(ang_s)*ones(p.np,1); % Inter-particle shear contact friction coefficient [0;1[
-p.mu_r = tand(ang_r)*ones(p.np,1); % Inter-particle rolling contact friction coefficient [0;1[
-
-% Compute new grid, scaled to fit max.- & min. particle positions
-GU = 2*max(p.radius); % Cell size
-x_min = min(p.x(:,1));% - p.radius(:));
-x_max = max(p.x(:,1));% + p.radius(:));%*3.0;
-%y_min = min(p.x(:,2) - p.radius(:));
-%y_max = max(p.x(:,2) + p.radius(:));
-z_min = min(p.x(:,3) - p.radius(:));
-z_max = max(p.x(:,3) + p.radius(:));
-z_adjust = 1.2; % Specify overheightening of world to allow for shear dilatancy
-%grids.num(1) = ceil((x_max-x_min)/GU);
-%grids.num(2) = ceil((y_max-y_min)/GU);
-grids.num(3) = ceil((z_max-z_min)*z_adjust/GU);
-%grids.L = [(x_max-x_min) (y_max-y_min) (z_max-z_min)*z_adjust];
-grids.L(3) = (z_max-z_min)*z_adjust;
-
-% Parameters related to capillary bonds
-% Disable capillary cohesion by setting kappa to zero.
-enableCapillaryCohesion = 0;
-theta = 0.0; % Wettability (0: perfect)
-if (enableCapillaryCohesion == 1)
- params.kappa = 2*pi*p.gamma_s(1) * cos(theta); % Prefactor
- params.V_b = 1e-12; % Liquid volume at bond
-else
- params.kappa = 0; % Zero capillary force
- params.V_b = 0; % Zero liquid volume at bond
-end
-params.db = (1.0 + theta/2.0) * params.V_b^(1.0/3.0); % Debonding distance
-
-% Move mobile upper wall to top of domain
-walls.x(1) = max(p.x(:,3)+p.radius(:));
-
-% Define new deviatoric stress [Pa]
-%walls.devs(1) = 0.0;
-walls.devs(1) = 10.0e3;
-
-% Let x- and y boundaries be periodic
-params.periodic = 1; % x- and y boundaries periodic
-%params.periodic = 2; % Only x-boundaries periodic
-
-% By default, all particles are free to move horizontally
-p.fixvel = zeros(p.np, 1); % Start off by defining all particles as free
-shearing = 1; % 1: true, 0: false
-
-if shearing == 1
- % Fix horizontal velocity to 0.0 of lowermost particles
- I = find(p.x(:,3) < (z_max-z_min)*0.1); % Find the lower 10%
- p.fixvel(I) = 1; % Fix horiz. velocity
- p.vel(I,1) = 0.0; % x-value
- p.vel(I,2) = 0.0; % y-value
-
- % Fix horizontal velocity to 0.0 of uppermost particles
- I = find(p.x(:,3) > (z_max-z_min)*0.9); % Find the upper 10%
- p.fixvel(I) = 1; % Fix horiz. velocity
- p.vel(I,1) = (x_max-x_min)*1.0; % x-value: One grid length per second
- p.vel(I,2) = 0.0; % y-value
-end
-
-% Zero x-axis displacement
-p.xsum = zeros(p.np, 1);
-
-% Write output binary
-fwritebin('../input/', [simulation_name '.bin'], p, grids, time, params, walls);
-disp('Writing of binary file complete.');
-
-% Plot particles in bubble plot
-if visualize == 1
- disp('Waiting for visualization.');
- plotspheres(p, grids, 5, 0, 1);
-end
-
-disp(['Project "' simulation_name '" is now ready for processing with SPHERE.']);
-disp(['Call "' pwd '/sphere ' simulation_name '" from the system terminal ']);
-disp(['to initiate, and check progress here in MATLAB using "status(''' simulation_name ''')"']);
-
DIR diff --git a/mfiles/visualize.m b/mfiles/visualize.m
t@@ -1,379 +0,0 @@
-function visualize(project, method, file_nr)
-
- lastfile = status(project);
-
- if exist('file_nr','var')
- filenr = file_nr;
- else
- filenr = lastfile;
- end
-
- figure;
-
- % Plot sum of kinetic energy vs. time
- if strcmpi(method, 'energy')
-
- disp('Energy')
-
- Epot = zeros(1,lastfile);
- Ekin = zeros(1,lastfile);
- Erot = zeros(1,lastfile);
- Es = zeros(1,lastfile);
- Es_dot = zeros(1,lastfile);
- Esum = zeros(1,lastfile);
-
- % Load data from all output files
- for i = 1:lastfile
- fn = [project '.output' num2str(i) '.bin'];
- disp(fn);
- [p, ~, time, params, ~] = freadbin('../output/', fn);
- for j = 1:p.np
- r = p.radius(j);
- m = (4/3*pi*r*r*r*p.rho(j));
- Epot(i) = Epot(i) + m * norm(params.g) * p.x(j,3);
- Ekin(i) = Ekin(i) + 0.5 * m ...
- * norm(p.vel(j,:)) * norm(p.vel(j,:));
- Erot(i) = Erot(i) + 0.5 * (2/5 * m * r*r) ...
- * norm(p.angvel(j,:)) * norm(p.angvel(j,:));
- end
- Es(i) = sum(p.es);
- Es_dot(i) = sum(p.es_dot);
-
- Esum(i) = Epot(i) + Ekin(i) + Erot(i) + Es(i);
- %Esum(i) = Epot(i) + Ekin(i) + Es(i);
- end
-
- % Time axis
- t = linspace(time.file_dt, time.current, length(Ekin));
-
- %disp(num2str(Ekin(:)));
-
- % Visualization, E_pot
- subplot(2,3,1);
- plot(t, Epot, '+-');
- title('Potential energy');
- xlabel('Time [s]');
- ylabel('Total potential energy [J]');
- box on;
- grid on;
-
- % Visualization, E_kin
- subplot(2,3,2);
- plot(t, Ekin, '+-');
- title('Kinetic energy');
- xlabel('Time [s]');
- ylabel('Total kinetic energy [J]');
- box on;
- grid on;
-
- % Visualization, E_rot
- subplot(2,3,3);
- plot(t, Erot, '+-');
- title('Rotational energy');
- xlabel('Time [s]');
- ylabel('Total rotational energy [J]');
- box on;
- grid on;
-
- % Visualizaiton, E_s_dot (current shear energy)
- subplot(2,3,4);
- plot(t, Es_dot, '+-');
- title('Shear energy rate');
- xlabel('Time [s]');
- ylabel('Shear energy rate [W]');
- box on;
- grid on;
-
- % Visualizaiton, E_s (total shear energy)
- subplot(2,3,5);
- plot(t, Es, '+-');
- title('Total shear energy');
- xlabel('Time [s]');
- ylabel('Total shear energy [J]');
- box on;
- grid on;
-
- % Total energy, Esum
- subplot(2,3,6);
- plot(t, Esum, '+-');
- title('Total energy: Pot + Kin + Rot + Shear');
- xlabel('Time [s]');
- ylabel('Total energy [J]');
- box on;
- grid on;
-
- end % End visualize energy
-
-
- % Visualize wall kinematics and force
- if strcmpi(method, 'walls')
- disp('Walls')
-
- [~, ~, ~, ~, walls] = freadbin('../output/', [project '.output0.bin']);
-
- wforce = zeros(lastfile, walls.nw);
- wvel = zeros(lastfile, walls.nw);
- wpos = zeros(lastfile, walls.nw);
- wdevs = zeros(lastfile, walls.nw);
-
- % Load data from all output files
- for i = 1:lastfile
- fn = [project '.output' num2str(i) '.bin'];
- disp(fn);
- [~, ~, time, ~, walls] = freadbin('../output/', fn);
-
- for j = 1:walls.nw
- wforce(i,j) = walls.force(j);
- wvel(i,j) = walls.vel(j);
- wpos(i,j) = walls.x(j);
- wdevs(i,j) = walls.devs(j);
- end
- end
-
- % Time axis
- t = linspace(time.file_dt, time.current, lastfile);
-
- % Visualization, wall position
- subplot(2,2,1);
- hold on
- for i=1:walls.nw
- p = plot(t, wpos(:,i), '+-');
- if i==2
- set(p,'Color','red')
- end
- end
- hold off
- title('Wall positions');
- xlabel('Time [s]');
- ylabel('Position [m]');
- box on;
- grid on;
-
- % Visualization, wall force
- subplot(2,2,2);
- hold on
- for i=1:walls.nw
- p = plot(t, wforce(:,i), '+-');
- if i==2
- set(p,'Color','red')
- end
- end
- hold off
- title('Wall forces');
- xlabel('Time [s]');
- ylabel('Force [N]');
- box on;
- grid on;
-
- % Visualization, wall velocity
- subplot(2,2,3);
- hold on
- for i=1:walls.nw
- p = plot(t, wvel(:,i), '+-');
- if i==2
- set(p,'Color','red')
- end
- end
- hold off
- title('Wall velocities');
- xlabel('Time [s]');
- ylabel('Velocity [m/s]');
- box on;
- grid on;
-
- % Visualization, wall deviatoric stresses
- subplot(2,2,4);
- hold on
- for i=1:walls.nw
- p = plot(t, wdevs(:,i), '+-');
- if i==2
- set(p,'Color','red')
- end
- end
- hold off
- title('Wall deviatoric stresses');
- xlabel('Time [s]');
- ylabel('Stress [Pa]');
- box on;
- grid on;
-
- end % End visualize walls
-
-
- % Visualize first output file with plotspheres
- if strcmpi(method, 'first')
- disp('Visualizing first output file')
- fn = [project '.output0.bin'];
- [p, grids, ~, ~] = freadbin('../output/', fn);
- plotspheres(p, grids, 5, 0, 1);
- end % End visualize last file
-
-
- % Visualize last output file with plotspheres
- if strcmpi(method, 'last')
- disp('Visualizing last output file')
- fn = [project '.output' num2str(lastfile) '.bin'];
- [p, grids, ~, ~] = freadbin('../output/', fn);
- plotspheres(p, grids, 5, 0, 1);
- end % End visualize last file
-
-
- % Visualize horizontal velocities (velocity profile)
- if strcmpi(method, 'veloprofile')
-
- disp('Visualizing velocity profile');
-
- % Read data
- fn = [project '.output' num2str(filenr) '.bin'];
- disp(fn);
- [p, ~, time, params, ~] = freadbin('../output/', fn);
-
- horiz_velo = zeros(2, p.np); % x,y velocities for each particle
-
- for j = 1:p.np
- horiz_velo(1, j) = p.vel(j, 1); % x-velocity
- horiz_velo(2, j) = p.vel(j, 2); % y-velocity
- end
-
- % Find shear velocity (for scaling first axis)
- fixidx = find(p.fixvel > 0.0);
- shearvel = max(p.vel(fixidx,1));
-
- % Plot x- and y velocities vs. z position
- hold on;
- plot(horiz_velo(2,:), p.x(:,3), '.b'); % y-velocity (blue)
- plot(horiz_velo(1,:), p.x(:,3), '.r'); % x-velocity (red)
- axis([-0.5*shearvel, shearvel*1.5, min(p.x(:,3)), max(p.x(:,3))]);
- title(['Velocity profile, t = ' num2str(time.current) ' s']);
- xlabel('Horizontal velocity [m/s]');
- ylabel('Vertical position [m]');
- legend('y', 'x', 'Location', 'SouthEast');
- box on;
- grid on;
- hold off;
-
- end % End visualize velocity profile
-
- if strcmpi(method, 'sheardisp')
- disp('Visualizing shear displacement, 1D');
-
- % Read first datafile
- [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output0.bin']);
-
- % Read original z-position at t = 0 s.
- zpos = p.x(:,3);
-
- % Read last datafile
- [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output' num2str(lastfile) '.bin']);
-
- % Plot
- plot(p.xsum(:), zpos(:), 'o');
- title(project);
- xlabel('Shear displacement [m]');
- ylabel('Initial vertical position [m]');
- box on;
- grid on;
-
- end % End visualize shear displacement
-
- if strcmpi(method, 'sheardisp2d')
- disp('Visualizing shear displacement, 2D');
-
- % Read first datafile
- [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output0.bin']);
-
- % Read original z-position at t = 0 s.
- zpos = p.x(:,3);
-
- % Read last datafile
- [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output' num2str(lastfile) '.bin']);
-
-
-
- % Plot
- colormap(jet)
- title(project);
- xlabel('Shear displacement [m]');
- ylabel('y [m]');
- zlabel('z [m]');
- box on;
- grid on;
- axis equal;
- axis vis3d;
- rotate3d on;
-
- end % End visualize shear displacement 2D
-
- if strcmpi(method, 'sheardisp3d')
- disp('Visualizing shear displacement, 3D');
-
- % Read last datafile
- [p, ~, ~, ~, ~] = freadbin('../output/',[project '.output' num2str(lastfile) '.bin']);
-
- % Plot
- scatter3(p.xsum(:), p.x(:,2), p.x(:,3), 30, p.x(:,3), 'filled');
- colormap(jet)
- title(project);
- xlabel('Shear displacement [m]');
- ylabel('y [m]');
- zlabel('z [m]');
- box on;
- grid on;
- axis equal;
- axis vis3d;
- rotate3d on;
-
- end % End visualize shear displacement 3D
-
- % Visualize shear stresses
- if strcmpi(method, 'shear')
-
- [p, grids, time, params, walls] = freadbin('../output/',[project '.output0.bin']);
-
- disp('Visualizing shear stress dynamics')
- xdisp = zeros(1, lastfile+1); % Shear displacement
- sigma = zeros(1, lastfile+1); % Effective normal stress
- tau = zeros(1, lastfile+1); % Shear stress
- dilation = zeros(1, lastfile+1); % Dilation
-
- % Calculate the shear velocity
- fixedidx_all = find(p.fixvel(:) > 0.0); % All particles with a fixed horiz. vel.
- shearvel = max(p.vel(fixedidx_all,1));
-
- % Surface area
- A = grids.L(1) * grids.L(2); % x-length * y-length
-
- % Load data from all output files
- for i = 0:lastfile
- fn = [project '.output' num2str(i) '.bin'];
- disp(fn);
- [p, ~, time, ~, walls] = freadbin('../output/', fn);
-
- xdisp(i+1) = time.current * shearvel;
- sigma(i+1) = walls.force(1) / A;
- tau(i+1) = shearstress(p, A);
- dilation(i+1) = walls.x;
- end
-
- % Plot stresses
- subplot(2,1,1);
- plot(xdisp, sigma, 'o-g', xdisp, tau, '+-r');
- title('Stress dynamics');
- xlabel('Shear distance [m]');
- ylabel('Stress [Pa]');
- legend('\sigma`','\tau');
- box on;
- grid on;
-
- % Plot dilation
- subplot(2,1,2);
- plot(xdisp, dilation, '+-b');
- title('Dilation');
- xlabel('Shear distance [m]');
- ylabel('Upper wall pos. [m]');
- box on;
- grid on;
-
- end
-
-end % End function
DIR diff --git a/python/elastic_collision.py b/python/elastic_collision.py
t@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+import subprocess
+import numpy
+import matplotlib.pyplot as plt
+
+# Import sphere functionality
+from sphere import *
+
+# New class
+init = Spherebin(np = 2, nd = 3)
+
+#init.generateRadii(radius_mean = 0.5, radius_variance = 1e-15, histogram = 0)
+init.radius[0] = numpy.ones(1, dtype=numpy.float64) * 0.5;
+init.radius[1] = numpy.ones(1, dtype=numpy.float64) * 0.52;
+
+init.defaultparams()
+
+# The world should never be less than 3 cells in ANY direction, due to contact searcg algorithm
+init.initsetup(gridnum = numpy.array([12,4,4]), periodic = 1, shearmodel = 2, g = numpy.array([0.0, 0.0, 0.0]))
+
+init.x[0] = numpy.array([1, 2, 2])
+init.x[1] = numpy.array([6, 2, 2])
+#init.x[2] = numpy.array([7, 2, 2])
+#init.x[3] = numpy.array([8, 2, 2])
+
+init.nu = numpy.zeros(init.np, dtype=numpy.float64)
+
+#for i in range(init.np):
+# init.x[i] = numpy.array([4+i*init.radius[i]*2, 2, 2])
+
+init.vel[0] = numpy.array([10.0, 0.0, 0.0])
+
+
+init.initTemporal(total = 1.0)
+
+init.writebin("../input/nc-test.bin")
+#render("../input/nc-test.bin", out = "~/Desktop/nc-test")
+
+subprocess.call("cd ..; rm output/*; ./sphere_darwin_X86_64 nc-test", shell=True)
+
+visualize("nc-test", "energy", savefig=True, outformat='png')
+#visualize("nc-test", "walls", savefig=True)
+
+subprocess.call("rm ../img_out/*; cd ../raytracer; ./render_all_outputs_GPU_clever.sh", shell=True)
+
+
DIR diff --git a/python/inelastic_collision.py b/python/inelastic_collision.py
t@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import subprocess
+import numpy
+import matplotlib.pyplot as plt
+
+# Import sphere functionality
+from sphere import *
+
+# New class
+init = Spherebin(np = 2, nd = 3)
+
+init.radius = numpy.ones(init.np, dtype=numpy.float64) * 0.5;
+
+init.defaultparams()
+
+# The world should never be less than 3 cells in ANY direction, due to contact searcg algorithm
+init.initsetup(gridnum = numpy.array([12,4,4]), periodic = 1, shearmodel = 2, g = numpy.array([0.0, 0.0, 0.0]))
+
+init.x[0] = numpy.array([1, 2, 2])
+init.x[1] = numpy.array([6, 2, 2])
+#init.x[2] = numpy.array([7, 2, 2])
+#init.x[3] = numpy.array([8, 2, 2])
+
+# Set fraction of critical damping (0 = elastic, 1 = completely inelastic)
+damping_fraction = 1.0
+init.nu = numpy.ones(init.np, dtype=numpy.float64) \
+ * damping_fraction * 2.0 * math.sqrt(4.0/3.0 * math.pi * init.radius.min()**3 \
+ * init.rho[0] * init.k_n[0])
+
+
+#for i in range(init.np):
+# init.x[i] = numpy.array([4+i*init.radius[i]*2, 2, 2])
+
+init.vel[0] = numpy.array([10.0, 0.0, 0.0])
+
+
+init.initTemporal(total = 1.0)
+
+init.writebin("../input/nc-test.bin")
+#render("../input/nc-test.bin", out = "~/Desktop/nc-test")
+
+subprocess.call("cd ..; rm output/*; ./sphere_darwin_X86_64 nc-test", shell=True)
+
+visualize("nc-test", "energy", savefig=True, outformat='png')
+#visualize("nc-test", "walls", savefig=True)
+
+subprocess.call("rm ../img_out/*; cd ../raytracer; ./render_all_outputs_GPU_clever.sh", shell=True)
+
+
DIR diff --git a/raytracer/colorbar.h b/raytracer/colorbar.h
t@@ -0,0 +1,22 @@
+#ifndef COLORBAR_H_
+#define COLORBAR_H_
+
+// Functions that determine red-, green- and blue color components
+// in a blue-white-red colormap. Ratio should be between 0.0-1.0.
+
+__inline__ __host__ __device__ float red(float ratio)
+{
+ return fmin(1.0f, 0.209f*ratio*ratio*ratio - 2.49f*ratio*ratio + 3.0f*ratio + 0.0109f);
+};
+
+__inline__ __host__ __device__ float green(float ratio)
+{
+ return fmin(1.0f, -2.44f*ratio*ratio + 2.15f*ratio + 0.369f);
+};
+
+__inline__ __host__ __device__ float blue(float ratio)
+{
+ return fmin(1.0f, -2.21f*ratio*ratio + 1.61f*ratio + 0.573f);
+};
+
+#endif
DIR diff --git a/raytracer/render_all_outputs_CPU.sh b/raytracer/render_all_outputs_CPU.sh
t@@ -0,0 +1,27 @@
+#!/bin/bash
+
+FILES=`ls ../output/*.bin`
+
+XRES=800
+YRES=800
+
+WORKHORSE=CPU
+
+echo "# Rendering PPM images and converting to JPG"
+for F in ../output/*.bin
+do
+ (BASE=`basename $F`;
+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm;
+ convert ../img_out/$BASE.ppm ../img_out/$BASE.jpg;)
+ #rm ../img_out/$BASE.ppm &)
+done
+
+#echo "# Converting PPM files to JPG using ImageMagick in parallel"
+#for F in ../img_out/*.ppm
+#do
+# (BASE=`basename $F`; convert $F $F.jpg &)
+#done
+
+#echo "# Removed temporary files"
+#rm ../img_out/*.ppm
+
DIR diff --git a/raytracer/render_all_outputs_GPU.sh b/raytracer/render_all_outputs_GPU.sh
t@@ -0,0 +1,29 @@
+#!/bin/bash
+
+FILES=`ls ../output/*.bin`
+
+XRES=800
+YRES=800
+
+WORKHORSE=GPU
+
+echo "# Rendering PPM images"
+for F in ../output/*.bin
+do
+ BASE=`basename $F`
+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null
+ if [ $? -ne 0 ] ; then
+ echo "Error rendering $F, trying again..."
+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null
+ fi
+done
+
+echo "# Converting PPM files to JPEG using ImageMagick in parallel"
+for F in ../img_out/*.ppm
+do
+ (BASE=`basename $F`; convert $F $F.jpg &)
+done
+
+#echo "# Removed temporary files"
+#rm ../img_out/*.ppm
+
DIR diff --git a/raytracer/rt-kernel.cu b/raytracer/rt-kernel.cu
t@@ -0,0 +1,442 @@
+#include <iostream>
+#include <cutil_math.h>
+#include "header.h"
+#include "rt-kernel.h"
+#include "colorbar.h"
+
+unsigned int iDivUp (unsigned int a, unsigned int b) {
+ return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+
+__inline__ __host__ __device__ float3 f4_to_f3(float4 in)
+{
+ return make_float3(in.x, in.y, in.z);
+}
+
+__inline__ __host__ __device__ float4 f3_to_f4(float3 in)
+{
+ return make_float4(in.x, in.y, in.z, 0.0f);
+}
+
+// Kernel for initializing image data
+__global__ void imageInit(unsigned char* _img, unsigned int pixels)
+{
+ // Compute pixel position from threadIdx/blockIdx
+ unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
+ if (mempos > pixels)
+ return;
+
+ _img[mempos*4] = 255; // Red channel
+ _img[mempos*4 + 1] = 255; // Green channel
+ _img[mempos*4 + 2] = 255; // Blue channel
+}
+
+// Calculate ray origins and directions
+__global__ void rayInitPerspective(float4* _ray_origo,
+ float4* _ray_direction,
+ float4 eye,
+ unsigned int width,
+ unsigned int height)
+{
+ // Compute pixel position from threadIdx/blockIdx
+ unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
+ if (mempos > width*height)
+ return;
+
+ // Calculate 2D position from linear index
+ unsigned int i = mempos % width;
+ unsigned int j = (int)floor((float)mempos/width) % width;
+
+ // Calculate pixel coordinates in image plane
+ float p_u = const_imgplane.x + (const_imgplane.y - const_imgplane.x)
+ * (i + 0.5f) / width;
+ float p_v = const_imgplane.z + (const_imgplane.w - const_imgplane.z)
+ * (j + 0.5f) / height;
+
+ // Write ray origo and direction to global memory
+ _ray_origo[mempos] = const_eye;
+ _ray_direction[mempos] = -const_d*const_w + p_u*const_u + p_v*const_v;
+}
+
+// Check wether the pixel's viewing ray intersects with the spheres,
+// and shade the pixel correspondingly
+__global__ void rayIntersectSpheres(float4* _ray_origo,
+ float4* _ray_direction,
+ float4* _p,
+ unsigned char* _img)
+{
+ // Compute pixel position from threadIdx/blockIdx
+ unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
+ if (mempos > const_pixels)
+ return;
+
+ // Read ray data from global memory
+ float3 e = f4_to_f3(_ray_origo[mempos]);
+ float3 d = f4_to_f3(_ray_direction[mempos]);
+ //float step = length(d);
+
+ // Distance, in ray steps, between object and eye initialized with a large value
+ float tdist = 1e10f;
+
+ // Surface normal at closest sphere intersection
+ float3 n;
+
+ // Intersection point coordinates
+ float3 p;
+
+ // Iterate through all particles
+ for (Inttype i=0; i<const_np; ++i) {
+
+ // Read sphere coordinate and radius
+ float3 c = f4_to_f3(_p[i]);
+ float R = _p[i].w;
+
+ // Calculate the discriminant: d = B^2 - 4AC
+ float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2
+ - 4.0f*dot(d,d) // -4*A
+ * (dot((e-c),(e-c)) - R*R); // C
+
+ // If the determinant is positive, there are two solutions
+ // One where the line enters the sphere, and one where it exits
+ if (Delta > 0.0f) {
+
+ // Calculate roots, Shirley 2009 p. 77
+ float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
+ * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
+
+ // Check wether intersection is closer than previous values
+ if (fabs(t_minus) < tdist) {
+ p = e + t_minus*d;
+ tdist = fabs(t_minus);
+ n = normalize(2.0f * (p - c)); // Surface normal
+ }
+
+ } // End of solution branch
+
+ } // End of particle loop
+
+ // Write pixel color
+ if (tdist < 1e10f) {
+
+ // Lambertian shading parameters
+ float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light)));
+ float I_d = 40.0f; // Light intensity
+ float k_d = 5.0f; // Diffuse coefficient
+
+ // Ambient shading
+ float k_a = 10.0f;
+ float I_a = 5.0f; // 5.0 for black background
+
+ // Write shading model values to pixel color channels
+ _img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod
+ + k_a * I_a)*0.48f);
+ _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
+ + k_a * I_a)*0.41f);
+ _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
+ + k_a * I_a)*0.27f);
+
+ }
+}
+
+// Check wether the pixel's viewing ray intersects with the spheres,
+// and shade the pixel correspondingly using a colormap
+__global__ void rayIntersectSpheresColormap(float4* _ray_origo,
+ float4* _ray_direction,
+ float4* _p,
+ float* _fixvel,
+ float* _linarr,
+ float max_val,
+ unsigned char* _img)
+{
+ // Compute pixel position from threadIdx/blockIdx
+ unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
+ if (mempos > const_pixels)
+ return;
+
+ // Read ray data from global memory
+ float3 e = f4_to_f3(_ray_origo[mempos]);
+ float3 d = f4_to_f3(_ray_direction[mempos]);
+
+ // Distance, in ray steps, between object and eye initialized with a large value
+ float tdist = 1e10f;
+
+ // Surface normal at closest sphere intersection
+ float3 n;
+
+ // Intersection point coordinates
+ float3 p;
+
+ unsigned int hitidx;
+
+ // Iterate through all particles
+ for (Inttype i=0; i<const_np; ++i) {
+
+ // Read sphere coordinate and radius
+ float3 c = f4_to_f3(_p[i]);
+ float R = _p[i].w;
+
+ // Calculate the discriminant: d = B^2 - 4AC
+ float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2
+ - 4.0f*dot(d,d) // -4*A
+ * (dot((e-c),(e-c)) - R*R); // C
+
+ // If the determinant is positive, there are two solutions
+ // One where the line enters the sphere, and one where it exits
+ if (Delta > 0.0f) {
+
+ // Calculate roots, Shirley 2009 p. 77
+ float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
+ * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
+
+ // Check wether intersection is closer than previous values
+ if (fabs(t_minus) < tdist) {
+ p = e + t_minus*d;
+ tdist = fabs(t_minus);
+ n = normalize(2.0f * (p - c)); // Surface normal
+ hitidx = i;
+ }
+
+ } // End of solution branch
+
+ } // End of particle loop
+
+ // Write pixel color
+ if (tdist < 1e10) {
+
+ // Fetch particle data used for color
+ float ratio = _linarr[hitidx] / max_val;
+ float fixvel = _fixvel[hitidx];
+
+ // Make sure the ratio doesn't exceed the 0.0-1.0 interval
+ if (ratio < 0.01f)
+ ratio = 0.01f;
+ if (ratio > 0.99f)
+ ratio = 0.99f;
+
+ // Lambertian shading parameters
+ float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light)));
+ float I_d = 40.0f; // Light intensity
+ float k_d = 5.0f; // Diffuse coefficient
+
+ // Ambient shading
+ float k_a = 10.0f;
+ float I_a = 5.0f;
+
+ float redv = red(ratio);
+ float greenv = green(ratio);
+ float bluev = blue(ratio);
+
+ // Make particle dark grey if the horizontal velocity is fixed
+ if (fixvel > 0.f) {
+ redv = 0.5;
+ greenv = 0.5;
+ bluev = 0.5;
+ }
+
+ // Write shading model values to pixel color channels
+ _img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod
+ + k_a * I_a)*redv);
+ _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
+ + k_a * I_a)*greenv);
+ _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
+ + k_a * I_a)*bluev);
+ }
+}
+
+extern "C"
+__host__ void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio,
+ unsigned int pixels, Inttype np)
+{
+ // Image dimensions in world space (l, r, b, t)
+ float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, 0.5f*imgw*hw_ratio);
+
+ // The view vector
+ float4 view = eye - lookat;
+
+ // Construct the camera view orthonormal base
+ //float4 up = make_float4(0.0f, 1.0f, 0.0f, 0.0f); // Pointing upward along +y
+ float4 up = make_float4(0.0f, 0.0f, 1.0f, 0.0f); // Pointing upward along +z
+ float4 w = -view/length(view); // w: Pointing backwards
+ float4 u = make_float4(cross(make_float3(up.x, up.y, up.z),
+ make_float3(w.x, w.y, w.z)), 0.0f)
+ / length(cross(make_float3(up.x, up.y, up.z), make_float3(w.x, w.y, w.z)));
+ float4 v = make_float4(cross(make_float3(w.x, w.y, w.z), make_float3(u.x, u.y, u.z)), 0.0f);
+
+ // Focal length 20% of eye vector length
+ float d = length(view)*0.8f;
+
+ // Light direction (points towards light source)
+ float4 light = normalize(-1.0f*eye*make_float4(1.0f, 0.2f, 0.6f, 0.0f));
+
+ std::cout << " Transfering camera values to constant memory\n";
+
+ cudaMemcpyToSymbol("const_u", &u, sizeof(u));
+ cudaMemcpyToSymbol("const_v", &v, sizeof(v));
+ cudaMemcpyToSymbol("const_w", &w, sizeof(w));
+ cudaMemcpyToSymbol("const_eye", &eye, sizeof(eye));
+ cudaMemcpyToSymbol("const_imgplane", &imgplane, sizeof(imgplane));
+ cudaMemcpyToSymbol("const_d", &d, sizeof(d));
+ cudaMemcpyToSymbol("const_light", &light, sizeof(light));
+ cudaMemcpyToSymbol("const_pixels", &pixels, sizeof(pixels));
+ cudaMemcpyToSymbol("const_np", &np, sizeof(np));
+}
+
+// Check for CUDA errors
+extern "C"
+__host__ void checkForCudaErrors(const char* checkpoint_description)
+{
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess) {
+ std::cout << "\nCuda error detected, checkpoint: " << checkpoint_description
+ << "\nError string: " << cudaGetErrorString(err) << "\n";
+ exit(EXIT_FAILURE);
+ }
+}
+
+
+// Wrapper for the rt kernel
+extern "C"
+__host__ int rt(float4* p, Inttype np,
+ rgb* img, unsigned int width, unsigned int height,
+ f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
+ int visualize, float max_val,
+ float* fixvel, float* pres, float* es_dot, float* es, float* vel)
+{
+ using std::cout;
+
+ cout << "Initializing CUDA:\n";
+
+ // Initialize GPU timestamp recorders
+ float t1, t2;
+ cudaEvent_t t1_go, t2_go, t1_stop, t2_stop;
+ cudaEventCreate(&t1_go);
+ cudaEventCreate(&t2_go);
+ cudaEventCreate(&t2_stop);
+ cudaEventCreate(&t1_stop);
+
+ // Start timer 1
+ cudaEventRecord(t1_go, 0);
+
+ // Allocate memory
+ cout << " Allocating device memory\n";
+ static float4 *_p; // Particle positions (x,y,z) and radius (w)
+ static float *_fixvel; // Indicates whether a particle has a fixed horizontal velocity
+ static float *_linarr; // Array for linear values to color the particles after
+ static unsigned char *_img; // RGBw values in image
+ static float4 *_ray_origo; // Ray origo (x,y,z)
+ static float4 *_ray_direction; // Ray direction (x,y,z)
+ cudaMalloc((void**)&_p, np*sizeof(float4));
+ cudaMalloc((void**)&_fixvel, np*sizeof(float));
+ cudaMalloc((void**)&_linarr, np*sizeof(float)); // 0 size if visualize = 0;
+ cudaMalloc((void**)&_img, width*height*4*sizeof(unsigned char));
+ cudaMalloc((void**)&_ray_origo, width*height*sizeof(float4));
+ cudaMalloc((void**)&_ray_direction, width*height*sizeof(float4));
+
+ // Transfer particle data
+ cout << " Transfering particle data: host -> device\n";
+ cudaMemcpy(_p, p, np*sizeof(float4), cudaMemcpyHostToDevice);
+ cudaMemcpy(_fixvel, fixvel, np*sizeof(float), cudaMemcpyHostToDevice);
+ if (visualize == 1 || visualize == 4)
+ cudaMemcpy(_linarr, pres, np*sizeof(float), cudaMemcpyHostToDevice);
+ if (visualize == 2)
+ cudaMemcpy(_linarr, es_dot, np*sizeof(float), cudaMemcpyHostToDevice);
+ if (visualize == 3)
+ cudaMemcpy(_linarr, es, np*sizeof(float), cudaMemcpyHostToDevice);
+ if (visualize == 4)
+ cudaMemcpy(_linarr, vel, np*sizeof(float), cudaMemcpyHostToDevice);
+
+ // Check for errors after memory allocation
+ checkForCudaErrors("CUDA error after memory allocation");
+
+ // Arrange thread/block structure
+ unsigned int pixels = width*height;
+ float hw_ratio = (float)height/(float)width;
+ const unsigned int threadsPerBlock = 256;
+ const unsigned int blocksPerGrid = iDivUp(pixels, threadsPerBlock);
+
+ // Start timer 2
+ cudaEventRecord(t2_go, 0);
+
+ // Initialize image to background color
+ imageInit<<< blocksPerGrid, threadsPerBlock >>>(_img, pixels);
+
+ // Initialize camera
+ cameraInit(make_float4(eye.x, eye.y, eye.z, 0.0f),
+ make_float4(lookat.x, lookat.y, lookat.z, 0.0f),
+ imgw, hw_ratio, pixels, np);
+ checkForCudaErrors("CUDA error after cameraInit");
+
+ // Construct rays for perspective projection
+ rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>(
+ _ray_origo, _ray_direction,
+ make_float4(eye.x, eye.y, eye.z, 0.0f),
+ width, height);
+
+ cudaThreadSynchronize();
+
+ // Find closest intersection between rays and spheres
+ if (visualize == 1) { // Visualize pressure
+ cout << " Pressure color map range: [0, " << max_val << "] Pa\n";
+ rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
+ _ray_origo, _ray_direction,
+ _p, _fixvel, _linarr, max_val, _img);
+ } else if (visualize == 2) { // es_dot visualization
+ cout << " Shear heat production rate color map range: [0, " << max_val << "] W\n";
+ rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
+ _ray_origo, _ray_direction,
+ _p, _fixvel, _linarr, max_val, _img);
+ } else if (visualize == 3) { // es visualization
+ cout << " Total shear heat color map range: [0, " << max_val << "] J\n";
+ rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
+ _ray_origo, _ray_direction,
+ _p, _fixvel, _linarr, max_val, _img);
+ } else if (visualize == 4) { // velocity visualization
+ cout << " Velocity color map range: [0, " << max_val << "] m/s\n";
+ rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
+ _ray_origo, _ray_direction,
+ _p, _fixvel, _linarr, max_val, _img);
+ } else { // Normal visualization
+ rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>(
+ _ray_origo, _ray_direction,
+ _p, _img);
+ }
+
+ // Make sure all threads are done before continuing CPU control sequence
+ cudaThreadSynchronize();
+
+ // Check for errors
+ checkForCudaErrors("CUDA error after kernel execution");
+
+ // Stop timer 2
+ cudaEventRecord(t2_stop, 0);
+ cudaEventSynchronize(t2_stop);
+
+ // Transfer image data from device to host
+ cout << " Transfering image data: device -> host\n";
+ cudaMemcpy(img, _img, width*height*4*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+
+ // Free dynamically allocated device memory
+ cudaFree(_p);
+ cudaFree(_fixvel);
+ cudaFree(_linarr);
+ cudaFree(_img);
+ cudaFree(_ray_origo);
+ cudaFree(_ray_direction);
+
+ // Stop timer 1
+ cudaEventRecord(t1_stop, 0);
+ cudaEventSynchronize(t1_stop);
+
+ // Calculate time spent in t1 and t2
+ cudaEventElapsedTime(&t1, t1_go, t1_stop);
+ cudaEventElapsedTime(&t2, t2_go, t2_stop);
+
+ // Report time spent
+ cout << " Time spent on entire GPU routine: "
+ << t1 << " ms\n";
+ cout << " - Kernels: " << t2 << " ms\n"
+ << " - Memory alloc. and transfer: " << t1-t2 << " ms\n";
+
+ // Return successfully
+ return 0;
+}
DIR diff --git a/raytracer/rt_GPU_init_pres.sh b/raytracer/rt_GPU_init_pres.sh
t@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Pass max. pressure as command line argument
+
+XRES=400
+YRES=1600
+
+WORKHORSE=GPU
+
+echo "# Rendering PPM images"
+for F in ../output/*init*.bin
+do
+ BASE=`basename $F`
+ OUTFILE=$BASE.ppm.jpg
+ if [ -e ../img_out/$OUTFILE ]
+ then
+ echo "$OUTFILE exists." > /dev/null
+ else
+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
+ if [ $? -ne 0 ] ; then
+ echo "Error rendering $F, trying again..."
+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
+ fi
+ fi
+done
+
+echo "# Converting PPM files to JPEG using ImageMagick in parallel"
+for F in ../img_out/*.ppm
+do
+ (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
+done
+
+sleep 5
+echo "# Removing temporary PPM files"
+rm ../img_out/*.ppm
+
DIR diff --git a/raytracer/rt_GPU_pres.sh b/raytracer/rt_GPU_pres.sh
t@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Pass max. pressure as command line argument
+
+XRES=800
+YRES=800
+
+WORKHORSE=GPU
+
+echo "# Rendering PPM images"
+for F in ../output/*.bin
+do
+ BASE=`basename $F`
+ OUTFILE=$BASE.ppm.jpg
+ if [ -e ../img_out/$OUTFILE ]
+ then
+ echo "$OUTFILE exists." > /dev/null
+ else
+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
+ if [ $? -ne 0 ] ; then
+ echo "Error rendering $F, trying again..."
+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
+ fi
+ fi
+done
+
+echo "# Converting PPM files to JPEG using ImageMagick in parallel"
+for F in ../img_out/*.ppm
+do
+ (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
+done
+
+sleep 5
+echo "# Removing temporary PPM files"
+rm ../img_out/*.ppm
+
DIR diff --git a/raytracer/rt_GPU_tall_pres.sh b/raytracer/rt_GPU_tall_pres.sh
t@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Pass max. pressure as command line argument
+
+XRES=800
+YRES=1200
+
+WORKHORSE=GPU
+
+echo "# Rendering PPM images"
+for F in ../output/*.bin
+do
+ BASE=`basename $F`
+ OUTFILE=$BASE.ppm.jpg
+ if [ -e ../img_out/$OUTFILE ]
+ then
+ echo "$OUTFILE exists." > /dev/null
+ else
+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
+ if [ $? -ne 0 ] ; then
+ echo "Error rendering $F, trying again..."
+ ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
+ fi
+ fi
+done
+
+echo "# Converting PPM files to JPEG using ImageMagick in parallel"
+for F in ../img_out/*.ppm
+do
+ (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
+done
+
+sleep 5
+echo "# Removing temporary PPM files"
+rm ../img_out/*.ppm
+
DIR diff --git a/src/cohesion.cuh b/src/cohesion.cuh
t@@ -0,0 +1,126 @@
+#ifndef COHESION_CUH_
+#define COHESION_CUH_
+
+// cohesion.cuh
+// Functions governing attractive forces between contacts
+
+
+// Linear-elastic bond: Attractive force with normal- and shear components
+// acting upon particle A in a bonded particle pair
+__device__ void bondLinear(Float3* N, Float3* T, Float* es_dot, Float* p,
+ unsigned int idx_a, unsigned int idx_b,
+ Float4* dev_x_sorted, Float4* dev_vel_sorted,
+ Float4* dev_angvel_sorted,
+ Float radius_a, Float radius_b,
+ Float3 x_ab, Float x_ab_length,
+ Float delta_ab)
+{
+
+ // If particles are not overlapping, apply bond force
+ if (delta_ab > 0.0f) {
+
+ // Allocate variables and fetch missing time=t values for particle A and B
+ Float4 vel_a = dev_vel_sorted[idx_a];
+ Float4 vel_b = dev_vel_sorted[idx_b];
+ Float4 angvel4_a = dev_angvel_sorted[idx_a];
+ Float4 angvel4_b = dev_angvel_sorted[idx_b];
+
+ // Convert to Float3's
+ Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
+ Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
+
+ // Normal vector of contact
+ Float3 n_ab = x_ab/x_ab_length;
+
+ // Relative contact interface velocity, w/o rolling
+ Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
+ vel_a.y - vel_b.y,
+ vel_a.z - vel_b.z);
+
+ // Relative contact interface velocity of particle surfaces at
+ // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
+ Float3 vel_ab = vel_ab_linear
+ + radius_a * cross(n_ab, angvel_a)
+ + radius_b * cross(n_ab, angvel_b);
+
+ // Relative contact interface rolling velocity
+ //Float3 angvel_ab = angvel_a - angvel_b;
+ //Float angvel_ab_length = length(angvel_ab);
+
+ // Normal component of the relative contact interface velocity
+ //Float vel_n_ab = dot(vel_ab_linear, n_ab);
+
+ // Tangential component of the relative contact interface velocity
+ // Hinrichsen and Wolf 2004, eq. 13.9
+ Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
+ //Float vel_t_ab_length = length(vel_t_ab);
+
+ Float3 f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+ Float3 f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+
+ // Mean radius
+ Float R_bar = (radius_a + radius_b)/2.0f;
+
+ // Normal force component: Elastic
+ f_n = devC_k_n * delta_ab * n_ab;
+
+ if (length(vel_t_ab) > 0.f) {
+ // Shear force component: Viscous
+ f_t = -1.0f * devC_gamma_s * vel_t_ab;
+
+ // Shear friction production rate [W]
+ //*es_dot += -dot(vel_t_ab, f_t);
+ }
+
+ // Add force components from this bond to total force for particle
+ *N += f_n + f_t;
+ *T += -R_bar * cross(n_ab, f_t);
+
+ // Pressure excerted onto the particle from this bond
+ *p += length(f_n) / (4.0f * PI * radius_a*radius_a);
+
+ }
+} // End of bondLinear()
+
+
+// Capillary cohesion after Richefeu et al. (2006)
+__device__ void capillaryCohesion_exp(Float3* N, Float radius_a,
+ Float radius_b, Float delta_ab,
+ Float3 x_ab, Float x_ab_length,
+ Float kappa)
+{
+
+ // Normal vector
+ Float3 n_ab = x_ab/x_ab_length;
+
+ Float3 f_c;
+ Float lambda, R_geo, R_har, r, h;
+
+ // Determine the ratio; r = max{Ri/Rj;Rj/Ri}
+ if ((radius_a/radius_b) > (radius_b/radius_a))
+ r = radius_a/radius_b;
+ else
+ r = radius_b/radius_a;
+
+ // Exponential decay function
+ h = -sqrtf(r);
+
+ // The harmonic mean
+ R_har = (2.0f * radius_a * radius_b) / (radius_a + radius_b);
+
+ // The geometrical mean
+ R_geo = sqrtf(radius_a * radius_b);
+
+ // The exponential falloff of the capillary force with distance
+ lambda = 0.9f * h * sqrtf(devC_V_b/R_har);
+
+ // Calculate cohesional force
+ f_c = -kappa * R_geo * expf(-delta_ab/lambda) * n_ab;
+
+ // Add force components from this collision to total force for particle
+ *N += f_c;
+
+} // End of capillaryCohesion_exp
+
+
+#endif
DIR diff --git a/src/contactmodels.cuh b/src/contactmodels.cuh
t@@ -0,0 +1,411 @@
+#ifndef CONTACTMODELS_CUH_
+#define CONTACTMODELS_CUH_
+
+// contactmodels.cuh
+// Functions governing repulsive forces between contacts
+
+
+// Linear viscoelastic contact model for particle-wall interactions
+// with tangential friction and rolling resistance
+__device__ Float contactLinear_wall(Float3* N, Float3* T, Float* es_dot, Float* p,
+ unsigned int idx_a, Float radius_a,
+ Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
+ Float3 n, Float delta, Float wvel)
+{
+ // Fetch particle velocities from global memory
+ Float4 linvel_tmp = dev_vel_sorted[idx_a];
+ Float4 angvel_tmp = dev_angvel_sorted[idx_a];
+
+ // Convert velocities to three-component vectors
+ Float3 linvel = MAKE_FLOAT3(linvel_tmp.x,
+ linvel_tmp.y,
+ linvel_tmp.z);
+ Float3 angvel = MAKE_FLOAT3(angvel_tmp.x,
+ angvel_tmp.y,
+ angvel_tmp.z);
+
+ // Store the length of the angular velocity for later use
+ Float angvel_length = length(angvel);
+
+ // Contact velocity is the sum of the linear and
+ // rotational components
+ Float3 vel = linvel + radius_a * cross(n, angvel) + wvel;
+
+ // Normal component of the contact velocity
+ Float vel_n = dot(vel, n);
+
+ // The tangential velocity is the contact velocity
+ // with the normal component subtracted
+ Float3 vel_t = vel - n * (dot(vel, n));
+ Float vel_t_length = length(vel_t);
+
+ // Calculate elastic normal component
+ //Float3 f_n = -devC_k_n * delta * n;
+
+ // Normal force component: Elastic - viscous damping
+ Float3 f_n = (-devC_k_n * delta - devC_gamma_wn * vel_n) * n;
+
+ // Make sure the viscous damping doesn't exceed the elastic component,
+ // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
+ if (dot(f_n, n) < 0.0f)
+ f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+
+ Float f_n_length = length(f_n); // Save length for later use
+
+ // Initialize vectors
+ Float3 f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+ Float3 T_res = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+
+ // Check that the tangential velocity is high enough to avoid
+ // divide by zero (producing a NaN)
+ if (vel_t_length > 0.f) {
+
+ Float f_t_visc = devC_gamma_ws * vel_t_length; // Tangential force by viscous model
+ Float f_t_limit = devC_mu_s * f_n_length; // Max. friction
+
+ // If the shear force component exceeds the friction,
+ // the particle slips and energy is dissipated
+ if (f_t_visc < f_t_limit) {
+ f_t = -1.0f * f_t_visc * vel_t/vel_t_length;
+
+ } else { // Dynamic friction, friction failure
+ f_t = -1.0f * f_t_limit * vel_t/vel_t_length;
+
+ // Shear energy production rate [W]
+ //*es_dot += -dot(vel_t, f_t);
+ }
+ }
+
+/* if (angvel_length > 0.f) {
+ // Apply rolling resistance (Zhou et al. 1999)
+ //T_res = -angvel_a/angvel_length * devC_mu_r * radius_a * f_n_length;
+
+ // New rolling resistance model
+ T_res = -1.0f * fmin(devC_gamma_r * radius_a * angvel_length,
+ devC_mu_r * radius_a * f_n_length)
+ * angvel_a/angvel_length;
+ }*/
+
+ // Total force from wall
+ *N += f_n + f_t;
+
+ // Total torque from wall
+ *T += -radius_a * cross(n, f_t) + T_res;
+
+ // Pressure excerted onto particle from this contact
+ *p += f_n_length / (4.0f * PI * radius_a*radius_a);
+
+ // Return force excerted onto the wall
+ //return -dot(*N, n);
+ return dot(f_n, n);
+}
+
+
+// Linear vicoelastic contact model for particle-particle interactions
+// with tangential friction and rolling resistance
+__device__ void contactLinearViscous(Float3* N, Float3* T, Float* es_dot, Float* p,
+ unsigned int idx_a, unsigned int idx_b,
+ Float4* dev_vel_sorted,
+ Float4* dev_angvel_sorted,
+ Float radius_a, Float radius_b,
+ Float3 x_ab, Float x_ab_length,
+ Float delta_ab, Float kappa)
+{
+
+ // Allocate variables and fetch missing time=t values for particle A and B
+ Float4 vel_a = dev_vel_sorted[idx_a];
+ Float4 vel_b = dev_vel_sorted[idx_b];
+ Float4 angvel4_a = dev_angvel_sorted[idx_a];
+ Float4 angvel4_b = dev_angvel_sorted[idx_b];
+
+ // Convert to Float3's
+ Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
+ Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
+
+ // Force between grain pair decomposed into normal- and tangential part
+ Float3 f_n, f_t, f_c, T_res;
+
+ // Normal vector of contact
+ Float3 n_ab = x_ab/x_ab_length;
+
+ // Relative contact interface velocity, w/o rolling
+ Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
+ vel_a.y - vel_b.y,
+ vel_a.z - vel_b.z);
+
+ // Relative contact interface velocity of particle surfaces at
+ // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
+ Float3 vel_ab = vel_ab_linear
+ + radius_a * cross(n_ab, angvel_a)
+ + radius_b * cross(n_ab, angvel_b);
+
+ // Relative contact interface rolling velocity
+ Float3 angvel_ab = angvel_a - angvel_b;
+ Float angvel_ab_length = length(angvel_ab);
+
+ // Normal component of the relative contact interface velocity
+ Float vel_n_ab = dot(vel_ab_linear, n_ab);
+
+ // Tangential component of the relative contact interface velocity
+ // Hinrichsen and Wolf 2004, eq. 13.9
+ Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
+ Float vel_t_ab_length = length(vel_t_ab);
+
+ // Compute the normal stiffness of the contact
+ //Float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
+
+ // Calculate rolling radius
+ Float R_bar = (radius_a + radius_b) / 2.0f;
+
+ // Normal force component: linear-elastic approximation (Augier 2009, eq. 3)
+ // with velocity dependant damping
+ // Damping coefficient: alpha = 0.8
+ //f_n = (-k_n_ab * delta_ab + 2.0f * 0.8f * sqrtf(m_eff*k_n_ab) * vel_ab) * n_ab;
+
+ // Linear spring for normal component (Renzo 2004, eq. 35)
+ // Dissipation due to plastic deformation is modelled by using a different
+ // unloading spring constant (Walton and Braun 1986)
+ // Here the factor in the second term determines the relative strength of the
+ // unloading spring relative to the loading spring.
+ /* if (vel_n_ab > 0.0f) { // Loading
+ f_n = (-k_n_ab * delta_ab) * n_ab;
+ } else { // Unloading
+ f_n = (-k_n_ab * 0.90f * delta_ab) * n_ab;
+ } // f_n is OK! */
+
+ // Normal force component: Elastic
+ //f_n = -k_n_ab * delta_ab * n_ab;
+
+ // Normal force component: Elastic - viscous damping
+ f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
+
+ // Make sure the viscous damping doesn't exceed the elastic component,
+ // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
+ if (dot(f_n, n_ab) < 0.0f)
+ f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+
+ Float f_n_length = length(f_n);
+
+ // Add max. capillary force
+ f_c = -kappa * sqrtf(radius_a * radius_b) * n_ab;
+
+ // Initialize force vectors to zero
+ f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+ T_res = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+
+ // Shear force component: Nonlinear relation
+ // Coulomb's law of friction limits the tangential force to less or equal
+ // to the normal force
+ if (vel_t_ab_length > 0.f) {
+
+ // Tangential force by viscous model
+ Float f_t_visc = devC_gamma_s * vel_t_ab_length;
+
+ // Determine max. friction
+ Float f_t_limit;
+ if (vel_t_ab_length > 0.001f) { // Dynamic
+ f_t_limit = devC_mu_d * length(f_n-f_c);
+ } else { // Static
+ f_t_limit = devC_mu_s * length(f_n-f_c);
+ }
+
+ // If the shear force component exceeds the friction,
+ // the particle slips and energy is dissipated
+ if (f_t_visc < f_t_limit) { // Static
+ f_t = -1.0f * f_t_visc * vel_t_ab/vel_t_ab_length;
+
+ } else { // Dynamic, friction failure
+ f_t = -1.0f * f_t_limit * vel_t_ab/vel_t_ab_length;
+
+ // Shear friction production rate [W]
+ //*es_dot += -dot(vel_t_ab, f_t);
+ }
+ }
+
+/* if (angvel_ab_length > 0.f) {
+ // Apply rolling resistance (Zhou et al. 1999)
+ //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
+
+ // New rolling resistance model
+ T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
+ devC_mu_r * R_bar * f_n_length)
+ * angvel_ab/angvel_ab_length;
+ }
+*/
+
+ // Add force components from this collision to total force for particle
+ *N += f_n + f_t + f_c;
+ *T += -R_bar * cross(n_ab, f_t) + T_res;
+
+ // Pressure excerted onto the particle from this contact
+ *p += f_n_length / (4.0f * PI * radius_a*radius_a);
+
+} // End of contactLinearViscous()
+
+
+// Linear elastic contact model for particle-particle interactions
+__device__ void contactLinear(Float3* N, Float3* T,
+ Float* es_dot, Float* p,
+ unsigned int idx_a_orig,
+ unsigned int idx_b_orig,
+ Float4 vel_a,
+ Float4* dev_vel,
+ Float3 angvel_a,
+ Float4* dev_angvel,
+ Float radius_a, Float radius_b,
+ Float3 x_ab, Float x_ab_length,
+ Float delta_ab, Float4* dev_delta_t,
+ unsigned int mempos)
+{
+
+ // Allocate variables and fetch missing time=t values for particle A and B
+ Float4 vel_b = dev_vel[idx_b_orig];
+ Float4 angvel4_b = dev_angvel[idx_b_orig];
+
+ // Fetch previous sum of shear displacement for the contact pair
+ Float4 delta_t0_4 = dev_delta_t[mempos];
+
+ Float3 delta_t0_uncor = MAKE_FLOAT3(delta_t0_4.x,
+ delta_t0_4.y,
+ delta_t0_4.z);
+
+ // Convert to Float3
+ Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
+
+ // Force between grain pair decomposed into normal- and tangential part
+ Float3 f_n, f_t, f_c, T_res;
+
+ // Normal vector of contact
+ Float3 n_ab = x_ab/x_ab_length;
+
+ // Relative contact interface velocity, w/o rolling
+ Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
+ vel_a.y - vel_b.y,
+ vel_a.z - vel_b.z);
+
+ // Relative contact interface velocity of particle surfaces at
+ // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
+ Float3 vel_ab = vel_ab_linear
+ + radius_a * cross(n_ab, angvel_a)
+ + radius_b * cross(n_ab, angvel_b);
+
+ // Relative contact interface rolling velocity
+ Float3 angvel_ab = angvel_a - angvel_b;
+ Float angvel_ab_length = length(angvel_ab);
+
+ // Normal component of the relative contact interface velocity
+ Float vel_n_ab = dot(vel_ab_linear, n_ab);
+
+ // Tangential component of the relative contact interface velocity
+ // Hinrichsen and Wolf 2004, eq. 13.9
+ Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
+ Float vel_t_ab_length = length(vel_t_ab);
+
+ // Correct tangential displacement vector, which is
+ // necessary if the tangential plane rotated
+ Float3 delta_t0 = delta_t0_uncor - (n_ab * dot(delta_t0_uncor, n_ab));
+ Float delta_t0_length = length(delta_t0);
+
+ // New tangential displacement vector
+ Float3 delta_t;
+
+ // Compute the normal stiffness of the contact
+ //Float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
+
+ // Calculate rolling radius
+ Float R_bar = (radius_a + radius_b) / 2.0f;
+
+ // Normal force component: Elastic
+ //f_n = -devC_k_n * delta_ab * n_ab;
+
+ // Normal force component: Elastic - viscous damping
+ f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
+
+ // Make sure the viscous damping doesn't exceed the elastic component,
+ // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
+ if (dot(f_n, n_ab) < 0.0f)
+ f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+
+ Float f_n_length = length(f_n);
+
+ // Add max. capillary force
+ f_c = -devC_kappa * sqrtf(radius_a * radius_b) * n_ab;
+
+ // Initialize force vectors to zero
+ f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+ T_res = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+
+ // Apply a tangential force if the previous tangential displacement
+ // is non-zero, or the current sliding velocity is non-zero.
+ if (delta_t0_length > 0.f || vel_t_ab_length > 0.f) {
+
+ // Shear force: Visco-Elastic, limited by Coulomb friction
+ Float3 f_t_elast = -1.0f * devC_k_s * delta_t0;
+ Float3 f_t_visc = -1.0f * devC_gamma_s * vel_t_ab;
+
+ Float f_t_limit;
+
+ if (vel_t_ab_length > 0.001f) { // Dynamic friciton
+ f_t_limit = devC_mu_d * length(f_n-f_c);
+ } else { // Static friction
+ f_t_limit = devC_mu_s * length(f_n-f_c);
+ }
+
+ // Tangential force before friction limit correction
+ f_t = f_t_elast + f_t_visc;
+ Float f_t_length = length(f_t);
+
+ // If failure criterion is not met, contact is viscous-linear elastic.
+ // If failure criterion is met, contact force is limited,
+ // resulting in a slip and energy dissipation
+ if (f_t_length > f_t_limit) { // Dynamic case
+
+ // Frictional force is reduced to equal the limit
+ f_t *= f_t_limit/f_t_length;
+
+ // A slip event zeros the displacement vector
+ //delta_t = make_Float3(0.0f, 0.0f, 0.0f);
+
+ // In a slip event, the tangential spring is adjusted to a
+ // length which is consistent with Coulomb's equation
+ // (Hinrichsen and Wolf, 2004)
+ delta_t = -1.0f/devC_k_s * f_t + devC_gamma_s * vel_t_ab;
+
+ // Shear friction heat production rate:
+ // The energy lost from the tangential spring is dissipated as heat
+ //*es_dot += -dot(vel_t_ab, f_t);
+ *es_dot += length(delta_t0 - delta_t) * devC_k_s / devC_dt; // Seen in EsyS-Particle
+
+ } else { // Static case
+
+ // No correction of f_t is required
+
+ // Add tangential displacement to total tangential displacement
+ delta_t = delta_t0 + vel_t_ab * devC_dt;
+ }
+ }
+
+ if (angvel_ab_length > 0.f) {
+ // Apply rolling resistance (Zhou et al. 1999)
+ //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
+
+ // New rolling resistance model
+ T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
+ devC_mu_r * R_bar * f_n_length)
+ * angvel_ab/angvel_ab_length;
+ }
+
+ // Add force components from this collision to total force for particle
+ *N += f_n + f_t + f_c;
+ *T += -R_bar * cross(n_ab, f_t) + T_res;
+
+ // Pressure excerted onto the particle from this contact
+ *p += f_n_length / (4.0f * PI * radius_a*radius_a);
+
+ // Store sum of tangential displacements
+ dev_delta_t[mempos] = MAKE_FLOAT4(delta_t.x, delta_t.y, delta_t.z, 0.0f);
+
+} // End of contactLinear()
+
+
+#endif
DIR diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
t@@ -0,0 +1,625 @@
+#ifndef CONTACTSEARCH_CUH_
+#define CONTACTSEARCH_CUH_
+
+// contactsearch.cuh
+// Functions for identifying contacts and processing boundaries
+
+
+// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
+// Kernel executed on device, and callable from device only.
+__device__ void overlapsInCell(int3 targetCell,
+ unsigned int idx_a,
+ Float4 x_a, Float radius_a,
+ Float3* N, Float3* T,
+ Float* es_dot, Float* p,
+ Float4* dev_x_sorted,
+ Float* dev_radius_sorted,
+ Float4* dev_vel_sorted,
+ Float4* dev_angvel_sorted,
+ unsigned int* dev_cellStart,
+ unsigned int* dev_cellEnd,
+ Float4* dev_w_nx,
+ Float4* dev_w_mvfd)
+ //uint4 bonds)
+{
+
+ // Variable containing modifier for interparticle
+ // vector, if it crosses a periodic boundary
+ Float3 distmod = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+
+ // Check whether x- and y boundaries are to be treated as periodic
+ // 1: x- and y boundaries periodic
+ // 2: x boundaries periodic
+ if (devC_periodic == 1) {
+
+ // Periodic x-boundary
+ if (targetCell.x < 0) {
+ targetCell.x = devC_num[0] - 1;
+ distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
+ }
+ if (targetCell.x == devC_num[0]) {
+ targetCell.x = 0;
+ distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
+ }
+
+ // Periodic y-boundary
+ if (targetCell.y < 0) {
+ targetCell.y = devC_num[1] - 1;
+ distmod += MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
+ }
+ if (targetCell.y == devC_num[1]) {
+ targetCell.y = 0;
+ distmod -= MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
+ }
+
+ } else if (devC_periodic == 2) {
+
+ // Periodic x-boundary
+ if (targetCell.x < 0) {
+ targetCell.x = devC_num[0] - 1;
+ distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
+ }
+ if (targetCell.x == devC_num[0]) {
+ targetCell.x = 0;
+ distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
+ }
+
+ // Hande out-of grid cases on y-axis
+ if (targetCell.y < 0 || targetCell.y == devC_num[1])
+ return;
+
+ } else {
+
+ // Hande out-of grid cases on x- and y-axes
+ if (targetCell.x < 0 || targetCell.x == devC_num[0])
+ return;
+ if (targetCell.y < 0 || targetCell.y == devC_num[1])
+ return;
+ }
+
+ // Handle out-of-grid cases on z-axis
+ if (targetCell.z < 0 || targetCell.z == devC_num[2])
+ return;
+
+
+ //// Check and process particle-particle collisions
+
+ // Calculate linear cell ID
+ unsigned int cellID = targetCell.x
+ + __umul24(targetCell.y, devC_num[0])
+ + __umul24(__umul24(devC_num[0],
+ devC_num[1]),
+ targetCell.z);
+
+ // Lowest particle index in cell
+ unsigned int startIdx = dev_cellStart[cellID];
+
+ // Make sure cell is not empty
+ if (startIdx != 0xffffffff) {
+
+ // Highest particle index in cell + 1
+ unsigned int endIdx = dev_cellEnd[cellID];
+
+ // Iterate over cell particles
+ for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
+ if (idx_b != idx_a) { // Do not collide particle with itself
+
+ // Fetch position and velocity of particle B.
+ Float4 x_b = dev_x_sorted[idx_b];
+ Float radius_b = dev_radius_sorted[idx_b];
+ Float kappa = devC_kappa;
+
+ // Distance between particle centers (Float4 -> Float3)
+ Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x,
+ x_a.y - x_b.y,
+ x_a.z - x_b.z);
+
+ // Adjust interparticle vector if periodic boundary/boundaries
+ // are crossed
+ x_ab += distmod;
+
+ Float x_ab_length = length(x_ab);
+
+ // Distance between particle perimeters
+ Float delta_ab = x_ab_length - (radius_a + radius_b);
+
+ // Check for particle overlap
+ if (delta_ab < 0.0f) {
+ contactLinearViscous(N, T, es_dot, p,
+ idx_a, idx_b,
+ dev_vel_sorted,
+ dev_angvel_sorted,
+ radius_a, radius_b,
+ x_ab, x_ab_length,
+ delta_ab, kappa);
+ } else if (delta_ab < devC_db) {
+ // Check wether particle distance satisfies the capillary bond distance
+ capillaryCohesion_exp(N, radius_a, radius_b, delta_ab,
+ x_ab, x_ab_length, kappa);
+ }
+
+ // Check wether particles are bonded together
+ /*if (bonds.x == idx_b || bonds.y == idx_b ||
+ bonds.z == idx_b || bonds.w == idx_b) {
+ bondLinear(N, T, es_dot, p,
+ idx_a, idx_b,
+ dev_x_sorted, dev_vel_sorted,
+ dev_angvel_sorted,
+ radius_a, radius_b,
+ x_ab, x_ab_length,
+ delta_ab);
+ }*/
+
+ } // Do not collide particle with itself end
+ } // Iterate over cell particles end
+ } // Check wether cell is empty end
+} // End of overlapsInCell(...)
+
+// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
+// Write the indexes of the overlaps in array contacts[]
+// Kernel executed on device, and callable from device only.
+__device__ void findContactsInCell(int3 targetCell,
+ unsigned int idx_a,
+ Float4 x_a, Float radius_a,
+ Float4* dev_x_sorted,
+ Float* dev_radius_sorted,
+ unsigned int* dev_cellStart,
+ unsigned int* dev_cellEnd,
+ unsigned int* dev_gridParticleIndex,
+ int* nc,
+ unsigned int* dev_contacts,
+ Float4* dev_distmod)
+{
+ // Variable containing modifier for interparticle
+ // vector, if it crosses a periodic boundary
+ Float3 distmod = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+
+ // Check whether x- and y boundaries are to be treated as periodic
+ // 1: x- and y boundaries periodic
+ // 2: x boundaries periodic
+ if (devC_periodic == 1) {
+
+ // Periodic x-boundary
+ if (targetCell.x < 0) {
+ targetCell.x = devC_num[0] - 1;
+ distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
+ }
+ if (targetCell.x == devC_num[0]) {
+ targetCell.x = 0;
+ distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
+ }
+
+ // Periodic y-boundary
+ if (targetCell.y < 0) {
+ targetCell.y = devC_num[1] - 1;
+ distmod += MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
+ }
+ if (targetCell.y == devC_num[1]) {
+ targetCell.y = 0;
+ distmod -= MAKE_FLOAT3(0.0f, devC_L[1], 0.0f);
+ }
+
+ } else if (devC_periodic == 2) {
+
+ // Periodic x-boundary
+ if (targetCell.x < 0) {
+ targetCell.x = devC_num[0] - 1;
+ distmod += MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
+ }
+ if (targetCell.x == devC_num[0]) {
+ targetCell.x = 0;
+ distmod -= MAKE_FLOAT3(devC_L[0], 0.0f, 0.0f);
+ }
+
+ // Hande out-of grid cases on y-axis
+ if (targetCell.y < 0 || targetCell.y == devC_num[1])
+ return;
+
+ } else {
+
+ // Hande out-of grid cases on x- and y-axes
+ if (targetCell.x < 0 || targetCell.x == devC_num[0])
+ return;
+ if (targetCell.y < 0 || targetCell.y == devC_num[1])
+ return;
+ }
+
+ // Handle out-of-grid cases on z-axis
+ if (targetCell.z < 0 || targetCell.z == devC_num[2])
+ return;
+
+
+ //// Check and process particle-particle collisions
+
+ // Calculate linear cell ID
+ unsigned int cellID = targetCell.x
+ + __umul24(targetCell.y, devC_num[0])
+ + __umul24(__umul24(devC_num[0],
+ devC_num[1]),
+ targetCell.z);
+
+ // Lowest particle index in cell
+ unsigned int startIdx = dev_cellStart[cellID];
+
+ // Make sure cell is not empty
+ if (startIdx != 0xffffffff) {
+
+ // Highest particle index in cell + 1
+ unsigned int endIdx = dev_cellEnd[cellID];
+
+ // Read the original index of particle A
+ unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
+
+ // Iterate over cell particles
+ for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
+ if (idx_b != idx_a) { // Do not collide particle with itself
+
+ // Fetch position and radius of particle B.
+ Float4 x_b = dev_x_sorted[idx_b];
+ Float radius_b = dev_radius_sorted[idx_b];
+
+ // Read the original index of particle B
+ unsigned int idx_b_orig = dev_gridParticleIndex[idx_b];
+
+ __syncthreads();
+
+ // Distance between particle centers (Float4 -> Float3)
+ Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x,
+ x_a.y - x_b.y,
+ x_a.z - x_b.z);
+
+ // Adjust interparticle vector if periodic boundary/boundaries
+ // are crossed
+ x_ab += distmod;
+
+ Float x_ab_length = length(x_ab);
+
+ // Distance between particle perimeters
+ Float delta_ab = x_ab_length - (radius_a + radius_b);
+
+ // Check for particle overlap
+ if (delta_ab < 0.0f) {
+
+ // If the particle is not yet registered in the contact list,
+ // use the next position in the array
+ int cpos = *nc;
+ unsigned int cidx;
+
+ // Find out, if particle is already registered in contact list
+ for (int i=0; i<devC_nc; ++i) {
+ __syncthreads();
+ cidx = dev_contacts[(unsigned int)(idx_a_orig*devC_nc+i)];
+ if (cidx == idx_b_orig)
+ cpos = i;
+ }
+
+ __syncthreads();
+
+ // Write the particle index to the relevant position,
+ // no matter if it already is there or not (concurrency of write)
+ dev_contacts[(unsigned int)(idx_a_orig*devC_nc+cpos)] = idx_b_orig;
+
+
+ // Write the interparticle vector and radius of particle B
+ //dev_x_ab_r_b[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_Float4(x_ab, radius_b);
+ dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = MAKE_FLOAT4(distmod.x, distmod.y, distmod.z, radius_b);
+
+ // Increment contact counter
+ ++*nc;
+ }
+
+ // Write the inter-particle position vector correction and radius of particle B
+ //dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_Float4(distmod, radius_b);
+
+ // Check wether particles are bonded together
+ /*if (bonds.x == idx_b || bonds.y == idx_b ||
+ bonds.z == idx_b || bonds.w == idx_b) {
+ bondLinear(N, T, es_dot, p,
+ idx_a, idx_b,
+ dev_x_sorted, dev_vel_sorted,
+ dev_angvel_sorted,
+ radius_a, radius_b,
+ x_ab, x_ab_length,
+ delta_ab);
+ }*/
+
+ } // Do not collide particle with itself end
+ } // Iterate over cell particles end
+ } // Check wether cell is empty end
+} // End of findContactsInCell(...)
+
+
+// For a single particle:
+// Search for neighbors to particle 'idx' inside the 27 closest cells,
+// and save the contact pairs in global memory.
+__global__ void topology(unsigned int* dev_cellStart,
+ unsigned int* dev_cellEnd, // Input: Particles in cell
+ unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
+ Float4* dev_x_sorted, Float* dev_radius_sorted,
+ unsigned int* dev_contacts,
+ Float4* dev_distmod)
+{
+ // Thread index equals index of particle A
+ unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
+ if (idx_a < devC_np) {
+ // Fetch particle data in global read
+ Float4 x_a = dev_x_sorted[idx_a];
+ Float radius_a = dev_radius_sorted[idx_a];
+
+ // Count the number of contacts in this time step
+ int nc = 0;
+
+ // Grid position of host and neighbor cells in uniform, cubic grid
+ int3 gridPos;
+ int3 targetPos;
+
+ // Calculate cell address in grid from position of particle
+ gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
+ gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
+ gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
+
+ // Find overlaps between particle no. idx and all particles
+ // from its own cell + 26 neighbor cells
+ for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
+ for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
+ for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
+ targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
+ findContactsInCell(targetPos, idx_a, x_a, radius_a,
+ dev_x_sorted, dev_radius_sorted,
+ dev_cellStart, dev_cellEnd,
+ dev_gridParticleIndex,
+ &nc, dev_contacts, dev_distmod);
+ }
+ }
+ }
+ }
+} // End of topology(...)
+
+
+// For a single particle:
+// Search for neighbors to particle 'idx' inside the 27 closest cells,
+// and compute the resulting normal- and shear force on it.
+// Collide with top- and bottom walls, save resulting force on upper wall.
+// Kernel is executed on device, and is callable from host only.
+__global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
+ unsigned int* dev_cellStart,
+ unsigned int* dev_cellEnd,
+ Float4* dev_x, Float* dev_radius,
+ Float4* dev_x_sorted, Float* dev_radius_sorted,
+ Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
+ Float4* dev_vel, Float4* dev_angvel,
+ Float4* dev_force, Float4* dev_torque,
+ Float* dev_es_dot, Float* dev_es, Float* dev_p,
+ Float4* dev_w_nx, Float4* dev_w_mvfd,
+ Float* dev_w_force, //uint4* dev_bonds_sorted,
+ unsigned int* dev_contacts,
+ Float4* dev_distmod,
+ Float4* dev_delta_t)
+{
+ // Thread index equals index of particle A
+ unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
+
+ if (idx_a < devC_np) {
+
+ // Fetch particle data in global read
+ unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
+ Float4 x_a = dev_x_sorted[idx_a];
+ Float radius_a = dev_radius_sorted[idx_a];
+
+ // Fetch wall data in global read
+ Float4 w_up_nx = dev_w_nx[0];
+ Float4 w_up_mvfd = dev_w_mvfd[0];
+
+ // Fetch world dimensions in constant memory read
+ Float3 origo = MAKE_FLOAT3(devC_origo[0],
+ devC_origo[1],
+ devC_origo[2]);
+ Float3 L = MAKE_FLOAT3(devC_L[0],
+ devC_L[1],
+ devC_L[2]);
+
+ // Index of particle which is bonded to particle A.
+ // The index is equal to the particle no (p.np)
+ // if particle A is bond-less.
+ //uint4 bonds = dev_bonds_sorted[idx_a];
+
+ // Initiate shear friction loss rate at 0.0
+ Float es_dot = 0.0f;
+
+ // Initiate pressure on particle at 0.0
+ Float p = 0.0f;
+
+ // Allocate memory for temporal force/torque vector values
+ Float3 N = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+ Float3 T = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
+
+ // Apply linear elastic, frictional contact model to registered contacts
+ if (devC_shearmodel == 2) {
+ unsigned int idx_b_orig, mempos;
+ Float delta_n, x_ab_length, radius_b;
+ Float3 x_ab;
+ Float4 x_b, distmod;
+ Float4 vel_a = dev_vel_sorted[idx_a];
+ Float4 angvel4_a = dev_angvel_sorted[idx_a];
+ Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
+
+ // Loop over all possible contacts, and remove contacts
+ // that no longer are valid (delta_n > 0.0)
+ for (int i = 0; i<(int)devC_nc; ++i) {
+ mempos = (unsigned int)(idx_a_orig * devC_nc + i);
+ __syncthreads();
+ idx_b_orig = dev_contacts[mempos];
+ distmod = dev_distmod[mempos];
+ x_b = dev_x[idx_b_orig];
+ //radius_b = dev_radius[idx_b_orig];
+ radius_b = distmod.w;
+
+ // Inter-particle vector, corrected for periodic boundaries
+ x_ab = MAKE_FLOAT3(x_a.x - x_b.x + distmod.x,
+ x_a.y - x_b.y + distmod.y,
+ x_a.z - x_b.z + distmod.z);
+
+ x_ab_length = length(x_ab);
+ delta_n = x_ab_length - (radius_a + radius_b);
+
+
+ if (idx_b_orig != (unsigned int)devC_np) {
+
+ // Process collision if the particles are overlapping
+ if (delta_n < 0.0f) {
+ //cuPrintf("\nProcessing contact, idx_a_orig = %u, idx_b_orig = %u, contact = %d, delta_n = %f\n",
+ // idx_a_orig, idx_b_orig, i, delta_n);
+ contactLinear(&N, &T, &es_dot, &p,
+ idx_a_orig,
+ idx_b_orig,
+ vel_a,
+ dev_vel,
+ angvel_a,
+ dev_angvel,
+ radius_a, radius_b,
+ x_ab, x_ab_length,
+ delta_n, dev_delta_t,
+ mempos);
+ } else {
+ __syncthreads();
+ // Remove this contact (there is no particle with index=np)
+ dev_contacts[mempos] = devC_np;
+ // Zero sum of shear displacement in this position
+ dev_delta_t[mempos] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ } else {
+ __syncthreads();
+ dev_delta_t[mempos] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ } // Contact loop end
+
+ } else if (devC_shearmodel == 1) {
+
+ int3 gridPos;
+ int3 targetPos;
+
+ // Calculate address in grid from position
+ gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
+ gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
+ gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
+
+ // Find overlaps between particle no. idx and all particles
+ // from its own cell + 26 neighbor cells.
+ // Calculate resulting normal- and shear-force components and
+ // torque for the particle on the base of contactLinearViscous()
+ for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
+ for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
+ for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
+ targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
+ overlapsInCell(targetPos, idx_a, x_a, radius_a,
+ &N, &T, &es_dot, &p,
+ dev_x_sorted, dev_radius_sorted,
+ dev_vel_sorted, dev_angvel_sorted,
+ dev_cellStart, dev_cellEnd,
+ dev_w_nx, dev_w_mvfd);
+ }
+ }
+ }
+
+ }
+
+ //// Interact with walls
+ Float delta_w; // Overlap distance
+ Float3 w_n; // Wall surface normal
+ Float w_force = 0.0f; // Force on wall from particle A
+
+ // Upper wall (idx 0)
+ delta_w = w_up_nx.w - (x_a.z + radius_a);
+ w_n = MAKE_FLOAT3(0.0f, 0.0f, -1.0f);
+ if (delta_w < 0.0f) {
+ w_force = contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
+ dev_vel_sorted, dev_angvel_sorted,
+ w_n, delta_w, w_up_mvfd.y);
+ }
+
+ // Lower wall (force on wall not stored)
+ delta_w = x_a.z - radius_a - origo.z;
+ w_n = MAKE_FLOAT3(0.0f, 0.0f, 1.0f);
+ if (delta_w < 0.0f) {
+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
+ dev_vel_sorted, dev_angvel_sorted,
+ w_n, delta_w, 0.0f);
+ }
+
+
+ if (devC_periodic == 0) {
+
+ // Left wall
+ delta_w = x_a.x - radius_a - origo.x;
+ w_n = MAKE_FLOAT3(1.0f, 0.0f, 0.0f);
+ if (delta_w < 0.0f) {
+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
+ dev_vel_sorted, dev_angvel_sorted,
+ w_n, delta_w, 0.0f);
+ }
+
+ // Right wall
+ delta_w = L.x - (x_a.x + radius_a);
+ w_n = MAKE_FLOAT3(-1.0f, 0.0f, 0.0f);
+ if (delta_w < 0.0f) {
+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
+ dev_vel_sorted, dev_angvel_sorted,
+ w_n, delta_w, 0.0f);
+ }
+
+ // Front wall
+ delta_w = x_a.y - radius_a - origo.y;
+ w_n = MAKE_FLOAT3(0.0f, 1.0f, 0.0f);
+ if (delta_w < 0.0f) {
+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
+ dev_vel_sorted, dev_angvel_sorted,
+ w_n, delta_w, 0.0f);
+ }
+
+ // Back wall
+ delta_w = L.y - (x_a.y + radius_a);
+ w_n = MAKE_FLOAT3(0.0f, -1.0f, 0.0f);
+ if (delta_w < 0.0f) {
+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
+ dev_vel_sorted, dev_angvel_sorted,
+ w_n, delta_w, 0.0f);
+ }
+
+ } else if (devC_periodic == 2) {
+
+ // Front wall
+ delta_w = x_a.y - radius_a - origo.y;
+ w_n = MAKE_FLOAT3(0.0f, 1.0f, 0.0f);
+ if (delta_w < 0.0f) {
+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
+ dev_vel_sorted, dev_angvel_sorted,
+ w_n, delta_w, 0.0f);
+ }
+
+ // Back wall
+ delta_w = L.y - (x_a.y + radius_a);
+ w_n = MAKE_FLOAT3(0.0f, -1.0f, 0.0f);
+ if (delta_w < 0.0f) {
+ (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
+ dev_vel_sorted, dev_angvel_sorted,
+ w_n, delta_w, 0.0f);
+ }
+ }
+
+
+ // Hold threads for coalesced write
+ __syncthreads();
+
+ // Write force to unsorted position
+ unsigned int orig_idx = dev_gridParticleIndex[idx_a];
+ dev_force[orig_idx] = MAKE_FLOAT4(N.x, N.y, N.z, 0.0f);
+ dev_torque[orig_idx] = MAKE_FLOAT4(T.x, T.y, T.z, 0.0f);
+ dev_es_dot[orig_idx] = es_dot;
+ dev_es[orig_idx] += es_dot * devC_dt;
+ dev_p[orig_idx] = p;
+ dev_w_force[orig_idx] = w_force;
+ }
+} // End of interact(...)
+
+
+#endif
DIR diff --git a/src/integration.cuh b/src/integration.cuh
t@@ -0,0 +1,207 @@
+#ifndef INTEGRATION_CUH_
+#define INTEGRATION_CUH_
+
+// integration.cuh
+// Functions responsible for temporal integration
+
+
+// Second order integration scheme based on Taylor expansion of particle kinematics.
+// Kernel executed on device, and callable from host only.
+__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
+ Float4* dev_angvel_sorted, Float* dev_radius_sorted, // Input
+ Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, // Output
+ Float4* dev_force, Float4* dev_torque, // Input
+ unsigned int* dev_gridParticleIndex) // Input: Sorted-Unsorted key
+{
+ unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
+
+ if (idx < devC_np) { // Condition prevents block size error
+
+ // Copy data to temporary arrays to avoid any potential read-after-write,
+ // write-after-read, or write-after-write hazards.
+ unsigned int orig_idx = dev_gridParticleIndex[idx];
+ Float4 force = dev_force[orig_idx];
+ Float4 torque = dev_torque[orig_idx];
+
+ // Initialize acceleration vectors to zero
+ Float4 acc = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
+ Float4 angacc = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
+
+ // Fetch particle position and velocity values from global read
+ Float4 x = dev_x_sorted[idx];
+ Float4 vel = dev_vel_sorted[idx];
+ Float4 angvel = dev_angvel_sorted[idx];
+ Float radius = dev_radius_sorted[idx];
+
+ // Coherent read from constant memory to registers
+ Float dt = devC_dt;
+ Float3 origo = MAKE_FLOAT3(devC_origo[0], devC_origo[1], devC_origo[2]);
+ Float3 L = MAKE_FLOAT3(devC_L[0], devC_L[1], devC_L[2]);
+ Float rho = devC_rho;
+
+ // Particle mass
+ Float m = 4.0f/3.0f * PI * radius*radius*radius * rho;
+
+ // Update linear acceleration of particle
+ acc.x = force.x / m;
+ acc.y = force.y / m;
+ acc.z = force.z / m;
+
+ // Update angular acceleration of particle
+ // (angacc = (total moment)/Intertia, intertia = 2/5*m*r^2)
+ angacc.x = torque.x * 1.0f / (2.0f/5.0f * m * radius*radius);
+ angacc.y = torque.y * 1.0f / (2.0f/5.0f * m * radius*radius);
+ angacc.z = torque.z * 1.0f / (2.0f/5.0f * m * radius*radius);
+
+ // Add gravity
+ acc.x += devC_g[0];
+ acc.y += devC_g[1];
+ acc.z += devC_g[2];
+
+ // Check if particle has a fixed horizontal velocity
+ if (vel.w > 0.0f) {
+
+ // Zero horizontal acceleration and disable
+ // gravity to counteract segregation.
+ // Particles may move in the z-dimension,
+ // to allow for dilation.
+ acc.x = 0.0f;
+ acc.y = 0.0f;
+ acc.z -= devC_g[2];
+
+ // Zero the angular acceleration
+ angacc = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+ // Update angular velocity
+ angvel.x += angacc.x * dt;
+ angvel.y += angacc.y * dt;
+ angvel.z += angacc.z * dt;
+
+ // Update linear velocity
+ vel.x += acc.x * dt;
+ vel.y += acc.y * dt;
+ vel.z += acc.z * dt;
+
+ // Update position. First-order Euler's scheme:
+ //x.x += vel.x * dt;
+ //x.y += vel.y * dt;
+ //x.z += vel.z * dt;
+
+ // Update position. Second-order scheme based on Taylor expansion
+ // (greater accuracy than the first-order Euler's scheme)
+ x.x += vel.x * dt + (acc.x * dt*dt)/2.0f;
+ x.y += vel.y * dt + (acc.y * dt*dt)/2.0f;
+ x.z += vel.z * dt + (acc.z * dt*dt)/2.0f;
+
+ // Add x-displacement for this time step to
+ // sum of x-displacements
+ x.w += vel.x * dt + (acc.x * dt*dt)/2.0f;
+
+ // Move particle across boundary if it is periodic
+ if (devC_periodic == 1) {
+ if (x.x < origo.x)
+ x.x += L.x;
+ if (x.x > L.x)
+ x.x -= L.x;
+ if (x.y < origo.y)
+ x.y += L.y;
+ if (x.y > L.y)
+ x.y -= L.y;
+ } else if (devC_periodic == 2) {
+ if (x.x < origo.x)
+ x.x += L.x;
+ if (x.x > L.x)
+ x.x -= L.x;
+ }
+
+ // Hold threads for coalesced write
+ __syncthreads();
+
+ // Store data in global memory at original, pre-sort positions
+ dev_angvel[orig_idx] = angvel;
+ dev_vel[orig_idx] = vel;
+ dev_x[orig_idx] = x;
+ }
+} // End of integrate(...)
+
+
+// Reduce wall force contributions from particles to a single value per wall
+__global__ void summation(Float* in, Float *out)
+{
+ __shared__ Float cache[256];
+ unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ unsigned int cacheIdx = threadIdx.x;
+
+ Float temp = 0.0f;
+ while (idx < devC_np) {
+ temp += in[idx];
+ idx += blockDim.x * gridDim.x;
+ }
+
+ // Set the cache values
+ cache[cacheIdx] = temp;
+
+ __syncthreads();
+
+ // For reductions, threadsPerBlock must be a power of two
+ // because of the following code
+ unsigned int i = blockDim.x/2;
+ while (i != 0) {
+ if (cacheIdx < i)
+ cache[cacheIdx] += cache[cacheIdx + i];
+ __syncthreads();
+ i /= 2;
+ }
+
+ // Write sum for block to global memory
+ if (cacheIdx == 0)
+ out[blockIdx.x] = cache[0];
+}
+
+// Update wall positions
+__global__ void integrateWalls(Float4* dev_w_nx,
+ Float4* dev_w_mvfd,
+ Float* dev_w_force_partial,
+ unsigned int blocksPerGrid)
+{
+ unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
+
+ if (idx < devC_nw) { // Condition prevents block size error
+
+ // Copy data to temporary arrays to avoid any potential read-after-write,
+ // write-after-read, or write-after-write hazards.
+ Float4 w_nx = dev_w_nx[idx];
+ Float4 w_mvfd = dev_w_mvfd[idx];
+
+ // Find the final sum of forces on wall
+ w_mvfd.z = 0.0f;
+ for (int i=0; i<blocksPerGrid; ++i) {
+ w_mvfd.z += dev_w_force_partial[i];
+ }
+
+ Float dt = devC_dt;
+
+ // Normal load = Deviatoric stress times wall surface area,
+ // directed downwards.
+ Float N = -w_mvfd.w*devC_L[0]*devC_L[1];
+
+ // Calculate resulting acceleration of wall
+ // (Wall mass is stored in w component of position Float4)
+ Float acc = (w_mvfd.z+N)/w_mvfd.x;
+
+ // Update linear velocity
+ w_mvfd.y += acc * dt;
+
+ // Update position. Second-order scheme based on Taylor expansion
+ // (greater accuracy than the first-order Euler's scheme)
+ w_nx.w += w_mvfd.y * dt + (acc * dt*dt)/2.0f;
+
+ // Store data in global memory
+ dev_w_nx[idx] = w_nx;
+ dev_w_mvfd[idx] = w_mvfd;
+ }
+} // End of integrateWalls(...)
+
+
+#endif
DIR diff --git a/src/sorting.cuh b/src/sorting.cuh
t@@ -0,0 +1,127 @@
+// Returns the cellID containing the particle, based cubic grid
+// See Bayraktar et al. 2009
+// Kernel is executed on the device, and is callable from the device only
+__device__ int calcCellID(Float3 x)
+{
+ unsigned int i_x, i_y, i_z;
+
+ // Calculate integral coordinates:
+ i_x = floor((x.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
+ i_y = floor((x.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
+ i_z = floor((x.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
+
+ // Integral coordinates are converted to 1D coordinate:
+ return __umul24(__umul24(i_z, devC_num[1]),
+ devC_num[0])
+ + __umul24(i_y, devC_num[0]) + i_x;
+
+} // End of calcCellID(...)
+
+
+// Calculate hash value for each particle, based on position in grid.
+// Kernel executed on device, and callable from host only.
+__global__ void calcParticleCellID(unsigned int* dev_gridParticleCellID,
+ unsigned int* dev_gridParticleIndex,
+ Float4* dev_x)
+{
+ //unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
+ unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
+
+ if (idx < devC_np) { // Condition prevents block size error
+
+ //volatile Float4 x = dev_x[idx]; // Ensure coalesced read
+ Float4 x = dev_x[idx]; // Ensure coalesced read
+
+ unsigned int cellID = calcCellID(MAKE_FLOAT3(x.x, x.y, x.z));
+
+ // Store values
+ dev_gridParticleCellID[idx] = cellID;
+ dev_gridParticleIndex[idx] = idx;
+
+ }
+} // End of calcParticleCellID(...)
+
+
+// Reorder particle data into sorted order, and find the start and end particle indexes
+// of each cell in the sorted hash array.
+// Kernel executed on device, and callable from host only.
+__global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_cellEnd,
+ unsigned int* dev_gridParticleCellID,
+ unsigned int* dev_gridParticleIndex,
+ Float4* dev_x, Float4* dev_vel,
+ Float4* dev_angvel, Float* dev_radius,
+ //uint4* dev_bonds,
+ Float4* dev_x_sorted, Float4* dev_vel_sorted,
+ Float4* dev_angvel_sorted, Float* dev_radius_sorted)
+ //uint4* dev_bonds_sorted)
+{
+
+ // Create hash array in shared on-chip memory. The size of the array
+ // (threadsPerBlock + 1) is determined at launch time (extern notation).
+ extern __shared__ unsigned int shared_data[];
+
+ // Thread index in block
+ unsigned int tidx = threadIdx.x;
+
+ // Thread index in grid
+ unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ //unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
+
+ // CellID hash value of particle idx
+ unsigned int cellID;
+
+ // Read cellID data and store it in shared memory (shared_data)
+ if (idx < devC_np) { // Condition prevents block size error
+ cellID = dev_gridParticleCellID[idx];
+
+ // Load hash data into shared memory, allowing access to neighbor particle cellID values
+ shared_data[tidx+1] = cellID;
+
+ if (idx > 0 && tidx == 0) {
+ // First thread in block must load neighbor particle hash
+ shared_data[0] = dev_gridParticleCellID[idx-1];
+ }
+ }
+
+ // Pause completed threads in this block, until all
+ // threads are done loading data into shared memory
+ __syncthreads();
+
+ // Find lowest and highest particle index in each cell
+ if (idx < devC_np) { // Condition prevents block size error
+ // If this particle has a different cell index to the previous particle, it's the first
+ // particle in the cell -> Store the index of this particle in the cell.
+ // The previous particle must be the last particle in the previous cell.
+ if (idx == 0 || cellID != shared_data[tidx]) {
+ dev_cellStart[cellID] = idx;
+ if (idx > 0)
+ dev_cellEnd[shared_data[tidx]] = idx;
+ }
+
+ // Check wether the thread is the last one
+ if (idx == (devC_np - 1))
+ dev_cellEnd[cellID] = idx + 1;
+
+
+ // Use the sorted index to reorder the position and velocity data
+ unsigned int sortedIndex = dev_gridParticleIndex[idx];
+
+ // Fetch from global read
+ Float4 x = dev_x[sortedIndex];
+ Float4 vel = dev_vel[sortedIndex];
+ Float4 angvel = dev_angvel[sortedIndex];
+ Float radius = dev_radius[sortedIndex];
+ //uint4 bonds = dev_bonds[sortedIndex];
+
+ __syncthreads();
+ // Write sorted data to global memory
+ dev_x_sorted[idx] = x;
+ dev_vel_sorted[idx] = vel;
+ dev_angvel_sorted[idx] = angvel;
+ dev_radius_sorted[idx] = radius;
+ //dev_bonds_sorted[idx] = bonds;
+ }
+} // End of reorderArrays(...)
+
+
+
DIR diff --git a/src/vector_arithmetic.h b/src/vector_arithmetic.h
t@@ -0,0 +1,994 @@
+#ifndef VECTOR_ARITHMETIC_H_
+#define VECTOR_ARITHMETIC_H_
+
+#include "cuda_runtime.h"
+#include "datatypes.h"
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#ifndef __CUDACC__
+#include <math.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// host implementations of CUDA functions
+////////////////////////////////////////////////////////////////////////////////
+
+inline Float fminf(Float a, Float b)
+{
+ return a < b ? a : b;
+}
+
+inline Float fmaxf(Float a, Float b)
+{
+ return a > b ? a : b;
+}
+
+inline int max(int a, int b)
+{
+ return a > b ? a : b;
+}
+
+inline int min(int a, int b)
+{
+ return a < b ? a : b;
+}
+
+inline Float rsqrtf(Float x)
+{
+ return 1.0f / sqrtf(x);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// constructors
+////////////////////////////////////////////////////////////////////////////////
+/*inline __host__ __device__ Float3 MAKE_FLOAT3(Float s)
+{
+ return MAKE_FLOAT3(s, s, s);
+}
+inline __host__ __device__ Float3 MAKE_FLOAT3(Float4 a)
+{
+ return MAKE_FLOAT3(a.x, a.y, a.z);
+}
+inline __host__ __device__ Float3 MAKE_FLOAT3(int3 a)
+{
+ return MAKE_FLOAT3(Float(a.x), Float(a.y), Float(a.z));
+}
+inline __host__ __device__ Float3 MAKE_FLOAT3(uint3 a)
+{
+ return MAKE_FLOAT3(Float(a.x), Float(a.y), Float(a.z));
+}
+
+inline __host__ __device__ Float4 MAKE_FLOAT4(Float s)
+{
+ return MAKE_FLOAT4(s, s, s, s);
+}
+inline __host__ __device__ Float4 MAKE_FLOAT4(Float3 a)
+{
+ return MAKE_FLOAT4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ Float4 MAKE_FLOAT4(Float3 a, Float w)
+{
+ return MAKE_FLOAT4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ Float4 MAKE_FLOAT4(int4 a)
+{
+ return MAKE_FLOAT4(Float(a.x), Float(a.y), Float(a.z), Float(a.w));
+}
+inline __host__ __device__ Float4 MAKE_FLOAT4(uint4 a)
+{
+ return MAKE_FLOAT4(Float(a.x), Float(a.y), Float(a.z), Float(a.w));
+}
+*/
+
+////////////////////////////////////////////////////////////////////////////////
+// negate
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ Float3 operator-(Float3 &a)
+{
+ return MAKE_FLOAT3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ Float4 operator-(Float4 &a)
+{
+ return MAKE_FLOAT4(-a.x, -a.y, -a.z, -a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// addition
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+ return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+ a.x += b.x; a.y += b.y;
+}
+inline __host__ __device__ int2 operator+(int2 a, int b)
+{
+ return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ int2 operator+(int b, int2 a)
+{
+ return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(int2 &a, int b)
+{
+ a.x += b; a.y += b;
+}
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{
+ return make_uint2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
+{
+ a.x += b.x; a.y += b.y;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
+{
+ return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
+{
+ return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint b)
+{
+ a.x += b; a.y += b;
+}
+
+
+inline __host__ __device__ Float3 operator+(Float3 a, Float3 b)
+{
+ return MAKE_FLOAT3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(Float3 &a, Float3 b)
+{
+ a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __host__ __device__ Float3 operator+(Float3 a, Float b)
+{
+ return MAKE_FLOAT3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(Float3 &a, Float b)
+{
+ a.x += b; a.y += b; a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+ return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+ a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __host__ __device__ int3 operator+(int3 a, int b)
+{
+ return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(int3 &a, int b)
+{
+ a.x += b; a.y += b; a.z += b;
+}
+
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+ return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+ a.x += b.x; a.y += b.y; a.z += b.z;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
+{
+ return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint b)
+{
+ a.x += b; a.y += b; a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int b, int3 a)
+{
+ return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
+{
+ return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ Float3 operator+(Float b, Float3 a)
+{
+ return MAKE_FLOAT3(a.x + b, a.y + b, a.z + b);
+}
+
+inline __host__ __device__ Float4 operator+(Float4 a, Float4 b)
+{
+ return MAKE_FLOAT4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+inline __host__ __device__ void operator+=(Float4 &a, Float4 b)
+{
+ a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+inline __host__ __device__ Float4 operator+(Float4 a, Float b)
+{
+ return MAKE_FLOAT4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ Float4 operator+(Float b, Float4 a)
+{
+ return MAKE_FLOAT4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(Float4 &a, Float b)
+{
+ a.x += b; a.y += b; a.z += b; a.w += b;
+}
+
+inline __host__ __device__ int4 operator+(int4 a, int4 b)
+{
+ return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+inline __host__ __device__ void operator+=(int4 &a, int4 b)
+{
+ a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+inline __host__ __device__ int4 operator+(int4 a, int b)
+{
+ return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ int4 operator+(int b, int4 a)
+{
+ return make_int4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(int4 &a, int b)
+{
+ a.x += b; a.y += b; a.z += b; a.w += b;
+}
+
+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
+{
+ return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
+{
+ a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
+{
+ return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
+{
+ return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint b)
+{
+ a.x += b; a.y += b; a.z += b; a.w += b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// subtract
+////////////////////////////////////////////////////////////////////////////////
+
+
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+ return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+ a.x -= b.x; a.y -= b.y;
+}
+inline __host__ __device__ int2 operator-(int2 a, int b)
+{
+ return make_int2(a.x - b, a.y - b);
+}
+inline __host__ __device__ int2 operator-(int b, int2 a)
+{
+ return make_int2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int b)
+{
+ a.x -= b; a.y -= b;
+}
+
+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
+{
+ return make_uint2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
+{
+ a.x -= b.x; a.y -= b.y;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
+{
+ return make_uint2(a.x - b, a.y - b);
+}
+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
+{
+ return make_uint2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint b)
+{
+ a.x -= b; a.y -= b;
+}
+
+inline __host__ __device__ Float3 operator-(Float3 a, Float3 b)
+{
+ return MAKE_FLOAT3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(Float3 &a, Float3 b)
+{
+ a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+inline __host__ __device__ Float3 operator-(Float3 a, Float b)
+{
+ return MAKE_FLOAT3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ Float3 operator-(Float b, Float3 a)
+{
+ return MAKE_FLOAT3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(Float3 &a, Float b)
+{
+ a.x -= b; a.y -= b; a.z -= b;
+}
+
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+ return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+ a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+inline __host__ __device__ int3 operator-(int3 a, int b)
+{
+ return make_int3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ int3 operator-(int b, int3 a)
+{
+ return make_int3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int b)
+{
+ a.x -= b; a.y -= b; a.z -= b;
+}
+
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+ return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+ a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
+{
+ return make_uint3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
+{
+ return make_uint3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint b)
+{
+ a.x -= b; a.y -= b; a.z -= b;
+}
+
+inline __host__ __device__ Float4 operator-(Float4 a, Float4 b)
+{
+ return MAKE_FLOAT4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+inline __host__ __device__ void operator-=(Float4 &a, Float4 b)
+{
+ a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+inline __host__ __device__ Float4 operator-(Float4 a, Float b)
+{
+ return MAKE_FLOAT4(a.x - b, a.y - b, a.z - b, a.w - b);
+}
+inline __host__ __device__ void operator-=(Float4 &a, Float b)
+{
+ a.x -= b; a.y -= b; a.z -= b; a.w -= b;
+}
+
+inline __host__ __device__ int4 operator-(int4 a, int4 b)
+{
+ return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int4 b)
+{
+ a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+inline __host__ __device__ int4 operator-(int4 a, int b)
+{
+ return make_int4(a.x - b, a.y - b, a.z - b, a.w - b);
+}
+inline __host__ __device__ int4 operator-(int b, int4 a)
+{
+ return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int b)
+{
+ a.x -= b; a.y -= b; a.z -= b; a.w -= b;
+}
+
+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
+{
+ return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
+{
+ a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
+{
+ return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b);
+}
+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
+{
+ return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint b)
+{
+ a.x -= b; a.y -= b; a.z -= b; a.w -= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// multiply
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+ return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int2 b)
+{
+ a.x *= b.x; a.y *= b.y;
+}
+inline __host__ __device__ int2 operator*(int2 a, int b)
+{
+ return make_int2(a.x * b, a.y * b);
+}
+inline __host__ __device__ int2 operator*(int b, int2 a)
+{
+ return make_int2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int b)
+{
+ a.x *= b; a.y *= b;
+}
+
+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
+{
+ return make_uint2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
+{
+ a.x *= b.x; a.y *= b.y;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
+{
+ return make_uint2(a.x * b, a.y * b);
+}
+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
+{
+ return make_uint2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint b)
+{
+ a.x *= b; a.y *= b;
+}
+
+inline __host__ __device__ Float3 operator*(Float3 a, Float3 b)
+{
+ return MAKE_FLOAT3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(Float3 &a, Float3 b)
+{
+ a.x *= b.x; a.y *= b.y; a.z *= b.z;
+}
+inline __host__ __device__ Float3 operator*(Float3 a, Float b)
+{
+ return MAKE_FLOAT3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ Float3 operator*(Float b, Float3 a)
+{
+ return MAKE_FLOAT3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(Float3 &a, Float b)
+{
+ a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+ return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int3 b)
+{
+ a.x *= b.x; a.y *= b.y; a.z *= b.z;
+}
+inline __host__ __device__ int3 operator*(int3 a, int b)
+{
+ return make_int3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ int3 operator*(int b, int3 a)
+{
+ return make_int3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int b)
+{
+ a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+ return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
+{
+ a.x *= b.x; a.y *= b.y; a.z *= b.z;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
+{
+ return make_uint3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
+{
+ return make_uint3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint b)
+{
+ a.x *= b; a.y *= b; a.z *= b;
+}
+
+inline __host__ __device__ Float4 operator*(Float4 a, Float4 b)
+{
+ return MAKE_FLOAT4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+inline __host__ __device__ void operator*=(Float4 &a, Float4 b)
+{
+ a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
+}
+inline __host__ __device__ Float4 operator*(Float4 a, Float b)
+{
+ return MAKE_FLOAT4(a.x * b, a.y * b, a.z * b, a.w * b);
+}
+inline __host__ __device__ Float4 operator*(Float b, Float4 a)
+{
+ return MAKE_FLOAT4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(Float4 &a, Float b)
+{
+ a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+inline __host__ __device__ int4 operator*(int4 a, int4 b)
+{
+ return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int4 b)
+{
+ a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
+}
+inline __host__ __device__ int4 operator*(int4 a, int b)
+{
+ return make_int4(a.x * b, a.y * b, a.z * b, a.w * b);
+}
+inline __host__ __device__ int4 operator*(int b, int4 a)
+{
+ return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int b)
+{
+ a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
+{
+ return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
+{
+ a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
+{
+ return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b);
+}
+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
+{
+ return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint b)
+{
+ a.x *= b; a.y *= b; a.z *= b; a.w *= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// divide
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float3 operator/(Float3 a, Float3 b)
+{
+ return MAKE_FLOAT3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ void operator/=(Float3 &a, Float3 b)
+{
+ a.x /= b.x; a.y /= b.y; a.z /= b.z;
+}
+inline __host__ __device__ Float3 operator/(Float3 a, Float b)
+{
+ return MAKE_FLOAT3(a.x / b, a.y / b, a.z / b);
+}
+inline __host__ __device__ void operator/=(Float3 &a, Float b)
+{
+ a.x /= b; a.y /= b; a.z /= b;
+}
+inline __host__ __device__ Float3 operator/(Float b, Float3 a)
+{
+ return MAKE_FLOAT3(b / a.x, b / a.y, b / a.z);
+}
+
+inline __host__ __device__ Float4 operator/(Float4 a, Float4 b)
+{
+ return MAKE_FLOAT4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+inline __host__ __device__ void operator/=(Float4 &a, Float4 b)
+{
+ a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
+}
+inline __host__ __device__ Float4 operator/(Float4 a, Float b)
+{
+ return MAKE_FLOAT4(a.x / b, a.y / b, a.z / b, a.w / b);
+}
+inline __host__ __device__ void operator/=(Float4 &a, Float b)
+{
+ a.x /= b; a.y /= b; a.z /= b; a.w /= b;
+}
+inline __host__ __device__ Float4 operator/(Float b, Float4 a){
+ return MAKE_FLOAT4(b / a.x, b / a.y, b / a.z, b / a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// min
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float3 fminf(Float3 a, Float3 b)
+{
+ return MAKE_FLOAT3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+inline __host__ __device__ Float4 fminf(Float4 a, Float4 b)
+{
+ return MAKE_FLOAT4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 min(int2 a, int2 b)
+{
+ return make_int2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+ return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ int4 min(int4 a, int4 b)
+{
+ return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
+{
+ return make_uint2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+ return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
+{
+ return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// max
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float3 fmaxf(Float3 a, Float3 b)
+{
+ return MAKE_FLOAT3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+inline __host__ __device__ Float4 fmaxf(Float4 a, Float4 b)
+{
+ return MAKE_FLOAT4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 max(int2 a, int2 b)
+{
+ return make_int2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+ return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ int4 max(int4 a, int4 b)
+{
+ return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
+{
+ return make_uint2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+ return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
+{
+ return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+// - linear interpolation between a and b, based on value t in [0, 1] range
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ Float lerp(Float a, Float b, Float t)
+{
+ return a + t*(b-a);
+}
+inline __device__ __host__ Float3 lerp(Float3 a, Float3 b, Float t)
+{
+ return a + t*(b-a);
+}
+inline __device__ __host__ Float4 lerp(Float4 a, Float4 b, Float t)
+{
+ return a + t*(b-a);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// clamp
+// - clamp the value v to be in the range [a, b]
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ Float clamp(Float f, Float a, Float b)
+{
+ return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+ return max(a, min(f, b));
+}
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+ return max(a, min(f, b));
+}
+
+inline __device__ __host__ Float3 clamp(Float3 v, Float a, Float b)
+{
+ return MAKE_FLOAT3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ Float3 clamp(Float3 v, Float3 a, Float3 b)
+{
+ return MAKE_FLOAT3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ Float4 clamp(Float4 v, Float a, Float b)
+{
+ return MAKE_FLOAT4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ Float4 clamp(Float4 v, Float4 a, Float4 b)
+{
+ return MAKE_FLOAT4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
+{
+ return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
+{
+ return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+ return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+ return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
+{
+ return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
+{
+ return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
+{
+ return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
+{
+ return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+ return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+ return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
+{
+ return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
+{
+ return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// dot product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float dot(Float3 a, Float3 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ Float dot(Float4 a, Float4 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ int dot(int2 a, int2 b)
+{
+ return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ int dot(int3 a, int3 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ int dot(int4 a, int4 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ uint dot(uint2 a, uint2 b)
+{
+ return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ uint dot(uint3 a, uint3 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ uint dot(uint4 a, uint4 b)
+{
+ return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// length
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float length(Float3 v)
+{
+ return sqrtf(dot(v, v));
+}
+inline __host__ __device__ Float length(Float4 v)
+{
+ return sqrtf(dot(v, v));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// normalize
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float3 normalize(Float3 v)
+{
+ Float invLen = rsqrtf(dot(v, v));
+ return v * invLen;
+}
+inline __host__ __device__ Float4 normalize(Float4 v)
+{
+ Float invLen = rsqrtf(dot(v, v));
+ return v * invLen;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// floor
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float3 floorf(Float3 v)
+{
+ return MAKE_FLOAT3(floorf(v.x), floorf(v.y), floorf(v.z));
+}
+inline __host__ __device__ Float4 floorf(Float4 v)
+{
+ return MAKE_FLOAT4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// frac - returns the fractional portion of a scalar or each vector component
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float fracf(Float v)
+{
+ return v - floorf(v);
+}
+inline __host__ __device__ Float3 fracf(Float3 v)
+{
+ return MAKE_FLOAT3(fracf(v.x), fracf(v.y), fracf(v.z));
+}
+inline __host__ __device__ Float4 fracf(Float4 v)
+{
+ return MAKE_FLOAT4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// fmod
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float3 fmodf(Float3 a, Float3 b)
+{
+ return MAKE_FLOAT3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
+}
+inline __host__ __device__ Float4 fmodf(Float4 a, Float4 b)
+{
+ return MAKE_FLOAT4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// absolute value
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float3 fabs(Float3 v)
+{
+ return MAKE_FLOAT3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __host__ __device__ Float4 fabs(Float4 v)
+{
+ return MAKE_FLOAT4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+
+inline __host__ __device__ int2 abs(int2 v)
+{
+ return make_int2(abs(v.x), abs(v.y));
+}
+inline __host__ __device__ int3 abs(int3 v)
+{
+ return make_int3(abs(v.x), abs(v.y), abs(v.z));
+}
+inline __host__ __device__ int4 abs(int4 v)
+{
+ return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// reflect
+// - returns reflection of incident ray I around surface normal N
+// - N should be normalized, reflected vector's length is equal to length of I
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float3 reflect(Float3 i, Float3 n)
+{
+ return i - 2.0f * n * dot(n,i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// cross product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ Float3 cross(Float3 a, Float3 b)
+{
+ return MAKE_FLOAT3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+
+#endif