The following algorithms are provided in sufficient detail so the various parameters and GPU-related functions calls can be examined, however many details such as variable declarations, parameter definitions, error checking, utility functions, etc. have been eliminated for brevity. The full source code is provided as Additional file 1. Also as we go from one algorithm to the next, adding new functionality along the way, we only show the new code while leaving just comments for the previous code.

### Algorithm 1

void SEM(int numOfCells, int maxCells, int *elements, int maxElements,

float *hostX, float *hostY, float *hostZ, float *hostType,

float *hostParameters, float dt, float timeSteps)

{

//Allocate device memory

//cells and elements

cudaMalloc(&numOfElements, maxCells * sizeof(int));

cudaMallocPitch(&elementType, &pitch, maxCells * sizeof(int), maxElements);

cudaMallocPitch(&X, &pitch, maxCells * sizeof(float), maxElements);

cudaMallocPitch(&X_F1, &pitch, maxCells * sizeof(float), maxElements);

cudaMallocPitch(&X_F2, &pitch, maxCells * sizeof(float), maxElements);

cudaMallocPitch(&Y, &pitch, maxCells * sizeof(float), maxElements);

cudaMallocPitch(&Y_F1, &pitch, maxCells * sizeof(float), maxElements);

cudaMallocPitch(&Y_F2, &pitch, maxCells * sizeof(float), maxElements);

cudaMallocPitch(&Z, &pitch, maxCells * sizeof(float), maxElements);

cudaMallocPitch(&Z_F1, &pitch, maxCells * sizeof(float), maxElements);

cudaMallocPitch(&Z_F2, &pitch, maxCells * sizeof(float), maxElements);

//Copy host memory to device memory

//spatial positions

cudaMemcpy(numOfElements, elements, maxCells * sizeof(int),

cudaMemcpyHostToDevice);

cudaMemcpy2D(elementType, pitch, hostType, maxCells * sizeof(int),

maxCells * sizeof(int), maxElements, cudaMemcpyHostToDevice);

cudaMemcpy2D(X, pitch, hostX, maxCells * sizeof(float),

maxCells * sizeof(float), maxElements, cudaMemcpyHostToDevice);

cudaMemcpy2D(Y, pitch, hostY, maxCells * sizeof(float),

maxCells * sizeof(float), maxElements, cudaMemcpyHostToDevice);

cudaMemcpy2D(Z, pitch, hostZ, maxCells * sizeof(float),

maxCells * sizeof(float), maxElements, cudaMemcpyHostToDevice);

//parameters

cudaMemcpyToSymbol(skin_parameters, hostParameters, 100 * sizeof(float), 0,

cudaMemcpyHostToDevice);

//execute kernel

for (t = 0; t < timeSteps; ++t) {

//movement kernels

sem_kernel_F1 < < < blocksPerGrid, threadsPerBlock > > > (numOfElements, X, X_F1,

X_F2, Y, Y_F1, Y_F2, Z, Z_F1, Z_F2, elementType,

pitch/sizeof(float), numOfCells, maxCells, maxElements, dt);

sem_kernel_F2 < < < blocksPerGrid, threadsPerBlock > > > (numOfElements, X, X_F1,

X_F2, Y, Y_F1, Y_F2, Z, Z_F1, Z_F2, elementType,

pitch/sizeof(float), numOfCells, maxCells, maxElements, dt);

cudaMemcpy2D(X, pitch, X_F2, pitch, maxCells * sizeof(float),

maxElements, cudaMemcpyDeviceToDevice);

cudaMemcpy2D(Y, pitch, Y_F2, pitch, maxCells * sizeof(float),

maxElements, cudaMemcpyDeviceToDevice);

cudaMemcpy2D(Z, pitch, Z_F2, pitch, maxCells * sizeof(float),

maxElements, cudaMemcpyDeviceToDevice);

}

//Copy result to host memory

cudaMemcpy2D(hostX, maxCells * sizeof(float), X, pitch,

maxCells * sizeof(float), maxElements, cudaMemcpyDeviceToHost);

cudaMemcpy2D(hostY, maxCells * sizeof(float), Y, pitch,

maxCells * sizeof(float), maxElements, cudaMemcpyDeviceToHost);

cudaMemcpy2D(hostZ, maxCells * sizeof(float), Z, pitch,

maxCells * sizeof(float), maxElements, cudaMemcpyDeviceToHost);

}

__global__ void

sem_kernel_F1(int *numOfElements, float *X, float *X_F1, float *X_F2, float *Y,

float *Y_F1, float *Y_F2, float *Z, float *Z_F1, float *Z_F2,

int *elementType, size_t pitch, int numOfCells, int maxCells,

int maxElements, float dt)

{

int cellNum = blockIdx.x * blockDim.x + threadIdx.x;

int elemNum = blockIdx.y * blockDim.y + threadIdx.y;

if (cellNum > = numOfCells) return;

if (elemNum > = numOfElements[cellNum]) return;

//intracellular

float intraX = 0.0;

float intraY = 0.0;

float intraZ = 0.0;

for (k = 0; k < numOfElements[cellNum]; ++k) {

if (k == elemNum) continue;

r = dist(X[elemNum*pitch+cellNum], Y[elemNum*pitch+cellNum],

Z[elemNum*pitch+cellNum], X[k*pitch+cellNum],

Y[k*pitch+cellNum], Z[k*pitch+cellNum]);

V = MORSE(r, INTRA_U0, INTRA_ETA0, INTRA_U1, INTRA_ETA1);

intraX += V * (X[elemNum*pitch+cellNum] - X[k*pitch+cellNum]);

intraY += V * (Y[elemNum*pitch+cellNum] - Y[k*pitch+cellNum]);

intraZ += V * (Z[elemNum*pitch+cellNum] - Z[k*pitch+cellNum]);

#if PERIODIC_BOUNDARY

intracellular_mirror(elemNum, cellNum, pitch, k, &intraX, &intraY,

&intraZ, X, Y, Z);

#endif

}

//intercellular

float interX = 0.0;

float interY = 0.0;

float interZ = 0.0;

for (j = 0; j < numOfCells; ++j) {

if (j == cellNum) continue;

for (k = 0; k < numOfElements[j]; ++k) {

r = dist(X[elemNum*pitch+cellNum], Y[elemNum*pitch+cellNum],

Z[elemNum*pitch+cellNum],

X[k*pitch+j], Y[k*pitch+j], Z[k*pitch+j]);

if (r <= INTER_DIST) {

V = P_MORSE(r, INTER_U0, INTER_ETA0, INTER_U1, INTER_ETA1);

interX += V * (X[elemNum*pitch+cellNum] - X[k*pitch+j]);

interY += V * (Y[elemNum*pitch+cellNum] - Y[k*pitch+j]);

interZ += V * (Z[elemNum*pitch+cellNum] - Z[k*pitch+j]);

}

#if PERIODIC_BOUNDARY

intercellular_mirror(elemNum, cellNum, pitch, k, j,

&interX, &interY, &interZ, X, Y, Z);

#endif

}

}

//basement membrane

if (elementType[elemNum*pitch+cellNum] == 1) {

r = dist(X[elemNum*pitch+cellNum], Y[elemNum*pitch+cellNum],

Z[elemNum*pitch+cellNum], X[elemNum*pitch+cellNum],

Y[elemNum*pitch+cellNum], 0);

V = MORSE(r, INTRA_U0, INTRA_ETA0, INTRA_U1, INTRA_ETA1);

interZ += V * (Z[elemNum*pitch+cellNum] - 0);

}

//update

X_F1[elemNum*pitch+cellNum] = X[elemNum*pitch+cellNum]

+ 0.5 * dt * (intraX + interX);

Y_F1[elemNum*pitch+cellNum] = Y[elemNum*pitch+cellNum]

+ 0.5 * dt * (intraY + interY);

Z_F1[elemNum*pitch+cellNum] = Z[elemNum*pitch+cellNum]

+ 0.5 * dt * (intraZ + interZ);

}

### Algorithm 2

void SEM(int numOfCells, int maxCells, int *elements, int maxElements,

float *hostX, float *hostY, float *hostZ, float *hostType,

float *hostParameters, float dt, float timeSteps)

{

//Allocate device memory

//cells and elements ..

//cell centers

cudaMalloc(&cellCenterX, maxCells * sizeof(float));

cudaMalloc(&cellCenterY, maxCells * sizeof(float));

cudaMalloc(&cellCenterZ, maxCells * sizeof(float));

//Copy host memory to device memory

//spatial positions ..

//parameters ..

//execute kernel

for (t = 0; t < timeSteps; ++t) {

//movement kernels

skin_center_kernel < < < blocksPerGrid1 D,threadsPerBlock1D > > > (numOfElements,

X, Y, Z, elementType, cellCenterX, cellCenterY,

cellCenterZ, pitch/sizeof(float), numOfCells,

maxCells, maxElements);

sem_kernel_F1 < < < blocksPerGrid, threadsPerBlock > > > (numOfElements, X, X_F1,

X_F2, Y, Y_F1, Y_F2, Z, Z_F1, Z_F2, elementType,

pitch/sizeof(float), numOfCells, maxCells, maxElements, dt);

sem_kernel_F2 < < < blocksPerGrid, threadsPerBlock > > > (numOfElements, X, X_F1,

X_F2, Y, Y_F1, Y_F2, Z, Z_F1, Z_F2, elementType,

pitch/sizeof(float), numOfCells, maxCells, maxElements, dt);

}

//Copy result to host memory ..

}

__global__ void

skin_center_kernel(int *numOfElements, float *X, float *Y, float *Z,

int *elementType, float *cellCenterX, float *cellCenterY,

float *cellCenterZ, size_t pitch,

int numOfCells, int maxCells, int maxElements, float dt)

{

int cellNum = blockIdx.x * blockDim.x + threadIdx.x;

int elemNum;

if (cellNum > = numOfCells) return;

float cX = 0.0;

float cY = 0.0;

float cZ = 0.0;

float minX, maxX;

float minY, maxY;

minX = X[cellNum];

maxX = X[cellNum];

minY = Y[cellNum];

maxY = Y[cellNum];

for (elemNum = 0; elemNum < numOfElements[cellNum]; ++elemNum) {

cX += X[elemNum*pitch+cellNum];

cY += Y[elemNum*pitch+cellNum];

cZ += Z[elemNum*pitch+cellNum];

if (X[elemNum*pitch+cellNum] < minX) minX = X[elemNum*pitch+cellNum];

if (X[elemNum*pitch+cellNum] > maxX) maxX = X[elemNum*pitch+cellNum];

if (Y[elemNum*pitch+cellNum] < minY) minY = Y[elemNum*pitch+cellNum];

if (Y[elemNum*pitch+cellNum] > maxY) maxY = Y[elemNum*pitch+cellNum];

}

cX = cX/(float)numOfElements[cellNum];

cY = cY/(float)numOfElements[cellNum];

cZ = cZ/(float)numOfElements[cellNum];

//handle special case when cell is split across periodic boundary

if ((maxX - minX) > (BOUNDARY_X/2)) {

cX = 0;

for (elemNum = 0; elemNum < numOfElements[cellNum]; ++elemNum) {

if (X[elemNum*pitch+cellNum] > (BOUNDARY_X/2))

cX += X[elemNum*pitch+cellNum] - BOUNDARY_X;

else

cX += X[elemNum*pitch+cellNum];

}

cX = cX/(float)numOfElements[cellNum];

if (cX < 0) cX += BOUNDARY_X;

}

if ((maxY - minY) > (BOUNDARY_Y/2)) {

cY = 0;

for (elemNum = 0; elemNum < numOfElements[cellNum]; ++elemNum) {

if (Y[elemNum*pitch+cellNum] > (BOUNDARY_Y/2))

cY += Y[elemNum*pitch+cellNum] - BOUNDARY_Y;

else

cY += Y[elemNum*pitch+cellNum];

}

cY = cY/(float)numOfElements[cellNum];

if (cY < 0) cY += BOUNDARY_Y;

}

cellCenterX[cellNum] = cX;

cellCenterY[cellNum] = cY;

cellCenterZ[cellNum] = cZ;

}

__global__ void

sem_kernel_F1(int *numOfElements, float *X, float *X_F1, float *X_F2, float *Y,

float *Y_F1, float *Y_F2, float *Z, float *Z_F1, float *Z_F2,

float *cX, float *cY, float *cZ,

int *elementType, size_t pitch, int numOfCells, int maxCells,

int maxElements, float dt)

{

int cellNum = blockIdx.x * blockDim.x + threadIdx.x;

int elemNum = blockIdx.y * blockDim.y + threadIdx.y;

if (cellNum > = numOfCells) return;

if (elemNum > = numOfElements[cellNum]) return;

//intracellular ..

//intercellular

float interX = 0.0;

float interY = 0.0;

float interZ = 0.0;

for (j = 0; j < numOfCells; ++j) {

if (j == cellNum) continue;

//check if cell centers are close enough

r = dist(cX[cellNum], cY[cellNum], cZ[cellNum], cX[j], cY[j], cZ[j]);

if (r > INTER_DIST) continue;

for (k = 0; k < numOfElements[j]; ++k) {

r = dist(X[elemNum*pitch+cellNum], Y[elemNum*pitch+cellNum],

Z[elemNum*pitch+cellNum],

X[k*pitch+j], Y[k*pitch+j], Z[k*pitch+j]);

if (r <= INTER_DIST) {

V = P_MORSE(r, INTER_U0, INTER_ETA0, INTER_U1, INTER_ETA1);

interX += V * (X[elemNum*pitch+cellNum] - X[k*pitch+j]);

interY += V * (Y[elemNum*pitch+cellNum] - Y[k*pitch+j]);

interZ += V * (Z[elemNum*pitch+cellNum] - Z[k*pitch+j]);

}

#if PERIODIC_BOUNDARY

intercellular_mirror(elemNum, cellNum, pitch, k, j,

&interX, &interY, &interZ, X, Y, Z);

#endif

}

}

//basement membrane ..

//update ..

}

### Algorithm 3

void SEM(int numOfCells, int maxCells, int *elements, int maxElements,

float *hostX, float *hostY, float *hostZ, float *hostType,

float *hostParameters, int numOfSpecies, float *speciesData,

float dt, float timeSteps)

{

//Allocate device memory

//cells and elements ..

//cell centers ..

//intracellular gene network

cudaMalloc(&neighborNum, maxCells * sizeof(float));

cudaMallocPitch(&sData, &pitch, maxCells * sizeof(float), numOfSpecies);

cudaMallocPitch(&sData_F1, &pitch, maxCells * sizeof(float), numOfSpecies);

cudaMallocPitch(&sData_F2, &pitch, maxCells * sizeof(float), numOfSpecies);

cudaMallocPitch(&neighborData, &pitch, maxCells * sizeof(float),

numOfSpecies);

//Copy host memory to device memory ..

//spatial positions ..

//parameters ..

//intracellular gene network

cudaMemcpy2D(sData, pitch, speciesData, maxCells * sizeof(float),

maxCells * sizeof(float), numOfSpecies, cudaMemcpyHostToDevice);

//execute kernel

for (t = 0; t < timeSteps; ++t) {

//movement kernels ..

//gene network kernels

skin_neighbor_kernel < < < blocksPerGrid1 D,threadsPerBlock1D > > > (numOfElements,

cellCenterX, cellCenterY, cellCenterZ, elementType,

neighborNum, pitch/sizeof(float),

sData, sData_F1, sData_F2, neighborData, numOfCells,

maxCells, maxElements);

skin_kernel_F1 < < < blocksPerGrid1 D,threadsPerBlock1D > > > (numOfElements,

X, Y, Z, elementType, neighborNum, pitch/sizeof(float),

sData, sData_F1, sData_F2, neighborData,

numOfCells, maxCells, maxElements, dt);

skin_kernel_F2 < < < blocksPerGrid1 D,threadsPerBlock1D > > > (numOfElements,

X, Y, Z, elementType, neighborNum, pitch/sizeof(float),

sData, sData_F1, sData_F2, neighborData,

numOfCells, maxCells, maxElements, dt);

}

//Copy result to host memory ..

}

__global__ void

skin_neighbor_kernel(int *numOfElements, float *cellCenterX,

float *cellCenterY, float *cellCenterZ, int *elementType,

int *neighborNum, size_t pitch,

float *speciesData, float *speciesData_F1,

float *speciesData_F2, float *neighborData,

int numOfCells, int maxCells, int maxElements)

{

int cellNum = blockIdx.x * blockDim.x + threadIdx.x;

int j;

if (cellNum > = numOfCells) return;

//totals from neighbors

float neighbor_NOTCH = 0.0;

float neighbor_DELTA = 0.0;

float neighbor_BOUND = 0.0;

int numOfNeighbors = 0;

for (j = 0; j < numOfCells; ++j) {

if (j == cellNum) continue;

if (distance_check(cellCenterX[cellNum], cellCenterY[cellNum],

cellCenterZ[cellNum], cellCenterX[j],

cellCenterY[j], cellCenterZ[j],

S_XY_TOT, NEIGHBOR_DIST) != S_NONE) {

++numOfNeighbors;

neighbor_NOTCH += speciesData[NOTCH_species*pitch+j];

neighbor_DELTA += speciesData[DELTA_species*pitch+j];

neighbor_BOUND += speciesData[BOUND_species*pitch+j];

}

}

neighborNum[cellNum] = numOfNeighbors;

neighborData[NOTCH_species*pitch+cellNum] = neighbor_NOTCH;

neighborData[DELTA_species*pitch+cellNum] = neighbor_DELTA;

neighborData[BOUND_species*pitch+cellNum] = neighbor_BOUND;

}

__global__ void

skin_kernel_F1(int *numOfElements, float *X, float *Y, float *Z,

int *elementType, int *neighborNum, size_t pitch,

float *speciesData, float *speciesData_F1,

float *speciesData_F2, float *neighborData,

int numOfCells, int maxCells, int maxElements, float dt)

{

int cellNum = blockIdx.x * blockDim.x + threadIdx.x;

if (cellNum > = numOfCells) return;

float interactionValue, F1_val;

//calculate F1

float NOTCH_val = speciesData[NOTCH_species*pitch+cellNum];

float DELTA_val = speciesData[DELTA_species*pitch+cellNum];

float BOUND_val = speciesData[BOUND_species*pitch+cellNum];

float BMA_val = speciesData[BMA_species*pitch+cellNum];

float OVOL1_val = speciesData[OVOL1_species*pitch+cellNum];

float OVOL2_val = speciesData[OVOL2_species*pitch+cellNum];

float CMYC_val = speciesData[CMYC_species*pitch+cellNum];

//averages from neighbors

float neighbor_NOTCH = 0.0;

float neighbor_DELTA = 0.0;

float neighbor_BOUND = 0.0;

int numOfNeighbors = 0;

neighbor_NOTCH = neighborData[NOTCH_species*pitch+cellNum];

neighbor_DELTA = neighborData[DELTA_species*pitch+cellNum];

neighbor_BOUND = neighborData[BOUND_species*pitch+cellNum];

numOfNeighbors = neighborNum[cellNum];

if (numOfNeighbors != 0) {

neighbor_NOTCH = neighbor_NOTCH/(float)numOfNeighbors;

neighbor_DELTA = neighbor_DELTA/(float)numOfNeighbors;

neighbor_BOUND = neighbor_BOUND/(float)numOfNeighbors;

}

//NOTCH

interactionValue = HILL(BOUND_val, NOTCH_pmin_BOUND, NOTCH_pmax_BOUND,

NOTCH_c_BOUND, NOTCH_h_BOUND);

interactionValue *= HILL(OVOL2_val, NOTCH_pmin_OVOL2, NOTCH_pmax_OVOL2,

NOTCH_c_OVOL2, NOTCH_h_OVOL2);

F1_val = NOTCH_val + 0.5 * dt * ((-KA) * NOTCH_val * neighbor_DELTA

+ KD * BOUND_val - DF * NOTCH_val

+ interactionValue);

if (F1_val < 0.0) F1_val = 0;

speciesData_F1[NOTCH_species*pitch+cellNum] = F1_val;

//DELTA

interactionValue = HILL(NOTCH_val, DELTA_pmin_BOUND, DELTA_pmax_BOUND,

DELTA_c_BOUND, DELTA_h_BOUND);

F1_val = DELTA_val + 0.5 * dt * ((-KA) * DELTA_val * neighbor_NOTCH

+ KD * neighbor_BOUND - DA * DELTA_val

+ interactionValue);

if (F1_val < 0.0) F1_val = 0;

speciesData_F1[DELTA_species*pitch+cellNum] = F1_val;

//BOUND RECEPTOR

F1_val = BOUND_val + 0.5 * dt * (KA * NOTCH_val * neighbor_DELTA

- KD * BOUND_val - KI * BOUND_val);

if (F1_val < 0.0) F1_val = 0;

speciesData_F1[BOUND_species*pitch+cellNum] = F1_val;

//Basement Membrane Adhesion

speciesData_F1[BMA_species*pitch+cellNum] =

speciesData[BMA_species*pitch+cellNum];

//OVOL1

interactionValue = HILL(OVOL2_val, OVOL1_pmin_OVOL2, OVOL1_pmax_OVOL2,

OVOL1_c_OVOL2, OVOL1_h_OVOL2);

interactionValue *= HILL(BMA_val, OVOL1_pmin_BMA, OVOL1_pmax_BMA,

OVOL1_c_BMA, OVOL1_h_BMA);

F1_val = OVOL1_val + 0.5 * dt * (interactionValue - OVOL1_decay * OVOL1_val);

if (F1_val < 0.0) F1_val = 0;

speciesData_F1[OVOL1_species*pitch+cellNum] = F1_val;

//OVOL2

interactionValue = HILL(OVOL1_val, OVOL2_pmin_OVOL1, OVOL2_pmax_OVOL1,

OVOL2_c_OVOL1, OVOL2_h_OVOL1);

interactionValue *= HILL(0.4, 0.0, 1.0, 1.0, 1.0);//TGF-beta

F1_val = OVOL2_val + 0.5 * dt * (interactionValue - OVOL2_decay * OVOL2_val);

if (F1_val < 0.0) F1_val = 0;

speciesData_F1[OVOL2_species*pitch+cellNum] = F1_val;

//CMYC

interactionValue = HILL(OVOL2_val, CMYC_pmin_OVOL2, CMYC_pmax_OVOL2,

CMYC_c_OVOL2, CMYC_h_OVOL2);

F1_val = CMYC_val + 0.5 * dt * (interactionValue - CMYC_decay * CMYC_val);

if (F1_val < 0.0) F1_val = 0;

speciesData_F1[CMYC_species*pitch+cellNum] = F1_val;

}