diff --git a/cuda/common.h b/cuda/common.h
index cef3b4f..ea02225 100644
--- a/cuda/common.h
+++ b/cuda/common.h
@@ -53,4 +53,6 @@ struct Params
     OptixTraversableHandle handle;
     Ray*                   rays;
     Hit*                   hits;
+    int*                   primitive_ids;  // Optional: triangle index per ray (-1 for miss)
+    int*                   instance_ids;   // Optional: geometry/instance index per ray (-1 for miss)
 };
diff --git a/cuda/kernel.cu b/cuda/kernel.cu
index 317596e..8e30adb 100644
--- a/cuda/kernel.cu
+++ b/cuda/kernel.cu
@@ -23,7 +23,7 @@ extern "C" __global__ void __raygen__main()
     const uint3        dim        = optixGetLaunchDimensions();
     const uint64_t linear_idx = idx.z * dim.y * dim.x + idx.y * dim.x + idx.x;
 
-    unsigned int t, nx, ny, nz;
+    unsigned int t, nx, ny, nz, prim_id, inst_id;
     Ray ray = params.rays[linear_idx];
     optixTrace(
         params.handle,
@@ -40,7 +40,9 @@ extern "C" __global__ void __raygen__main()
         t,
         nx,
         ny,
-        nz
+        nz,
+        prim_id,
+        inst_id
     );
 
     Hit hit;
@@ -49,6 +51,14 @@ extern "C" __global__ void __raygen__main()
     hit.geom_normal.y       = int_as_float( ny );
     hit.geom_normal.z       = int_as_float( nz );
     params.hits[linear_idx] = hit;
+
+    // Write optional primitive and instance IDs
+    if (params.primitive_ids != nullptr) {
+        params.primitive_ids[linear_idx] = static_cast<int>(prim_id);
+    }
+    if (params.instance_ids != nullptr) {
+        params.instance_ids[linear_idx] = static_cast<int>(inst_id);
+    }
 }
 
 
@@ -58,6 +68,8 @@ extern "C" __global__ void __miss__miss()
     optixSetPayload_1( float_as_int( 1.0f ) );
     optixSetPayload_2( float_as_int( 0.0f ) );
     optixSetPayload_3( float_as_int( 0.0f ) );
+    optixSetPayload_4( static_cast<unsigned int>(-1) );  // primitive_id = -1 for miss
+    optixSetPayload_5( static_cast<unsigned int>(-1) );  // instance_id = -1 for miss
 }
 
 
@@ -99,4 +111,6 @@ extern "C" __global__ void __closesthit__chit()
     optixSetPayload_1(float_as_int(n.x));
     optixSetPayload_2(float_as_int(n.y));
     optixSetPayload_3(float_as_int(n.z));
+    optixSetPayload_4(primIdx);                     // primitive/triangle index
+    optixSetPayload_5(optixGetInstanceId());        // instance/geometry index
 }
diff --git a/examples/cuda_utils.py b/examples/cuda_utils.py
deleted file mode 100644
index 5b60057..0000000
--- a/examples/cuda_utils.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from numba import cuda
-import numba as nb
-import numpy as np
-
-def calc_dims(shape):
-    threadsperblock = (32,32)
-    blockspergrid = (
-        (shape[0] + (threadsperblock[0] - 1)) // threadsperblock[0],
-        (shape[1] + (threadsperblock[1] - 1)) // threadsperblock[1]
-    )
-    return blockspergrid, threadsperblock
-
-@cuda.jit(device=True)
-def add(a, b):
-    return float3(a[0]+b[0], a[1]+b[1], a[2]+b[2])
-
-@cuda.jit(device=True)
-def diff(a, b):
-    return float3(a[0]-b[0], a[1]-b[1], a[2]-b[2])
-
-@cuda.jit(device=True)
-def mul(a, b):
-    return float3(a[0]*b, a[1]*b, a[2]*b)
-
-@cuda.jit(device=True)
-def multColor(a, b):
-    return float3(a[0]*b[0], a[1]*b[1], a[2]*b[2])
-
-@cuda.jit(device=True)
-def dot(a, b):
-    return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]
-
-@cuda.jit(device=True)
-def mix(a, b, k):
-    return add(mul(a, k), mul(b, 1-k))
-
-@cuda.jit(device=True)
-def make_float3(a, offset):
-    return float3(a[offset], a[offset+1], a[offset+2])
-
-@cuda.jit(device=True)
-def invert(a):
-    return float3(-a[0], -a[1], -a[2])
-
-@cuda.jit(device=True)
-def float3(a, b, c):
-    return (np.float32(a), np.float32(b), np.float32(c))
diff --git a/examples/hillshade.py b/examples/hillshade.py
deleted file mode 100644
index fa433a7..0000000
--- a/examples/hillshade.py
+++ /dev/null
@@ -1,228 +0,0 @@
-import numpy as np
-from numba import cuda
-
-import cupy
-import xarray as xr
-from rtxpy import RTX
-
-from typing import Optional
-
-from scipy.spatial.transform import Rotation as R
-
-from cuda_utils import *
-import mesh_utils
-
-@cuda.jit
-def _generatePrimaryRays(data, x_coords, y_coords, H, W):
-    """
-    A GPU kernel that given a set of x and y discrete coordinates on a raster terrain
-    generates in @data a list of parallel rays that represent camera rays generated from an ortographic camera
-    that is looking straight down at the surface from an origin height 10000
-    """
-    i, j = cuda.grid(2)
-    if i>=0 and i < H and j>=0 and j < W:
-        #data[i,j,0] = j + 1e-6 # x_coords[j] + 1e-6
-        #data[i,j,1] = i + 1e-6 # y_coords[i] + 1e-6
-
-        if (j == W-1):
-            data[i,j,0] = j - 1e-3
-        else:
-            data[i,j,0] = j + 1e-3
-
-        if (i == H-1):
-            data[i,j,1] = i - 1e-3
-        else:
-            data[i,j,1] = i + 1e-3
-
-        data[i,j,2] = 10000 # Location of the camera (height)
-        data[i,j,3] = 1e-3
-        data[i,j,4] = 0
-        data[i,j,5] = 0
-        data[i,j,6] = -1
-        data[i,j,7] = np.inf
-
-
-def generatePrimaryRays(rays, x_coords, y_coords, H, W):
-    griddim, blockdim = calc_dims((H, W))
-    _generatePrimaryRays[griddim, blockdim](rays, x_coords, y_coords, H, W)
-    return 0
-
-
-@cuda.jit
-def _generateShadowRays(rays, hits, normals, H, W, sunDir):
-    """
-    A GPU kernel that given a set rays and their respective intersection points,
-    generates in rays (overwriting the original content) a new set of rays (shadow rays)
-    That have their origins at the point of intersection of their parent ray and direction - the direction towards the sun
-    The normals vectors at the point of intersection of the original rays are cached in @normals
-    Thus we can later use them to do lambertian shading, after the shadow rays have been traced
-    """
-    i, j = cuda.grid(2)
-    if i>=0 and i < H and j>=0 and j < W:
-        dist = hits[i,j,0]
-        norm = make_float3(hits[i,j], 1)
-        if (norm[2] < 0):
-            norm = invert(norm)
-        ray = rays[i,j]
-        rayOrigin = make_float3(ray, 0)
-        rayDir    = make_float3(ray, 4)
-        p = add(rayOrigin, mul(rayDir,dist))
-
-        newOrigin = add(p, mul(norm, 1e-3))
-        ray[0] = newOrigin[0]
-        ray[1] = newOrigin[1]
-        ray[2] = newOrigin[2]
-        ray[3] = 1e-3
-        ray[4] = sunDir[0]
-        ray[5] = sunDir[1]
-        ray[6] = sunDir[2]
-        ray[7] = np.inf if dist > 0 else 0
-
-        normals[i,j,0] = norm[0]
-        normals[i,j,1] = norm[1]
-        normals[i,j,2] = norm[2]
-
-def generateShadowRays(rays, hits, normals, H, W, sunDir):
-    griddim, blockdim = calc_dims((H, W))
-    _generateShadowRays[griddim, blockdim](rays, hits, normals, H, W, sunDir)
-    return 0
-
-
-@cuda.jit
-def _shadeLambert(hits, normals, output, H, W, sunDir, castShadows):
-    """
-    This kernel does a simple Lambertian shading
-    The hits array contains the results of tracing the shadow rays through the scene.
-    If the value in hits[x,y,0] is > 0, then a valid intersection occurred and that means that the point
-    at location x,y is in shadow.
-    The normals array stores the normal at the intersecion point of each camera ray
-    We then use the information for light visibility and normal to apply Lambert's cosine law
-    The final result is stored in output which is an RGB array
-    """
-    i, j = cuda.grid(2)
-    if i>=0 and i < H and j>=0 and j < W:
-        # Normal at the intersection of camera ray (i,j) with the scene
-        norm = make_float3(normals[i,j], 0)
-
-        # Below is same as existing algorithm without shadows and is OK with shadows.
-        # Could be improved with a bit of antialiasing at edges of shadow???
-
-        light_dir = make_float3(sunDir, 0)  # Might have to make it zero if back cull.
-        cos_theta = dot(light_dir, norm)  # light_dir and norm are already normalised.
-
-        temp = (cos_theta + 1) / 2
-
-        if castShadows and hits[i, j, 0] >= 0:
-            temp = temp / 2
-
-        if temp > 1:
-            temp = 1
-        elif temp < 0:
-            temp = 0
-
-        output[i, j] = temp
-
-
-
-def shadeLambert(hits, normals, output, H, W, sunDir, castShadows):
-    griddim, blockdim = calc_dims((H, W))
-    _shadeLambert[griddim, blockdim](hits, normals, output, H, W, sunDir, castShadows)
-    return 0
-
-
-def getSunDir(angle_altitude, azimuth):
-    """
-    Calculate the vector towards the sun based on sun altitude angle and azimuth
-    """
-    north = (0,1,0)
-    rx = R.from_euler('x', angle_altitude, degrees=True)
-    rz = R.from_euler('z', azimuth+180, degrees=True)
-    sunDir = rx.apply(north)
-    sunDir = rz.apply(sunDir)
-    return sunDir
-
-
-def hillshade_rt(raster: xr.DataArray,
-              optix: RTX,
-              shadows: bool = False,
-              azimuth: int = 225,
-              angle_altitude: int = 25,
-              name: Optional[str] = 'hillshade') -> xr.DataArray:
-
-    H,W = raster.shape
-    sunDir = cupy.array(getSunDir(angle_altitude, azimuth))
-
-    #output = np.zeros((H,W,3), np.float32)
-
-    # Device buffers
-    d_rays   = cupy.empty((H,W,8), np.float32)
-    d_hits   = cupy.empty((H,W,4), np.float32)
-    d_aux    = cupy.empty((H,W,3), np.float32)
-    d_output = cupy.empty((H,W), np.float32)
-
-    y_coords = cupy.array(raster.indexes.get('y').values)
-    x_coords = cupy.array(raster.indexes.get('x').values)
-
-    generatePrimaryRays(d_rays, x_coords, y_coords, H, W)
-    device = cupy.cuda.Device(0)
-    device.synchronize()
-    res = optix.trace(d_rays, d_hits, W*H)
-
-    generateShadowRays(d_rays, d_hits, d_aux, H, W, sunDir)
-    if shadows:
-        device.synchronize()
-        res = optix.trace(d_rays, d_hits, W*H)
-
-    shadeLambert(d_hits, d_aux, d_output, H, W, sunDir, shadows)
-
-    if isinstance(raster.data, np.ndarray):
-        output = cupy.asnumpy(d_output[:, :])
-        nanValue = np.nan
-    else:
-        output = d_output[:, :]
-        nanValue = cupy.nan
-
-    output[0, :] = nanValue
-    output[-1, :] = nanValue
-    output[:, 0] = nanValue
-    output[:, -1] = nanValue
-
-    hill = xr.DataArray(output,
-                    name=name,
-                    coords=raster.coords,
-                    dims=raster.dims,
-                    attrs=raster.attrs)
-    return hill
-
-def hillshade_gpu(raster: xr.DataArray,
-              shadows: bool = False,
-              azimuth: int = 225,
-              angle_altitude: int = 25,
-              name: Optional[str] = 'hillshade') -> xr.DataArray:
-    # Move the terrain to GPU for testing the GPU path
-    if not isinstance(raster.data, cupy.ndarray):
-        print("WARNING: raster.data is not a cupy array. Additional overhead will be incurred")
-    H,W = raster.data.squeeze().shape
-    optix = RTX()
-
-    datahash = np.uint64(hash(str(raster.data.get())) % (1 << 64))
-    optixhash = np.uint64(optix.getHash())
-    if (optixhash != datahash):
-        numTris = (H - 1) * (W - 1) * 2
-        verts = cupy.empty(H * W * 3, np.float32)
-        triangles = cupy.empty(numTris * 3, np.int32)
-
-        # Generate a mesh from the terrain (buffers are on the GPU, so generation happens also on GPU)
-        res = mesh_utils.triangulateTerrain(verts, triangles, raster)
-        if res:
-            raise ValueError("Failed to generate mesh from terrain. Error code:{}".format(res))
-        res = optix.build(datahash, verts, triangles)
-        if res:
-            raise ValueError("OptiX failed to build GAS with error code:{}".format(res))
-        #Clear some GPU memory that we no longer need
-        verts = None
-        triangles = None
-        cupy.get_default_memory_pool().free_all_blocks()
-
-    hill = hillshade_rt(raster, optix, azimuth=azimuth, angle_altitude=angle_altitude, shadows=shadows, name=name)
-    return hill
diff --git a/examples/mesh_utils.py b/examples/mesh_utils.py
deleted file mode 100644
index 599f9c9..0000000
--- a/examples/mesh_utils.py
+++ /dev/null
@@ -1,134 +0,0 @@
-
-import numba as nb
-from numba import cuda
-import numpy as np
-import cupy
-
-
-@cuda.jit
-def _triangulateTerrain(verts, triangles, data, H, W, scale, stride):
-    globalId = stride + cuda.grid(1)
-    if globalId < W*H:
-        h = globalId // W
-        w = globalId % W
-        meshMapIndex = h * W + w
-
-        val = data[h, w]
-
-        offset = 3*meshMapIndex
-        verts[offset] = w  # x_coords[w] # w
-        verts[offset+1] = h  # y_coords[h] # h
-        verts[offset+2] = val * scale
-
-        if w != W - 1 and h != H - 1:
-            offset = 6*(h * (W-1) + w)
-            triangles[offset+0] = np.int32(meshMapIndex + W)
-            triangles[offset+1] = np.int32(meshMapIndex + W + 1)
-            triangles[offset+2] = np.int32(meshMapIndex)
-            triangles[offset+3] = np.int32(meshMapIndex + W + 1)
-            triangles[offset+4] = np.int32(meshMapIndex + 1)
-            triangles[offset+5] = np.int32(meshMapIndex)
-
-
-@nb.njit(parallel=True)
-def triangulateCPU(verts, triangles, data, H, W, scale):
-    for h in nb.prange(H):
-        for w in range(W):
-            meshMapIndex = h * W + w
-
-            val = data[h, w]
-
-            offset = 3*meshMapIndex
-            verts[offset] = w  # x_coords[w] # w
-            verts[offset+1] = h  # y_coords[h] # h
-            verts[offset+2] = val * scale
-
-            if w != W - 1 and h != H - 1:
-                offset = 6*(h * (W-1) + w)
-                triangles[offset+0] = np.int32(meshMapIndex + W)
-                triangles[offset+1] = np.int32(meshMapIndex + W + 1)
-                triangles[offset+2] = np.int32(meshMapIndex)
-                triangles[offset+3] = np.int32(meshMapIndex + W+1)
-                triangles[offset+4] = np.int32(meshMapIndex + 1)
-                triangles[offset+5] = np.int32(meshMapIndex)
-
-
-def triangulateTerrain(verts, triangles, terrain, scale=1):
-    H, W = terrain.shape
-    if isinstance(terrain.data, np.ndarray):
-        triangulateCPU(verts, triangles, terrain.data, H, W, scale)
-    if isinstance(terrain.data, cupy.ndarray):
-        jobSize = H*W
-        blockdim = 1024
-        griddim = (jobSize + blockdim - 1) // 1024
-        d = 100
-        offset = 0
-        while (jobSize > 0):
-            batch = min(d, griddim)
-            _triangulateTerrain[batch, blockdim](verts, triangles,
-                                                 terrain.data, H, W,
-                                                 scale, offset)
-            offset += batch*blockdim
-            jobSize -= batch*blockdim
-    return 0
-
-
-@nb.jit(nopython=True)
-def fillContents(content, verts, triangles, numTris):
-    v = np.empty(12, np.float32)
-    pad = np.zeros(2, np.int8)
-    offset = 0
-    for i in range(numTris):
-        t0 = triangles[3*i+0]
-        t1 = triangles[3*i+1]
-        t2 = triangles[3*i+2]
-        v[3*0+0] = 0
-        v[3*0+1] = 0
-        v[3*0+2] = 0
-        v[3*1+0] = verts[3*t0+0]
-        v[3*1+1] = verts[3*t0+1]
-        v[3*1+2] = verts[3*t0+2]
-        v[3*2+0] = verts[3*t1+0]
-        v[3*2+1] = verts[3*t1+1]
-        v[3*2+2] = verts[3*t1+2]
-        v[3*3+0] = verts[3*t2+0]
-        v[3*3+1] = verts[3*t2+1]
-        v[3*3+2] = verts[3*t2+2]
-
-        offset = 50*i
-        content[offset:offset+48] = v.view(np.uint8)
-        content[offset+48:offset+50] = pad
-
-
-def write(name, verts, triangles):
-    """
-    Save a triangulated raster to a standard STL file.
-    Windows has a default STL viewer and probably all 3D viewers have native support for it
-    because of its simplicity. Can be used to verify the correctness of the algorithm or
-    to visualize the mesh to get a notion of the size/complexity etc.
-    @param name - The name of the mesh file we're going to save. Should end in .stl
-    @param verts - A numpy array containing all the vertices of the mesh. Format is 3 float32 per vertex (vertex buffer)
-    @param triangles - A numpy array containing all the triangles of the mesh. Format is 3 int32 per triangle (index buffer)
-    """
-    ib = triangles
-    vb = verts
-    if isinstance(ib, cupy.ndarray):
-        ib = cupy.asnumpy(ib)
-    if isinstance(vb, cupy.ndarray):
-        vb = cupy.asnumpy(vb)
-
-    header = np.zeros(80, np.uint8)
-    nf = np.empty(1, np.uint32)
-    numTris = triangles.shape[0] // 3
-    nf[0] = numTris
-    f = open(name, 'wb')
-    f.write(header)
-    f.write(nf)
-
-    # size of 1 triangle in STL is 50 bytes
-    # 12 floats (each 4 bytes) for a total of 48
-    # And additional 2 bytes for padding
-    content = np.empty(numTris*(50), np.uint8)
-    fillContents(content, vb, ib, numTris)
-    f.write(content)
-    f.close()
diff --git a/examples/viewshed.py b/examples/viewshed.py
deleted file mode 100644
index 986c1b9..0000000
--- a/examples/viewshed.py
+++ /dev/null
@@ -1,222 +0,0 @@
-from numba import cuda
-import numpy as np
-import numba as nb
-
-import cupy
-import xarray as xr
-from rtxpy import RTX
-import math
-
-from typing import Union
-
-from scipy.spatial.transform import Rotation as R
-
-from cuda_utils import *
-import mesh_utils
-
-# view options default values
-OBS_ELEV = 0
-TARGET_ELEV = 0
-
-# if a cell is invisible, its value is set to -1
-INVISIBLE = -1
-
-@cuda.jit
-def _generatePrimaryRays(data, x_coords, y_coords, H, W):
-    i, j = cuda.grid(2)
-    if i>=0 and i < H and j>=0 and j < W:
-#        if (j == W-1):
-#            data[i,j,0] = y_coords[j] - 1e-6
-#        else:
-#            data[i,j,0] = y_coords[j] + 1e-6
-#
-#        if (i == H-1):
-#            data[i,j,1] = x_coords[i] - 1e-6
-#        else:
-#            data[i,j,1] = x_coords[i] + 1e-6
-
-        if (j == W-1):
-            data[i,j,0] = j - 1e-3
-        else:
-            data[i,j,0] = j + 1e-3
-
-        if (i == H-1):
-            data[i,j,1] = i - 1e-3
-        else:
-            data[i,j,1] = i + 1e-3
-
-        data[i,j,2] = 10000 # Location of the camera (height)
-        data[i,j,3] = 1e-3
-        data[i,j,4] = 0
-        data[i,j,5] = 0
-        data[i,j,6] = -1
-        data[i,j,7] = np.inf
-
-def generatePrimaryRays(rays, x_coords, y_coords, H, W):
-    griddim, blockdim = calc_dims((H, W))
-    d_y_coords = cupy.array(y_coords)
-    d_x_coords = cupy.array(x_coords)
-    _generatePrimaryRays[griddim, blockdim](rays, d_x_coords, d_y_coords, H, W)
-    return 0
-
-@cuda.jit
-def _generateViewshedRays(camRays, hits, vsrays, visibility_grid, H, W, vp):
-    i, j = cuda.grid(2)
-    if i>=0 and i < H and j>=0 and j < W:
-        elevationOffset = vp[2]
-        targetElevation = vp[3]
-        dist = hits[i,j,0]  # distance to surface from camera
-        norm = make_float3(hits[i,j], 1)  # normal vector at intersection with surface
-        #inorm = norm
-        if (norm[2] < 0):  # if back hit, face forward
-            norm = invert(norm)
-        cameraRay = camRays[i,j]
-        rayOrigin = make_float3(cameraRay, 0)  # get the camera ray origin
-        rayDir    = make_float3(cameraRay, 4)  # get the camera ray direction
-        hitP = add(rayOrigin, mul(rayDir,dist))  # calculate intersection point
-        newOrigin = add(hitP, mul(norm, 1e-3))  # generate new ray origin, and a little offset to avoid self-intersection
-        newOrigin = add(newOrigin, float3(0, 0, targetElevation))  # move the new origin up by the selected by user targetElevation factor
-
-        w = int(vp[0])
-        h = int(vp[1])
-        viewshedRay = camRays[h,w]  # get the camera ray that was cast for the location of the viewshed origin
-        dist = hits[h,w,0]  # get the distance from the camera to the viewshed point
-        rayOrigin = make_float3(viewshedRay, 0)  # get the origin on the camera of the ray towards VP point
-        rayDir    = make_float3(viewshedRay, 4)  # get the direction from camera to VP point
-        hitP = add(rayOrigin, mul(rayDir,dist))  # calculate distance from camera to VP
-        viewshedPoint = add(hitP, float3(0, 0, elevationOffset))  # calculate the VP location on the surface and add the VP offset
-
-        newDir = diff(viewshedPoint, newOrigin)  # calculate vector from SurfaceHit to VP
-        length = math.sqrt(dot(newDir, newDir))  # calculate distance from surface to VP
-        newDir = mul(newDir, 1/length)  # normalize the direction (vector v)
-
-        cosine = dot(norm, newDir)  # cosine of the angle between n and v
-        theta = math.acos(cosine)  # Cosine angle in radians
-        theta = (180*theta)/math.pi  # Cosine angle in degrees
-
-        # prepare a viewshed ray to cast to determine visibility
-        vsray = vsrays[i,j]
-        vsray[0] = newOrigin[0]
-        vsray[1] = newOrigin[1]
-        vsray[2] = newOrigin[2]
-        vsray[3] = 0
-        vsray[4] = newDir[0]
-        vsray[5] = newDir[1]
-        vsray[6] = newDir[2]
-        vsray[7] = length
-
-        visibility_grid[i,j] = theta
-
-def generateViewshedRays(rays, hits, vsrays, visibility_grid, H, W, vp):
-    griddim, blockdim = calc_dims((H, W))
-    _generateViewshedRays[griddim, blockdim](rays, hits, vsrays, visibility_grid, H, W, vp)
-    return 0
-
-@cuda.jit
-def _calcViewshed(hits, visibility_grid, H, W):
-    i, j = cuda.grid(2)
-    if i>=0 and i < H and j>=0 and j < W:
-        dist = hits[i,j,0]
-        # We traced the viewshed rays and now hits contains the intersection data
-        # if dist > 0, then we were able to hit something along the length of the ray
-        # which means that the pixel we targeted is not directly visible from the view point
-        if dist>=0:
-            visibility_grid[i,j] = INVISIBLE
-
-def calcViewshed(hits, visibility_grid, H, W):
-    griddim, blockdim = calc_dims((H, W))
-    _calcViewshed[griddim, blockdim](hits, visibility_grid, H, W)
-    return 0
-
-def viewshed_rt(raster: xr.DataArray,
-             optix: RTX,
-             x: Union[int, float],
-             y: Union[int, float],
-             observer_elev: float = OBS_ELEV,
-             target_elev: float = TARGET_ELEV) -> xr.DataArray:
-
-    H,W = raster.shape
-
-    y_coords = raster.indexes.get('y').values
-    x_coords = raster.indexes.get('x').values
-
-    # validate x arg
-    if x < x_coords.min():
-        raise ValueError("x argument outside of raster x_range")
-    elif x > x_coords.max():
-        raise ValueError("x argument outside of raster x_range")
-
-    # validate y arg
-    if y < y_coords.min():
-        raise ValueError("y argument outside of raster y_range")
-    elif y > y_coords.max():
-        raise ValueError("y argument outside of raster y_range")
-
-    selection = raster.sel(x=[x], y=[y], method='nearest')
-    x = selection.x.values[0]
-    y = selection.y.values[0]
-
-    y_view = np.where(y_coords == y)[0][0]
-    x_view = np.where(x_coords == x)[0][0]
-
-    # Device buffers
-    d_rays = cupy.empty((H,W,8), np.float32)
-    d_hits = cupy.empty((H,W,4), np.float32)
-    d_visgrid = cupy.empty((H,W), np.float32)
-    d_vsrays  = cupy.empty((H,W,8), np.float32)
-
-    generatePrimaryRays(d_rays, x_coords, y_coords, H, W)
-    device = cupy.cuda.Device(0)
-    device.synchronize()
-    res = optix.trace(d_rays, d_hits, W*H)
-
-    generateViewshedRays(d_rays, d_hits, d_vsrays, d_visgrid, H, W, (x_view, y_view, observer_elev, target_elev))
-    device.synchronize()
-    res = optix.trace(d_vsrays, d_hits, W*H)
-
-    calcViewshed(d_hits, d_visgrid, H, W)
-
-    if isinstance(raster.data, np.ndarray):
-        visgrid = cupy.asnumpy(d_visgrid)
-    else:
-        visgrid = d_visgrid
-
-    view = xr.DataArray(visgrid,
-                    name="viewshed",
-                    coords=raster.coords,
-                    dims=raster.dims,
-                    attrs=raster.attrs)
-    return view
-
-def viewshed_gpu(raster: xr.DataArray,
-             x: Union[int, float],
-             y: Union[int, float],
-             observer_elev: float = OBS_ELEV,
-             target_elev: float = TARGET_ELEV) -> xr.DataArray:
-    if not isinstance(raster.data, cupy.ndarray):
-        raise ValueError("raster.data must be a cupy array")
-
-    H,W = raster.shape
-    optix = RTX()
-
-    datahash = np.uint64(hash(str(raster.data.get())) % (1 << 64))
-    optixhash = np.uint64(optix.getHash())
-    if (optixhash != datahash):
-        numTris = (H - 1) * (W - 1) * 2
-        verts = cupy.empty(H * W * 3, np.float32)
-        triangles = cupy.empty(numTris * 3, np.int32)
-
-        # Generate a mesh from the terrain (buffers are on the GPU, so generation happens also on GPU)
-        res = mesh_utils.triangulateTerrain(verts, triangles, raster)
-        if res:
-            raise ValueError("Failed to generate mesh from terrain. Error code:{}".format(res))
-        res = optix.build(datahash, verts, triangles)
-        if res:
-            raise ValueError("OptiX failed to build GAS with error code:{}".format(res))
-        #Clear some GPU memory that we no longer need
-        verts = None
-        triangles = None
-        cupy.get_default_memory_pool().free_all_blocks()
-
-    view = viewshed_rt(raster, optix, x, y, observer_elev, target_elev)
-    return view
diff --git a/rtxpy/rtx.py b/rtxpy/rtx.py
index 73978ea..8a866ee 100644
--- a/rtxpy/rtx.py
+++ b/rtxpy/rtx.py
@@ -322,7 +322,7 @@ def _init_optix(device: Optional[int] = None):
     pipeline_options = optix.PipelineCompileOptions(
         usesMotionBlur=False,
         traversableGraphFlags=optix.TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY,
-        numPayloadValues=4,
+        numPayloadValues=6,  # t, nx, ny, nz, primitive_id, instance_id
         numAttributeValues=2,
         exceptionFlags=optix.EXCEPTION_FLAG_NONE,
         pipelineLaunchParamsVariableName="params",
@@ -403,7 +403,7 @@ def _init_optix(device: Optional[int] = None):
     _create_sbt()
 
     # Allocate params buffer (24 bytes: handle(8) + rays_ptr(8) + hits_ptr(8))
-    _state.d_params = cupy.zeros(24, dtype=cupy.uint8)
+    _state.d_params = cupy.zeros(40, dtype=cupy.uint8)  # 5 pointers * 8 bytes
 
     _state.initialized = True
     atexit.register(_cleanup_at_exit)
@@ -771,7 +771,7 @@ def _build_accel(hash_value: int, vertices, indices) -> int:
 # Ray tracing
 # -----------------------------------------------------------------------------
 
-def _trace_rays(rays, hits, num_rays: int) -> int:
+def _trace_rays(rays, hits, num_rays: int, primitive_ids=None, instance_ids=None) -> int:
     """
     Trace rays against the current acceleration structure.
 
@@ -782,6 +782,11 @@ def _trace_rays(rays, hits, num_rays: int) -> int:
         rays: Ray buffer (Nx8 float32: ox,oy,oz,tmin,dx,dy,dz,tmax)
         hits: Hit buffer (Nx4 float32: t,nx,ny,nz)
         num_rays: Number of rays to trace
+        primitive_ids: Optional output buffer (Nx1 int32) for triangle indices.
+                       -1 indicates a miss.
+        instance_ids: Optional output buffer (Nx1 int32) for geometry/instance indices.
+                      -1 indicates a miss. Useful in multi-GAS mode to identify
+                      which geometry was hit.
 
     Returns:
         0 on success, non-zero on error
@@ -808,6 +813,12 @@ def _trace_rays(rays, hits, num_rays: int) -> int:
     if rays.size != num_rays * 8 or hits.size != num_rays * 4:
         return -1
 
+    # Validate optional buffers
+    if primitive_ids is not None and primitive_ids.size != num_rays:
+        return -1
+    if instance_ids is not None and instance_ids.size != num_rays:
+        return -1
+
     # Ensure rays are on GPU
     if isinstance(rays, cupy.ndarray):
         d_rays = rays
@@ -835,12 +846,38 @@ def _trace_rays(rays, hits, num_rays: int) -> int:
         d_hits = _state.d_hits
         hits_on_host = True
 
-    # Pack params: handle(8 bytes) + rays_ptr(8 bytes) + hits_ptr(8 bytes)
+    # Handle optional primitive_ids buffer
+    d_prim_ids_ptr = 0
+    prim_ids_on_host = False
+    if primitive_ids is not None:
+        if isinstance(primitive_ids, cupy.ndarray):
+            d_prim_ids = primitive_ids
+            prim_ids_on_host = False
+        else:
+            d_prim_ids = cupy.zeros(num_rays, dtype=cupy.int32)
+            prim_ids_on_host = True
+        d_prim_ids_ptr = d_prim_ids.data.ptr
+
+    # Handle optional instance_ids buffer
+    d_inst_ids_ptr = 0
+    inst_ids_on_host = False
+    if instance_ids is not None:
+        if isinstance(instance_ids, cupy.ndarray):
+            d_inst_ids = instance_ids
+            inst_ids_on_host = False
+        else:
+            d_inst_ids = cupy.zeros(num_rays, dtype=cupy.int32)
+            inst_ids_on_host = True
+        d_inst_ids_ptr = d_inst_ids.data.ptr
+
+    # Pack params: handle(8) + rays_ptr(8) + hits_ptr(8) + prim_ids_ptr(8) + inst_ids_ptr(8)
     params_data = struct.pack(
-        'QQQ',
+        'QQQQQ',
         trace_handle,
         d_rays.data.ptr,
         d_hits.data.ptr,
+        d_prim_ids_ptr,
+        d_inst_ids_ptr,
     )
     _state.d_params[:] = cupy.frombuffer(np.frombuffer(params_data, dtype=np.uint8), dtype=cupy.uint8)
 
@@ -849,17 +886,23 @@ def _trace_rays(rays, hits, num_rays: int) -> int:
         _state.pipeline,
         0,  # stream
         _state.d_params.data.ptr,
-        24,  # sizeof(Params)
+        40,  # sizeof(Params): 5 pointers * 8 bytes
         _state.sbt,
         num_rays,  # width
         1,  # height
         1,  # depth
     )
 
-    # Copy results back if hits was on host
-    if hits_on_host:
+    # Copy results back if buffers were on host
+    if hits_on_host or prim_ids_on_host or inst_ids_on_host:
         cupy.cuda.Stream.null.synchronize()
+
+    if hits_on_host:
         hits[:] = d_hits.get()
+    if prim_ids_on_host:
+        primitive_ids[:] = d_prim_ids.get()
+    if inst_ids_on_host:
+        instance_ids[:] = d_inst_ids.get()
 
     return 0
 
@@ -936,7 +979,7 @@ def getHash(self) -> int:
         """
         return _state.current_hash
 
-    def trace(self, rays, hits, numRays: int) -> int:
+    def trace(self, rays, hits, numRays: int, primitive_ids=None, instance_ids=None) -> int:
         """
         Trace rays against the current acceleration structure.
 
@@ -948,11 +991,17 @@ def trace(self, rays, hits, numRays: int) -> int:
             hits: Hit buffer (4 float32 per hit: t,nx,ny,nz)
                   t=-1 indicates a miss
             numRays: Number of rays to trace
+            primitive_ids: Optional output buffer (numRays x int32) for triangle indices.
+                           Will contain the index of the hit triangle within its geometry,
+                           or -1 for rays that missed.
+            instance_ids: Optional output buffer (numRays x int32) for geometry/instance indices.
+                          Will contain the instance ID of the hit geometry, or -1 for misses.
+                          Useful in multi-GAS mode to identify which geometry was hit.
 
         Returns:
             0 on success, non-zero on error
         """
-        return _trace_rays(rays, hits, numRays)
+        return _trace_rays(rays, hits, numRays, primitive_ids, instance_ids)
 
     # -------------------------------------------------------------------------
     # Multi-GAS API
diff --git a/rtxpy/tests/test_simple.py b/rtxpy/tests/test_simple.py
index 42b4ff0..6dfb463 100644
--- a/rtxpy/tests/test_simple.py
+++ b/rtxpy/tests/test_simple.py
@@ -927,3 +927,252 @@ def test_cupy_buffers_multi_gas(test_cupy):
         hits_np = hits
 
     np.testing.assert_almost_equal(hits_np[0], 10.0, decimal=1)
+
+
+# =============================================================================
+# Primitive ID and Instance ID Tests
+# =============================================================================
+
+@pytest.mark.parametrize("test_cupy", [False, True])
+def test_primitive_ids_single_gas(test_cupy):
+    """Test primitive_ids returns correct triangle indices in single-GAS mode."""
+    if test_cupy:
+        if not has_cupy:
+            pytest.skip("cupy not available")
+        import cupy
+        backend = cupy
+    else:
+        backend = np
+
+    rtx = RTX()
+    rtx.clear_scene()
+
+    # Create a mesh with 2 triangles (a quad)
+    # Triangle 0: vertices 0,1,2 (bottom-left triangle)
+    # Triangle 1: vertices 2,1,3 (top-right triangle)
+    verts = backend.float32([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0])
+    triangles = backend.int32([0, 1, 2, 2, 1, 3])
+
+    res = rtx.build(0, verts, triangles)
+    assert res == 0
+
+    # Ray hitting triangle 0 (bottom-left area)
+    rays = backend.float32([0.25, 0.25, 10, 0, 0, 0, -1, 1000])
+    hits = backend.float32([0, 0, 0, 0])
+    prim_ids = backend.int32([0])
+
+    res = rtx.trace(rays, hits, 1, primitive_ids=prim_ids)
+    assert res == 0
+    assert float(hits[0]) > 0  # Should hit
+    assert int(prim_ids[0]) == 0  # Should hit triangle 0
+
+    # Ray hitting triangle 1 (top-right area)
+    rays2 = backend.float32([0.75, 0.75, 10, 0, 0, 0, -1, 1000])
+    hits2 = backend.float32([0, 0, 0, 0])
+    prim_ids2 = backend.int32([0])
+
+    res = rtx.trace(rays2, hits2, 1, primitive_ids=prim_ids2)
+    assert res == 0
+    assert float(hits2[0]) > 0  # Should hit
+    assert int(prim_ids2[0]) == 1  # Should hit triangle 1
+
+
+@pytest.mark.parametrize("test_cupy", [False, True])
+def test_primitive_ids_miss(test_cupy):
+    """Test primitive_ids returns -1 for misses."""
+    if test_cupy:
+        if not has_cupy:
+            pytest.skip("cupy not available")
+        import cupy
+        backend = cupy
+    else:
+        backend = np
+
+    rtx = RTX()
+    rtx.clear_scene()
+
+    verts = backend.float32([0, 0, 0, 1, 0, 0, 0.5, 1, 0])
+    triangles = backend.int32([0, 1, 2])
+
+    res = rtx.build(0, verts, triangles)
+    assert res == 0
+
+    # Ray that misses
+    rays = backend.float32([100, 100, 10, 0, 0, 0, -1, 1000])
+    hits = backend.float32([0, 0, 0, 0])
+    prim_ids = backend.int32([999])  # Initialize with non-zero to verify it's set
+
+    res = rtx.trace(rays, hits, 1, primitive_ids=prim_ids)
+    assert res == 0
+    assert float(hits[0]) == -1.0  # Miss
+    assert int(prim_ids[0]) == -1  # primitive_id should be -1 for miss
+
+
+@pytest.mark.parametrize("test_cupy", [False, True])
+def test_instance_ids_multi_gas(test_cupy):
+    """Test instance_ids returns correct geometry indices in multi-GAS mode."""
+    if test_cupy:
+        if not has_cupy:
+            pytest.skip("cupy not available")
+        import cupy
+        backend = cupy
+    else:
+        backend = np
+
+    rtx = RTX()
+    rtx.clear_scene()
+
+    # Small triangle
+    verts = backend.float32([0, 0, 0, 1, 0, 0, 0.5, 1, 0])
+    tris = backend.int32([0, 1, 2])
+
+    # Add 3 geometries at different X positions
+    rtx.add_geometry("mesh0", verts, tris, transform=[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0])
+    rtx.add_geometry("mesh1", verts, tris, transform=[1, 0, 0, 5, 0, 1, 0, 0, 0, 0, 1, 0])
+    rtx.add_geometry("mesh2", verts, tris, transform=[1, 0, 0, 10, 0, 1, 0, 0, 0, 0, 1, 0])
+
+    # Ray hitting mesh0 (at x=0.5)
+    rays0 = backend.float32([0.5, 0.33, 10, 0, 0, 0, -1, 1000])
+    hits0 = backend.float32([0, 0, 0, 0])
+    prim_ids0 = backend.int32([0])
+    inst_ids0 = backend.int32([0])
+
+    res = rtx.trace(rays0, hits0, 1, primitive_ids=prim_ids0, instance_ids=inst_ids0)
+    assert res == 0
+    assert float(hits0[0]) > 0
+    assert int(inst_ids0[0]) == 0  # Should hit mesh0
+
+    # Ray hitting mesh1 (at x=5.5)
+    rays1 = backend.float32([5.5, 0.33, 10, 0, 0, 0, -1, 1000])
+    hits1 = backend.float32([0, 0, 0, 0])
+    prim_ids1 = backend.int32([0])
+    inst_ids1 = backend.int32([0])
+
+    res = rtx.trace(rays1, hits1, 1, primitive_ids=prim_ids1, instance_ids=inst_ids1)
+    assert res == 0
+    assert float(hits1[0]) > 0
+    assert int(inst_ids1[0]) == 1  # Should hit mesh1
+
+    # Ray hitting mesh2 (at x=10.5)
+    rays2 = backend.float32([10.5, 0.33, 10, 0, 0, 0, -1, 1000])
+    hits2 = backend.float32([0, 0, 0, 0])
+    prim_ids2 = backend.int32([0])
+    inst_ids2 = backend.int32([0])
+
+    res = rtx.trace(rays2, hits2, 1, primitive_ids=prim_ids2, instance_ids=inst_ids2)
+    assert res == 0
+    assert float(hits2[0]) > 0
+    assert int(inst_ids2[0]) == 2  # Should hit mesh2
+
+
+@pytest.mark.parametrize("test_cupy", [False, True])
+def test_instance_ids_miss(test_cupy):
+    """Test instance_ids returns -1 for misses in multi-GAS mode."""
+    if test_cupy:
+        if not has_cupy:
+            pytest.skip("cupy not available")
+        import cupy
+        backend = cupy
+    else:
+        backend = np
+
+    rtx = RTX()
+    rtx.clear_scene()
+
+    verts = backend.float32([0, 0, 0, 1, 0, 0, 0.5, 1, 0])
+    tris = backend.int32([0, 1, 2])
+
+    rtx.add_geometry("mesh", verts, tris)
+
+    # Ray that misses
+    rays = backend.float32([100, 100, 10, 0, 0, 0, -1, 1000])
+    hits = backend.float32([0, 0, 0, 0])
+    prim_ids = backend.int32([999])
+    inst_ids = backend.int32([999])
+
+    res = rtx.trace(rays, hits, 1, primitive_ids=prim_ids, instance_ids=inst_ids)
+    assert res == 0
+    assert float(hits[0]) == -1.0  # Miss
+    assert int(prim_ids[0]) == -1
+    assert int(inst_ids[0]) == -1
+
+
+@pytest.mark.parametrize("test_cupy", [False, True])
+def test_primitive_ids_multiple_rays(test_cupy):
+    """Test primitive_ids with multiple rays hitting different triangles."""
+    if test_cupy:
+        if not has_cupy:
+            pytest.skip("cupy not available")
+        import cupy
+        backend = cupy
+    else:
+        backend = np
+
+    rtx = RTX()
+    rtx.clear_scene()
+
+    # Create a mesh with 2 triangles
+    verts = backend.float32([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0])
+    triangles = backend.int32([0, 1, 2, 2, 1, 3])
+
+    res = rtx.build(0, verts, triangles)
+    assert res == 0
+
+    # 3 rays: one hitting triangle 0, one hitting triangle 1, one missing
+    rays = backend.float32([
+        0.25, 0.25, 10, 0, 0, 0, -1, 1000,  # hits triangle 0
+        0.75, 0.75, 10, 0, 0, 0, -1, 1000,  # hits triangle 1
+        100, 100, 10, 0, 0, 0, -1, 1000,    # misses
+    ])
+    hits = backend.float32([0] * 12)  # 3 rays * 4 floats
+    prim_ids = backend.int32([0, 0, 0])
+
+    res = rtx.trace(rays, hits, 3, primitive_ids=prim_ids)
+    assert res == 0
+
+    # Check results
+    assert int(prim_ids[0]) == 0   # First ray hits triangle 0
+    assert int(prim_ids[1]) == 1   # Second ray hits triangle 1
+    assert int(prim_ids[2]) == -1  # Third ray misses
+
+
+@pytest.mark.parametrize("test_cupy", [False, True])
+def test_primitive_and_instance_ids_optional(test_cupy):
+    """Test that primitive_ids and instance_ids are truly optional."""
+    if test_cupy:
+        if not has_cupy:
+            pytest.skip("cupy not available")
+        import cupy
+        backend = cupy
+    else:
+        backend = np
+
+    rtx = RTX()
+    rtx.clear_scene()
+
+    verts = backend.float32([0, 0, 0, 1, 0, 0, 0.5, 1, 0])
+    triangles = backend.int32([0, 1, 2])
+
+    res = rtx.build(0, verts, triangles)
+    assert res == 0
+
+    rays = backend.float32([0.5, 0.33, 10, 0, 0, 0, -1, 1000])
+    hits = backend.float32([0, 0, 0, 0])
+
+    # Should work without any optional params
+    res = rtx.trace(rays, hits, 1)
+    assert res == 0
+    np.testing.assert_almost_equal(float(hits[0]), 10.0, decimal=1)
+
+    # Should work with only primitive_ids
+    hits = backend.float32([0, 0, 0, 0])
+    prim_ids = backend.int32([0])
+    res = rtx.trace(rays, hits, 1, primitive_ids=prim_ids)
+    assert res == 0
+    assert int(prim_ids[0]) == 0
+
+    # Should work with only instance_ids
+    hits = backend.float32([0, 0, 0, 0])
+    inst_ids = backend.int32([0])
+    res = rtx.trace(rays, hits, 1, instance_ids=inst_ids)
+    assert res == 0