diff --git a/pyproject.toml b/pyproject.toml
index b1e1b8d..449a7c2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "rtxpy"
-version = "0.0.4"
+version = "0.0.5"
 description = "Ray tracing using CUDA accessible from Python"
 readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.10"
@@ -36,6 +36,7 @@ dependencies = [
 # Note: otk-pyoptix must be installed separately from NVIDIA
 # See: https://github.com/NVIDIA/otk-pyoptix
 tests = ["pytest"]
+terrain = ["numba", "xarray", "scipy"]
 
 [project.urls]
 Homepage = "https://github.com/makepath/rtxpy"
diff --git a/rtxpy/__init__.py b/rtxpy/__init__.py
index b7083aa..df4a355 100644
--- a/rtxpy/__init__.py
+++ b/rtxpy/__init__.py
@@ -1,11 +1,6 @@
-from .rtx import (
-    RTX,
-    has_cupy,
-    get_device_count,
-    get_device_properties,
-    list_devices,
-    get_current_device,
-)
+from .rtx import RTX, has_cupy
+from . import mesh
+from . import viewshed
+from . import hillshade
 
-
-__version__ = "0.0.4"
+__version__ = "0.0.5"
diff --git a/rtxpy/_cuda_utils.py b/rtxpy/_cuda_utils.py
new file mode 100644
index 0000000..2ee3cc6
--- /dev/null
+++ b/rtxpy/_cuda_utils.py
@@ -0,0 +1,64 @@
+"""
+Internal CUDA utility functions for rtxpy terrain analysis.
+
+This module provides low-level CUDA device functions for vector math operations
+used by viewshed and hillshade computations.
+"""
+
+from numba import cuda
+import numpy as np
+
+
+def calc_dims(shape):
+    """Calculate CUDA grid and block dimensions for a 2D array."""
+    threadsperblock = (32, 32)
+    blockspergrid = (
+        (shape[0] + (threadsperblock[0] - 1)) // threadsperblock[0],
+        (shape[1] + (threadsperblock[1] - 1)) // threadsperblock[1]
+    )
+    return blockspergrid, threadsperblock
+
+
+@cuda.jit(device=True)
+def add(a, b):
+    return float3(a[0]+b[0], a[1]+b[1], a[2]+b[2])
+
+
+@cuda.jit(device=True)
+def diff(a, b):
+    return float3(a[0]-b[0], a[1]-b[1], a[2]-b[2])
+
+
+@cuda.jit(device=True)
+def mul(a, b):
+    return float3(a[0]*b, a[1]*b, a[2]*b)
+
+
+@cuda.jit(device=True)
+def multColor(a, b):
+    return float3(a[0]*b[0], a[1]*b[1], a[2]*b[2])
+
+
+@cuda.jit(device=True)
+def dot(a, b):
+    return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]
+
+
+@cuda.jit(device=True)
+def mix(a, b, k):
+    return add(mul(a, k), mul(b, 1-k))
+
+
+@cuda.jit(device=True)
+def make_float3(a, offset):
+    return float3(a[offset], a[offset+1], a[offset+2])
+
+
+@cuda.jit(device=True)
+def invert(a):
+    return float3(-a[0], -a[1], -a[2])
+
+
+@cuda.jit(device=True)
+def float3(a, b, c):
+    return (np.float32(a), np.float32(b), np.float32(c))
diff --git a/rtxpy/hillshade.py b/rtxpy/hillshade.py
new file mode 100644
index 0000000..9eb7e85
--- /dev/null
+++ b/rtxpy/hillshade.py
@@ -0,0 +1,290 @@
+"""
+GPU-accelerated hillshade rendering using ray tracing.
+
+This module provides functions for computing hillshade (terrain shading)
+on raster data using NVIDIA OptiX ray tracing for optional shadow casting.
+
+Functions
+---------
+hillshade
+    Compute hillshade with optional ray-traced shadows.
+hillshade_with_context
+    Compute hillshade using an existing RTX context.
+"""
+
+import numpy as np
+from numba import cuda
+from typing import Optional
+
+import cupy
+import xarray as xr
+from scipy.spatial.transform import Rotation as R
+
+from .rtx import RTX
+from ._cuda_utils import calc_dims, make_float3, add, mul, dot, invert, float3
+from . import mesh
+
+
+@cuda.jit
+def _generatePrimaryRays(data, x_coords, y_coords, H, W):
+    """
+    Generate orthographic camera rays looking straight down at the terrain.
+    """
+    i, j = cuda.grid(2)
+    if i >= 0 and i < H and j >= 0 and j < W:
+        if (j == W-1):
+            data[i, j, 0] = j - 1e-3
+        else:
+            data[i, j, 0] = j + 1e-3
+
+        if (i == H-1):
+            data[i, j, 1] = i - 1e-3
+        else:
+            data[i, j, 1] = i + 1e-3
+
+        data[i, j, 2] = 10000  # Camera height
+        data[i, j, 3] = 1e-3
+        data[i, j, 4] = 0
+        data[i, j, 5] = 0
+        data[i, j, 6] = -1
+        data[i, j, 7] = np.inf
+
+
+def _generatePrimaryRaysWrapper(rays, x_coords, y_coords, H, W):
+    griddim, blockdim = calc_dims((H, W))
+    _generatePrimaryRays[griddim, blockdim](rays, x_coords, y_coords, H, W)
+    return 0
+
+
+@cuda.jit
+def _generateShadowRays(rays, hits, normals, H, W, sunDir):
+    """
+    Generate shadow rays from surface intersection points toward the sun.
+    """
+    i, j = cuda.grid(2)
+    if i >= 0 and i < H and j >= 0 and j < W:
+        dist = hits[i, j, 0]
+        norm = make_float3(hits[i, j], 1)
+        if (norm[2] < 0):
+            norm = invert(norm)
+        ray = rays[i, j]
+        rayOrigin = make_float3(ray, 0)
+        rayDir = make_float3(ray, 4)
+        p = add(rayOrigin, mul(rayDir, dist))
+
+        newOrigin = add(p, mul(norm, 1e-3))
+        ray[0] = newOrigin[0]
+        ray[1] = newOrigin[1]
+        ray[2] = newOrigin[2]
+        ray[3] = 1e-3
+        ray[4] = sunDir[0]
+        ray[5] = sunDir[1]
+        ray[6] = sunDir[2]
+        ray[7] = np.inf if dist > 0 else 0
+
+        normals[i, j, 0] = norm[0]
+        normals[i, j, 1] = norm[1]
+        normals[i, j, 2] = norm[2]
+
+
+def _generateShadowRaysWrapper(rays, hits, normals, H, W, sunDir):
+    griddim, blockdim = calc_dims((H, W))
+    _generateShadowRays[griddim, blockdim](rays, hits, normals, H, W, sunDir)
+    return 0
+
+
+@cuda.jit
+def _shadeLambert(hits, normals, output, H, W, sunDir, castShadows):
+    """
+    Apply Lambertian shading with optional shadow darkening.
+    """
+    i, j = cuda.grid(2)
+    if i >= 0 and i < H and j >= 0 and j < W:
+        norm = make_float3(normals[i, j], 0)
+        light_dir = make_float3(sunDir, 0)
+        cos_theta = dot(light_dir, norm)
+
+        temp = (cos_theta + 1) / 2
+
+        if castShadows and hits[i, j, 0] >= 0:
+            temp = temp / 2
+
+        if temp > 1:
+            temp = 1
+        elif temp < 0:
+            temp = 0
+
+        output[i, j] = temp
+
+
+def _shadeLambertWrapper(hits, normals, output, H, W, sunDir, castShadows):
+    griddim, blockdim = calc_dims((H, W))
+    _shadeLambert[griddim, blockdim](hits, normals, output, H, W, sunDir, castShadows)
+    return 0
+
+
+def _getSunDir(angle_altitude, azimuth):
+    """
+    Calculate the vector towards the sun based on altitude angle and azimuth.
+    """
+    north = (0, 1, 0)
+    rx = R.from_euler('x', angle_altitude, degrees=True)
+    rz = R.from_euler('z', azimuth+180, degrees=True)
+    sunDir = rx.apply(north)
+    sunDir = rz.apply(sunDir)
+    return sunDir
+
+
+def hillshade_with_context(raster: xr.DataArray,
+                           rtx: RTX,
+                           shadows: bool = False,
+                           azimuth: int = 225,
+                           angle_altitude: int = 25,
+                           name: Optional[str] = 'hillshade') -> xr.DataArray:
+    """
+    Compute hillshade using an existing RTX context.
+
+    This function performs hillshade rendering with optional ray-traced shadows
+    using an existing RTX context that already has terrain geometry built.
+
+    Parameters
+    ----------
+    raster : xr.DataArray
+        Input terrain raster with x and y coordinates.
+    rtx : RTX
+        Pre-configured RTX instance with terrain geometry already built.
+    shadows : bool, optional
+        Whether to compute ray-traced shadows. Default is False.
+    azimuth : int, optional
+        Sun azimuth angle in degrees (0=North, 90=East). Default is 225 (SW).
+    angle_altitude : int, optional
+        Sun altitude angle in degrees above horizon. Default is 25.
+    name : str, optional
+        Name for the output DataArray. Default is 'hillshade'.
+
+    Returns
+    -------
+    xr.DataArray
+        Hillshade result with values from 0 (dark) to 1 (bright).
+    """
+    H, W = raster.shape
+    sunDir = cupy.array(_getSunDir(angle_altitude, azimuth))
+
+    # Device buffers
+    d_rays = cupy.empty((H, W, 8), np.float32)
+    d_hits = cupy.empty((H, W, 4), np.float32)
+    d_aux = cupy.empty((H, W, 3), np.float32)
+    d_output = cupy.empty((H, W), np.float32)
+
+    y_coords = cupy.array(raster.indexes.get('y').values)
+    x_coords = cupy.array(raster.indexes.get('x').values)
+
+    _generatePrimaryRaysWrapper(d_rays, x_coords, y_coords, H, W)
+    device = cupy.cuda.Device(0)
+    device.synchronize()
+    rtx.trace(d_rays, d_hits, W*H)
+
+    _generateShadowRaysWrapper(d_rays, d_hits, d_aux, H, W, sunDir)
+    if shadows:
+        device.synchronize()
+        rtx.trace(d_rays, d_hits, W*H)
+
+    _shadeLambertWrapper(d_hits, d_aux, d_output, H, W, sunDir, shadows)
+
+    if isinstance(raster.data, np.ndarray):
+        output = cupy.asnumpy(d_output[:, :])
+        nanValue = np.nan
+    else:
+        output = d_output[:, :]
+        nanValue = cupy.nan
+
+    output[0, :] = nanValue
+    output[-1, :] = nanValue
+    output[:, 0] = nanValue
+    output[:, -1] = nanValue
+
+    hill = xr.DataArray(output,
+                        name=name,
+                        coords=raster.coords,
+                        dims=raster.dims,
+                        attrs=raster.attrs)
+    return hill
+
+
+def hillshade(raster: xr.DataArray,
+              shadows: bool = False,
+              azimuth: int = 225,
+              angle_altitude: int = 25,
+              name: Optional[str] = 'hillshade') -> xr.DataArray:
+    """
+    Compute hillshade with optional ray-traced shadows.
+
+    This function performs hillshade (terrain shading) rendering on a raster
+    DEM. It automatically builds the necessary ray tracing acceleration
+    structure if needed.
+
+    Parameters
+    ----------
+    raster : xr.DataArray
+        Input terrain raster as an xarray DataArray with x and y coordinates.
+        Should be a cupy array for best performance.
+    shadows : bool, optional
+        Whether to compute ray-traced shadows. When True, areas occluded from
+        the sun are darkened. Default is False.
+    azimuth : int, optional
+        Sun azimuth angle in degrees (0=North, 90=East, 180=South, 270=West).
+        Default is 225 (Southwest).
+    angle_altitude : int, optional
+        Sun altitude angle in degrees above the horizon. Default is 25.
+    name : str, optional
+        Name for the output DataArray. Default is 'hillshade'.
+
+    Returns
+    -------
+    xr.DataArray
+        Hillshade result with values from 0 (dark) to 1 (bright).
+        Edge pixels are set to NaN.
+
+    Raises
+    ------
+    ValueError
+        If mesh generation or OptiX build fails.
+
+    Examples
+    --------
+    >>> import rtxpy
+    >>> result = rtxpy.hillshade.hillshade(terrain_raster, shadows=True, azimuth=315)
+    """
+    if not isinstance(raster.data, cupy.ndarray):
+        print("WARNING: raster.data is not a cupy array. Additional overhead will be incurred")
+    H, W = raster.data.squeeze().shape
+    rtx = RTX()
+
+    datahash = np.uint64(hash(str(raster.data.get())) % (1 << 64))
+    optixhash = np.uint64(rtx.getHash())
+    if (optixhash != datahash):
+        numTris = (H - 1) * (W - 1) * 2
+        verts = cupy.empty(H * W * 3, np.float32)
+        triangles = cupy.empty(numTris * 3, np.int32)
+
+        # Generate mesh from terrain
+        res = mesh.triangulate_terrain(verts, triangles, raster)
+        if res:
+            raise ValueError("Failed to generate mesh from terrain. Error code:{}".format(res))
+        res = rtx.build(datahash, verts, triangles)
+        if res:
+            raise ValueError("OptiX failed to build GAS with error code:{}".format(res))
+        # Clear GPU memory no longer needed
+        verts = None
+        triangles = None
+        cupy.get_default_memory_pool().free_all_blocks()
+
+    hill = hillshade_with_context(raster, rtx, azimuth=azimuth,
+                                  angle_altitude=angle_altitude,
+                                  shadows=shadows, name=name)
+    return hill
+
+
+# Backwards compatibility aliases
+hillshade_gpu = hillshade
+hillshade_rt = hillshade_with_context
diff --git a/rtxpy/mesh.py b/rtxpy/mesh.py
new file mode 100644
index 0000000..af9bb82
--- /dev/null
+++ b/rtxpy/mesh.py
@@ -0,0 +1,197 @@
+"""
+Mesh utilities for rtxpy terrain processing.
+
+This module provides functions for triangulating terrain data and exporting
+meshes to STL format.
+
+Functions
+---------
+triangulate_terrain
+    Convert a 2D terrain array into a triangulated mesh.
+write_stl
+    Export a triangulated mesh to STL file format.
+"""
+
+import numba as nb
+from numba import cuda
+import numpy as np
+import cupy
+
+
+@cuda.jit
+def _triangulateTerrain(verts, triangles, data, H, W, scale, stride):
+    globalId = stride + cuda.grid(1)
+    if globalId < W*H:
+        h = globalId // W
+        w = globalId % W
+        meshMapIndex = h * W + w
+
+        val = data[h, w]
+
+        offset = 3*meshMapIndex
+        verts[offset] = w  # x_coords[w] # w
+        verts[offset+1] = h  # y_coords[h] # h
+        verts[offset+2] = val * scale
+
+        if w != W - 1 and h != H - 1:
+            offset = 6*(h * (W-1) + w)
+            triangles[offset+0] = np.int32(meshMapIndex + W)
+            triangles[offset+1] = np.int32(meshMapIndex + W + 1)
+            triangles[offset+2] = np.int32(meshMapIndex)
+            triangles[offset+3] = np.int32(meshMapIndex + W + 1)
+            triangles[offset+4] = np.int32(meshMapIndex + 1)
+            triangles[offset+5] = np.int32(meshMapIndex)
+
+
+@nb.njit(parallel=True)
+def _triangulateCPU(verts, triangles, data, H, W, scale):
+    for h in nb.prange(H):
+        for w in range(W):
+            meshMapIndex = h * W + w
+
+            val = data[h, w]
+
+            offset = 3*meshMapIndex
+            verts[offset] = w  # x_coords[w] # w
+            verts[offset+1] = h  # y_coords[h] # h
+            verts[offset+2] = val * scale
+
+            if w != W - 1 and h != H - 1:
+                offset = 6*(h * (W-1) + w)
+                triangles[offset+0] = np.int32(meshMapIndex + W)
+                triangles[offset+1] = np.int32(meshMapIndex + W + 1)
+                triangles[offset+2] = np.int32(meshMapIndex)
+                triangles[offset+3] = np.int32(meshMapIndex + W+1)
+                triangles[offset+4] = np.int32(meshMapIndex + 1)
+                triangles[offset+5] = np.int32(meshMapIndex)
+
+
+def triangulate_terrain(verts, triangles, terrain, scale=1):
+    """
+    Convert a 2D terrain array into a triangulated mesh.
+
+    This function populates pre-allocated vertex and triangle buffers with
+    mesh data generated from the terrain heightmap. The mesh is created by
+    generating two triangles for each cell in the terrain grid.
+
+    Parameters
+    ----------
+    verts : array-like
+        Pre-allocated buffer for vertex data. Should have shape (H * W * 3,)
+        where H and W are the terrain dimensions. Each vertex consists of
+        3 float32 values (x, y, z).
+    triangles : array-like
+        Pre-allocated buffer for triangle indices. Should have shape
+        ((H-1) * (W-1) * 2 * 3,) for the index buffer.
+    terrain : array-like
+        2D terrain heightmap with shape (H, W). Can be a numpy array,
+        cupy array, or xarray DataArray.
+    scale : float, optional
+        Scale factor for elevation values. Default is 1.
+
+    Returns
+    -------
+    int
+        0 on success.
+
+    Notes
+    -----
+    - If terrain.data is a numpy array, uses parallel CPU processing via numba.
+    - If terrain.data is a cupy array, uses GPU processing via CUDA kernels.
+    """
+    H, W = terrain.shape
+    if isinstance(terrain.data, np.ndarray):
+        _triangulateCPU(verts, triangles, terrain.data, H, W, scale)
+    if isinstance(terrain.data, cupy.ndarray):
+        jobSize = H*W
+        blockdim = 1024
+        griddim = (jobSize + blockdim - 1) // 1024
+        d = 100
+        offset = 0
+        while (jobSize > 0):
+            batch = min(d, griddim)
+            _triangulateTerrain[batch, blockdim](verts, triangles,
+                                                 terrain.data, H, W,
+                                                 scale, offset)
+            offset += batch*blockdim
+            jobSize -= batch*blockdim
+    return 0
+
+
+@nb.jit(nopython=True)
+def _fillContents(content, verts, triangles, numTris):
+    v = np.empty(12, np.float32)
+    pad = np.zeros(2, np.int8)
+    offset = 0
+    for i in range(numTris):
+        t0 = triangles[3*i+0]
+        t1 = triangles[3*i+1]
+        t2 = triangles[3*i+2]
+        v[3*0+0] = 0
+        v[3*0+1] = 0
+        v[3*0+2] = 0
+        v[3*1+0] = verts[3*t0+0]
+        v[3*1+1] = verts[3*t0+1]
+        v[3*1+2] = verts[3*t0+2]
+        v[3*2+0] = verts[3*t1+0]
+        v[3*2+1] = verts[3*t1+1]
+        v[3*2+2] = verts[3*t1+2]
+        v[3*3+0] = verts[3*t2+0]
+        v[3*3+1] = verts[3*t2+1]
+        v[3*3+2] = verts[3*t2+2]
+
+        offset = 50*i
+        content[offset:offset+48] = v.view(np.uint8)
+        content[offset+48:offset+50] = pad
+
+
+def write_stl(name, verts, triangles):
+    """
+    Save a triangulated mesh to a standard STL file.
+
+    STL is a widely supported 3D mesh format. Windows has a default STL viewer
+    and most 3D applications support it natively due to its simplicity.
+
+    Parameters
+    ----------
+    name : str
+        The name of the mesh file to save. Should end in .stl
+    verts : array-like
+        Vertex buffer containing all mesh vertices. Format is 3 float32 values
+        per vertex (x, y, z coordinates). Can be numpy or cupy array.
+    triangles : array-like
+        Index buffer containing all mesh triangles. Format is 3 int32 values
+        per triangle (vertex indices). Can be numpy or cupy array.
+
+    Notes
+    -----
+    If input arrays are cupy arrays, they will be automatically converted
+    to numpy arrays for file writing.
+    """
+    ib = triangles
+    vb = verts
+    if isinstance(ib, cupy.ndarray):
+        ib = cupy.asnumpy(ib)
+    if isinstance(vb, cupy.ndarray):
+        vb = cupy.asnumpy(vb)
+
+    header = np.zeros(80, np.uint8)
+    nf = np.empty(1, np.uint32)
+    numTris = triangles.shape[0] // 3
+    nf[0] = numTris
+    f = open(name, 'wb')
+    f.write(header)
+    f.write(nf)
+
+    # size of 1 triangle in STL is 50 bytes
+    # 12 floats (each 4 bytes) for a total of 48
+    # And additional 2 bytes for padding
+    content = np.empty(numTris*(50), np.uint8)
+    _fillContents(content, vb, ib, numTris)
+    f.write(content)
+    f.close()
+
+
+# Backwards compatibility aliases
+triangulateTerrain = triangulate_terrain
+write = write_stl
diff --git a/rtxpy/rtx.py b/rtxpy/rtx.py
index 73978ea..9615f6b 100644
--- a/rtxpy/rtx.py
+++ b/rtxpy/rtx.py
@@ -49,7 +49,6 @@ class _OptixState:
     """
 
     def __init__(self):
-        self.device_id = None  # CUDA device ID used for this context
         self.context = None
         self.module = None
         self.pipeline = None
@@ -84,9 +83,6 @@ def __init__(self):
 
     def cleanup(self):
         """Release all OptiX and CUDA resources."""
-        # Reset device tracking
-        self.device_id = None
-
         # Free device buffers
         self.d_params = None
         self.d_rays = None
@@ -118,10 +114,6 @@ def cleanup(self):
 
         self.initialized = False
 
-    def reset_device(self):
-        """Reset device tracking (called during cleanup)."""
-        self.device_id = None
-
 
 _state = _OptixState()
 
@@ -133,106 +125,6 @@ def _cleanup_at_exit():
         _state.cleanup()
 
 
-# -----------------------------------------------------------------------------
-# Device utilities
-# -----------------------------------------------------------------------------
-
-def get_device_count() -> int:
-    """
-    Get the number of available CUDA devices.
-
-    Returns:
-        Number of CUDA-capable GPUs available.
-
-    Example:
-        >>> import rtxpy
-        >>> rtxpy.get_device_count()
-        2
-    """
-    return cupy.cuda.runtime.getDeviceCount()
-
-
-def get_device_properties(device: int = 0) -> dict:
-    """
-    Get properties of a CUDA device.
-
-    Args:
-        device: Device ID (0, 1, 2, ...). Defaults to device 0.
-
-    Returns:
-        Dictionary containing device properties including:
-        - name: Device name (e.g., "NVIDIA GeForce RTX 3090")
-        - compute_capability: Tuple of (major, minor) compute capability
-        - total_memory: Total device memory in bytes
-        - multiprocessor_count: Number of streaming multiprocessors
-
-    Raises:
-        ValueError: If device ID is invalid.
-
-    Example:
-        >>> import rtxpy
-        >>> props = rtxpy.get_device_properties(0)
-        >>> print(props['name'])
-        NVIDIA GeForce RTX 3090
-    """
-    device_count = cupy.cuda.runtime.getDeviceCount()
-    if device < 0 or device >= device_count:
-        raise ValueError(
-            f"Invalid device ID {device}. "
-            f"Available devices: 0-{device_count - 1}"
-        )
-
-    with cupy.cuda.Device(device):
-        props = cupy.cuda.runtime.getDeviceProperties(device)
-
-    return {
-        'name': props['name'].decode('utf-8') if isinstance(props['name'], bytes) else props['name'],
-        'compute_capability': (props['major'], props['minor']),
-        'total_memory': props['totalGlobalMem'],
-        'multiprocessor_count': props['multiProcessorCount'],
-    }
-
-
-def list_devices() -> list:
-    """
-    List all available CUDA devices with their properties.
-
-    Returns:
-        List of dictionaries, each containing device properties.
-        Each dict includes 'id' (device index) plus all properties
-        from get_device_properties().
-
-    Example:
-        >>> import rtxpy
-        >>> for dev in rtxpy.list_devices():
-        ...     print(f"GPU {dev['id']}: {dev['name']}")
-        GPU 0: NVIDIA GeForce RTX 3090
-        GPU 1: NVIDIA GeForce RTX 2080
-    """
-    devices = []
-    for i in range(get_device_count()):
-        props = get_device_properties(i)
-        props['id'] = i
-        devices.append(props)
-    return devices
-
-
-def get_current_device() -> Optional[int]:
-    """
-    Get the CUDA device ID that RTX is currently using.
-
-    Returns:
-        Device ID if RTX has been initialized, None otherwise.
-
-    Example:
-        >>> import rtxpy
-        >>> rtx = rtxpy.RTX(device=1)
-        >>> rtxpy.get_current_device()
-        1
-    """
-    return _state.device_id if _state.initialized else None
-
-
 # -----------------------------------------------------------------------------
 # PTX loading
 # -----------------------------------------------------------------------------
@@ -265,43 +157,13 @@ def _log_callback(level, tag, message):
     print(f"[OPTIX][{level}][{tag}]: {message}")
 
 
-def _init_optix(device: Optional[int] = None):
-    """
-    Initialize OptiX context, module, pipeline, and SBT.
-
-    Args:
-        device: CUDA device ID to use. If None, uses the current CuPy device.
-                If already initialized, this parameter is ignored (a warning
-                would be appropriate if it differs from the active device).
-    """
+def _init_optix():
+    """Initialize OptiX context, module, pipeline, and SBT."""
     global _state
 
     if _state.initialized:
-        # Already initialized - check if user requested a different device
-        if device is not None and _state.device_id != device:
-            import warnings
-            warnings.warn(
-                f"RTX already initialized on device {_state.device_id}. "
-                f"Ignoring request for device {device}. "
-                "Create a new Python process to use a different device.",
-                RuntimeWarning
-            )
         return
 
-    # Select the CUDA device if specified
-    if device is not None:
-        device_count = cupy.cuda.runtime.getDeviceCount()
-        if device < 0 or device >= device_count:
-            raise ValueError(
-                f"Invalid device ID {device}. "
-                f"Available devices: 0-{device_count - 1}"
-            )
-        cupy.cuda.Device(device).use()
-        _state.device_id = device
-    else:
-        # Use current device
-        _state.device_id = cupy.cuda.Device().id
-
     # Create OptiX device context (uses cupy's CUDA context)
     _state.context = optix.deviceContextCreate(
         cupy.cuda.get_current_stream().ptr,
@@ -874,34 +736,11 @@ class RTX:
 
     This class provides GPU-accelerated ray-triangle intersection using
     NVIDIA's OptiX ray tracing engine.
-
-    Args:
-        device: CUDA device ID to use (0, 1, 2, ...). If None (default),
-                uses the currently active CuPy device. Use get_device_count()
-                to see available devices.
-
-    Example:
-        # Use default device (device 0 or current CuPy device)
-        rtx = RTX()
-
-        # Use specific GPU
-        rtx = RTX(device=1)
-
-    Note:
-        The RTX context is a singleton - all RTX instances share the same
-        underlying OptiX context. The device can only be set on first
-        initialization. Subsequent RTX() calls with a different device
-        will emit a warning.
     """
 
-    def __init__(self, device: Optional[int] = None):
-        """
-        Initialize the RTX context.
-
-        Args:
-            device: CUDA device ID to use. If None, uses the current device.
-        """
-        _init_optix(device)
+    def __init__(self):
+        """Initialize the RTX context."""
+        _init_optix()
 
     def build(self, hashValue: int, vertexBuffer, indexBuffer) -> int:
         """
@@ -917,16 +756,6 @@ def build(self, hashValue: int, vertexBuffer, indexBuffer) -> int:
         """
         return _build_accel(hashValue, vertexBuffer, indexBuffer)
 
-    @property
-    def device(self) -> Optional[int]:
-        """
-        The CUDA device ID this RTX instance is using.
-
-        Returns:
-            Device ID (0, 1, 2, ...) or None if not initialized.
-        """
-        return _state.device_id
-
     def getHash(self) -> int:
         """
         Get the hash of the current acceleration structure.
diff --git a/rtxpy/viewshed.py b/rtxpy/viewshed.py
new file mode 100644
index 0000000..b69e7ad
--- /dev/null
+++ b/rtxpy/viewshed.py
@@ -0,0 +1,304 @@
+"""
+GPU-accelerated viewshed analysis using ray tracing.
+
+This module provides functions for computing viewshed (visibility) analysis
+on terrain data using NVIDIA OptiX ray tracing.
+
+Functions
+---------
+viewshed
+    Compute viewshed from a point on a raster terrain.
+viewshed_with_context
+    Compute viewshed using an existing RTX context.
+"""
+
+from numba import cuda
+import numpy as np
+import math
+from typing import Union
+
+import cupy
+import xarray as xr
+
+from .rtx import RTX
+from ._cuda_utils import calc_dims, make_float3, add, mul, diff, dot, invert, float3
+from . import mesh
+
+# view options default values
+OBS_ELEV = 0
+TARGET_ELEV = 0
+
+# if a cell is invisible, its value is set to -1
+INVISIBLE = -1
+
+
+@cuda.jit
+def _generatePrimaryRays(data, x_coords, y_coords, H, W):
+    i, j = cuda.grid(2)
+    if i >= 0 and i < H and j >= 0 and j < W:
+        if (j == W-1):
+            data[i, j, 0] = j - 1e-3
+        else:
+            data[i, j, 0] = j + 1e-3
+
+        if (i == H-1):
+            data[i, j, 1] = i - 1e-3
+        else:
+            data[i, j, 1] = i + 1e-3
+
+        data[i, j, 2] = 10000  # Location of the camera (height)
+        data[i, j, 3] = 1e-3
+        data[i, j, 4] = 0
+        data[i, j, 5] = 0
+        data[i, j, 6] = -1
+        data[i, j, 7] = np.inf
+
+
+def _generatePrimaryRaysWrapper(rays, x_coords, y_coords, H, W):
+    griddim, blockdim = calc_dims((H, W))
+    d_y_coords = cupy.array(y_coords)
+    d_x_coords = cupy.array(x_coords)
+    _generatePrimaryRays[griddim, blockdim](rays, d_x_coords, d_y_coords, H, W)
+    return 0
+
+
+@cuda.jit
+def _generateViewshedRays(camRays, hits, vsrays, visibility_grid, H, W, vp):
+    i, j = cuda.grid(2)
+    if i >= 0 and i < H and j >= 0 and j < W:
+        elevationOffset = vp[2]
+        targetElevation = vp[3]
+        dist = hits[i, j, 0]  # distance to surface from camera
+        norm = make_float3(hits[i, j], 1)  # normal vector at intersection with surface
+        if (norm[2] < 0):  # if back hit, face forward
+            norm = invert(norm)
+        cameraRay = camRays[i, j]
+        rayOrigin = make_float3(cameraRay, 0)  # get the camera ray origin
+        rayDir = make_float3(cameraRay, 4)  # get the camera ray direction
+        hitP = add(rayOrigin, mul(rayDir, dist))  # calculate intersection point
+        newOrigin = add(hitP, mul(norm, 1e-3))  # generate new ray origin with offset
+        newOrigin = add(newOrigin, float3(0, 0, targetElevation))  # add target elevation
+
+        w = int(vp[0])
+        h = int(vp[1])
+        viewshedRay = camRays[h, w]  # get camera ray for viewshed origin
+        dist = hits[h, w, 0]  # get distance from camera to viewshed point
+        rayOrigin = make_float3(viewshedRay, 0)
+        rayDir = make_float3(viewshedRay, 4)
+        hitP = add(rayOrigin, mul(rayDir, dist))
+        viewshedPoint = add(hitP, float3(0, 0, elevationOffset))
+
+        newDir = diff(viewshedPoint, newOrigin)  # vector from SurfaceHit to VP
+        length = math.sqrt(dot(newDir, newDir))  # distance from surface to VP
+        newDir = mul(newDir, 1/length)  # normalize direction
+
+        cosine = dot(norm, newDir)  # cosine of angle between n and v
+        theta = math.acos(cosine)  # angle in radians
+        theta = (180*theta)/math.pi  # angle in degrees
+
+        # prepare viewshed ray for visibility test
+        vsray = vsrays[i, j]
+        vsray[0] = newOrigin[0]
+        vsray[1] = newOrigin[1]
+        vsray[2] = newOrigin[2]
+        vsray[3] = 0
+        vsray[4] = newDir[0]
+        vsray[5] = newDir[1]
+        vsray[6] = newDir[2]
+        vsray[7] = length
+
+        visibility_grid[i, j] = theta
+
+
+def _generateViewshedRaysWrapper(rays, hits, vsrays, visibility_grid, H, W, vp):
+    griddim, blockdim = calc_dims((H, W))
+    _generateViewshedRays[griddim, blockdim](rays, hits, vsrays, visibility_grid, H, W, vp)
+    return 0
+
+
+@cuda.jit
+def _calcViewshed(hits, visibility_grid, H, W):
+    i, j = cuda.grid(2)
+    if i >= 0 and i < H and j >= 0 and j < W:
+        dist = hits[i, j, 0]
+        # If dist > 0, intersection occurred along ray length
+        # meaning pixel is not directly visible from view point
+        if dist >= 0:
+            visibility_grid[i, j] = INVISIBLE
+
+
+def _calcViewshedWrapper(hits, visibility_grid, H, W):
+    griddim, blockdim = calc_dims((H, W))
+    _calcViewshed[griddim, blockdim](hits, visibility_grid, H, W)
+    return 0
+
+
+def viewshed_with_context(raster: xr.DataArray,
+                          rtx: RTX,
+                          x: Union[int, float],
+                          y: Union[int, float],
+                          observer_elev: float = OBS_ELEV,
+                          target_elev: float = TARGET_ELEV) -> xr.DataArray:
+    """
+    Compute viewshed analysis using an existing RTX context.
+
+    This function performs viewshed (visibility) analysis from a specified
+    observer point on the terrain. It uses an existing RTX context that
+    already has the terrain geometry built, avoiding rebuild overhead.
+
+    Parameters
+    ----------
+    raster : xr.DataArray
+        Input terrain raster with x and y coordinates.
+    rtx : RTX
+        Pre-configured RTX instance with terrain geometry already built.
+    x : int or float
+        X coordinate of the observer position.
+    y : int or float
+        Y coordinate of the observer position.
+    observer_elev : float, optional
+        Height of the observer above the terrain surface. Default is 0.
+    target_elev : float, optional
+        Height of target points above the terrain surface. Default is 0.
+
+    Returns
+    -------
+    xr.DataArray
+        Viewshed result where visible cells contain the viewing angle in
+        degrees, and invisible cells contain -1.
+
+    Raises
+    ------
+    ValueError
+        If x or y coordinates are outside the raster extent.
+    """
+    H, W = raster.shape
+
+    y_coords = raster.indexes.get('y').values
+    x_coords = raster.indexes.get('x').values
+
+    # validate x arg
+    if x < x_coords.min():
+        raise ValueError("x argument outside of raster x_range")
+    elif x > x_coords.max():
+        raise ValueError("x argument outside of raster x_range")
+
+    # validate y arg
+    if y < y_coords.min():
+        raise ValueError("y argument outside of raster y_range")
+    elif y > y_coords.max():
+        raise ValueError("y argument outside of raster y_range")
+
+    selection = raster.sel(x=[x], y=[y], method='nearest')
+    x = selection.x.values[0]
+    y = selection.y.values[0]
+
+    y_view = np.where(y_coords == y)[0][0]
+    x_view = np.where(x_coords == x)[0][0]
+
+    # Device buffers
+    d_rays = cupy.empty((H, W, 8), np.float32)
+    d_hits = cupy.empty((H, W, 4), np.float32)
+    d_visgrid = cupy.empty((H, W), np.float32)
+    d_vsrays = cupy.empty((H, W, 8), np.float32)
+
+    _generatePrimaryRaysWrapper(d_rays, x_coords, y_coords, H, W)
+    device = cupy.cuda.Device(0)
+    device.synchronize()
+    rtx.trace(d_rays, d_hits, W*H)
+
+    _generateViewshedRaysWrapper(d_rays, d_hits, d_vsrays, d_visgrid, H, W,
+                                 (x_view, y_view, observer_elev, target_elev))
+    device.synchronize()
+    rtx.trace(d_vsrays, d_hits, W*H)
+
+    _calcViewshedWrapper(d_hits, d_visgrid, H, W)
+
+    if isinstance(raster.data, np.ndarray):
+        visgrid = cupy.asnumpy(d_visgrid)
+    else:
+        visgrid = d_visgrid
+
+    view = xr.DataArray(visgrid,
+                        name="viewshed",
+                        coords=raster.coords,
+                        dims=raster.dims,
+                        attrs=raster.attrs)
+    return view
+
+
+def viewshed(raster: xr.DataArray,
+             x: Union[int, float],
+             y: Union[int, float],
+             observer_elev: float = OBS_ELEV,
+             target_elev: float = TARGET_ELEV) -> xr.DataArray:
+    """
+    Compute viewshed analysis from a point on a raster terrain.
+
+    This function performs viewshed (visibility) analysis from a specified
+    observer point on the terrain. It automatically builds the necessary
+    ray tracing acceleration structure if needed.
+
+    Parameters
+    ----------
+    raster : xr.DataArray
+        Input terrain raster as an xarray DataArray with x and y coordinates.
+        The data must be a cupy array on GPU.
+    x : int or float
+        X coordinate of the observer position.
+    y : int or float
+        Y coordinate of the observer position.
+    observer_elev : float, optional
+        Height of the observer above the terrain surface. Default is 0.
+    target_elev : float, optional
+        Height of target points above the terrain surface. Default is 0.
+
+    Returns
+    -------
+    xr.DataArray
+        Viewshed result where visible cells contain the viewing angle in
+        degrees, and invisible cells contain -1.
+
+    Raises
+    ------
+    ValueError
+        If raster.data is not a cupy array, or if the mesh generation
+        or OptiX build fails.
+
+    Examples
+    --------
+    >>> import rtxpy
+    >>> result = rtxpy.viewshed.viewshed(terrain_raster, x=1000, y=2000, observer_elev=2)
+    """
+    if not isinstance(raster.data, cupy.ndarray):
+        raise ValueError("raster.data must be a cupy array")
+
+    H, W = raster.shape
+    rtx = RTX()
+
+    datahash = np.uint64(hash(str(raster.data.get())) % (1 << 64))
+    optixhash = np.uint64(rtx.getHash())
+    if (optixhash != datahash):
+        numTris = (H - 1) * (W - 1) * 2
+        verts = cupy.empty(H * W * 3, np.float32)
+        triangles = cupy.empty(numTris * 3, np.int32)
+
+        # Generate mesh from terrain
+        res = mesh.triangulate_terrain(verts, triangles, raster)
+        if res:
+            raise ValueError("Failed to generate mesh from terrain. Error code:{}".format(res))
+        res = rtx.build(datahash, verts, triangles)
+        if res:
+            raise ValueError("OptiX failed to build GAS with error code:{}".format(res))
+        # Clear GPU memory no longer needed
+        verts = None
+        triangles = None
+        cupy.get_default_memory_pool().free_all_blocks()
+
+    view = viewshed_with_context(raster, rtx, x, y, observer_elev, target_elev)
+    return view
+
+
+# Backwards compatibility aliases
+viewshed_gpu = viewshed
+viewshed_rt = viewshed_with_context