diff --git a/GEMstack/scripts/geotag_from_files.py b/GEMstack/offboard/geo_tagging/geotag_from_files.py similarity index 100% rename from GEMstack/scripts/geotag_from_files.py rename to GEMstack/offboard/geo_tagging/geotag_from_files.py diff --git a/GEMstack/scripts/geotag_from_rosbag.py b/GEMstack/offboard/geo_tagging/geotag_from_rosbag.py similarity index 100% rename from GEMstack/scripts/geotag_from_rosbag.py rename to GEMstack/offboard/geo_tagging/geotag_from_rosbag.py diff --git a/GEMstack/scripts/register_lidar_scans.py b/GEMstack/offboard/geo_tagging/register_lidar_scans.py similarity index 100% rename from GEMstack/scripts/register_lidar_scans.py rename to GEMstack/offboard/geo_tagging/register_lidar_scans.py diff --git a/GEMstack/offboard/lidar_colorization/README.MD b/GEMstack/offboard/lidar_colorization/README.MD new file mode 100644 index 000000000..20e7f662b --- /dev/null +++ b/GEMstack/offboard/lidar_colorization/README.MD @@ -0,0 +1,87 @@ +# Point Cloud Colorizer from Multi-View Images + +This script projects LiDAR point clouds into multi-view camera images and colorizes the 3D points using RGB values interpolated from the corresponding camera views. It supports multiple camera types and exports both raw and colorized point clouds in PLY format. + +## Features + +- Projects LiDAR point clouds into camera frames using calibrated extrinsic and intrinsic parameters +- Handles undistortion of input images using OpenCV +- Samples RGB values at subpixel precision from multi-view images +- Merges color information into a unified point cloud and saves as `.ply` +- Supports multiple camera types: front-right (`fr`), rear-right (`rr`), front-left (`fl`), rear-left (`rl`) + +## Installation + +Make sure you have the following dependencies installed: + +```bash +pip install numpy opencv-python open3d +``` +### Usage: + +```bash +python path/to/colorize_pointcloud.py \ + --folder_path /path/to/your/pointcloud_and_image_data \ + --output_path /path/to/save/output \ + --camera_types fr,rr +``` +##### 1. Arguments +```bash + --folder_path: Directory containing .b files (NumPy arrays) with point cloud data and corresponding images ({camera_type}_{index}.png) + + --output_path: Directory where the resulting .ply point clouds will be saved + + --camera_types: Comma-separated list of camera types to project and colorize from. Supported values: fr, rr, fl, rl +``` + +###### 2. Data Format Requirements +- Point cloud files: .b files that store N x 3 or N x 6 NumPy arrays of 3D points (with optional RGB). You can rename your saved pointcloud in NumPy arrays format to .b and then use this script. +
+- Images: RGB .png images named like fr_001.png, rr_001.png, etc., located in the same directory as the .b files +
+ +###### 3. Calibration: +Calibration parameters must be accessible via the settings module: + +- Extrinsic: + + ```python + settings.get("calibration..extrinsics.rotation") + settings.get("calibration..extrinsics.position") + ``` + ```python + settings.get("calibration.top_lidar.rotation") + settings.get("calibration.top_lidar.position") + ``` + +- Intrinsic: + ```python + settings.get("calibration..intrinsics.focal") + settings.get("calibration..intrinsics.center") + settings.get("calibration..intrinsics.distort") + settings.get("calibration..intrinsics.skew") + ``` +###### 4. Outputs +For each .b file: + + pure_pointcloud_.ply: raw point cloud without color + + colored_pointcloud_.ply: point cloud with interpolated RGB from visible camera views + +##### Example +```python +python colorize_pointcloud.py \ + --folder_path data/session1 \ + --output_path data/colored_output \ + --camera_types fr,rr +``` + +## Notes + Only the 3D points that are visible in at least one selected camera are assigned color. + + If no image exists for a particular view or projection fails, that camera is skipped. + + Requires accurate and synchronized calibration between LiDAR and camera systems. + + Author: Henry Yi + License: MIT diff --git a/GEMstack/offboard/lidar_colorization/colorization.py b/GEMstack/offboard/lidar_colorization/colorization.py new file mode 100644 index 000000000..1528d633f --- /dev/null +++ b/GEMstack/offboard/lidar_colorization/colorization.py @@ -0,0 +1,313 @@ +import os +from pathlib import Path +import numpy as np +import argparse +import open3d as o3d +import cv2 +import open3d as o3d + +from ...utils import settings + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--folder_path", type=str, required=True, help="Path to the folder containing the point clouds and images") + parser.add_argument("--output_path", type=str, required=True, help="Path to save the output point cloud") + parser.add_argument("--camera_types", type=str, required=True, help="Comma-separated list of supported camera types (fr, rr, fl, rl) to colorize") + return parser.parse_args() + + +def save_ply_with_open3d(points, filename): + ''' + Save the point cloud to a PLY file using Open3D + ''' + pc = o3d.geometry.PointCloud() + pc.points = o3d.utility.Vector3dVector(points[:, :3]) + if points.shape[1] == 6: + colors = points[:, 3:6] / 255.0 # normalize RGB to [0, 1] + pc.colors = o3d.utility.Vector3dVector(colors) + o3d.io.write_point_cloud(filename, pc) + +def get_camera_extrinsic_matrix(camera_type): + ''' + Get the camera extrinsic matrix for the given camera type + ''' + if camera_type == "front_right" or camera_type == "fr": + # From the settings + rotation = np.array(settings.get("calibration.front_right_camera.extrinsics.rotation")) + + translation = np.array(settings.get("calibration.front_right_camera.extrinsics.position")) + + # Build 4x4 homogeneous transformation matrix + fr_to_vehicle = np.eye(4) + fr_to_vehicle[:3, :3] = rotation + fr_to_vehicle[:3, 3] = translation + return fr_to_vehicle + elif camera_type == "rear_right" or camera_type == "rr": + # From the settings + rotation = np.array(settings.get("calibration.rear_right_camera.extrinsics.rotation")) + + translation = np.array(settings.get("calibration.rear_right_camera.extrinsics.position")) + + # Build 4x4 homogeneous transformation matrix + rr_to_vehicle = np.eye(4) + rr_to_vehicle[:3, :3] = rotation + rr_to_vehicle[:3, 3] = translation + return rr_to_vehicle + elif camera_type == "front_left" or camera_type == "fl": + rotation = np.array(settings.get("calibration.front_left_camera.extrinsics.rotation")) + translation = np.array(settings.get("calibration.front_left_camera.extrinsics.position")) + + # Build 4x4 homogeneous transformation matrix + fl_to_vehicle = np.eye(4) + fl_to_vehicle[:3, :3] = rotation + fl_to_vehicle[:3, 3] = translation + return fl_to_vehicle + elif camera_type == "rear_left" or camera_type == "rl": + rotation = np.array(settings.get("calibration.rear_left_camera.extrinsics.rotation")) + translation = np.array(settings.get("calibration.rear_left_camera.extrinsics.position")) + + # Build 4x4 homogeneous transformation matrix + rl_to_vehicle = np.eye(4) + rl_to_vehicle[:3, :3] = rotation + rl_to_vehicle[:3, 3] = translation + return rl_to_vehicle + else: + raise ValueError(f"Camera type {camera_type} not supported") + +def get_camera_intrinsic_matrix(camera_type): + ''' + Get the camera intrinsic matrix for the given camera type + ''' + if camera_type == "front_right" or camera_type == "fr": + # Provided intrinsics + focal = settings.get("calibration.front_right_camera.intrinsics.focal") # fx, fy + center = settings.get("calibration.front_right_camera.intrinsics.center") # cx, cy + fr_cam_distort = settings.get("calibration.front_right_camera.intrinsics.distort") + skew = settings.get("calibration.front_right_camera.intrinsics.skew") + + fx, fy = focal + cx, cy = center + + # Build K matrix + fr_cam_K = np.array([ + [fx, skew, cx], + [0, fy, cy], + [0, 0, 1] + ]) + return fr_cam_K, np.array(fr_cam_distort) + elif camera_type == "rear_right" or camera_type == "rr": + # Provided intrinsics + focal = settings.get("calibration.rear_right_camera.intrinsics.focal") # fx, fy + center = settings.get("calibration.rear_right_camera.intrinsics.center") # cx, cy + rr_cam_distort = settings.get("calibration.rear_right_camera.intrinsics.distort") + skew = settings.get("calibration.rear_right_camera.intrinsics.skew") + + fx, fy = focal + cx, cy = center + + # Build K matrix + rr_cam_K = np.array([ + [fx, skew, cx], + [0, fy, cy], + [0, 0, 1] + ]) + return rr_cam_K, np.array(rr_cam_distort) + elif camera_type == "front_left" or camera_type == "fl": + # Provided intrinsics + focal = settings.get("calibration.front_left_camera.intrinsics.focal") # fx, fy + center = settings.get("calibration.front_left_camera.intrinsics.center") # cx, cy + fl_cam_distort = settings.get("calibration.front_left_camera.intrinsics.distort") + skew = settings.get("calibration.front_left_camera.intrinsics.skew") + + fx, fy = focal + cx, cy = center + + # Build K matrix + fl_cam_K = np.array([ + [fx, skew, cx], + [0, fy, cy], + [0, 0, 1] + ]) + return fl_cam_K, np.array(fl_cam_distort) + elif camera_type == "rear_left" or camera_type == "rl": + # Provided intrinsics + focal = settings.get("calibration.rear_left_camera.intrinsics.focal") # fx, fy + center = settings.get("calibration.rear_left_camera.intrinsics.center") # cx, cy + rl_cam_distort = settings.get("calibration.rear_left_camera.intrinsics.distort") + skew = settings.get("calibration.rear_left_camera.intrinsics.skew") + + fx, fy = focal + cx, cy = center + + # Build K matrix + rl_cam_K = np.array([ + [fx, skew, cx], + [0, fy, cy], + [0, 0, 1] + ]) + return rl_cam_K, np.array(rl_cam_distort) + else: + raise ValueError(f"Camera type {camera_type} not supported") + +def get_lidar_extrinsic_matrix(lidar_type): + ''' + Get the lidar extrinsic matrix for the given lidar type + ''' + if lidar_type == "ouster": + # From the settings + rotation = np.array(settings.get("calibration.top_lidar.rotation")) + + translation = np.array(settings.get("calibration.top_lidar.position")) + + # Build 4x4 homogeneous transformation matrix + ouster_to_vehicle = np.eye(4) + ouster_to_vehicle[:3, :3] = rotation + ouster_to_vehicle[:3, 3] = translation + return ouster_to_vehicle + else: + raise ValueError(f"Lidar type {lidar_type} not supported") + +def sample_rgb_colors(image_rgb, u_f, v_f): + """ + Sample RGB colors at subpixel positions (u_f, v_f) from image. + Args: + image_rgb: np.ndarray (H, W, 3), float32 in [0,1] + u_f: (N,) float32 + v_f: (N,) float32 + Returns: + (N, 3) array of RGB colors + """ + assert image_rgb.dtype == np.float32 and image_rgb.max() <= 1.0 + colors = [] + + for x, y in zip(u_f, v_f): + patch = cv2.getRectSubPix(image_rgb, patchSize=(1, 1), center=(x, y)) + colors.append(patch[0, 0]) # (1, 1, 3) → get RGB triplet + + return np.array(colors) + + +def main(): + args = parse_args() + folder_path = args.folder_path + output_path = args.output_path + camera_types = args.camera_types.split(',') + print('camera_types', camera_types) + + folder = Path(folder_path) + files = list(folder.glob("*.b")) # all files + + for file in files: + # print(file.name) + data = np.load(file, allow_pickle=True) + # Save + os.makedirs(output_path, exist_ok=True) + save_ply_with_open3d(data, f"{output_path}/pure_pointcloud_{file.name}.ply") + colors_full = np.zeros((data.shape[0], 3), dtype=np.float32) # default black + points_h = np.hstack([data, np.ones((data.shape[0], 1))]) # shape: (N, 4) + + for camera_type in camera_types: + + # Get camera extrinsic matrix + camera_to_vehicle = get_camera_extrinsic_matrix(camera_type) + + # Get lidar extrinsic matrix + lidar_to_vehicle = get_lidar_extrinsic_matrix("ouster") + + # Get camera intrinsic matrix + camera_K, camera_distort = get_camera_intrinsic_matrix(camera_type) + + # Project points to camera + # Transform lidar points into camera frame + T_vehicle_to_cam = np.linalg.inv(camera_to_vehicle) + T_lidar_to_cam = T_vehicle_to_cam @ lidar_to_vehicle + + # Convert points to homogeneous coordinates + + # Transform to camera frame + points_cam = (T_lidar_to_cam @ points_h.T).T # shape: (N, 4) + points_cam = points_cam[:, :3] # drop homogeneous component + + # Filter out points behind the camera + mask = points_cam[:, 2] > 0 + # points_cam = points_cam[mask] + + # Load image + point_cloud_number = file.name.split("_")[1].split('.')[0] + # image_path = f"{file.parent.name}/{camera_type}_{point_cloud_number}.png" + image_path = file.parent / f"{camera_type}_{point_cloud_number}.png" + image_path = str(image_path) + if not Path(image_path).exists(): + print(f"❌ Image file does not exist: {image_path}") + continue + image_raw = cv2.imread(image_path) + h, w = image_raw.shape[:2] + + cam_distort_matrix = camera_distort + cam_k = camera_K + # Get optimal camera matrix (undistorted intrinsics) + cam_K_new, roi = cv2.getOptimalNewCameraMatrix(cam_k, cam_distort_matrix, (w, h), 0) + + # Undistort the image + image_undistorted = cv2.undistort(image_raw, cam_k, cam_distort_matrix, None, cam_K_new) + + # Project to image + proj = (cam_K_new @ points_cam.T).T # shape: (N, 3) + with np.errstate(divide='ignore', invalid='ignore'): + proj[:, 0] = np.divide(proj[:, 0], proj[:, 2], out=np.zeros_like(proj[:, 0]), where=proj[:, 2] != 0) + proj[:, 1] = np.divide(proj[:, 1], proj[:, 2], out=np.zeros_like(proj[:, 1]), where=proj[:, 2] != 0) + + # proj[:, 0] /= proj[:, 2] + # proj[:, 1] /= proj[:, 2] + pixels = proj[:, :2] # shape: (N, 2) + + # Step 1: filter front-facing + finite_mask = np.isfinite(proj[:, 0]) & np.isfinite(proj[:, 1]) + # mask_front = mask # Z_cam > 0 + mask_front = mask & finite_mask + + # Projected pixel coords (u, v) + u = pixels[:, 0] + v = pixels[:, 1] + if np.any(np.isnan(u)) or np.any(np.isnan(v)): + print(f"❌ NaN values in u or v for {camera_type}") + continue + if np.any(np.isinf(u)) or np.any(np.isinf(v)): + print(f"❌ Inf values in u or v for {camera_type}") + continue + # Step 2: in image bounds + + # Load image and convert to RGB + image_undistorted = cv2.cvtColor(image_undistorted, cv2.COLOR_BGR2RGB) + image_undistorted = image_undistorted.astype(np.float32) / 255.0 + h, w, _ = image_undistorted.shape + in_bounds = (u >= 0) & (u < w - 1) & (v >= 0) & (v < h - 1) + # in_bounds = (u >= 0) & (v >= 0) + # Step 3: Combine masks + visible_mask = mask_front & in_bounds + # visible_mask = mask_front + + # Step 4: Get visible pixel coords + u_f = u[visible_mask].astype(np.float32) + v_f = v[visible_mask].astype(np.float32) + if u_f.shape[0] == 0 or v_f.shape[0] == 0: + print(f"❌ No visible points for {camera_type}") + continue + # Step 5: Interpolate RGB + colors_interpolated = sample_rgb_colors(image_undistorted, u_f, v_f) + # Step 6: Assign to full color array + # colors_interpolated = np.concatenate(colors_interpolated, axis=1) # shape: (N, 3) + colors_full[visible_mask] = colors_interpolated # only visible points get color + + # Step 7: Create point cloud with full color info + + colored_pcd = o3d.geometry.PointCloud() + colored_pcd.points = o3d.utility.Vector3dVector(data) # full point set + colored_pcd.colors = o3d.utility.Vector3dVector(colors_full) # partial color + + o3d.io.write_point_cloud(f"{output_path}/colored_pointcloud_{file.name}.ply", colored_pcd) + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/README.md b/GEMstack/offboard/mast3r_3d_reconstruction/README.md new file mode 100644 index 000000000..12237e9ae --- /dev/null +++ b/GEMstack/offboard/mast3r_3d_reconstruction/README.md @@ -0,0 +1,138 @@ +# MASt3R-Based 3D Scene Reconstruction with GPS-Based Scaling + +This script performs 3D scene reconstruction from multi-view images using [MASt3R](https://github.com/naver/mast3r), then scales the reconstructed scene using GPS data embedded in the image EXIF metadata. + +The scaled point cloud is saved in `.ply` format, optionally with TSDF filtering and depth cleaning. + +--- + +## Features + +- Runs MASt3R pipeline to reconstruct a 3D point cloud from a set of input images +- Extracts GPS metadata (latitude, longitude, altitude) from image EXIF data +- Converts GPS coordinates into 3D Cartesian space using `pyproj` +- Estimates real-world scale using: + - **RANSAC-based robust ratio estimation** + - **Median ratio estimation** +- Exports `.ply` point cloud with real-world scale + +--- + +## Installation + +Install the required packages (assuming `mast3r` is cloned and download the checkpoint files): + +The simplest way to install is to use the mast3r.yml file, I exported. It would create a conda virtual environment named `mast3r`. + +And then please use the following command to install faiss-gpu: +```bash +canda install -c pytorch -r faiss-gpu +``` + +Make sure you have: +- `open3d` +- `numpy`, `opencv-python`, `torch`, `Pillow`, `pyproj`, `pandas`, `scipy` + +And that `mast3r` is correctly installed and importable. + +--- + +## 🚀 Usage + +### Basic usage (with reconstruction) + +```bash +python scale_pointcloud_based_on_geotag.py \ + --weights path/to/mast3r/core/checkpoint \ + --retrieval_model path/to/mast3r/retrieval/checkpoint \ + --folder_path path/to/images \ + --output_path output/output_scene.ply \ + --scale_method ransac \ + --scenegraph_type retrieval \ + --winsize 20 \ + --refid 10 +``` + +### Usage (with precomputed scene): + to directly get scaled pointcloud based on Geo Location info tagged to the images + +```bash +python scale_pointcloud_based_on_geotag.py \ + --scene_path path/to/scene.pkl \ + --folder_path path/to/images \ + --output_path output/output_scene.ply \ + --scale_method median +``` + +--- + +## 🧠 Arguments + +### Required (basic run) + +| Argument | Description | +|------------------|-----------------------------------------------------------------------------| +| `--folder_path` | Path to folder containing input images (JPG, PNG) | +| `--output_path` | Output file path for the scaled `.ply` point cloud | +| `--scale_method` | Method to estimate scale (`ransac` or `median`) | + +### Optional + +| Argument | Description | +|------------------|-----------------------------------------------------------------------------| +| `--scene_path` | (Optional) Path to a precomputed MASt3R scene `.pkl` file | +| `--crs_from` | Input coordinate system (default: `EPSG:4979` — lat/lon/alt) | +| `--crs_to` | Target coordinate system (default: `EPSG:32616` — UTM) | + +### MASt3R-specific args (if no `--scene_path` is given) + +You can also configure MASt3R parameters such as: + +| Argument | Description | +|----------------------|---------------------------------------------| +| `--optim_level` | Optimization stage (default: `refine+depth`)| +| `--lr1`, `--niter1` | Learning rate and iteration count (stage 1) (default: 0.07, 300)| +| `--lr2`, `--niter2` | Learning rate and iteration count (stage 2) (default: 0.01, 300)| +| `--min_conf_thr` | Minimum confidence threshold (default: 1.5) | +| `--matching_conf_thr`| Optional Matching confidence threshold | +| `--scenegraph_type` | Graph type: `retrieval`, `swin`, `complete`, etc | +| `--winsize` | Optional Window size for `swin` or `logwin` | +| `--clean_depth` | Clean depth maps (default: `True`) | +| `--TSDF_thresh` | Optional TSDF threshold for filtering | +| `--refid` | Optional Reference images. In `retrieval` scenegraph_type, this indicates the number of neighboring images | + +--- + +## Scale Estimation + +The script computes the relative scale between reconstructed camera centers and GPS-based 3D coordinates. Two methods are supported: + +- `ransac`: Robust estimate using inlier filtering +- `median`: Simple median ratio of pairwise distances + +--- + +## Output + +- A `.ply` point cloud in the specified `output_path`, scaled using estimated GPS-to-reconstruction ratio +- The point cloud preserves geometry from MASt3R and optionally applies depth/TSDF filtering + +--- + +## Notes + +- Input images **must contain valid EXIF GPS tags**. +- If no `--scene_path` is given, the full MASt3R pipeline will run (can be slow). +- You can reuse a `scene.pkl` from MASt3R by supplying `--scene_path`. + +--- + +## 📚 Reference + +- [MASt3R GitHub](https://github.com/naver/mast3r) +- [pyproj](https://pyproj4.github.io/pyproj/stable/) + +--- + +**Author**: Henry Yi +**License**: MIT diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/mast3r.yml b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r.yml new file mode 100644 index 000000000..7a22b4bf0 --- /dev/null +++ b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r.yml @@ -0,0 +1,208 @@ +name: mast3r +channels: + - pytorch + - nvidia + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - blas=1.0=mkl + - brotli-python=1.0.9=py311h6a678d5_9 + - bzip2=1.0.8=h5eee18b_6 + - c-ares=1.19.1=h5eee18b_0 + - ca-certificates=2025.2.25=h06a4308_0 + - certifi=2025.1.31=py311h06a4308_0 + - charset-normalizer=3.3.2=pyhd3eb1b0_0 + - cmake=3.14.0=h52cb24c_0 + - cuda-cudart=12.4.127=0 + - cuda-cupti=12.4.127=0 + - cuda-libraries=12.4.1=0 + - cuda-nvrtc=12.4.127=0 + - cuda-nvtx=12.4.127=0 + - cuda-opencl=12.8.90=0 + - cuda-runtime=12.4.1=0 + - cuda-version=12.8=3 + - expat=2.7.1=h6a678d5_0 + - ffmpeg=4.3=hf484d3e_0 + - filelock=3.17.0=py311h06a4308_0 + - freetype=2.13.3=h4a9f257_0 + - giflib=5.2.2=h5eee18b_0 + - gmp=6.3.0=h6a678d5_0 + - gmpy2=2.2.1=py311h5eee18b_0 + - gnutls=3.6.15=he1e5248_0 + - idna=3.7=py311h06a4308_0 + - intel-openmp=2023.1.0=hdb19cb5_46306 + - jinja2=3.1.6=py311h06a4308_0 + - jpeg=9e=h5eee18b_3 + - krb5=1.20.1=h143b758_1 + - lame=3.100=h7b6447c_0 + - lcms2=2.16=h92b89f2_1 + - ld_impl_linux-64=2.40=h12ee557_0 + - lerc=4.0.0=h6a678d5_0 + - libcublas=12.4.5.8=0 + - libcufft=11.2.1.3=0 + - libcufile=1.13.1.3=0 + - libcurand=10.3.9.90=0 + - libcurl=8.12.1=hc9e6f67_0 + - libcusolver=11.6.1.9=0 + - libcusparse=12.3.1.170=0 + - libdeflate=1.22=h5eee18b_0 + - libedit=3.1.20230828=h5eee18b_0 + - libev=4.33=h7f8727e_1 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libiconv=1.16=h5eee18b_3 + - libidn2=2.3.4=h5eee18b_0 + - libjpeg-turbo=2.0.0=h9bf148f_0 + - libnghttp2=1.57.0=h2d74bed_0 + - libnpp=12.2.5.30=0 + - libnvfatbin=12.8.90=0 + - libnvjitlink=12.4.127=0 + - libnvjpeg=12.3.1.117=0 + - libpng=1.6.39=h5eee18b_0 + - libssh2=1.11.1=h251f7ec_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.19.0=h5eee18b_0 + - libtiff=4.7.0=hde9077f_0 + - libunistring=0.9.10=h27cfd23_0 + - libuuid=1.41.5=h5eee18b_0 + - libwebp=1.3.2=h9f374a3_1 + - libwebp-base=1.3.2=h5eee18b_1 + - llvm-openmp=14.0.6=h9e868ea_0 + - lz4-c=1.9.4=h6a678d5_1 + - markupsafe=3.0.2=py311h5eee18b_0 + - mkl=2023.1.0=h213fc3f_46344 + - mkl-service=2.4.0=py311h5eee18b_2 + - mkl_fft=1.3.11=py311h5eee18b_0 + - mkl_random=1.2.8=py311ha02d727_0 + - mpc=1.3.1=h5eee18b_0 + - mpfr=4.2.1=h5eee18b_0 + - mpmath=1.3.0=py311h06a4308_0 + - ncurses=6.4=h6a678d5_0 + - nettle=3.7.3=hbbd107a_1 + - networkx=3.4.2=py311h06a4308_0 + - numpy=2.0.1=py311h08b1b3b_1 + - numpy-base=2.0.1=py311hf175353_1 + - ocl-icd=2.3.2=h5eee18b_1 + - openh264=2.1.1=h4ff587b_0 + - openjpeg=2.5.2=h0d4d230_1 + - openssl=3.0.16=h5eee18b_0 + - pillow=11.1.0=py311hac6e08b_1 + - pip=25.0=py311h06a4308_0 + - pysocks=1.7.1=py311h06a4308_0 + - python=3.11.11=he870216_0 + - pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0 + - pytorch-cuda=12.4=hc786d27_7 + - pytorch-mutex=1.0=cuda + - pyyaml=6.0.2=py311h5eee18b_0 + - readline=8.2=h5eee18b_0 + - requests=2.32.3=py311h06a4308_1 + - rhash=1.4.3=hdbd6064_0 + - setuptools=75.8.0=py311h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - tbb=2021.8.0=hdb19cb5_0 + - tk=8.6.14=h39e8969_0 + - torchtriton=3.1.0=py311 + - torchvision=0.20.1=py311_cu124 + - typing_extensions=4.12.2=py311h06a4308_0 + - urllib3=2.3.0=py311h06a4308_0 + - wheel=0.45.1=py311h06a4308_0 + - xz=5.6.4=h5eee18b_1 + - yaml=0.2.5=h7b6447c_0 + - zlib=1.2.13=h5eee18b_1 + - zstd=1.5.6=hc292b87_0 + - pip: + - absl-py==2.2.2 + - aiofiles==24.1.0 + - annotated-types==0.7.0 + - anyio==4.9.0 + - asmk==0.1.1 + - bracex==2.5.post1 + - build==1.2.2.post1 + - clarabel==0.10.0 + - click==8.1.8 + - contourpy==1.3.2 + - cvxpy==1.6.5 + - cycler==0.12.1 + - cython==3.0.12 + - einops==0.8.1 + - fastapi==0.115.12 + - ffmpy==0.5.0 + - fonttools==4.57.0 + - freetype-py==2.5.1 + - fsspec==2025.3.2 + - gradio==5.27.0 + - gradio-client==1.9.0 + - groovy==0.1.2 + - grpcio==1.71.0 + - h11==0.16.0 + - httpcore==1.0.9 + - httpx==0.28.1 + - huggingface-hub==0.30.2 + - imageio==2.37.0 + - joblib==1.4.2 + - kapture==1.1.10 + - kapture-localization==1.1.10 + - kiwisolver==1.4.8 + - llvmlite==0.44.0 + - markdown==3.8 + - markdown-it-py==3.0.0 + - matplotlib==3.10.1 + - mdurl==0.1.2 + - numba==0.61.2 + - numpy-quaternion==2024.0.8 + - opencv-python==4.11.0.86 + - open3d==0.19.0 + - orjson==3.10.16 + - osqp==1.0.3 + - packaging==25.0 + - pandas==2.2.3 + - piexif==1.1.3 + - pillow-heif==0.22.0 + - poselib==2.0.4 + - protobuf==6.30.2 + - pyaml==25.1.0 + - pycolmap==3.11.1 + - pydantic==2.11.3 + - pydantic-core==2.33.1 + - pydub==0.25.1 + - pyglet==1.5.31 + - pygments==2.19.1 + - pyopengl==3.1.0 + - pyparsing==3.2.3 + - pyproject-hooks==1.2.0 + - pyrender==0.1.45 + - python-dateutil==2.9.0.post0 + - python-multipart==0.0.20 + - pytz==2025.2 + - pyproj==3.7.1 + - rich==14.0.0 + - roma==1.5.2.1 + - ruff==0.11.7 + - safehttpx==0.1.6 + - safetensors==0.5.3 + - scikit-learn==1.6.1 + - scipy==1.15.2 + - scs==3.2.7.post2 + - semantic-version==2.10.0 + - shellingham==1.5.4 + - six==1.17.0 + - sniffio==1.3.1 + - starlette==0.46.2 + - sympy==1.13.1 + - tabulate==0.9.0 + - tensorboard==2.19.0 + - tensorboard-data-server==0.7.2 + - threadpoolctl==3.6.0 + - tomlkit==0.13.2 + - tqdm==4.67.1 + - trimesh==4.6.8 + - typer==0.15.2 + - typing-inspection==0.4.0 + - tzdata==2025.2 + - uvicorn==0.34.2 + - wcmatch==10.0 + - websockets==15.0.1 + - werkzeug==3.1.3 +prefix: /u/hyi1/.conda/envs/mast3r diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py new file mode 100644 index 000000000..403ea8ed3 --- /dev/null +++ b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py @@ -0,0 +1,426 @@ +import math +import gradio +import os +import numpy as np +import functools +import trimesh +import copy +from scipy.spatial.transform import Rotation +import tempfile +import shutil +import torch + +from mast3r.cloud_opt.sparse_ga import sparse_global_alignment,anchor_depth_offsets,make_pts3d,show_reconstruction,forward_mast3r,prepare_canonical_data,condense_data,compute_min_spanning_tree,sparse_scene_optimizer +from mast3r.cloud_opt.tsdf_optimizer import TSDFPostProcess +from mast3r.image_pairs import make_pairs +from mast3r.retrieval.processor import Retriever + +import mast3r.utils.path_to_dust3r # noqa +from dust3r.utils.geometry import inv, geotrf # noqa +from dust3r.utils.image import load_images +from dust3r.utils.device import to_numpy +from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes +from dust3r.demo import get_args_parser as dust3r_get_args_parser +from dust3r.cloud_opt.base_opt import clean_pointcloud + +import matplotlib.pyplot as pl +import open3d as o3d + +import scipy.cluster.hierarchy as sch + + +def get_args_parser(): + parser = dust3r_get_args_parser() + parser.add_argument('--share', action='store_true') + parser.add_argument('--gradio_delete_cache', default=None, type=int, + help='age/frequency at which gradio removes the file. If >0, matching cache is purged') + parser.add_argument('--retrieval_model', default=None, type=str, help="retrieval_model to be loaded") + + actions = parser._actions + for action in actions: + if action.dest == 'model_name': + action.choices = ["MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"] + # change defaults + parser.prog = 'mast3r demo' + return parser + +import pickle +import os +from pathlib import Path + + +class SparseGA(): + ''' + PointCloud class. + ''' + def __init__(self, img_paths, pairs_in, res_fine, anchors, canonical_paths=None): + def fetch_img(im): + def torgb(x): return (x[0].permute(1, 2, 0).numpy() * .5 + .5).clip(min=0., max=1.) + for im1, im2 in pairs_in: + if im1['instance'] == im: + return torgb(im1['img']) + if im2['instance'] == im: + return torgb(im2['img']) + self.canonical_paths = canonical_paths + self.img_paths = img_paths + self.imgs = [fetch_img(img) for img in img_paths] + self.intrinsics = res_fine['intrinsics'] + self.cam2w = res_fine['cam2w'] + self.depthmaps = res_fine['depthmaps'] + self.pts3d = res_fine['pts3d'] + self.pts3d_colors = [] + self.working_device = self.cam2w.device + for i in range(len(self.imgs)): + im = self.imgs[i] + x, y = anchors[i][0][..., :2].detach().cpu().numpy().T + self.pts3d_colors.append(im[y, x]) + assert self.pts3d_colors[-1].shape == self.pts3d[i].shape + self.n_imgs = len(self.imgs) + + def get_focals(self): + return torch.tensor([ff[0, 0] for ff in self.intrinsics]).to(self.working_device) + + def get_principal_points(self): + return torch.stack([ff[:2, -1] for ff in self.intrinsics]).to(self.working_device) + + def get_im_poses(self): + return self.cam2w + + def get_sparse_pts3d(self): + return self.pts3d + + def get_dense_pts3d(self, clean_depth=True, subsample=8): + ''' + Get dense 3D points. + ''' + assert self.canonical_paths, 'cache_path is required for dense 3d points' + device = self.cam2w.device + confs = [] + base_focals = [] + anchors = {} + for i, canon_path in enumerate(self.canonical_paths): + (canon, canon2, conf), focal = torch.load(canon_path, map_location=device) + confs.append(conf) + base_focals.append(focal) + + H, W = conf.shape + pixels = torch.from_numpy(np.mgrid[:W, :H].T.reshape(-1, 2)).float().to(device) + idxs, offsets = anchor_depth_offsets(canon2, {i: (pixels, None)}, subsample=subsample) + anchors[i] = (pixels, idxs[i], offsets[i]) + + # densify sparse depthmaps + pts3d, depthmaps = make_pts3d(anchors, self.intrinsics, self.cam2w, [ + d.ravel() for d in self.depthmaps], base_focals=base_focals, ret_depth=True) + self.confs_dense = confs + if clean_depth: + confs = clean_pointcloud(confs, self.intrinsics, inv(self.cam2w), depthmaps, pts3d) + self.confs_dense_clean = confs + + self.pts3d_dense = pts3d + self.depthmaps_dense = depthmaps + return pts3d, depthmaps, confs + + def get_pts3d_colors(self): + return self.pts3d_colors + + def get_depthmaps(self): + return self.depthmaps + + def get_masks(self): + return [slice(None, None) for _ in range(len(self.imgs))] + + def show(self, show_cams=True): + pts3d, _, confs = self.get_dense_pts3d() + show_reconstruction(self.imgs, self.intrinsics if show_cams else None, self.cam2w, + [p.clip(min=-50, max=50) for p in pts3d], + masks=[c > 1 for c in confs]) + + +def convert_dust3r_pairs_naming(imgs, pairs_in): + for pair_id in range(len(pairs_in)): + for i in range(2): + pairs_in[pair_id][i]['instance'] = imgs[pairs_in[pair_id][i]['idx']] + return pairs_in + + +def sparse_global_alignment(imgs, pairs_in, cache_path, model, subsample=8, desc_conf='desc_conf', + kinematic_mode='hclust-ward', device='cuda', dtype=torch.float32, shared_intrinsics=False, **kw): + """ Sparse alignment with MASt3R + imgs: list of image paths + cache_path: path where to dump temporary files (str) + + lr1, niter1: learning rate and #iterations for coarse global alignment (3D matching) + lr2, niter2: learning rate and #iterations for refinement (2D reproj error) + + lora_depth: smart dimensionality reduction with depthmaps + """ + # Convert pair naming convention from dust3r to mast3r + pairs_in = convert_dust3r_pairs_naming(imgs, pairs_in) + # forward pass + pairs, cache_path = forward_mast3r(pairs_in, model, + cache_path=cache_path, subsample=subsample, + desc_conf=desc_conf, device=device) + + # extract canonical pointmaps + tmp_pairs, pairwise_scores, canonical_views, canonical_paths, preds_21 = \ + prepare_canonical_data(imgs, pairs, subsample, cache_path=cache_path, mode='avg-angle', device=device) + + # smartly combine all useful data + imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21 = \ + condense_data(imgs, tmp_pairs, canonical_views, preds_21, dtype) + + # Build kinematic chain + if kinematic_mode == 'mst': + # compute minimal spanning tree + mst = compute_min_spanning_tree(pairwise_scores) + + elif kinematic_mode.startswith('hclust'): + mode, linkage = kinematic_mode.split('-') + + # Convert the affinity matrix to a distance matrix (if needed) + n_patches = (imsizes // subsample).prod(dim=1) + max_n_corres = 3 * torch.minimum(n_patches[:,None], n_patches[None,:]) + pws = (pairwise_scores.clone() / max_n_corres).clip(max=1) + pws.fill_diagonal_(1) + pws = to_numpy(pws) + distance_matrix = np.where(pws, 1 - pws, 2) + + # Compute the condensed distance matrix + condensed_distance_matrix = sch.distance.squareform(distance_matrix) + + # Perform hierarchical clustering using the linkage method + Z = sch.linkage(condensed_distance_matrix, method=linkage) + # dendrogram = sch.dendrogram(Z) + + tree = np.eye(len(imgs)) + new_to_old_nodes = {i:i for i in range(len(imgs))} + for i, (a, b) in enumerate(Z[:,:2].astype(int)): + # given two nodes to be merged, we choose which one is the best representant + a = new_to_old_nodes[a] + b = new_to_old_nodes[b] + tree[a,b] = tree[b,a] = 1 + best = a if pws[a].sum() > pws[b].sum() else b + new_to_old_nodes[len(imgs)+i] = best + pws[best] = np.maximum(pws[a], pws[b]) # update the node + + pairwise_scores = torch.from_numpy(tree) # this output just gives 1s for connected edges and zeros for other, i.e. no scores or priority + mst = compute_min_spanning_tree(pairwise_scores) + + else: + raise ValueError(f'bad {kinematic_mode=}') + + # remove all edges not in the spanning tree? + # min_spanning_tree = {(imgs[i],imgs[j]) for i,j in mst[1]} + # tmp_pairs = {(a,b):v for (a,b),v in tmp_pairs.items() if {(a,b),(b,a)} & min_spanning_tree} + + imgs, res_coarse, res_fine = sparse_scene_optimizer( + imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21, canonical_paths, mst, + shared_intrinsics=shared_intrinsics, cache_path=cache_path, device=device, dtype=dtype, **kw) + + scene = SparseGA(imgs, pairs_in, res_fine or res_coarse, anchors, canonical_paths) + # scene.get_dense_pts3d() + # path = Path("saved_data") / "scene.pkl" + # with path.open("wb") as f: + # pickle.dump(scene, f) + + print("Current working directory:", os.getcwd()) + return scene + +def _convert_scene_output_to_glb(outfile, imgs, pts3d, mask, focals, cams2world, cam_size=0.05, + cam_color=None, as_pointcloud=False, + transparent_cams=False, silent=False): + ''' + Convert scene output to GLB file. + ''' + assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals) + pts3d = to_numpy(pts3d) + imgs = to_numpy(imgs) + focals = to_numpy(focals) + cams2world = to_numpy(cams2world) + + scene = trimesh.Scene() + + # full pointcloud + if as_pointcloud: + pts = np.concatenate([p[m.ravel()] for p, m in zip(pts3d, mask)]).reshape(-1, 3) + col = np.concatenate([p[m] for p, m in zip(imgs, mask)]).reshape(-1, 3) + valid_msk = np.isfinite(pts.sum(axis=1)) + pct = trimesh.PointCloud(pts[valid_msk], colors=col[valid_msk]) + scene.add_geometry(pct) + else: + meshes = [] + for i in range(len(imgs)): + pts3d_i = pts3d[i].reshape(imgs[i].shape) + msk_i = mask[i] & np.isfinite(pts3d_i.sum(axis=-1)) + meshes.append(pts3d_to_trimesh(imgs[i], pts3d_i, msk_i)) + mesh = trimesh.Trimesh(**cat_meshes(meshes)) + scene.add_geometry(mesh) + + # add each camera + for i, pose_c2w in enumerate(cams2world): + if isinstance(cam_color, list): + camera_edge_color = cam_color[i] + else: + camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)] + add_scene_cam(scene, pose_c2w, camera_edge_color, + None if transparent_cams else imgs[i], focals[i], + imsize=imgs[i].shape[1::-1], screen_width=cam_size) + + rot = np.eye(4) + rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix() + scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot)) + if not silent: + print('(exporting 3D scene to', outfile, ')') + scene.export(file_obj=outfile) + return outfile + +def convert_scene_output_to_ply_impl(outfile, imgs, pts3d, mask, scale=1.0, apply_y_flip=False, silent=False): + """ + Export a scaled and colored 3D point cloud to PLY format using Open3D. + + Args: + outfile (str): Path to save the .ply file. + imgs (list of np.ndarray): RGB images (H, W, 3) per view. + pts3d (list of np.ndarray): 3D points per view, shape (H * W, 3). + mask (list of np.ndarray): Boolean masks indicating valid points per view (H, W). + scale (float): Scale factor to apply to the 3D points. + apply_y_flip (bool): If True, apply a y-axis flip to the 3D points. + silent (bool): If False, print export message. + + Returns: + str: Output file path to the saved PLY. + """ + # Convert per-image valid 3D points and corresponding colors + all_pts = [] + all_colors = [] + + for p, m, img in zip(pts3d, mask, imgs): + pts = p[m.ravel()] * scale # shape (N_valid, 3) + colors = img[m] # shape (N_valid, 3) + valid = np.isfinite(pts).all(axis=1) # remove NaNs or Infs + + all_pts.append(pts[valid]) + all_colors.append(colors[valid]) # normalize RGB to [0, 1] + + # Concatenate across all views + all_pts = np.concatenate(all_pts, axis=0) + all_colors = np.concatenate(all_colors, axis=0) + + apply_y_flip = True + if apply_y_flip: + rot = Rotation.from_euler('y', np.deg2rad(180)).as_matrix() + all_pts = all_pts @ rot.T # apply rotation to all points + # Create Open3D point cloud + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(all_pts) + pcd.colors = o3d.utility.Vector3dVector(all_colors) + # Save to .ply + final_outfile = os.path.join(os.getcwd(), outfile) + os.makedirs(os.path.dirname(final_outfile), exist_ok=True) + o3d.io.write_point_cloud(final_outfile, pcd) + if not silent: + print(f"✅ Exported scaled point cloud to: {outfile}") + + return outfile + +def convert_scene_output_to_ply(outfile, data, scale=1.0, apply_y_flip=False, min_conf_thr=1.5, clean=True, TSDF_thresh=0): + ''' + Convert scene output to PLY file. This is to filter out points that are not visible in the images either using TSDF or normal confidence thresholding. + ''' + imgs = to_numpy(data.imgs) + if TSDF_thresh > 0: + tsdf = TSDFPostProcess(data, TSDF_thresh=TSDF_thresh) + dense_pts3d, _, confs = to_numpy(tsdf.get_dense_pts3d(clean_depth=clean)) + msk = to_numpy([c > min_conf_thr for c in confs]) + else : + confs = data.confs_dense_clean if clean else data.confs_dense + dense_pts3d = to_numpy(data.pts3d_dense) + msk = to_numpy([c > min_conf_thr for c in confs]) + # 3D pointcloud from depthmap, poses and intrinsics + return convert_scene_output_to_ply_impl(outfile, imgs, dense_pts3d, msk, scale=scale, apply_y_flip=apply_y_flip) + +def get_3D_model_from_scene(silent, scene_state, min_conf_thr=2, as_pointcloud=False, mask_sky=False, + clean_depth=False, transparent_cams=False, cam_size=0.05, TSDF_thresh=0): + """ + extract 3D_model (glb file) from a reconstructed scene + """ + if scene_state is None: + return None + outfile = scene_state.outfile_name + if outfile is None: + return None + + # get optimized values from scene + scene = scene_state.sparse_ga + rgbimg = scene.imgs + focals = scene.get_focals().cpu() + cams2world = scene.get_im_poses().cpu() + + # 3D pointcloud from depthmap, poses and intrinsics + if TSDF_thresh > 0: + tsdf = TSDFPostProcess(scene, TSDF_thresh=TSDF_thresh) + pts3d, _, confs = to_numpy(tsdf.get_dense_pts3d(clean_depth=clean_depth)) + else: + pts3d, _, confs = to_numpy(scene.get_dense_pts3d(clean_depth=clean_depth)) + msk = to_numpy([c > min_conf_thr for c in confs]) + return _convert_scene_output_to_glb(outfile, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud, + transparent_cams=transparent_cams, cam_size=cam_size, silent=silent) + +def get_reconstructed_scene(outdir, gradio_delete_cache, model, retrieval_model, device, silent, image_size, + current_scene_state, filelist, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, + matching_conf_thr, as_pointcloud, mask_sky, clean_depth, + scenegraph_type, winsize, win_cyclic, refid, TSDF_thresh, shared_intrinsics, **kw): + """ + from a list of images, run mast3r inference, sparse global aligner. + then run get_3D_model_from_scene + """ + imgs = load_images(filelist, size=image_size, verbose=not silent) + if len(imgs) == 1: + imgs = [imgs[0], copy.deepcopy(imgs[0])] + imgs[1]['idx'] = 1 + filelist = [filelist[0], filelist[0] + '_2'] + + scene_graph_params = [scenegraph_type] + if scenegraph_type in ["swin", "logwin"]: + scene_graph_params.append(str(winsize)) + elif scenegraph_type == "oneref": + scene_graph_params.append(str(refid)) + elif scenegraph_type == "retrieval": + scene_graph_params.append(str(winsize)) # Na + scene_graph_params.append(str(refid)) # k + if scenegraph_type in ["swin", "logwin"] and not win_cyclic: + scene_graph_params.append('noncyclic') + scene_graph = '-'.join(scene_graph_params) + + sim_matrix = None + if 'retrieval' in scenegraph_type: + assert retrieval_model is not None + retriever = Retriever(retrieval_model, backbone=model, device=device) + with torch.no_grad(): + sim_matrix = retriever(filelist) + + # Cleanup + del retriever + torch.cuda.empty_cache() + + pairs = make_pairs(imgs, scene_graph=scene_graph, prefilter=None, symmetrize=True, sim_mat=sim_matrix) + if optim_level == 'coarse': + niter2 = 0 + # Sparse GA (forward mast3r -> matching -> 3D optim -> 2D refinement -> triangulation) + if current_scene_state is not None and \ + not current_scene_state.should_delete and \ + current_scene_state.cache_dir is not None: + cache_dir = current_scene_state.cache_dir + elif gradio_delete_cache: + cache_dir = tempfile.mkdtemp(suffix='_cache', dir=outdir) + else: + cache_dir = os.path.join(outdir, 'cache') + os.makedirs(cache_dir, exist_ok=True) + scene = sparse_global_alignment(filelist, pairs, cache_dir, + model, lr1=lr1, niter1=niter1, lr2=lr2, niter2=niter2, device=device, + opt_depth='depth' in optim_level, shared_intrinsics=shared_intrinsics, + matching_conf_thr=matching_conf_thr, **kw) + + scene.get_dense_pts3d() + return scene + diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py b/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py new file mode 100644 index 000000000..6e851a025 --- /dev/null +++ b/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py @@ -0,0 +1,378 @@ +from contextlib import nullcontext +from itertools import combinations +from pathlib import Path +import numpy as np +from mast3r.model import AsymmetricMASt3R +from mast3r.utils.misc import hash_md5 +from pyproj import Transformer +import numpy as np +from PIL import Image +from PIL.ExifTags import TAGS, GPSTAGS +import os +import torch + +import numpy as np +from itertools import combinations +import random + +import pickle +import pandas as pd +import tempfile + +import numpy as np +import open3d as o3d +from scipy.spatial.transform import Rotation + +from mast3r.demo import get_args_parser, main_demo +import argparse +from mast3r_runner import get_reconstructed_scene, convert_scene_output_to_ply + + +def todevice(batch, device, callback=None, non_blocking=False): + ''' Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy). + + batch: list, tuple, dict of tensors or other things + device: pytorch device or 'numpy' + callback: function that would be called on every sub-elements. + ''' + if callback: + batch = callback(batch) + + if isinstance(batch, dict): + return {k: todevice(v, device) for k, v in batch.items()} + + if isinstance(batch, (tuple, list)): + return type(batch)(todevice(x, device) for x in batch) + + x = batch + if device == 'numpy': + if isinstance(x, torch.Tensor): + x = x.detach().cpu().numpy() + elif x is not None: + if isinstance(x, np.ndarray): + x = torch.from_numpy(x) + if torch.is_tensor(x): + x = x.to(device, non_blocking=non_blocking) + return x + + +def to_numpy(x): return todevice(x, 'numpy') +def to_cpu(x): return todevice(x, 'cpu') +def to_cuda(x): return todevice(x, 'cuda') + +def dms_to_decimal(d, m, s, ref): + ''' + Convert degrees, minutes, seconds to decimal degrees. + ''' + dd = d + m / 60 + s / 3600 + return -dd if ref in ['S', 'W'] else dd + +def get_gps_from_exif(image_path): + ''' + Get GPS information from an image file. + ''' + img = Image.open(image_path) + exif = img._getexif() + if not exif: + return None + gps_info = {} + for tag, value in exif.items(): + decoded = TAGS.get(tag) + if decoded == "GPSInfo": + for t in value: + sub_decoded = GPSTAGS.get(t) + gps_info[sub_decoded] = value[t] + + return gps_info + +def parse_gps_info(image_path): + ''' + Parse GPS information from an image file. + ''' + gps_info = get_gps_from_exif(image_path) + # Latitude + lat_ref = gps_info.get('GPSLatitudeRef', 'N') + lat_dms = gps_info.get('GPSLatitude') + lat = dms_to_decimal(lat_dms[0], lat_dms[1], lat_dms[2], lat_ref) + + # Longitude + lon_ref = gps_info.get('GPSLongitudeRef', 'E') + lon_dms = gps_info.get('GPSLongitude') + lon = dms_to_decimal(lon_dms[0], lon_dms[1], lon_dms[2], lon_ref) + + # Altitude + alt = gps_info.get('GPSAltitude', 0.0) + alt_ref = gps_info.get('GPSAltitudeRef', b'\x00') + if isinstance(alt_ref, bytes) and alt_ref == b'\x01': + alt = -alt # below sea level + + # Timestamp (optional) + timestamp = gps_info.get('GPSTimeStamp', (0.0, 0.0, 0.0)) + timestamp = f'{int(timestamp[0])}:{int(timestamp[1])}:{int(timestamp[2])} UTC' + datestamp = gps_info.get('GPSDateStamp', '0000:00:00') + + return { + 'latitude': float(lat), + 'longitude': float(lon), + 'altitude': float(alt), + 'timestamp': timestamp, + 'date': datestamp + } + +def gps_to_xyz(gps_lookup, crs_from, crs_to): + ''' + Convert GPS coordinates to xyz coordinates. + ''' + xyz_lookup = {} + transformer = Transformer.from_crs(crs_from, crs_to, always_xy=True) # includes altitude + for image_name, (lat, lon, alt) in gps_lookup.items(): + xyz = transformer.transform(lon, lat, alt) + xyz_lookup[image_name] = xyz + return xyz_lookup + +def estimate_3d_scale_from_gps(camera_centers, gps_xyz, camera_image_names, min_dist_threshold=1.0): + """ + Estimate scale factor between MASt3r's camera centers and GPS 3D coordinates. + + Inputs: + camera_centers: (N, 3) array in MASt3r units (arbitrary scale) + gps_xyz: (N, 3) array in meters [x, y, z] from lat/lon/alt + camera_image_names: list of camera image names + min_dist_threshold: minimum distance threshold for valid GPS pairs + Returns: + scale: estimated meters-per-unit scale factor + """ + assert camera_centers.shape == np.ones((len(gps_xyz), len(list(gps_xyz.items())[0][1]))).shape + sfm_dists = [] + gps_dists = [] + + for i, j in combinations(range(len(camera_centers)), 2): + file_name_i = camera_image_names[i] + file_name_j = camera_image_names[j] + d_sfm = np.linalg.norm(camera_centers[i] - camera_centers[j]) + d_gps = np.linalg.norm(np.array(gps_xyz[file_name_i]) - np.array(gps_xyz[file_name_j])) + + if d_gps > min_dist_threshold: + sfm_dists.append(d_sfm) + gps_dists.append(d_gps) + + sfm_dists = np.array(sfm_dists) + gps_dists = np.array(gps_dists) + + if len(gps_dists) == 0: + raise ValueError("Not enough valid GPS pairs for scale estimation.") + + scale = np.median(gps_dists / sfm_dists) + return scale, sfm_dists, gps_dists + + +def estimate_scale_ransac(camera_centers, gps_xyz, camera_image_names, threshold=0.05, iterations=1000, min_dist=1.0): + ''' + Estimate scale factor between MASt3r's camera centers and GPS 3D coordinates using RANSAC. + + Inputs: + camera_centers: (N, 3) array in MASt3r units (arbitrary scale) + gps_xyz: (N, 3) array in meters [x, y, z] from lat/lon/alt + camera_image_names: list of camera image names + threshold: threshold for inliers + iterations: number of RANSAC iterations + min_dist: minimum distance threshold for valid GPS pairs + Returns: + scale: estimated meters-per-unit scale factor + ''' + scales = [] + pairs = [] + + # Build all valid distance pairs + for i, j in combinations(range(len(camera_centers)), 2): + file_name_i = camera_image_names[i] + file_name_j = camera_image_names[j] + d_sfm = np.linalg.norm(camera_centers[i] - camera_centers[j]) + d_gps = np.linalg.norm(np.array(gps_xyz[file_name_i]) - np.array(gps_xyz[file_name_j])) + if d_gps > min_dist: + scale = d_gps / d_sfm + scales.append(scale) + pairs.append((i, j)) + + scales = np.array(scales) + pairs = np.array(pairs) + + best_inliers = [] + best_scale = None + + for _ in range(iterations): + # Sample one index randomly + idx = random.randint(0, len(scales) - 1) + candidate_scale = scales[idx] + + # Compute residuals for all + residuals = np.abs(scales - candidate_scale) + + # Inliers: within the threshold + inliers = np.where(residuals < threshold)[0] + + if len(inliers) > len(best_inliers): + best_inliers = inliers + best_scale = np.median(scales[inliers]) + + return best_scale, len(best_inliers), len(scales) + +def extract_image_names(image_paths): + ''' + Extract image names from a list of image paths. + ''' + return [path.split('/')[-1] for path in image_paths] + + + +def collect_gps_data(data_folder): + ''' + Collect GPS data with extra metadata from a folder of images. + ''' + records = [] + for fname in sorted(os.listdir(data_folder)): + if fname.lower().endswith(('.jpg', '.jpeg', '.png')): + full_path = os.path.join(data_folder, fname) + try: + gps_data = parse_gps_info(full_path) + if gps_data: + parsed_gps_data = gps_data + records.append({ + 'filename': fname, + 'latitude': parsed_gps_data['latitude'], + 'longitude': parsed_gps_data['longitude'], + 'altitude': parsed_gps_data['altitude'], + 'timestamp': parsed_gps_data['timestamp'], + 'date': parsed_gps_data['date'] + }) + except Exception as e: + print(f"Warning: {fname} - {e}") + return pd.DataFrame(records) + + +def run_mast3r(args): + ''' + Run MASt3R on a folder of images. + ''' + if args.weights is not None: + weights_path = args.weights + else: + weights_path = "naver/" + args.model_name + model = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device) + chkpt_tag = hash_md5(weights_path) + + def get_context(tmp_dir): + return tempfile.TemporaryDirectory(suffix='_mast3r_context') if tmp_dir is None \ + else nullcontext(tmp_dir) + with get_context(args.tmp_dir) as tmpdirname: + cache_path = os.path.join(tmpdirname, chkpt_tag) + os.makedirs(cache_path, exist_ok=True) + inputfiles = [os.path.join(args.folder_path, f) for f in os.listdir(args.folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))] + scene_state = get_reconstructed_scene(tmpdirname, False, model, + args.retrieval_model, args.device, args.silent, args.image_size, None, inputfiles, args.optim_level, args.lr1, args.niter1, args.lr2, args.niter2, args.min_conf_thr, args.matching_conf_thr, + True, False, args.clean_depth, + args.scenegraph_type, args.winsize, args.win_cyclic, args.refid, 0, args.shared_intrinsics) + return scene_state + +def add_parse_args(parser, is_scene_path=False): + parser.add_argument('--folder_path', type=str, required=True, help='Path to the folder containing the images') + parser.add_argument('--output_path', type=str, required=True, help='Path to the output file') + parser.add_argument('--scale_method', type=str, required=True, help='Method to use for scale estimation') + parser.add_argument('--scene_path', type=str, required=False, help='Path to the scene file') + parser.add_argument('--crs_from', type=str, required=False, default='EPSG:4979', help='EPSG code of the input CRS') + parser.add_argument('--crs_to', type=str, required=False, default='EPSG:32616', help='EPSG code of the output CRS') + if not is_scene_path: + parser.add_argument('--optim_level', type=str, required=False, default='refine+depth', choices=['coarse', 'refine', 'refine+depth'], help='Optimization level') + parser.add_argument('--lr1', type=float, required=False, default=0.07, help='Learning rate for the first refinement iteration stage') + parser.add_argument('--niter1', type=int, required=False, default=300, help='Number of iterations for the first refinement iteration stage') + parser.add_argument('--lr2', type=float, required=False, default=0.01, help='Learning rate for the second refinement iteration stage') + parser.add_argument('--niter2', type=int, required=False, default=300, help='Number of iterations for the second refinement iteration stage') + parser.add_argument('--min_conf_thr', type=float, required=False, default=1.5, help='Minimum confidence threshold') + parser.add_argument('--matching_conf_thr', type=float, required=False, default=0., help='Matching confidence threshold') + parser.add_argument('--clean_depth', type=bool, required=False, default=True, help='Whether to clean the depth') + + available_scenegraph_type = [ + ("complete: all possible image pairs", "complete"), + ("swin: sliding window", "swin"), + ("logwin: sliding window with long range", "logwin"), + ("oneref: match one image with all", "oneref"), + ("retrieval: connect views based on similarity", "retrieval") + ] + + # Extract the actual values for argparse + scenegraph_choices = [val for _, val in available_scenegraph_type] + + # Construct help string + help_msg = "Type of scene graph:\n" + "\n".join([f" {val}: {desc.split(':', 1)[1].strip()}" for desc, val in available_scenegraph_type]) + + parser.add_argument( + '--scenegraph_type', + type=str, + choices=scenegraph_choices, + required=False, + default='complete', + help=help_msg + ) + parser.add_argument('--winsize', type=int, required=False, default=1, help='Window size for sliding window pair making scenegraph_type') + parser.add_argument('--win_cyclic', type=bool, required=False, default=False, help='Whether to use a cyclic sliding window') + parser.add_argument('--refid', type=str, required=False, default=0, help='Reference image for retrieval. For retrieval scenegraph_type, this indicates the number of neighbors.') + parser.add_argument('--TSDF_thresh', type=float, required=False, default=0, help='TSDF refinement threshold') + parser.add_argument('--shared_intrinsics', type=bool, required=False, default=False, help='Whether to use a shared intrinsics model') + + + return parser + +def scale_pointcloud_based_on_geotag(): + ''' + Scale a pointcloud based on GPS data. If no scene file is provided, MASt3R will be run to generate a scene file. + ''' + parser = argparse.ArgumentParser() + + # Add known args + parser.add_argument('--scene_path', type=str, help='This is the path to the scene file', required=False) + + # Parse known and unknown args + args, unknown = parser.parse_known_args() + + if not args.scene_path: + args_parser = get_args_parser() + args_parser = add_parse_args(args_parser, is_scene_path=False) + args = args_parser.parse_args() + scene = run_mast3r(args) + data = scene + else: + args_parser = argparse.ArgumentParser() + args_parser = add_parse_args(args_parser, is_scene_path=True) + args = args_parser.parse_args() + with open(args.scene_path, 'rb') as f: + data = pickle.load(f) + + # Get camera centers + cam2w = data.get_im_poses() + camera_centers = cam2w[:, :3, 3] # Extract translation component from [R|t] + + # Collect GPS data + df = collect_gps_data(args.folder_path) + image_gps_data = df.to_numpy() + gps_lookup = { + row[0]: [float(row[1]), float(row[2]), float(row[3])] + for row in image_gps_data + } + image_names = extract_image_names(data.img_paths) + xyz_lookup = gps_to_xyz(gps_lookup, args.crs_from, args.crs_to) + + # Estimate scale + scale = 1.0 + if args.scale_method == 'ransac': + scale, sfm_dists, gps_dists = estimate_scale_ransac(camera_centers.cpu().numpy(), xyz_lookup, image_names) + elif args.scale_method == 'median': + scale, sfm_dists, gps_dists = estimate_3d_scale_from_gps(camera_centers.cpu().numpy(), xyz_lookup, image_names) + else: + raise ValueError(f"Invalid scale method: {args.scale_method}") + print(f"Estimated scale: {scale}") + + # Convert scene output to PLY + convert_scene_output_to_ply(args.output_path, data, scale=scale, apply_y_flip=False, min_conf_thr=args.min_conf_thr, clean=args.clean_depth, TSDF_thresh=args.TSDF_thresh) + +if __name__ == "__main__": + scale_pointcloud_based_on_geotag() \ No newline at end of file diff --git a/GEMstack/scripts/__init__.py b/GEMstack/scripts/__init__.py deleted file mode 100644 index e69de29bb..000000000