From cdb49e4e1fa72e01bf50924a4ec468f03b215af7 Mon Sep 17 00:00:00 2001 From: Henry-YiW Date: Thu, 15 May 2025 04:13:06 -0500 Subject: [PATCH 1/6] lidar_colorization and mast3r reconstruction codes added --- .../lidar_colorization/colorization.py | 261 ++++++++++ .../mast3r_3d_reconstruction/README.MD | 18 + .../mast3r_3d_reconstruction/mast3r_runner.py | 448 ++++++++++++++++++ .../scale_pointcloud_based_on_geotag.py | 317 +++++++++++++ 4 files changed, 1044 insertions(+) create mode 100644 GEMstack/offboard/lidar_colorization/colorization.py create mode 100644 GEMstack/offboard/mast3r_3d_reconstruction/README.MD create mode 100644 GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py create mode 100644 GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py diff --git a/GEMstack/offboard/lidar_colorization/colorization.py b/GEMstack/offboard/lidar_colorization/colorization.py new file mode 100644 index 000000000..f7033d804 --- /dev/null +++ b/GEMstack/offboard/lidar_colorization/colorization.py @@ -0,0 +1,261 @@ +import os +from pathlib import Path +import numpy as np +import argparse +import open3d as o3d +import cv2 +import open3d as o3d + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--folder_path", type=str, required=True) + parser.add_argument("--output_path", type=str, required=True) + parser.add_argument("--camera_types", type=str, required=True) + return parser.parse_args() + + +def save_ply_with_open3d(points, filename): + pc = o3d.geometry.PointCloud() + pc.points = o3d.utility.Vector3dVector(points[:, :3]) + if points.shape[1] == 6: + colors = points[:, 3:6] / 255.0 # normalize RGB to [0, 1] + pc.colors = o3d.utility.Vector3dVector(colors) + o3d.io.write_point_cloud(filename, pc) + +def get_camera_extrinsic_matrix(camera_type): + if camera_type == "front_right" or camera_type == "fr": + # From your file + rotation = np.array([ + [-0.7168464770690616, -0.10046018208578958, 0.6899557088168523], + [-0.6970911725372957, 0.12308618950445319, -0.7063382243117325], + [-0.01396515249660048, -0.9872981017750231, -0.15826380744561577] + ]) + + translation = np.array([1.8861563355156226, -0.7733611068168774, 1.6793040225335112]) + + # Build 4x4 homogeneous transformation matrix + fr_to_vehicle = np.eye(4) + fr_to_vehicle[:3, :3] = rotation + fr_to_vehicle[:3, 3] = translation + return fr_to_vehicle + elif camera_type == "rear_right" or camera_type == "rr": + # From your file + rotation = np.array([ + [-0.7359657309159472, 0.15986191414426415, -0.6578743127098735], + [0.6768157805459531, 0.14993386619459964, -0.7207220233709469], + [-0.016578363047300385, -0.9756864271752846, -0.21854325362408236] + ]) + + translation = np.array([0.11419591502518789, -0.6896311735924415, 1.711181163333824]) + + # Build 4x4 homogeneous transformation matrix + rr_to_vehicle = np.eye(4) + rr_to_vehicle[:3, :3] = rotation + rr_to_vehicle[:3, 3] = translation + return rr_to_vehicle + else: + raise ValueError(f"Camera type {camera_type} not supported") + +def get_camera_intrinsic_matrix(camera_type): + if camera_type == "front_right" or camera_type == "fr": + # Provided intrinsics + focal = [1176.2554468073797, 1175.1456876174707] # fx, fy + center = [966.4326452411585, 608.5803255934914] # cx, cy + fr_cam_distort = [-0.2701363254469883, 0.16439325523243875, -0.001607207824773341, -7.412467081891699e-05, + -0.06199397580030171] + skew = 0 # assume no skew + + fx, fy = focal + cx, cy = center + + # Build K matrix + fr_cam_K = np.array([ + [fx, skew, cx], + [0, fy, cy], + [0, 0, 1] + ]) + return fr_cam_K, np.array(fr_cam_distort) + elif camera_type == "rear_right" or camera_type == "rr": + # Provided intrinsics + focal = [1162.3787554048329, 1162.855381183851] # fx, fy + center = [956.2663906909728, 569.2039945552984] # cx, cy + rr_cam_distort = [-0.25040910859151444, 0.1109210921906881, -0.00041247665414900384, 0.0008205455176671751, + -0.026395952816984845] + skew = 0 # assume no skew + + fx, fy = focal + cx, cy = center + + # Build K matrix + rr_cam_K = np.array([ + [fx, skew, cx], + [0, fy, cy], + [0, 0, 1] + ]) + return rr_cam_K, np.array(rr_cam_distort) + else: + raise ValueError(f"Camera type {camera_type} not supported") + +def get_lidar_extrinsic_matrix(lidar_type): + if lidar_type == "ouster": + # From your file + rotation = np.array([ + [1,0,0], + [0,1,0], + [0,0,1] + ]) + + translation = np.array([1.10,0,2.03]) + + # Build 4x4 homogeneous transformation matrix + ouster_to_vehicle = np.eye(4) + ouster_to_vehicle[:3, :3] = rotation + ouster_to_vehicle[:3, 3] = translation + return ouster_to_vehicle + else: + raise ValueError(f"Lidar type {lidar_type} not supported") + +def sample_rgb_colors(image_rgb, u_f, v_f): + """ + Sample RGB colors at subpixel positions (u_f, v_f) from image. + Args: + image_rgb: np.ndarray (H, W, 3), float32 in [0,1] + u_f: (N,) float32 + v_f: (N,) float32 + Returns: + (N, 3) array of RGB colors + """ + assert image_rgb.dtype == np.float32 and image_rgb.max() <= 1.0 + colors = [] + + for x, y in zip(u_f, v_f): + patch = cv2.getRectSubPix(image_rgb, patchSize=(1, 1), center=(x, y)) + colors.append(patch[0, 0]) # (1, 1, 3) → get RGB triplet + + return np.array(colors) + + +def main(): + args = parse_args() + folder_path = args.folder_path + output_path = args.output_path + camera_types = args.camera_types.split(',') + print('camera_types', camera_types) + + folder = Path(folder_path) + files = list(folder.glob("*.b")) # all files + + for file in files: + # print(file.name) + data = np.load(file, allow_pickle=True) + # Save + os.makedirs(output_path, exist_ok=True) + save_ply_with_open3d(data, f"{output_path}/pure_pointcloud_{file.name}.ply") + colors_full = np.zeros((data.shape[0], 3), dtype=np.float32) # default black + points_h = np.hstack([data, np.ones((data.shape[0], 1))]) # shape: (N, 4) + + for camera_type in camera_types: + + # Get camera extrinsic matrix + camera_to_vehicle = get_camera_extrinsic_matrix(camera_type) + + # Get lidar extrinsic matrix + lidar_to_vehicle = get_lidar_extrinsic_matrix("ouster") + + # Get camera intrinsic matrix + camera_K, camera_distort = get_camera_intrinsic_matrix(camera_type) + + # Project points to camera + # Transform lidar points into camera frame + T_vehicle_to_cam = np.linalg.inv(camera_to_vehicle) + T_lidar_to_cam = T_vehicle_to_cam @ lidar_to_vehicle + + # Convert points to homogeneous coordinates + + # Transform to camera frame + points_cam = (T_lidar_to_cam @ points_h.T).T # shape: (N, 4) + points_cam = points_cam[:, :3] # drop homogeneous component + + # Filter out points behind the camera + mask = points_cam[:, 2] > 0 + # points_cam = points_cam[mask] + + # Load image + point_cloud_number = file.name.split("_")[1].split('.')[0] + # image_path = f"{file.parent.name}/{camera_type}_{point_cloud_number}.png" + image_path = file.parent / f"{camera_type}_{point_cloud_number}.png" + image_path = str(image_path) + if not Path(image_path).exists(): + print(f"❌ Image file does not exist: {image_path}") + continue + image_raw = cv2.imread(image_path) + h, w = image_raw.shape[:2] + + cam_distort_matrix = camera_distort + cam_k = camera_K + # Get optimal camera matrix (undistorted intrinsics) + cam_K_new, roi = cv2.getOptimalNewCameraMatrix(cam_k, cam_distort_matrix, (w, h), 0) + + # Undistort the image + image_undistorted = cv2.undistort(image_raw, cam_k, cam_distort_matrix, None, cam_K_new) + + # Project to image + proj = (cam_K_new @ points_cam.T).T # shape: (N, 3) + with np.errstate(divide='ignore', invalid='ignore'): + proj[:, 0] = np.divide(proj[:, 0], proj[:, 2], out=np.zeros_like(proj[:, 0]), where=proj[:, 2] != 0) + proj[:, 1] = np.divide(proj[:, 1], proj[:, 2], out=np.zeros_like(proj[:, 1]), where=proj[:, 2] != 0) + + # proj[:, 0] /= proj[:, 2] + # proj[:, 1] /= proj[:, 2] + pixels = proj[:, :2] # shape: (N, 2) + + # Step 1: filter front-facing + finite_mask = np.isfinite(proj[:, 0]) & np.isfinite(proj[:, 1]) + # mask_front = mask # Z_cam > 0 + mask_front = mask & finite_mask + + # Projected pixel coords (u, v) + u = pixels[:, 0] + v = pixels[:, 1] + if np.any(np.isnan(u)) or np.any(np.isnan(v)): + print(f"❌ NaN values in u or v for {camera_type}") + continue + if np.any(np.isinf(u)) or np.any(np.isinf(v)): + print(f"❌ Inf values in u or v for {camera_type}") + continue + # Step 2: in image bounds + + # Load image and convert to RGB + image_undistorted = cv2.cvtColor(image_undistorted, cv2.COLOR_BGR2RGB) + image_undistorted = image_undistorted.astype(np.float32) / 255.0 + h, w, _ = image_undistorted.shape + in_bounds = (u >= 0) & (u < w - 1) & (v >= 0) & (v < h - 1) + # in_bounds = (u >= 0) & (v >= 0) + # Step 3: Combine masks + visible_mask = mask_front & in_bounds + # visible_mask = mask_front + + # Step 4: Get visible pixel coords + u_f = u[visible_mask].astype(np.float32) + v_f = v[visible_mask].astype(np.float32) + if u_f.shape[0] == 0 or v_f.shape[0] == 0: + print(f"❌ No visible points for {camera_type}") + continue + # Step 5: Interpolate RGB + colors_interpolated = sample_rgb_colors(image_undistorted, u_f, v_f) + # Step 6: Assign to full color array + # colors_interpolated = np.concatenate(colors_interpolated, axis=1) # shape: (N, 3) + colors_full[visible_mask] = colors_interpolated # only visible points get color + + # Step 7: Create point cloud with full color info + + colored_pcd = o3d.geometry.PointCloud() + colored_pcd.points = o3d.utility.Vector3dVector(data) # full point set + colored_pcd.colors = o3d.utility.Vector3dVector(colors_full) # partial color + + o3d.io.write_point_cloud(f"{output_path}/colored_pointcloud_{file.name}.ply", colored_pcd) + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/README.MD b/GEMstack/offboard/mast3r_3d_reconstruction/README.MD new file mode 100644 index 000000000..9977c4a2a --- /dev/null +++ b/GEMstack/offboard/mast3r_3d_reconstruction/README.MD @@ -0,0 +1,18 @@ +# Scaled 3D Scene Reconstruction from Images + +This repository provides code for generating a **scaled 3D reconstructed scene** from a set of input images, using [MASt3R](https://github.com/naver/mast3r.git) as the core image matching and reconstruction backend. + +## 🧩 Dependency + +This project relies on the official [MASt3R repository](https://github.com/naver/mast3r.git) by NAVER Labs Europe. +Please follow their installation instructions to set up the environment and dependencies. + +## 🛠️ Installation + +1. Clone this repository and the MASt3R submodule: + +```bash +git clone https://github.com/your-username/your-project.git +cd your-project +git clone --recursive https://github.com/naver/mast3r.git +``` \ No newline at end of file diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py new file mode 100644 index 000000000..12cd9d7be --- /dev/null +++ b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py @@ -0,0 +1,448 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# sparse gradio demo functions +# -------------------------------------------------------- +import math +import gradio +import os +import numpy as np +import functools +import trimesh +import copy +from scipy.spatial.transform import Rotation +import tempfile +import shutil +import torch + +from mast3r.cloud_opt.sparse_ga import sparse_global_alignment,anchor_depth_offsets,make_pts3d,show_reconstruction,forward_mast3r,prepare_canonical_data,condense_data,compute_min_spanning_tree,sparse_scene_optimizer +from mast3r.cloud_opt.tsdf_optimizer import TSDFPostProcess +from mast3r.image_pairs import make_pairs +from mast3r.retrieval.processor import Retriever + +import mast3r.utils.path_to_dust3r # noqa +from dust3r.dust3r.utils.geometry import inv, geotrf # noqa +from dust3r.dust3r.utils.image import load_images +from dust3r.dust3r.utils.device import to_numpy +from dust3r.dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes +from dust3r.dust3r.demo import get_args_parser as dust3r_get_args_parser +from dust3r.dust3r.cloud_opt.base_opt import clean_pointcloud + +import matplotlib.pyplot as pl +import open3d as o3d + +import scipy.cluster.hierarchy as sch + + +def get_args_parser(): + parser = dust3r_get_args_parser() + parser.add_argument('--share', action='store_true') + parser.add_argument('--gradio_delete_cache', default=None, type=int, + help='age/frequency at which gradio removes the file. If >0, matching cache is purged') + parser.add_argument('--retrieval_model', default=None, type=str, help="retrieval_model to be loaded") + + actions = parser._actions + for action in actions: + if action.dest == 'model_name': + action.choices = ["MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"] + # change defaults + parser.prog = 'mast3r demo' + return parser + +import pickle +import os +from pathlib import Path + + +class SparseGA(): + def __init__(self, img_paths, pairs_in, res_fine, anchors, canonical_paths=None): + def fetch_img(im): + def torgb(x): return (x[0].permute(1, 2, 0).numpy() * .5 + .5).clip(min=0., max=1.) + for im1, im2 in pairs_in: + if im1['instance'] == im: + return torgb(im1['img']) + if im2['instance'] == im: + return torgb(im2['img']) + self.canonical_paths = canonical_paths + self.img_paths = img_paths + self.imgs = [fetch_img(img) for img in img_paths] + self.intrinsics = res_fine['intrinsics'] + self.cam2w = res_fine['cam2w'] + self.depthmaps = res_fine['depthmaps'] + self.pts3d = res_fine['pts3d'] + self.pts3d_colors = [] + self.working_device = self.cam2w.device + for i in range(len(self.imgs)): + im = self.imgs[i] + x, y = anchors[i][0][..., :2].detach().cpu().numpy().T + self.pts3d_colors.append(im[y, x]) + assert self.pts3d_colors[-1].shape == self.pts3d[i].shape + self.n_imgs = len(self.imgs) + + def get_focals(self): + return torch.tensor([ff[0, 0] for ff in self.intrinsics]).to(self.working_device) + + def get_principal_points(self): + return torch.stack([ff[:2, -1] for ff in self.intrinsics]).to(self.working_device) + + def get_im_poses(self): + return self.cam2w + + def get_sparse_pts3d(self): + return self.pts3d + + def get_dense_pts3d(self, clean_depth=True, subsample=8): + assert self.canonical_paths, 'cache_path is required for dense 3d points' + device = self.cam2w.device + confs = [] + base_focals = [] + anchors = {} + for i, canon_path in enumerate(self.canonical_paths): + (canon, canon2, conf), focal = torch.load(canon_path, map_location=device) + confs.append(conf) + base_focals.append(focal) + + H, W = conf.shape + pixels = torch.from_numpy(np.mgrid[:W, :H].T.reshape(-1, 2)).float().to(device) + idxs, offsets = anchor_depth_offsets(canon2, {i: (pixels, None)}, subsample=subsample) + anchors[i] = (pixels, idxs[i], offsets[i]) + + # densify sparse depthmaps + pts3d, depthmaps = make_pts3d(anchors, self.intrinsics, self.cam2w, [ + d.ravel() for d in self.depthmaps], base_focals=base_focals, ret_depth=True) + self.confs_dense = confs + if clean_depth: + confs = clean_pointcloud(confs, self.intrinsics, inv(self.cam2w), depthmaps, pts3d) + self.confs_dense_clean = confs + + self.pts3d_dense = pts3d + self.depthmaps_dense = depthmaps + return pts3d, depthmaps, confs + + def get_pts3d_colors(self): + return self.pts3d_colors + + def get_depthmaps(self): + return self.depthmaps + + def get_masks(self): + return [slice(None, None) for _ in range(len(self.imgs))] + + def show(self, show_cams=True): + pts3d, _, confs = self.get_dense_pts3d() + show_reconstruction(self.imgs, self.intrinsics if show_cams else None, self.cam2w, + [p.clip(min=-50, max=50) for p in pts3d], + masks=[c > 1 for c in confs]) + + +def convert_dust3r_pairs_naming(imgs, pairs_in): + for pair_id in range(len(pairs_in)): + for i in range(2): + pairs_in[pair_id][i]['instance'] = imgs[pairs_in[pair_id][i]['idx']] + return pairs_in + + +def sparse_global_alignment(imgs, pairs_in, cache_path, model, subsample=8, desc_conf='desc_conf', + kinematic_mode='hclust-ward', device='cuda', dtype=torch.float32, shared_intrinsics=False, **kw): + """ Sparse alignment with MASt3R + imgs: list of image paths + cache_path: path where to dump temporary files (str) + + lr1, niter1: learning rate and #iterations for coarse global alignment (3D matching) + lr2, niter2: learning rate and #iterations for refinement (2D reproj error) + + lora_depth: smart dimensionality reduction with depthmaps + """ + # Convert pair naming convention from dust3r to mast3r + pairs_in = convert_dust3r_pairs_naming(imgs, pairs_in) + # forward pass + pairs, cache_path = forward_mast3r(pairs_in, model, + cache_path=cache_path, subsample=subsample, + desc_conf=desc_conf, device=device) + + # extract canonical pointmaps + tmp_pairs, pairwise_scores, canonical_views, canonical_paths, preds_21 = \ + prepare_canonical_data(imgs, pairs, subsample, cache_path=cache_path, mode='avg-angle', device=device) + + # smartly combine all useful data + imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21 = \ + condense_data(imgs, tmp_pairs, canonical_views, preds_21, dtype) + + # Build kinematic chain + if kinematic_mode == 'mst': + # compute minimal spanning tree + mst = compute_min_spanning_tree(pairwise_scores) + + elif kinematic_mode.startswith('hclust'): + mode, linkage = kinematic_mode.split('-') + + # Convert the affinity matrix to a distance matrix (if needed) + n_patches = (imsizes // subsample).prod(dim=1) + max_n_corres = 3 * torch.minimum(n_patches[:,None], n_patches[None,:]) + pws = (pairwise_scores.clone() / max_n_corres).clip(max=1) + pws.fill_diagonal_(1) + pws = to_numpy(pws) + distance_matrix = np.where(pws, 1 - pws, 2) + + # Compute the condensed distance matrix + condensed_distance_matrix = sch.distance.squareform(distance_matrix) + + # Perform hierarchical clustering using the linkage method + Z = sch.linkage(condensed_distance_matrix, method=linkage) + # dendrogram = sch.dendrogram(Z) + + tree = np.eye(len(imgs)) + new_to_old_nodes = {i:i for i in range(len(imgs))} + for i, (a, b) in enumerate(Z[:,:2].astype(int)): + # given two nodes to be merged, we choose which one is the best representant + a = new_to_old_nodes[a] + b = new_to_old_nodes[b] + tree[a,b] = tree[b,a] = 1 + best = a if pws[a].sum() > pws[b].sum() else b + new_to_old_nodes[len(imgs)+i] = best + pws[best] = np.maximum(pws[a], pws[b]) # update the node + + pairwise_scores = torch.from_numpy(tree) # this output just gives 1s for connected edges and zeros for other, i.e. no scores or priority + mst = compute_min_spanning_tree(pairwise_scores) + + else: + raise ValueError(f'bad {kinematic_mode=}') + + # remove all edges not in the spanning tree? + # min_spanning_tree = {(imgs[i],imgs[j]) for i,j in mst[1]} + # tmp_pairs = {(a,b):v for (a,b),v in tmp_pairs.items() if {(a,b),(b,a)} & min_spanning_tree} + + imgs, res_coarse, res_fine = sparse_scene_optimizer( + imgs, subsample, imsizes, pps, base_focals, core_depth, anchors, corres, corres2d, preds_21, canonical_paths, mst, + shared_intrinsics=shared_intrinsics, cache_path=cache_path, device=device, dtype=dtype, **kw) + + scene = SparseGA(imgs, pairs_in, res_fine or res_coarse, anchors, canonical_paths) + # scene.get_dense_pts3d() + # path = Path("saved_data") / "scene.pkl" + # with path.open("wb") as f: + # pickle.dump(scene, f) + + print("Current working directory:", os.getcwd()) + return scene + +def _convert_scene_output_to_glb(outfile, imgs, pts3d, mask, focals, cams2world, cam_size=0.05, + cam_color=None, as_pointcloud=False, + transparent_cams=False, silent=False): + assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals) + pts3d = to_numpy(pts3d) + imgs = to_numpy(imgs) + focals = to_numpy(focals) + cams2world = to_numpy(cams2world) + + scene = trimesh.Scene() + + # full pointcloud + if as_pointcloud: + pts = np.concatenate([p[m.ravel()] for p, m in zip(pts3d, mask)]).reshape(-1, 3) + col = np.concatenate([p[m] for p, m in zip(imgs, mask)]).reshape(-1, 3) + valid_msk = np.isfinite(pts.sum(axis=1)) + pct = trimesh.PointCloud(pts[valid_msk], colors=col[valid_msk]) + scene.add_geometry(pct) + else: + meshes = [] + for i in range(len(imgs)): + pts3d_i = pts3d[i].reshape(imgs[i].shape) + msk_i = mask[i] & np.isfinite(pts3d_i.sum(axis=-1)) + meshes.append(pts3d_to_trimesh(imgs[i], pts3d_i, msk_i)) + mesh = trimesh.Trimesh(**cat_meshes(meshes)) + scene.add_geometry(mesh) + + # add each camera + for i, pose_c2w in enumerate(cams2world): + if isinstance(cam_color, list): + camera_edge_color = cam_color[i] + else: + camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)] + add_scene_cam(scene, pose_c2w, camera_edge_color, + None if transparent_cams else imgs[i], focals[i], + imsize=imgs[i].shape[1::-1], screen_width=cam_size) + + rot = np.eye(4) + rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix() + scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot)) + if not silent: + print('(exporting 3D scene to', outfile, ')') + scene.export(file_obj=outfile) + return outfile + +def convert_scene_output_to_ply_impl(outfile, imgs, pts3d, mask, scale=1.0, apply_y_flip=False, silent=False): + """ + Export a scaled and colored 3D point cloud to PLY format using Open3D. + + Args: + outfile (str): Path to save the .ply file. + imgs (list of np.ndarray): RGB images (H, W, 3) per view. + pts3d (list of np.ndarray): 3D points per view, shape (H * W, 3). + mask (list of np.ndarray): Boolean masks indicating valid points per view (H, W). + scale (float): Scale factor to apply to the 3D points. + silent (bool): If False, print export message. + + Returns: + str: Output file path to the saved PLY. + """ + # Convert per-image valid 3D points and corresponding colors + all_pts = [] + all_colors = [] + + for p, m, img in zip(pts3d, mask, imgs): + pts = p[m.ravel()] * scale # shape (N_valid, 3) + colors = img[m] # shape (N_valid, 3) + valid = np.isfinite(pts).all(axis=1) # remove NaNs or Infs + + all_pts.append(pts[valid]) + all_colors.append(colors[valid]) # normalize RGB to [0, 1] + + # Concatenate across all views + all_pts = np.concatenate(all_pts, axis=0) + print(all_pts.shape) + all_colors = np.concatenate(all_colors, axis=0) + print(all_colors.shape) + print(all_colors[5]) + + apply_y_flip = True + if apply_y_flip: + rot = Rotation.from_euler('y', np.deg2rad(180)).as_matrix() + all_pts = all_pts @ rot.T # apply rotation to all points + # Create Open3D point cloud + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(all_pts) + pcd.colors = o3d.utility.Vector3dVector(all_colors) + + # Save to .ply + o3d.io.write_point_cloud(outfile, pcd) + if not silent: + print(f"✅ Exported scaled point cloud to: {outfile}") + + return outfile + +def convert_scene_output_to_ply(outfile, data, scale=1.0, apply_y_flip=False, min_conf_thr=1.5, clean=True): + confs = data.confs_dense_clean if clean else data.confs_dense + msk = to_numpy([c > min_conf_thr for c in confs]) + imgs = to_numpy(data.imgs) + dense_pts3d = to_numpy(data.pts3d_dense) + return convert_scene_output_to_ply_impl(outfile, imgs, dense_pts3d, msk, scale=scale, apply_y_flip=apply_y_flip) + +def get_3D_model_from_scene(silent, scene_state, min_conf_thr=2, as_pointcloud=False, mask_sky=False, + clean_depth=False, transparent_cams=False, cam_size=0.05, TSDF_thresh=0): + """ + extract 3D_model (glb file) from a reconstructed scene + """ + if scene_state is None: + return None + outfile = scene_state.outfile_name + if outfile is None: + return None + + # get optimized values from scene + scene = scene_state.sparse_ga + rgbimg = scene.imgs + focals = scene.get_focals().cpu() + cams2world = scene.get_im_poses().cpu() + + # 3D pointcloud from depthmap, poses and intrinsics + if TSDF_thresh > 0: + tsdf = TSDFPostProcess(scene, TSDF_thresh=TSDF_thresh) + pts3d, _, confs = to_numpy(tsdf.get_dense_pts3d(clean_depth=clean_depth)) + else: + pts3d, _, confs = to_numpy(scene.get_dense_pts3d(clean_depth=clean_depth)) + msk = to_numpy([c > min_conf_thr for c in confs]) + return _convert_scene_output_to_glb(outfile, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud, + transparent_cams=transparent_cams, cam_size=cam_size, silent=silent) + +def sort_images_from_longest_endpoint(D_square, data_length): + D_square = D_square.copy() + # Find the two farthest points + i, j = np.unravel_index(np.argmax(D_square), D_square.shape) + start_idx = i # or j — either works + + # Greedy traversal using the precomputed distance matrix + N = data_length + visited = np.zeros(N, dtype=bool) + visited[start_idx] = True + path = [start_idx] + + current_idx = start_idx + for _ in range(N - 1): + dists = D_square[current_idx] + dists[visited] = np.inf # Ignore visited + next_idx = np.argmin(dists) + path.append(next_idx) + visited[next_idx] = True + current_idx = next_idx + return path + +def get_reconstructed_scene(outdir, gradio_delete_cache, model, retrieval_model, device, silent, image_size, + current_scene_state, filelist, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, + matching_conf_thr, as_pointcloud, mask_sky, clean_depth, + scenegraph_type, winsize, win_cyclic, refid, TSDF_thresh, shared_intrinsics, **kw): + """ + from a list of images, run mast3r inference, sparse global aligner. + then run get_3D_model_from_scene + """ + imgs = load_images(filelist, size=image_size, verbose=not silent) + if len(imgs) == 1: + imgs = [imgs[0], copy.deepcopy(imgs[0])] + imgs[1]['idx'] = 1 + filelist = [filelist[0], filelist[0] + '_2'] + + scene_graph_params = [scenegraph_type] + if scenegraph_type in ["swin", "logwin"]: + scene_graph_params.append(str(winsize)) + elif scenegraph_type == "oneref": + scene_graph_params.append(str(refid)) + elif scenegraph_type == "retrieval": + scene_graph_params.append(str(winsize)) # Na + scene_graph_params.append(str(refid)) # k + + if scenegraph_type in ["swin", "logwin"] and not win_cyclic: + scene_graph_params.append('noncyclic') + scene_graph = '-'.join(scene_graph_params) + + sim_matrix = None + if 'retrieval' in scenegraph_type: + assert retrieval_model is not None + retriever = Retriever(retrieval_model, backbone=model, device=device) + with torch.no_grad(): + sim_matrix = retriever(filelist) + + # Cleanup + del retriever + torch.cuda.empty_cache() + + pairs = make_pairs(imgs, scene_graph=scene_graph, prefilter=None, symmetrize=True, sim_mat=sim_matrix) + if optim_level == 'coarse': + niter2 = 0 + # Sparse GA (forward mast3r -> matching -> 3D optim -> 2D refinement -> triangulation) + if current_scene_state is not None and \ + not current_scene_state.should_delete and \ + current_scene_state.cache_dir is not None: + cache_dir = current_scene_state.cache_dir + elif gradio_delete_cache: + cache_dir = tempfile.mkdtemp(suffix='_cache', dir=outdir) + else: + cache_dir = os.path.join(outdir, 'cache') + os.makedirs(cache_dir, exist_ok=True) + scene = sparse_global_alignment(filelist, pairs, cache_dir, + model, lr1=lr1, niter1=niter1, lr2=lr2, niter2=niter2, device=device, + opt_depth='depth' in optim_level, shared_intrinsics=shared_intrinsics, + matching_conf_thr=matching_conf_thr, **kw) + if current_scene_state is not None and \ + not current_scene_state.should_delete and \ + current_scene_state.outfile_name is not None: + outfile_name = current_scene_state.outfile_name + else: + outfile_name = tempfile.mktemp(suffix='_scene.glb', dir=outdir) + + # scene_state = SparseGAState(scene, gradio_delete_cache, cache_dir, outfile_name) + # outfile = get_3D_model_from_scene(silent, scene_state, min_conf_thr, as_pointcloud, mask_sky, + # clean_depth, transparent_cams, cam_size, TSDF_thresh) + outfile = convert_scene_output_to_ply(outfile_name, scene, scale=1.0, apply_y_flip=False, min_conf_thr=min_conf_thr, clean=clean_depth) + return scene, outfile + diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py b/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py new file mode 100644 index 000000000..949779efd --- /dev/null +++ b/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py @@ -0,0 +1,317 @@ +from contextlib import nullcontext +from itertools import combinations +from pathlib import Path +import numpy as np +from mast3r.mast3r.model import AsymmetricMASt3R +from mast3r.mast3r.utils.misc import hash_md5 +from pyproj import Transformer +import numpy as np +from PIL import Image +from PIL.ExifTags import TAGS, GPSTAGS +import os +import torch + +import numpy as np +from itertools import combinations +import random + +import pickle +import pandas as pd +import tempfile + +import numpy as np +import open3d as o3d +from scipy.spatial.transform import Rotation + +from mast3r.mast3r.demo import get_args_parser, main_demo +import argparse +from mast3r_runner import get_reconstructed_scene, convert_scene_output_to_ply + + +def todevice(batch, device, callback=None, non_blocking=False): + ''' Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy). + + batch: list, tuple, dict of tensors or other things + device: pytorch device or 'numpy' + callback: function that would be called on every sub-elements. + ''' + if callback: + batch = callback(batch) + + if isinstance(batch, dict): + return {k: todevice(v, device) for k, v in batch.items()} + + if isinstance(batch, (tuple, list)): + return type(batch)(todevice(x, device) for x in batch) + + x = batch + if device == 'numpy': + if isinstance(x, torch.Tensor): + x = x.detach().cpu().numpy() + elif x is not None: + if isinstance(x, np.ndarray): + x = torch.from_numpy(x) + if torch.is_tensor(x): + x = x.to(device, non_blocking=non_blocking) + return x + + +def to_numpy(x): return todevice(x, 'numpy') +def to_cpu(x): return todevice(x, 'cpu') +def to_cuda(x): return todevice(x, 'cuda') + +def dms_to_decimal(d, m, s, ref): + dd = d + m / 60 + s / 3600 + return -dd if ref in ['S', 'W'] else dd + +def get_gps_from_exif(image_path): + img = Image.open(image_path) + exif = img._getexif() + if not exif: + return None + gps_info = {} + for tag, value in exif.items(): + decoded = TAGS.get(tag) + # print(decoded) + if decoded == "GPSInfo": + for t in value: + sub_decoded = GPSTAGS.get(t) + # print(sub_decoded) + gps_info[sub_decoded] = value[t] + + return gps_info + +def parse_gps_info(image_path): + gps_info = get_gps_from_exif(image_path) + # Latitude + lat_ref = gps_info.get('GPSLatitudeRef', 'N') + lat_dms = gps_info.get('GPSLatitude') + lat = dms_to_decimal(lat_dms[0], lat_dms[1], lat_dms[2], lat_ref) + + # Longitude + lon_ref = gps_info.get('GPSLongitudeRef', 'E') + lon_dms = gps_info.get('GPSLongitude') + lon = dms_to_decimal(lon_dms[0], lon_dms[1], lon_dms[2], lon_ref) + + # Altitude + alt = gps_info.get('GPSAltitude', 0.0) + alt_ref = gps_info.get('GPSAltitudeRef', b'\x00') + if isinstance(alt_ref, bytes) and alt_ref == b'\x01': + alt = -alt # below sea level + + # Timestamp (optional) + timestamp = gps_info.get('GPSTimeStamp', (0.0, 0.0, 0.0)) + timestamp = f'{int(timestamp[0])}:{int(timestamp[1])}:{int(timestamp[2])} UTC' + datestamp = gps_info.get('GPSDateStamp', '0000:00:00') + + return { + 'latitude': float(lat), + 'longitude': float(lon), + 'altitude': float(alt), + 'timestamp': timestamp, + 'date': datestamp + } + +def gps_to_xyz(gps_lookup): + xyz_lookup = {} + transformer = Transformer.from_crs("EPSG:4979", "EPSG:32616", always_xy=True) # includes altitude + for image_name, (lat, lon, alt) in gps_lookup.items(): + xyz = transformer.transform(lon, lat, alt) + xyz_lookup[image_name] = xyz + return xyz_lookup + +def estimate_3d_scale_from_gps(camera_centers, gps_xyz, camera_image_names, min_dist_threshold=1.0): + """ + Estimate scale factor between MASt3r's camera centers and GPS 3D coordinates. + + Inputs: + camera_centers: (N, 3) array in MASt3r units (arbitrary scale) + gps_xyz: (N, 3) array in meters [x, y, z] from lat/lon/alt + Returns: + scale: estimated meters-per-unit scale factor + """ + assert camera_centers.shape == np.ones((len(gps_xyz), len(list(gps_xyz.items())[0][1]))).shape + sfm_dists = [] + gps_dists = [] + + for i, j in combinations(range(len(camera_centers)), 2): + file_name_i = camera_image_names[i] + file_name_j = camera_image_names[j] + d_sfm = np.linalg.norm(camera_centers[i] - camera_centers[j]) + # print(file_name_i, file_name_j) + # print(gps_xyz[file_name_i], gps_xyz[file_name_j]) + d_gps = np.linalg.norm(np.array(gps_xyz[file_name_i]) - np.array(gps_xyz[file_name_j])) + + if d_gps > min_dist_threshold: + sfm_dists.append(d_sfm) + gps_dists.append(d_gps) + + sfm_dists = np.array(sfm_dists) + gps_dists = np.array(gps_dists) + + if len(gps_dists) == 0: + raise ValueError("Not enough valid GPS pairs for scale estimation.") + + scale = np.median(gps_dists / sfm_dists) + return scale, sfm_dists, gps_dists + + +def estimate_scale_ransac(camera_centers, gps_xyz, camera_image_names, threshold=0.05, iterations=1000, min_dist=1.0): + scales = [] + pairs = [] + + # Build all valid distance pairs + for i, j in combinations(range(len(camera_centers)), 2): + file_name_i = camera_image_names[i] + file_name_j = camera_image_names[j] + d_sfm = np.linalg.norm(camera_centers[i] - camera_centers[j]) + d_gps = np.linalg.norm(np.array(gps_xyz[file_name_i]) - np.array(gps_xyz[file_name_j])) + if d_gps > min_dist: + scale = d_gps / d_sfm + scales.append(scale) + pairs.append((i, j)) + + scales = np.array(scales) + pairs = np.array(pairs) + + best_inliers = [] + best_scale = None + + for _ in range(iterations): + # Sample one index randomly + idx = random.randint(0, len(scales) - 1) + candidate_scale = scales[idx] + + # Compute residuals for all + residuals = np.abs(scales - candidate_scale) + + # Inliers: within the threshold + inliers = np.where(residuals < threshold)[0] + + if len(inliers) > len(best_inliers): + best_inliers = inliers + best_scale = np.median(scales[inliers]) + + return best_scale, len(best_inliers), len(scales) + +def extract_image_names(image_paths): + return [path.split('/')[-1] for path in image_paths] + + + +def collect_gps_data(data_folder): + records = [] + for fname in sorted(os.listdir(data_folder)): + if fname.lower().endswith(('.jpg', '.jpeg', '.png')): + full_path = os.path.join(data_folder, fname) + try: + gps_data = parse_gps_info(full_path) + if gps_data: + parsed_gps_data = gps_data + records.append({ + 'filename': fname, + 'latitude': parsed_gps_data['latitude'], + 'longitude': parsed_gps_data['longitude'], + 'altitude': parsed_gps_data['altitude'], + 'timestamp': parsed_gps_data['timestamp'], + 'date': parsed_gps_data['date'] + }) + except Exception as e: + print(f"Warning: {fname} - {e}") + return pd.DataFrame(records) + + +def run_mast3r(args): + model = AsymmetricMASt3R.from_pretrained(args.weights_path).to(args.device) + chkpt_tag = hash_md5(args.weights_path) + + def get_context(tmp_dir): + return tempfile.TemporaryDirectory(suffix='_mast3r_context') if tmp_dir is None \ + else nullcontext(tmp_dir) + with get_context(args.tmp_dir) as tmpdirname: + cache_path = os.path.join(tmpdirname, chkpt_tag) + os.makedirs(cache_path, exist_ok=True) + inputfiles = [os.path.join(args.folder_path, f) for f in os.listdir(args.folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))] + scene_state, outfile = get_reconstructed_scene(tmpdirname, False, model, + args.retrieval_model, args.device, args.silent, args.image_size, None, inputfiles, args.optim_level, args.lr1, args.niter1, args.lr2, args.niter2, args.min_conf_thr, args.matching_conf_thr, + True, False, args.clean_depth, + args.scenegraph_type, args.winsize, args.win_cyclic, args.refid, 0, args.shared_intrinsics) + return scene_state, outfile + +def add_parse_args(parser): + parser.add_argument('--folder_path', type=str, required=True) + parser.add_argument('--output_path', type=str, required=True) + parser.add_argument('--scale_method', type=str, required=True) + + parser.add_argument('--weights_path', type=str, required=True, help='Path to the core mast3rweights file') + parser.add_argument('--retrieval_model', type=str, required=True, help='Retrieval model weights path that is used to make image pairs') + parser.add_argument('--device', type=str, required=True, help='Device to run the model on') + parser.add_argument('--silent', type=bool, required=True, help='Whether to run the model silently') + parser.add_argument('--image_size', type=int, required=True, help='Image size') + parser.add_argument('--optim_level', type=int, required=True, help='Optimization level') + parser.add_argument('--lr1', type=float, required=True, help='Learning rate for the first refinement iteration stage') + parser.add_argument('--niter1', type=int, required=True, help='Number of iterations for the first refinement iteration stage') + parser.add_argument('--lr2', type=float, required=True, help='Learning rate for the second refinement iteration stage') + parser.add_argument('--niter2', type=int, required=True, help='Number of iterations for the second refinement iteration stage') + parser.add_argument('--min_conf_thr', type=float, required=True, help='Minimum confidence threshold') + parser.add_argument('--matching_conf_thr', type=float, required=True, help='Matching confidence threshold') + # parser.add_argument('--as_pointcloud', type=bool, required=True, help='Whether to output a pointcloud') + # parser.add_argument('--mask_sky', type=bool, required=True, help='Whether to mask the sky') + parser.add_argument('--clean_depth', type=bool, required=True, help='Whether to clean the depth') + parser.add_argument('--transparent_cams', type=bool, required=True, help='Whether to make the cameras transparent') + parser.add_argument('--cam_size', type=float, required=True, help='Camera size') + parser.add_argument('--scenegraph_type', type=str, required=True, help='Scenegraph type') + parser.add_argument('--winsize', type=int, required=True, help='Window size for sliding window pair making scenegraph_type') + parser.add_argument('--win_cyclic', type=bool, required=True, help='Whether to use a cyclic sliding window') + parser.add_argument('--refid', type=str, required=True, help='Reference image for retrieval') + # parser.add_argument('--TSDF_thresh', type=float, required=True, help='TSDF refinement threshold') + parser.add_argument('--shared_intrinsics', type=bool, required=True, help='Whether to use a shared intrinsics model') + + + return parser + +def scale_pointcloud_based_on_geotag(): + parser = argparse.ArgumentParser() + + # Add known args + parser.add_argument('--scene_path', type=str, help='This is the path to the scene file', required=False) + + # Parse known and unknown args + args, unknown = parser.parse_known_args() + + if not args.scene_path: + args_parser = get_args_parser() + args_parser = add_parse_args(args_parser) + args = args_parser.parse_args() + scene, outfile = run_mast3r(args) + scene.get_dense_pts3d() + data = scene + else: + args_parser = argparse.ArgumentParser() + args_parser = add_parse_args(args_parser) + args = args_parser.parse_args() + + with open(args.scene_path, 'rb') as f: + data = pickle.load(f) + + + cam2w = data.get_im_poses() + camera_centers = cam2w[:, :3, 3] # Extract translation component from [R|t] + df = collect_gps_data(args.folder_path) + image_gps_data = df.to_numpy() + gps_lookup = { + row[0]: [float(row[1]), float(row[2]), float(row[3])] + for row in image_gps_data + } + image_names = extract_image_names(data.img_paths) + xyz_lookup = gps_to_xyz(gps_lookup) + for method in ['ransac', 'median']: + if method == 'ransac': + scale, sfm_dists, gps_dists = estimate_scale_ransac(camera_centers, xyz_lookup, image_names) + else: + scale, sfm_dists, gps_dists = estimate_3d_scale_from_gps(camera_centers, xyz_lookup, image_names) + print(f"Estimated scale: {scale}") + convert_scene_output_to_ply(args.output_path, data, scale=scale, apply_y_flip=False) + +if __name__ == "__main__": + scale_pointcloud_based_on_geotag() \ No newline at end of file From a42bb4fdeed3f680acc3d24c9a68c42332771469 Mon Sep 17 00:00:00 2001 From: Henry-YiW Date: Thu, 15 May 2025 12:11:11 -0500 Subject: [PATCH 2/6] files cleaned --- .../geo_tagging}/geotag_from_files.py | 0 .../geo_tagging}/geotag_from_rosbag.py | 0 .../geo_tagging}/register_lidar_scans.py | 0 .../mast3r_3d_reconstruction/mast3r_runner.py | 16 ++-- .../scale_pointcloud_based_on_geotag.py | 80 ++++++++++--------- GEMstack/scripts/__init__.py | 0 6 files changed, 52 insertions(+), 44 deletions(-) rename GEMstack/{scripts => offboard/geo_tagging}/geotag_from_files.py (100%) rename GEMstack/{scripts => offboard/geo_tagging}/geotag_from_rosbag.py (100%) rename GEMstack/{scripts => offboard/geo_tagging}/register_lidar_scans.py (100%) delete mode 100644 GEMstack/scripts/__init__.py diff --git a/GEMstack/scripts/geotag_from_files.py b/GEMstack/offboard/geo_tagging/geotag_from_files.py similarity index 100% rename from GEMstack/scripts/geotag_from_files.py rename to GEMstack/offboard/geo_tagging/geotag_from_files.py diff --git a/GEMstack/scripts/geotag_from_rosbag.py b/GEMstack/offboard/geo_tagging/geotag_from_rosbag.py similarity index 100% rename from GEMstack/scripts/geotag_from_rosbag.py rename to GEMstack/offboard/geo_tagging/geotag_from_rosbag.py diff --git a/GEMstack/scripts/register_lidar_scans.py b/GEMstack/offboard/geo_tagging/register_lidar_scans.py similarity index 100% rename from GEMstack/scripts/register_lidar_scans.py rename to GEMstack/offboard/geo_tagging/register_lidar_scans.py diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py index 12cd9d7be..3cb511db6 100644 --- a/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py +++ b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py @@ -23,12 +23,12 @@ from mast3r.retrieval.processor import Retriever import mast3r.utils.path_to_dust3r # noqa -from dust3r.dust3r.utils.geometry import inv, geotrf # noqa -from dust3r.dust3r.utils.image import load_images -from dust3r.dust3r.utils.device import to_numpy -from dust3r.dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes -from dust3r.dust3r.demo import get_args_parser as dust3r_get_args_parser -from dust3r.dust3r.cloud_opt.base_opt import clean_pointcloud +from dust3r.utils.geometry import inv, geotrf # noqa +from dust3r.utils.image import load_images +from dust3r.utils.device import to_numpy +from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes +from dust3r.demo import get_args_parser as dust3r_get_args_parser +from dust3r.cloud_opt.base_opt import clean_pointcloud import matplotlib.pyplot as pl import open3d as o3d @@ -314,9 +314,9 @@ def convert_scene_output_to_ply_impl(outfile, imgs, pts3d, mask, scale=1.0, appl pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(all_pts) pcd.colors = o3d.utility.Vector3dVector(all_colors) - + print('cwd', os.getcwd()) # Save to .ply - o3d.io.write_point_cloud(outfile, pcd) + o3d.io.write_point_cloud(os.path.join(os.getcwd(), outfile), pcd) if not silent: print(f"✅ Exported scaled point cloud to: {outfile}") diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py b/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py index 949779efd..542d75f93 100644 --- a/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py +++ b/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py @@ -2,8 +2,8 @@ from itertools import combinations from pathlib import Path import numpy as np -from mast3r.mast3r.model import AsymmetricMASt3R -from mast3r.mast3r.utils.misc import hash_md5 +from mast3r.model import AsymmetricMASt3R +from mast3r.utils.misc import hash_md5 from pyproj import Transformer import numpy as np from PIL import Image @@ -23,7 +23,7 @@ import open3d as o3d from scipy.spatial.transform import Rotation -from mast3r.mast3r.demo import get_args_parser, main_demo +from mast3r.demo import get_args_parser, main_demo import argparse from mast3r_runner import get_reconstructed_scene, convert_scene_output_to_ply @@ -238,34 +238,35 @@ def get_context(tmp_dir): args.scenegraph_type, args.winsize, args.win_cyclic, args.refid, 0, args.shared_intrinsics) return scene_state, outfile -def add_parse_args(parser): - parser.add_argument('--folder_path', type=str, required=True) - parser.add_argument('--output_path', type=str, required=True) - parser.add_argument('--scale_method', type=str, required=True) - - parser.add_argument('--weights_path', type=str, required=True, help='Path to the core mast3rweights file') - parser.add_argument('--retrieval_model', type=str, required=True, help='Retrieval model weights path that is used to make image pairs') - parser.add_argument('--device', type=str, required=True, help='Device to run the model on') - parser.add_argument('--silent', type=bool, required=True, help='Whether to run the model silently') - parser.add_argument('--image_size', type=int, required=True, help='Image size') - parser.add_argument('--optim_level', type=int, required=True, help='Optimization level') - parser.add_argument('--lr1', type=float, required=True, help='Learning rate for the first refinement iteration stage') - parser.add_argument('--niter1', type=int, required=True, help='Number of iterations for the first refinement iteration stage') - parser.add_argument('--lr2', type=float, required=True, help='Learning rate for the second refinement iteration stage') - parser.add_argument('--niter2', type=int, required=True, help='Number of iterations for the second refinement iteration stage') - parser.add_argument('--min_conf_thr', type=float, required=True, help='Minimum confidence threshold') - parser.add_argument('--matching_conf_thr', type=float, required=True, help='Matching confidence threshold') - # parser.add_argument('--as_pointcloud', type=bool, required=True, help='Whether to output a pointcloud') - # parser.add_argument('--mask_sky', type=bool, required=True, help='Whether to mask the sky') - parser.add_argument('--clean_depth', type=bool, required=True, help='Whether to clean the depth') - parser.add_argument('--transparent_cams', type=bool, required=True, help='Whether to make the cameras transparent') - parser.add_argument('--cam_size', type=float, required=True, help='Camera size') - parser.add_argument('--scenegraph_type', type=str, required=True, help='Scenegraph type') - parser.add_argument('--winsize', type=int, required=True, help='Window size for sliding window pair making scenegraph_type') - parser.add_argument('--win_cyclic', type=bool, required=True, help='Whether to use a cyclic sliding window') - parser.add_argument('--refid', type=str, required=True, help='Reference image for retrieval') - # parser.add_argument('--TSDF_thresh', type=float, required=True, help='TSDF refinement threshold') - parser.add_argument('--shared_intrinsics', type=bool, required=True, help='Whether to use a shared intrinsics model') +def add_parse_args(parser, is_scene_path=False): + parser.add_argument('--folder_path', type=str, required=True, help='Path to the folder containing the images') + parser.add_argument('--output_path', type=str, required=True, help='Path to the output file') + parser.add_argument('--scale_method', type=str, required=True, help='Method to use for scale estimation') + parser.add_argument('--scene_path', type=str, required=False, help='Path to the scene file') + if not is_scene_path: + parser.add_argument('--weights_path', type=str, required=True, help='Path to the core mast3rweights file') + # parser.add_argument('--retrieval_model', type=str, required=True, help='Retrieval model weights path that is used to make image pairs') + # parser.add_argument('--device', type=str, required=True, help='Device to run the model on') + # parser.add_argument('--silent', type=bool, required=True, help='Whether to run the model silently') + # parser.add_argument('--image_size', type=int, required=True, help='Image size') + parser.add_argument('--optim_level', type=int, required=True, help='Optimization level') + parser.add_argument('--lr1', type=float, required=True, help='Learning rate for the first refinement iteration stage') + parser.add_argument('--niter1', type=int, required=True, help='Number of iterations for the first refinement iteration stage') + parser.add_argument('--lr2', type=float, required=True, help='Learning rate for the second refinement iteration stage') + parser.add_argument('--niter2', type=int, required=True, help='Number of iterations for the second refinement iteration stage') + parser.add_argument('--min_conf_thr', type=float, required=True, help='Minimum confidence threshold') + parser.add_argument('--matching_conf_thr', type=float, required=True, help='Matching confidence threshold') + # parser.add_argument('--as_pointcloud', type=bool, required=True, help='Whether to output a pointcloud') + # parser.add_argument('--mask_sky', type=bool, required=True, help='Whether to mask the sky') + parser.add_argument('--clean_depth', type=bool, required=True, help='Whether to clean the depth') + parser.add_argument('--transparent_cams', type=bool, required=True, help='Whether to make the cameras transparent') + parser.add_argument('--cam_size', type=float, required=True, help='Camera size') + parser.add_argument('--scenegraph_type', type=str, required=True, help='Scenegraph type') + parser.add_argument('--winsize', type=int, required=True, help='Window size for sliding window pair making scenegraph_type') + parser.add_argument('--win_cyclic', type=bool, required=True, help='Whether to use a cyclic sliding window') + parser.add_argument('--refid', type=str, required=True, help='Reference image for retrieval') + # parser.add_argument('--TSDF_thresh', type=float, required=True, help='TSDF refinement threshold') + parser.add_argument('--shared_intrinsics', type=bool, required=True, help='Whether to use a shared intrinsics model') return parser @@ -281,16 +282,16 @@ def scale_pointcloud_based_on_geotag(): if not args.scene_path: args_parser = get_args_parser() - args_parser = add_parse_args(args_parser) + args_parser = add_parse_args(args_parser, is_scene_path=False) args = args_parser.parse_args() scene, outfile = run_mast3r(args) scene.get_dense_pts3d() data = scene else: args_parser = argparse.ArgumentParser() - args_parser = add_parse_args(args_parser) + args_parser = add_parse_args(args_parser, is_scene_path=True) args = args_parser.parse_args() - + print('cwd', os.getcwd()) with open(args.scene_path, 'rb') as f: data = pickle.load(f) @@ -305,11 +306,18 @@ def scale_pointcloud_based_on_geotag(): } image_names = extract_image_names(data.img_paths) xyz_lookup = gps_to_xyz(gps_lookup) + scale = 1.0 for method in ['ransac', 'median']: if method == 'ransac': - scale, sfm_dists, gps_dists = estimate_scale_ransac(camera_centers, xyz_lookup, image_names) + # print('camera_centers', camera_centers, ) + # print('xyz_lookup', xyz_lookup) + # print('image_names', image_names) + scale, sfm_dists, gps_dists = estimate_scale_ransac(camera_centers.cpu().numpy(), xyz_lookup, image_names) + print('scale', scale) + print('sfm_dists', sfm_dists) + print('gps_dists', gps_dists) else: - scale, sfm_dists, gps_dists = estimate_3d_scale_from_gps(camera_centers, xyz_lookup, image_names) + scale, sfm_dists, gps_dists = estimate_3d_scale_from_gps(camera_centers.cpu().numpy(), xyz_lookup, image_names) print(f"Estimated scale: {scale}") convert_scene_output_to_ply(args.output_path, data, scale=scale, apply_y_flip=False) diff --git a/GEMstack/scripts/__init__.py b/GEMstack/scripts/__init__.py deleted file mode 100644 index e69de29bb..000000000 From 3c439787a7eacff83c9067951c1afa375c30057d Mon Sep 17 00:00:00 2001 From: Henry-YiW Date: Thu, 15 May 2025 22:37:17 -0500 Subject: [PATCH 3/6] using the extrinsic and intrinsic matrices from the global settings --- .../lidar_colorization/colorization.py | 98 ++++++++++++++----- 1 file changed, 71 insertions(+), 27 deletions(-) diff --git a/GEMstack/offboard/lidar_colorization/colorization.py b/GEMstack/offboard/lidar_colorization/colorization.py index f7033d804..019ab65e5 100644 --- a/GEMstack/offboard/lidar_colorization/colorization.py +++ b/GEMstack/offboard/lidar_colorization/colorization.py @@ -6,11 +6,13 @@ import cv2 import open3d as o3d +from ...utils import settings + def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--folder_path", type=str, required=True) - parser.add_argument("--output_path", type=str, required=True) - parser.add_argument("--camera_types", type=str, required=True) + parser.add_argument("--folder_path", type=str, required=True, help="Path to the folder containing the point clouds and images") + parser.add_argument("--output_path", type=str, required=True, help="Path to save the output point cloud") + parser.add_argument("--camera_types", type=str, required=True, help="Comma-separated list of supported camera types (fr, rr, fl, rl) to colorize") return parser.parse_args() @@ -24,14 +26,10 @@ def save_ply_with_open3d(points, filename): def get_camera_extrinsic_matrix(camera_type): if camera_type == "front_right" or camera_type == "fr": - # From your file - rotation = np.array([ - [-0.7168464770690616, -0.10046018208578958, 0.6899557088168523], - [-0.6970911725372957, 0.12308618950445319, -0.7063382243117325], - [-0.01396515249660048, -0.9872981017750231, -0.15826380744561577] - ]) + # From the settings + rotation = np.array(settings.get("calibration.front_right_camera.extrinsics.rotation")) - translation = np.array([1.8861563355156226, -0.7733611068168774, 1.6793040225335112]) + translation = np.array(settings.get("calibration.front_right_camera.extrinsics.position")) # Build 4x4 homogeneous transformation matrix fr_to_vehicle = np.eye(4) @@ -39,31 +37,44 @@ def get_camera_extrinsic_matrix(camera_type): fr_to_vehicle[:3, 3] = translation return fr_to_vehicle elif camera_type == "rear_right" or camera_type == "rr": - # From your file - rotation = np.array([ - [-0.7359657309159472, 0.15986191414426415, -0.6578743127098735], - [0.6768157805459531, 0.14993386619459964, -0.7207220233709469], - [-0.016578363047300385, -0.9756864271752846, -0.21854325362408236] - ]) + # From the settings + rotation = np.array(settings.get("calibration.rear_right_camera.extrinsics.rotation")) - translation = np.array([0.11419591502518789, -0.6896311735924415, 1.711181163333824]) + translation = np.array(settings.get("calibration.rear_right_camera.extrinsics.position")) # Build 4x4 homogeneous transformation matrix rr_to_vehicle = np.eye(4) rr_to_vehicle[:3, :3] = rotation rr_to_vehicle[:3, 3] = translation return rr_to_vehicle + elif camera_type == "front_left" or camera_type == "fl": + rotation = np.array(settings.get("calibration.front_left_camera.extrinsics.rotation")) + translation = np.array(settings.get("calibration.front_left_camera.extrinsics.position")) + + # Build 4x4 homogeneous transformation matrix + fl_to_vehicle = np.eye(4) + fl_to_vehicle[:3, :3] = rotation + fl_to_vehicle[:3, 3] = translation + return fl_to_vehicle + elif camera_type == "rear_left" or camera_type == "rl": + rotation = np.array(settings.get("calibration.rear_left_camera.extrinsics.rotation")) + translation = np.array(settings.get("calibration.rear_left_camera.extrinsics.position")) + + # Build 4x4 homogeneous transformation matrix + rl_to_vehicle = np.eye(4) + rl_to_vehicle[:3, :3] = rotation + rl_to_vehicle[:3, 3] = translation + return rl_to_vehicle else: raise ValueError(f"Camera type {camera_type} not supported") def get_camera_intrinsic_matrix(camera_type): if camera_type == "front_right" or camera_type == "fr": # Provided intrinsics - focal = [1176.2554468073797, 1175.1456876174707] # fx, fy - center = [966.4326452411585, 608.5803255934914] # cx, cy - fr_cam_distort = [-0.2701363254469883, 0.16439325523243875, -0.001607207824773341, -7.412467081891699e-05, - -0.06199397580030171] - skew = 0 # assume no skew + focal = settings.get("calibration.front_right_camera.intrinsics.focal") # fx, fy + center = settings.get("calibration.front_right_camera.intrinsics.center") # cx, cy + fr_cam_distort = settings.get("calibration.front_right_camera.intrinsics.distort") + skew = settings.get("calibration.front_right_camera.intrinsics.skew") fx, fy = focal cx, cy = center @@ -77,11 +88,10 @@ def get_camera_intrinsic_matrix(camera_type): return fr_cam_K, np.array(fr_cam_distort) elif camera_type == "rear_right" or camera_type == "rr": # Provided intrinsics - focal = [1162.3787554048329, 1162.855381183851] # fx, fy - center = [956.2663906909728, 569.2039945552984] # cx, cy - rr_cam_distort = [-0.25040910859151444, 0.1109210921906881, -0.00041247665414900384, 0.0008205455176671751, - -0.026395952816984845] - skew = 0 # assume no skew + focal = settings.get("calibration.rear_right_camera.intrinsics.focal") # fx, fy + center = settings.get("calibration.rear_right_camera.intrinsics.center") # cx, cy + rr_cam_distort = settings.get("calibration.rear_right_camera.intrinsics.distort") + skew = settings.get("calibration.rear_right_camera.intrinsics.skew") fx, fy = focal cx, cy = center @@ -93,6 +103,40 @@ def get_camera_intrinsic_matrix(camera_type): [0, 0, 1] ]) return rr_cam_K, np.array(rr_cam_distort) + elif camera_type == "front_left" or camera_type == "fl": + # Provided intrinsics + focal = settings.get("calibration.front_left_camera.intrinsics.focal") # fx, fy + center = settings.get("calibration.front_left_camera.intrinsics.center") # cx, cy + fl_cam_distort = settings.get("calibration.front_left_camera.intrinsics.distort") + skew = settings.get("calibration.front_left_camera.intrinsics.skew") + + fx, fy = focal + cx, cy = center + + # Build K matrix + fl_cam_K = np.array([ + [fx, skew, cx], + [0, fy, cy], + [0, 0, 1] + ]) + return fl_cam_K, np.array(fl_cam_distort) + elif camera_type == "rear_left" or camera_type == "rl": + # Provided intrinsics + focal = settings.get("calibration.rear_left_camera.intrinsics.focal") # fx, fy + center = settings.get("calibration.rear_left_camera.intrinsics.center") # cx, cy + rl_cam_distort = settings.get("calibration.rear_left_camera.intrinsics.distort") + skew = settings.get("calibration.rear_left_camera.intrinsics.skew") + + fx, fy = focal + cx, cy = center + + # Build K matrix + rl_cam_K = np.array([ + [fx, skew, cx], + [0, fy, cy], + [0, 0, 1] + ]) + return rl_cam_K, np.array(rl_cam_distort) else: raise ValueError(f"Camera type {camera_type} not supported") From 585a3a62ad082aa51e6bc569c94cb5043019aa89 Mon Sep 17 00:00:00 2001 From: Henry-YiW Date: Thu, 15 May 2025 22:40:49 -0500 Subject: [PATCH 4/6] using the extrinsic and intrinsic matrices from the global settings and clean those hard-coded data in codes --- .../lidar_colorization/colorization.py | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/GEMstack/offboard/lidar_colorization/colorization.py b/GEMstack/offboard/lidar_colorization/colorization.py index 019ab65e5..1528d633f 100644 --- a/GEMstack/offboard/lidar_colorization/colorization.py +++ b/GEMstack/offboard/lidar_colorization/colorization.py @@ -17,6 +17,9 @@ def parse_args(): def save_ply_with_open3d(points, filename): + ''' + Save the point cloud to a PLY file using Open3D + ''' pc = o3d.geometry.PointCloud() pc.points = o3d.utility.Vector3dVector(points[:, :3]) if points.shape[1] == 6: @@ -25,6 +28,9 @@ def save_ply_with_open3d(points, filename): o3d.io.write_point_cloud(filename, pc) def get_camera_extrinsic_matrix(camera_type): + ''' + Get the camera extrinsic matrix for the given camera type + ''' if camera_type == "front_right" or camera_type == "fr": # From the settings rotation = np.array(settings.get("calibration.front_right_camera.extrinsics.rotation")) @@ -69,6 +75,9 @@ def get_camera_extrinsic_matrix(camera_type): raise ValueError(f"Camera type {camera_type} not supported") def get_camera_intrinsic_matrix(camera_type): + ''' + Get the camera intrinsic matrix for the given camera type + ''' if camera_type == "front_right" or camera_type == "fr": # Provided intrinsics focal = settings.get("calibration.front_right_camera.intrinsics.focal") # fx, fy @@ -141,15 +150,14 @@ def get_camera_intrinsic_matrix(camera_type): raise ValueError(f"Camera type {camera_type} not supported") def get_lidar_extrinsic_matrix(lidar_type): + ''' + Get the lidar extrinsic matrix for the given lidar type + ''' if lidar_type == "ouster": - # From your file - rotation = np.array([ - [1,0,0], - [0,1,0], - [0,0,1] - ]) + # From the settings + rotation = np.array(settings.get("calibration.top_lidar.rotation")) - translation = np.array([1.10,0,2.03]) + translation = np.array(settings.get("calibration.top_lidar.position")) # Build 4x4 homogeneous transformation matrix ouster_to_vehicle = np.eye(4) From c72b7ecc11b896c3f553db5a4da635468f945a49 Mon Sep 17 00:00:00 2001 From: Henry-YiW Date: Fri, 16 May 2025 01:56:20 -0500 Subject: [PATCH 5/6] hard-coded codes cleaned with new README.md added --- .../offboard/lidar_colorization/README.MD | 87 ++++++++ .../mast3r_3d_reconstruction/README.MD | 18 -- .../mast3r_3d_reconstruction/README.md | 138 ++++++++++++ .../mast3r_3d_reconstruction/mast3r.yml | 208 ++++++++++++++++++ .../mast3r_3d_reconstruction/mast3r_runner.py | 47 ++-- .../scale_pointcloud_based_on_geotag.py | 108 +++++---- 6 files changed, 517 insertions(+), 89 deletions(-) create mode 100644 GEMstack/offboard/lidar_colorization/README.MD delete mode 100644 GEMstack/offboard/mast3r_3d_reconstruction/README.MD create mode 100644 GEMstack/offboard/mast3r_3d_reconstruction/README.md create mode 100644 GEMstack/offboard/mast3r_3d_reconstruction/mast3r.yml diff --git a/GEMstack/offboard/lidar_colorization/README.MD b/GEMstack/offboard/lidar_colorization/README.MD new file mode 100644 index 000000000..20e7f662b --- /dev/null +++ b/GEMstack/offboard/lidar_colorization/README.MD @@ -0,0 +1,87 @@ +# Point Cloud Colorizer from Multi-View Images + +This script projects LiDAR point clouds into multi-view camera images and colorizes the 3D points using RGB values interpolated from the corresponding camera views. It supports multiple camera types and exports both raw and colorized point clouds in PLY format. + +## Features + +- Projects LiDAR point clouds into camera frames using calibrated extrinsic and intrinsic parameters +- Handles undistortion of input images using OpenCV +- Samples RGB values at subpixel precision from multi-view images +- Merges color information into a unified point cloud and saves as `.ply` +- Supports multiple camera types: front-right (`fr`), rear-right (`rr`), front-left (`fl`), rear-left (`rl`) + +## Installation + +Make sure you have the following dependencies installed: + +```bash +pip install numpy opencv-python open3d +``` +### Usage: + +```bash +python path/to/colorize_pointcloud.py \ + --folder_path /path/to/your/pointcloud_and_image_data \ + --output_path /path/to/save/output \ + --camera_types fr,rr +``` +##### 1. Arguments +```bash + --folder_path: Directory containing .b files (NumPy arrays) with point cloud data and corresponding images ({camera_type}_{index}.png) + + --output_path: Directory where the resulting .ply point clouds will be saved + + --camera_types: Comma-separated list of camera types to project and colorize from. Supported values: fr, rr, fl, rl +``` + +###### 2. Data Format Requirements +- Point cloud files: .b files that store N x 3 or N x 6 NumPy arrays of 3D points (with optional RGB). You can rename your saved pointcloud in NumPy arrays format to .b and then use this script. +
+- Images: RGB .png images named like fr_001.png, rr_001.png, etc., located in the same directory as the .b files +
+ +###### 3. Calibration: +Calibration parameters must be accessible via the settings module: + +- Extrinsic: + + ```python + settings.get("calibration..extrinsics.rotation") + settings.get("calibration..extrinsics.position") + ``` + ```python + settings.get("calibration.top_lidar.rotation") + settings.get("calibration.top_lidar.position") + ``` + +- Intrinsic: + ```python + settings.get("calibration..intrinsics.focal") + settings.get("calibration..intrinsics.center") + settings.get("calibration..intrinsics.distort") + settings.get("calibration..intrinsics.skew") + ``` +###### 4. Outputs +For each .b file: + + pure_pointcloud_.ply: raw point cloud without color + + colored_pointcloud_.ply: point cloud with interpolated RGB from visible camera views + +##### Example +```python +python colorize_pointcloud.py \ + --folder_path data/session1 \ + --output_path data/colored_output \ + --camera_types fr,rr +``` + +## Notes + Only the 3D points that are visible in at least one selected camera are assigned color. + + If no image exists for a particular view or projection fails, that camera is skipped. + + Requires accurate and synchronized calibration between LiDAR and camera systems. + + Author: Henry Yi + License: MIT diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/README.MD b/GEMstack/offboard/mast3r_3d_reconstruction/README.MD deleted file mode 100644 index 9977c4a2a..000000000 --- a/GEMstack/offboard/mast3r_3d_reconstruction/README.MD +++ /dev/null @@ -1,18 +0,0 @@ -# Scaled 3D Scene Reconstruction from Images - -This repository provides code for generating a **scaled 3D reconstructed scene** from a set of input images, using [MASt3R](https://github.com/naver/mast3r.git) as the core image matching and reconstruction backend. - -## 🧩 Dependency - -This project relies on the official [MASt3R repository](https://github.com/naver/mast3r.git) by NAVER Labs Europe. -Please follow their installation instructions to set up the environment and dependencies. - -## 🛠️ Installation - -1. Clone this repository and the MASt3R submodule: - -```bash -git clone https://github.com/your-username/your-project.git -cd your-project -git clone --recursive https://github.com/naver/mast3r.git -``` \ No newline at end of file diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/README.md b/GEMstack/offboard/mast3r_3d_reconstruction/README.md new file mode 100644 index 000000000..12237e9ae --- /dev/null +++ b/GEMstack/offboard/mast3r_3d_reconstruction/README.md @@ -0,0 +1,138 @@ +# MASt3R-Based 3D Scene Reconstruction with GPS-Based Scaling + +This script performs 3D scene reconstruction from multi-view images using [MASt3R](https://github.com/naver/mast3r), then scales the reconstructed scene using GPS data embedded in the image EXIF metadata. + +The scaled point cloud is saved in `.ply` format, optionally with TSDF filtering and depth cleaning. + +--- + +## Features + +- Runs MASt3R pipeline to reconstruct a 3D point cloud from a set of input images +- Extracts GPS metadata (latitude, longitude, altitude) from image EXIF data +- Converts GPS coordinates into 3D Cartesian space using `pyproj` +- Estimates real-world scale using: + - **RANSAC-based robust ratio estimation** + - **Median ratio estimation** +- Exports `.ply` point cloud with real-world scale + +--- + +## Installation + +Install the required packages (assuming `mast3r` is cloned and download the checkpoint files): + +The simplest way to install is to use the mast3r.yml file, I exported. It would create a conda virtual environment named `mast3r`. + +And then please use the following command to install faiss-gpu: +```bash +canda install -c pytorch -r faiss-gpu +``` + +Make sure you have: +- `open3d` +- `numpy`, `opencv-python`, `torch`, `Pillow`, `pyproj`, `pandas`, `scipy` + +And that `mast3r` is correctly installed and importable. + +--- + +## 🚀 Usage + +### Basic usage (with reconstruction) + +```bash +python scale_pointcloud_based_on_geotag.py \ + --weights path/to/mast3r/core/checkpoint \ + --retrieval_model path/to/mast3r/retrieval/checkpoint \ + --folder_path path/to/images \ + --output_path output/output_scene.ply \ + --scale_method ransac \ + --scenegraph_type retrieval \ + --winsize 20 \ + --refid 10 +``` + +### Usage (with precomputed scene): + to directly get scaled pointcloud based on Geo Location info tagged to the images + +```bash +python scale_pointcloud_based_on_geotag.py \ + --scene_path path/to/scene.pkl \ + --folder_path path/to/images \ + --output_path output/output_scene.ply \ + --scale_method median +``` + +--- + +## 🧠 Arguments + +### Required (basic run) + +| Argument | Description | +|------------------|-----------------------------------------------------------------------------| +| `--folder_path` | Path to folder containing input images (JPG, PNG) | +| `--output_path` | Output file path for the scaled `.ply` point cloud | +| `--scale_method` | Method to estimate scale (`ransac` or `median`) | + +### Optional + +| Argument | Description | +|------------------|-----------------------------------------------------------------------------| +| `--scene_path` | (Optional) Path to a precomputed MASt3R scene `.pkl` file | +| `--crs_from` | Input coordinate system (default: `EPSG:4979` — lat/lon/alt) | +| `--crs_to` | Target coordinate system (default: `EPSG:32616` — UTM) | + +### MASt3R-specific args (if no `--scene_path` is given) + +You can also configure MASt3R parameters such as: + +| Argument | Description | +|----------------------|---------------------------------------------| +| `--optim_level` | Optimization stage (default: `refine+depth`)| +| `--lr1`, `--niter1` | Learning rate and iteration count (stage 1) (default: 0.07, 300)| +| `--lr2`, `--niter2` | Learning rate and iteration count (stage 2) (default: 0.01, 300)| +| `--min_conf_thr` | Minimum confidence threshold (default: 1.5) | +| `--matching_conf_thr`| Optional Matching confidence threshold | +| `--scenegraph_type` | Graph type: `retrieval`, `swin`, `complete`, etc | +| `--winsize` | Optional Window size for `swin` or `logwin` | +| `--clean_depth` | Clean depth maps (default: `True`) | +| `--TSDF_thresh` | Optional TSDF threshold for filtering | +| `--refid` | Optional Reference images. In `retrieval` scenegraph_type, this indicates the number of neighboring images | + +--- + +## Scale Estimation + +The script computes the relative scale between reconstructed camera centers and GPS-based 3D coordinates. Two methods are supported: + +- `ransac`: Robust estimate using inlier filtering +- `median`: Simple median ratio of pairwise distances + +--- + +## Output + +- A `.ply` point cloud in the specified `output_path`, scaled using estimated GPS-to-reconstruction ratio +- The point cloud preserves geometry from MASt3R and optionally applies depth/TSDF filtering + +--- + +## Notes + +- Input images **must contain valid EXIF GPS tags**. +- If no `--scene_path` is given, the full MASt3R pipeline will run (can be slow). +- You can reuse a `scene.pkl` from MASt3R by supplying `--scene_path`. + +--- + +## 📚 Reference + +- [MASt3R GitHub](https://github.com/naver/mast3r) +- [pyproj](https://pyproj4.github.io/pyproj/stable/) + +--- + +**Author**: Henry Yi +**License**: MIT diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/mast3r.yml b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r.yml new file mode 100644 index 000000000..7a22b4bf0 --- /dev/null +++ b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r.yml @@ -0,0 +1,208 @@ +name: mast3r +channels: + - pytorch + - nvidia + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - blas=1.0=mkl + - brotli-python=1.0.9=py311h6a678d5_9 + - bzip2=1.0.8=h5eee18b_6 + - c-ares=1.19.1=h5eee18b_0 + - ca-certificates=2025.2.25=h06a4308_0 + - certifi=2025.1.31=py311h06a4308_0 + - charset-normalizer=3.3.2=pyhd3eb1b0_0 + - cmake=3.14.0=h52cb24c_0 + - cuda-cudart=12.4.127=0 + - cuda-cupti=12.4.127=0 + - cuda-libraries=12.4.1=0 + - cuda-nvrtc=12.4.127=0 + - cuda-nvtx=12.4.127=0 + - cuda-opencl=12.8.90=0 + - cuda-runtime=12.4.1=0 + - cuda-version=12.8=3 + - expat=2.7.1=h6a678d5_0 + - ffmpeg=4.3=hf484d3e_0 + - filelock=3.17.0=py311h06a4308_0 + - freetype=2.13.3=h4a9f257_0 + - giflib=5.2.2=h5eee18b_0 + - gmp=6.3.0=h6a678d5_0 + - gmpy2=2.2.1=py311h5eee18b_0 + - gnutls=3.6.15=he1e5248_0 + - idna=3.7=py311h06a4308_0 + - intel-openmp=2023.1.0=hdb19cb5_46306 + - jinja2=3.1.6=py311h06a4308_0 + - jpeg=9e=h5eee18b_3 + - krb5=1.20.1=h143b758_1 + - lame=3.100=h7b6447c_0 + - lcms2=2.16=h92b89f2_1 + - ld_impl_linux-64=2.40=h12ee557_0 + - lerc=4.0.0=h6a678d5_0 + - libcublas=12.4.5.8=0 + - libcufft=11.2.1.3=0 + - libcufile=1.13.1.3=0 + - libcurand=10.3.9.90=0 + - libcurl=8.12.1=hc9e6f67_0 + - libcusolver=11.6.1.9=0 + - libcusparse=12.3.1.170=0 + - libdeflate=1.22=h5eee18b_0 + - libedit=3.1.20230828=h5eee18b_0 + - libev=4.33=h7f8727e_1 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libiconv=1.16=h5eee18b_3 + - libidn2=2.3.4=h5eee18b_0 + - libjpeg-turbo=2.0.0=h9bf148f_0 + - libnghttp2=1.57.0=h2d74bed_0 + - libnpp=12.2.5.30=0 + - libnvfatbin=12.8.90=0 + - libnvjitlink=12.4.127=0 + - libnvjpeg=12.3.1.117=0 + - libpng=1.6.39=h5eee18b_0 + - libssh2=1.11.1=h251f7ec_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtasn1=4.19.0=h5eee18b_0 + - libtiff=4.7.0=hde9077f_0 + - libunistring=0.9.10=h27cfd23_0 + - libuuid=1.41.5=h5eee18b_0 + - libwebp=1.3.2=h9f374a3_1 + - libwebp-base=1.3.2=h5eee18b_1 + - llvm-openmp=14.0.6=h9e868ea_0 + - lz4-c=1.9.4=h6a678d5_1 + - markupsafe=3.0.2=py311h5eee18b_0 + - mkl=2023.1.0=h213fc3f_46344 + - mkl-service=2.4.0=py311h5eee18b_2 + - mkl_fft=1.3.11=py311h5eee18b_0 + - mkl_random=1.2.8=py311ha02d727_0 + - mpc=1.3.1=h5eee18b_0 + - mpfr=4.2.1=h5eee18b_0 + - mpmath=1.3.0=py311h06a4308_0 + - ncurses=6.4=h6a678d5_0 + - nettle=3.7.3=hbbd107a_1 + - networkx=3.4.2=py311h06a4308_0 + - numpy=2.0.1=py311h08b1b3b_1 + - numpy-base=2.0.1=py311hf175353_1 + - ocl-icd=2.3.2=h5eee18b_1 + - openh264=2.1.1=h4ff587b_0 + - openjpeg=2.5.2=h0d4d230_1 + - openssl=3.0.16=h5eee18b_0 + - pillow=11.1.0=py311hac6e08b_1 + - pip=25.0=py311h06a4308_0 + - pysocks=1.7.1=py311h06a4308_0 + - python=3.11.11=he870216_0 + - pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0 + - pytorch-cuda=12.4=hc786d27_7 + - pytorch-mutex=1.0=cuda + - pyyaml=6.0.2=py311h5eee18b_0 + - readline=8.2=h5eee18b_0 + - requests=2.32.3=py311h06a4308_1 + - rhash=1.4.3=hdbd6064_0 + - setuptools=75.8.0=py311h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - tbb=2021.8.0=hdb19cb5_0 + - tk=8.6.14=h39e8969_0 + - torchtriton=3.1.0=py311 + - torchvision=0.20.1=py311_cu124 + - typing_extensions=4.12.2=py311h06a4308_0 + - urllib3=2.3.0=py311h06a4308_0 + - wheel=0.45.1=py311h06a4308_0 + - xz=5.6.4=h5eee18b_1 + - yaml=0.2.5=h7b6447c_0 + - zlib=1.2.13=h5eee18b_1 + - zstd=1.5.6=hc292b87_0 + - pip: + - absl-py==2.2.2 + - aiofiles==24.1.0 + - annotated-types==0.7.0 + - anyio==4.9.0 + - asmk==0.1.1 + - bracex==2.5.post1 + - build==1.2.2.post1 + - clarabel==0.10.0 + - click==8.1.8 + - contourpy==1.3.2 + - cvxpy==1.6.5 + - cycler==0.12.1 + - cython==3.0.12 + - einops==0.8.1 + - fastapi==0.115.12 + - ffmpy==0.5.0 + - fonttools==4.57.0 + - freetype-py==2.5.1 + - fsspec==2025.3.2 + - gradio==5.27.0 + - gradio-client==1.9.0 + - groovy==0.1.2 + - grpcio==1.71.0 + - h11==0.16.0 + - httpcore==1.0.9 + - httpx==0.28.1 + - huggingface-hub==0.30.2 + - imageio==2.37.0 + - joblib==1.4.2 + - kapture==1.1.10 + - kapture-localization==1.1.10 + - kiwisolver==1.4.8 + - llvmlite==0.44.0 + - markdown==3.8 + - markdown-it-py==3.0.0 + - matplotlib==3.10.1 + - mdurl==0.1.2 + - numba==0.61.2 + - numpy-quaternion==2024.0.8 + - opencv-python==4.11.0.86 + - open3d==0.19.0 + - orjson==3.10.16 + - osqp==1.0.3 + - packaging==25.0 + - pandas==2.2.3 + - piexif==1.1.3 + - pillow-heif==0.22.0 + - poselib==2.0.4 + - protobuf==6.30.2 + - pyaml==25.1.0 + - pycolmap==3.11.1 + - pydantic==2.11.3 + - pydantic-core==2.33.1 + - pydub==0.25.1 + - pyglet==1.5.31 + - pygments==2.19.1 + - pyopengl==3.1.0 + - pyparsing==3.2.3 + - pyproject-hooks==1.2.0 + - pyrender==0.1.45 + - python-dateutil==2.9.0.post0 + - python-multipart==0.0.20 + - pytz==2025.2 + - pyproj==3.7.1 + - rich==14.0.0 + - roma==1.5.2.1 + - ruff==0.11.7 + - safehttpx==0.1.6 + - safetensors==0.5.3 + - scikit-learn==1.6.1 + - scipy==1.15.2 + - scs==3.2.7.post2 + - semantic-version==2.10.0 + - shellingham==1.5.4 + - six==1.17.0 + - sniffio==1.3.1 + - starlette==0.46.2 + - sympy==1.13.1 + - tabulate==0.9.0 + - tensorboard==2.19.0 + - tensorboard-data-server==0.7.2 + - threadpoolctl==3.6.0 + - tomlkit==0.13.2 + - tqdm==4.67.1 + - trimesh==4.6.8 + - typer==0.15.2 + - typing-inspection==0.4.0 + - tzdata==2025.2 + - uvicorn==0.34.2 + - wcmatch==10.0 + - websockets==15.0.1 + - werkzeug==3.1.3 +prefix: /u/hyi1/.conda/envs/mast3r diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py index 3cb511db6..0f4cb3f75 100644 --- a/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py +++ b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py @@ -1,10 +1,3 @@ -#!/usr/bin/env python3 -# Copyright (C) 2024-present Naver Corporation. All rights reserved. -# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). -# -# -------------------------------------------------------- -# sparse gradio demo functions -# -------------------------------------------------------- import math import gradio import os @@ -301,10 +294,7 @@ def convert_scene_output_to_ply_impl(outfile, imgs, pts3d, mask, scale=1.0, appl # Concatenate across all views all_pts = np.concatenate(all_pts, axis=0) - print(all_pts.shape) all_colors = np.concatenate(all_colors, axis=0) - print(all_colors.shape) - print(all_colors[5]) apply_y_flip = True if apply_y_flip: @@ -314,19 +304,26 @@ def convert_scene_output_to_ply_impl(outfile, imgs, pts3d, mask, scale=1.0, appl pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(all_pts) pcd.colors = o3d.utility.Vector3dVector(all_colors) - print('cwd', os.getcwd()) # Save to .ply - o3d.io.write_point_cloud(os.path.join(os.getcwd(), outfile), pcd) + final_outfile = os.path.join(os.getcwd(), outfile) + os.makedirs(os.path.dirname(final_outfile), exist_ok=True) + o3d.io.write_point_cloud(final_outfile, pcd) if not silent: print(f"✅ Exported scaled point cloud to: {outfile}") return outfile -def convert_scene_output_to_ply(outfile, data, scale=1.0, apply_y_flip=False, min_conf_thr=1.5, clean=True): - confs = data.confs_dense_clean if clean else data.confs_dense - msk = to_numpy([c > min_conf_thr for c in confs]) +def convert_scene_output_to_ply(outfile, data, scale=1.0, apply_y_flip=False, min_conf_thr=1.5, clean=True, TSDF_thresh=0): imgs = to_numpy(data.imgs) - dense_pts3d = to_numpy(data.pts3d_dense) + if TSDF_thresh > 0: + tsdf = TSDFPostProcess(data, TSDF_thresh=TSDF_thresh) + dense_pts3d, _, confs = to_numpy(tsdf.get_dense_pts3d(clean_depth=clean)) + msk = to_numpy([c > min_conf_thr for c in confs]) + else : + confs = data.confs_dense_clean if clean else data.confs_dense + dense_pts3d = to_numpy(data.pts3d_dense) + msk = to_numpy([c > min_conf_thr for c in confs]) + # 3D pointcloud from depthmap, poses and intrinsics return convert_scene_output_to_ply_impl(outfile, imgs, dense_pts3d, msk, scale=scale, apply_y_flip=apply_y_flip) def get_3D_model_from_scene(silent, scene_state, min_conf_thr=2, as_pointcloud=False, mask_sky=False, @@ -400,7 +397,6 @@ def get_reconstructed_scene(outdir, gradio_delete_cache, model, retrieval_model, elif scenegraph_type == "retrieval": scene_graph_params.append(str(winsize)) # Na scene_graph_params.append(str(refid)) # k - if scenegraph_type in ["swin", "logwin"] and not win_cyclic: scene_graph_params.append('noncyclic') scene_graph = '-'.join(scene_graph_params) @@ -433,16 +429,17 @@ def get_reconstructed_scene(outdir, gradio_delete_cache, model, retrieval_model, model, lr1=lr1, niter1=niter1, lr2=lr2, niter2=niter2, device=device, opt_depth='depth' in optim_level, shared_intrinsics=shared_intrinsics, matching_conf_thr=matching_conf_thr, **kw) - if current_scene_state is not None and \ - not current_scene_state.should_delete and \ - current_scene_state.outfile_name is not None: - outfile_name = current_scene_state.outfile_name - else: - outfile_name = tempfile.mktemp(suffix='_scene.glb', dir=outdir) + # if current_scene_state is not None and \ + # not current_scene_state.should_delete and \ + # current_scene_state.outfile_name is not None: + # outfile_name = current_scene_state.outfile_name + # else: + # outfile_name = tempfile.mktemp(suffix='_scene.glb', dir=outdir) # scene_state = SparseGAState(scene, gradio_delete_cache, cache_dir, outfile_name) # outfile = get_3D_model_from_scene(silent, scene_state, min_conf_thr, as_pointcloud, mask_sky, # clean_depth, transparent_cams, cam_size, TSDF_thresh) - outfile = convert_scene_output_to_ply(outfile_name, scene, scale=1.0, apply_y_flip=False, min_conf_thr=min_conf_thr, clean=clean_depth) - return scene, outfile + scene.get_dense_pts3d() + # outfile = convert_scene_output_to_ply(outfile_name, scene, scale=1.0, apply_y_flip=False, min_conf_thr=min_conf_thr, clean=clean_depth) + return scene diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py b/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py index 542d75f93..dd1915ced 100644 --- a/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py +++ b/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py @@ -72,11 +72,9 @@ def get_gps_from_exif(image_path): gps_info = {} for tag, value in exif.items(): decoded = TAGS.get(tag) - # print(decoded) if decoded == "GPSInfo": for t in value: sub_decoded = GPSTAGS.get(t) - # print(sub_decoded) gps_info[sub_decoded] = value[t] return gps_info @@ -112,9 +110,9 @@ def parse_gps_info(image_path): 'date': datestamp } -def gps_to_xyz(gps_lookup): +def gps_to_xyz(gps_lookup, crs_from, crs_to): xyz_lookup = {} - transformer = Transformer.from_crs("EPSG:4979", "EPSG:32616", always_xy=True) # includes altitude + transformer = Transformer.from_crs(crs_from, crs_to, always_xy=True) # includes altitude for image_name, (lat, lon, alt) in gps_lookup.items(): xyz = transformer.transform(lon, lat, alt) xyz_lookup[image_name] = xyz @@ -138,8 +136,6 @@ def estimate_3d_scale_from_gps(camera_centers, gps_xyz, camera_image_names, min_ file_name_i = camera_image_names[i] file_name_j = camera_image_names[j] d_sfm = np.linalg.norm(camera_centers[i] - camera_centers[j]) - # print(file_name_i, file_name_j) - # print(gps_xyz[file_name_i], gps_xyz[file_name_j]) d_gps = np.linalg.norm(np.array(gps_xyz[file_name_i]) - np.array(gps_xyz[file_name_j])) if d_gps > min_dist_threshold: @@ -222,8 +218,12 @@ def collect_gps_data(data_folder): def run_mast3r(args): - model = AsymmetricMASt3R.from_pretrained(args.weights_path).to(args.device) - chkpt_tag = hash_md5(args.weights_path) + if args.weights is not None: + weights_path = args.weights + else: + weights_path = "naver/" + args.model_name + model = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device) + chkpt_tag = hash_md5(weights_path) def get_context(tmp_dir): return tempfile.TemporaryDirectory(suffix='_mast3r_context') if tmp_dir is None \ @@ -232,41 +232,64 @@ def get_context(tmp_dir): cache_path = os.path.join(tmpdirname, chkpt_tag) os.makedirs(cache_path, exist_ok=True) inputfiles = [os.path.join(args.folder_path, f) for f in os.listdir(args.folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))] - scene_state, outfile = get_reconstructed_scene(tmpdirname, False, model, + scene_state = get_reconstructed_scene(tmpdirname, False, model, args.retrieval_model, args.device, args.silent, args.image_size, None, inputfiles, args.optim_level, args.lr1, args.niter1, args.lr2, args.niter2, args.min_conf_thr, args.matching_conf_thr, True, False, args.clean_depth, args.scenegraph_type, args.winsize, args.win_cyclic, args.refid, 0, args.shared_intrinsics) - return scene_state, outfile + return scene_state def add_parse_args(parser, is_scene_path=False): parser.add_argument('--folder_path', type=str, required=True, help='Path to the folder containing the images') parser.add_argument('--output_path', type=str, required=True, help='Path to the output file') parser.add_argument('--scale_method', type=str, required=True, help='Method to use for scale estimation') parser.add_argument('--scene_path', type=str, required=False, help='Path to the scene file') + parser.add_argument('--crs_from', type=str, required=False, default='EPSG:4979', help='EPSG code of the input CRS') + parser.add_argument('--crs_to', type=str, required=False, default='EPSG:32616', help='EPSG code of the output CRS') if not is_scene_path: - parser.add_argument('--weights_path', type=str, required=True, help='Path to the core mast3rweights file') - # parser.add_argument('--retrieval_model', type=str, required=True, help='Retrieval model weights path that is used to make image pairs') - # parser.add_argument('--device', type=str, required=True, help='Device to run the model on') + # parser.add_argument('--retrieval_model', type=str, required=False, default=None, help='Retrieval model weights path that is used to make image pairs') + # parser.add_argument('--device', type=str, required=False, default='cuda:0', help='Device to run the model on') # parser.add_argument('--silent', type=bool, required=True, help='Whether to run the model silently') # parser.add_argument('--image_size', type=int, required=True, help='Image size') - parser.add_argument('--optim_level', type=int, required=True, help='Optimization level') - parser.add_argument('--lr1', type=float, required=True, help='Learning rate for the first refinement iteration stage') - parser.add_argument('--niter1', type=int, required=True, help='Number of iterations for the first refinement iteration stage') - parser.add_argument('--lr2', type=float, required=True, help='Learning rate for the second refinement iteration stage') - parser.add_argument('--niter2', type=int, required=True, help='Number of iterations for the second refinement iteration stage') - parser.add_argument('--min_conf_thr', type=float, required=True, help='Minimum confidence threshold') - parser.add_argument('--matching_conf_thr', type=float, required=True, help='Matching confidence threshold') + parser.add_argument('--optim_level', type=str, required=False, default='refine+depth', choices=['coarse', 'refine', 'refine+depth'], help='Optimization level') + parser.add_argument('--lr1', type=float, required=False, default=0.07, help='Learning rate for the first refinement iteration stage') + parser.add_argument('--niter1', type=int, required=False, default=300, help='Number of iterations for the first refinement iteration stage') + parser.add_argument('--lr2', type=float, required=False, default=0.01, help='Learning rate for the second refinement iteration stage') + parser.add_argument('--niter2', type=int, required=False, default=300, help='Number of iterations for the second refinement iteration stage') + parser.add_argument('--min_conf_thr', type=float, required=False, default=1.5, help='Minimum confidence threshold') + parser.add_argument('--matching_conf_thr', type=float, required=False, default=0., help='Matching confidence threshold') # parser.add_argument('--as_pointcloud', type=bool, required=True, help='Whether to output a pointcloud') # parser.add_argument('--mask_sky', type=bool, required=True, help='Whether to mask the sky') - parser.add_argument('--clean_depth', type=bool, required=True, help='Whether to clean the depth') - parser.add_argument('--transparent_cams', type=bool, required=True, help='Whether to make the cameras transparent') - parser.add_argument('--cam_size', type=float, required=True, help='Camera size') - parser.add_argument('--scenegraph_type', type=str, required=True, help='Scenegraph type') - parser.add_argument('--winsize', type=int, required=True, help='Window size for sliding window pair making scenegraph_type') - parser.add_argument('--win_cyclic', type=bool, required=True, help='Whether to use a cyclic sliding window') - parser.add_argument('--refid', type=str, required=True, help='Reference image for retrieval') - # parser.add_argument('--TSDF_thresh', type=float, required=True, help='TSDF refinement threshold') - parser.add_argument('--shared_intrinsics', type=bool, required=True, help='Whether to use a shared intrinsics model') + parser.add_argument('--clean_depth', type=bool, required=False, default=True, help='Whether to clean the depth') + # parser.add_argument('--transparent_cams', type=bool, required=True, help='Whether to make the cameras transparent') + # parser.add_argument('--cam_size', type=float, required=True, help='Camera size') + + available_scenegraph_type = [ + ("complete: all possible image pairs", "complete"), + ("swin: sliding window", "swin"), + ("logwin: sliding window with long range", "logwin"), + ("oneref: match one image with all", "oneref"), + ("retrieval: connect views based on similarity", "retrieval") + ] + + # Extract the actual values for argparse + scenegraph_choices = [val for _, val in available_scenegraph_type] + + # Construct help string + help_msg = "Type of scene graph:\n" + "\n".join([f" {val}: {desc.split(':', 1)[1].strip()}" for desc, val in available_scenegraph_type]) + + parser.add_argument( + '--scenegraph_type', + type=str, + choices=scenegraph_choices, + required=False, + default='complete', + help=help_msg + ) + parser.add_argument('--winsize', type=int, required=False, default=1, help='Window size for sliding window pair making scenegraph_type') + parser.add_argument('--win_cyclic', type=bool, required=False, default=False, help='Whether to use a cyclic sliding window') + parser.add_argument('--refid', type=str, required=False, default=0, help='Reference image for retrieval. For retrieval scenegraph_type, this indicates the number of neighbors.') + parser.add_argument('--TSDF_thresh', type=float, required=False, default=0, help='TSDF refinement threshold') + parser.add_argument('--shared_intrinsics', type=bool, required=False, default=False, help='Whether to use a shared intrinsics model') return parser @@ -284,14 +307,12 @@ def scale_pointcloud_based_on_geotag(): args_parser = get_args_parser() args_parser = add_parse_args(args_parser, is_scene_path=False) args = args_parser.parse_args() - scene, outfile = run_mast3r(args) - scene.get_dense_pts3d() + scene = run_mast3r(args) data = scene else: args_parser = argparse.ArgumentParser() args_parser = add_parse_args(args_parser, is_scene_path=True) args = args_parser.parse_args() - print('cwd', os.getcwd()) with open(args.scene_path, 'rb') as f: data = pickle.load(f) @@ -305,21 +326,16 @@ def scale_pointcloud_based_on_geotag(): for row in image_gps_data } image_names = extract_image_names(data.img_paths) - xyz_lookup = gps_to_xyz(gps_lookup) + xyz_lookup = gps_to_xyz(gps_lookup, args.crs_from, args.crs_to) scale = 1.0 - for method in ['ransac', 'median']: - if method == 'ransac': - # print('camera_centers', camera_centers, ) - # print('xyz_lookup', xyz_lookup) - # print('image_names', image_names) - scale, sfm_dists, gps_dists = estimate_scale_ransac(camera_centers.cpu().numpy(), xyz_lookup, image_names) - print('scale', scale) - print('sfm_dists', sfm_dists) - print('gps_dists', gps_dists) - else: - scale, sfm_dists, gps_dists = estimate_3d_scale_from_gps(camera_centers.cpu().numpy(), xyz_lookup, image_names) - print(f"Estimated scale: {scale}") - convert_scene_output_to_ply(args.output_path, data, scale=scale, apply_y_flip=False) + if args.scale_method == 'ransac': + scale, sfm_dists, gps_dists = estimate_scale_ransac(camera_centers.cpu().numpy(), xyz_lookup, image_names) + elif args.scale_method == 'median': + scale, sfm_dists, gps_dists = estimate_3d_scale_from_gps(camera_centers.cpu().numpy(), xyz_lookup, image_names) + else: + raise ValueError(f"Invalid scale method: {args.scale_method}") + print(f"Estimated scale: {scale}") + convert_scene_output_to_ply(args.output_path, data, scale=scale, apply_y_flip=False, min_conf_thr=args.min_conf_thr, clean=args.clean_depth, TSDF_thresh=args.TSDF_thresh) if __name__ == "__main__": scale_pointcloud_based_on_geotag() \ No newline at end of file From 9ccba46bc626e9d907a227f54f3c789eedc8650f Mon Sep 17 00:00:00 2001 From: Henry-YiW Date: Fri, 16 May 2025 02:04:09 -0500 Subject: [PATCH 6/6] codes cleaned and documentations added --- .../mast3r_3d_reconstruction/mast3r_runner.py | 47 +++++----------- .../scale_pointcloud_based_on_geotag.py | 55 ++++++++++++++++--- 2 files changed, 60 insertions(+), 42 deletions(-) diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py index 0f4cb3f75..403ea8ed3 100644 --- a/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py +++ b/GEMstack/offboard/mast3r_3d_reconstruction/mast3r_runner.py @@ -50,6 +50,9 @@ def get_args_parser(): class SparseGA(): + ''' + PointCloud class. + ''' def __init__(self, img_paths, pairs_in, res_fine, anchors, canonical_paths=None): def fetch_img(im): def torgb(x): return (x[0].permute(1, 2, 0).numpy() * .5 + .5).clip(min=0., max=1.) @@ -87,6 +90,9 @@ def get_sparse_pts3d(self): return self.pts3d def get_dense_pts3d(self, clean_depth=True, subsample=8): + ''' + Get dense 3D points. + ''' assert self.canonical_paths, 'cache_path is required for dense 3d points' device = self.cam2w.device confs = [] @@ -223,6 +229,9 @@ def sparse_global_alignment(imgs, pairs_in, cache_path, model, subsample=8, desc def _convert_scene_output_to_glb(outfile, imgs, pts3d, mask, focals, cams2world, cam_size=0.05, cam_color=None, as_pointcloud=False, transparent_cams=False, silent=False): + ''' + Convert scene output to GLB file. + ''' assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals) pts3d = to_numpy(pts3d) imgs = to_numpy(imgs) @@ -275,6 +284,7 @@ def convert_scene_output_to_ply_impl(outfile, imgs, pts3d, mask, scale=1.0, appl pts3d (list of np.ndarray): 3D points per view, shape (H * W, 3). mask (list of np.ndarray): Boolean masks indicating valid points per view (H, W). scale (float): Scale factor to apply to the 3D points. + apply_y_flip (bool): If True, apply a y-axis flip to the 3D points. silent (bool): If False, print export message. Returns: @@ -314,6 +324,9 @@ def convert_scene_output_to_ply_impl(outfile, imgs, pts3d, mask, scale=1.0, appl return outfile def convert_scene_output_to_ply(outfile, data, scale=1.0, apply_y_flip=False, min_conf_thr=1.5, clean=True, TSDF_thresh=0): + ''' + Convert scene output to PLY file. This is to filter out points that are not visible in the images either using TSDF or normal confidence thresholding. + ''' imgs = to_numpy(data.imgs) if TSDF_thresh > 0: tsdf = TSDFPostProcess(data, TSDF_thresh=TSDF_thresh) @@ -353,28 +366,6 @@ def get_3D_model_from_scene(silent, scene_state, min_conf_thr=2, as_pointcloud=F return _convert_scene_output_to_glb(outfile, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud, transparent_cams=transparent_cams, cam_size=cam_size, silent=silent) -def sort_images_from_longest_endpoint(D_square, data_length): - D_square = D_square.copy() - # Find the two farthest points - i, j = np.unravel_index(np.argmax(D_square), D_square.shape) - start_idx = i # or j — either works - - # Greedy traversal using the precomputed distance matrix - N = data_length - visited = np.zeros(N, dtype=bool) - visited[start_idx] = True - path = [start_idx] - - current_idx = start_idx - for _ in range(N - 1): - dists = D_square[current_idx] - dists[visited] = np.inf # Ignore visited - next_idx = np.argmin(dists) - path.append(next_idx) - visited[next_idx] = True - current_idx = next_idx - return path - def get_reconstructed_scene(outdir, gradio_delete_cache, model, retrieval_model, device, silent, image_size, current_scene_state, filelist, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr, as_pointcloud, mask_sky, clean_depth, @@ -429,17 +420,7 @@ def get_reconstructed_scene(outdir, gradio_delete_cache, model, retrieval_model, model, lr1=lr1, niter1=niter1, lr2=lr2, niter2=niter2, device=device, opt_depth='depth' in optim_level, shared_intrinsics=shared_intrinsics, matching_conf_thr=matching_conf_thr, **kw) - # if current_scene_state is not None and \ - # not current_scene_state.should_delete and \ - # current_scene_state.outfile_name is not None: - # outfile_name = current_scene_state.outfile_name - # else: - # outfile_name = tempfile.mktemp(suffix='_scene.glb', dir=outdir) - - # scene_state = SparseGAState(scene, gradio_delete_cache, cache_dir, outfile_name) - # outfile = get_3D_model_from_scene(silent, scene_state, min_conf_thr, as_pointcloud, mask_sky, - # clean_depth, transparent_cams, cam_size, TSDF_thresh) + scene.get_dense_pts3d() - # outfile = convert_scene_output_to_ply(outfile_name, scene, scale=1.0, apply_y_flip=False, min_conf_thr=min_conf_thr, clean=clean_depth) return scene diff --git a/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py b/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py index dd1915ced..6e851a025 100644 --- a/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py +++ b/GEMstack/offboard/mast3r_3d_reconstruction/scale_pointcloud_based_on_geotag.py @@ -61,10 +61,16 @@ def to_cpu(x): return todevice(x, 'cpu') def to_cuda(x): return todevice(x, 'cuda') def dms_to_decimal(d, m, s, ref): + ''' + Convert degrees, minutes, seconds to decimal degrees. + ''' dd = d + m / 60 + s / 3600 return -dd if ref in ['S', 'W'] else dd def get_gps_from_exif(image_path): + ''' + Get GPS information from an image file. + ''' img = Image.open(image_path) exif = img._getexif() if not exif: @@ -80,6 +86,9 @@ def get_gps_from_exif(image_path): return gps_info def parse_gps_info(image_path): + ''' + Parse GPS information from an image file. + ''' gps_info = get_gps_from_exif(image_path) # Latitude lat_ref = gps_info.get('GPSLatitudeRef', 'N') @@ -111,6 +120,9 @@ def parse_gps_info(image_path): } def gps_to_xyz(gps_lookup, crs_from, crs_to): + ''' + Convert GPS coordinates to xyz coordinates. + ''' xyz_lookup = {} transformer = Transformer.from_crs(crs_from, crs_to, always_xy=True) # includes altitude for image_name, (lat, lon, alt) in gps_lookup.items(): @@ -125,6 +137,8 @@ def estimate_3d_scale_from_gps(camera_centers, gps_xyz, camera_image_names, min_ Inputs: camera_centers: (N, 3) array in MASt3r units (arbitrary scale) gps_xyz: (N, 3) array in meters [x, y, z] from lat/lon/alt + camera_image_names: list of camera image names + min_dist_threshold: minimum distance threshold for valid GPS pairs Returns: scale: estimated meters-per-unit scale factor """ @@ -153,6 +167,19 @@ def estimate_3d_scale_from_gps(camera_centers, gps_xyz, camera_image_names, min_ def estimate_scale_ransac(camera_centers, gps_xyz, camera_image_names, threshold=0.05, iterations=1000, min_dist=1.0): + ''' + Estimate scale factor between MASt3r's camera centers and GPS 3D coordinates using RANSAC. + + Inputs: + camera_centers: (N, 3) array in MASt3r units (arbitrary scale) + gps_xyz: (N, 3) array in meters [x, y, z] from lat/lon/alt + camera_image_names: list of camera image names + threshold: threshold for inliers + iterations: number of RANSAC iterations + min_dist: minimum distance threshold for valid GPS pairs + Returns: + scale: estimated meters-per-unit scale factor + ''' scales = [] pairs = [] @@ -191,11 +218,17 @@ def estimate_scale_ransac(camera_centers, gps_xyz, camera_image_names, threshold return best_scale, len(best_inliers), len(scales) def extract_image_names(image_paths): + ''' + Extract image names from a list of image paths. + ''' return [path.split('/')[-1] for path in image_paths] def collect_gps_data(data_folder): + ''' + Collect GPS data with extra metadata from a folder of images. + ''' records = [] for fname in sorted(os.listdir(data_folder)): if fname.lower().endswith(('.jpg', '.jpeg', '.png')): @@ -218,6 +251,9 @@ def collect_gps_data(data_folder): def run_mast3r(args): + ''' + Run MASt3R on a folder of images. + ''' if args.weights is not None: weights_path = args.weights else: @@ -246,10 +282,6 @@ def add_parse_args(parser, is_scene_path=False): parser.add_argument('--crs_from', type=str, required=False, default='EPSG:4979', help='EPSG code of the input CRS') parser.add_argument('--crs_to', type=str, required=False, default='EPSG:32616', help='EPSG code of the output CRS') if not is_scene_path: - # parser.add_argument('--retrieval_model', type=str, required=False, default=None, help='Retrieval model weights path that is used to make image pairs') - # parser.add_argument('--device', type=str, required=False, default='cuda:0', help='Device to run the model on') - # parser.add_argument('--silent', type=bool, required=True, help='Whether to run the model silently') - # parser.add_argument('--image_size', type=int, required=True, help='Image size') parser.add_argument('--optim_level', type=str, required=False, default='refine+depth', choices=['coarse', 'refine', 'refine+depth'], help='Optimization level') parser.add_argument('--lr1', type=float, required=False, default=0.07, help='Learning rate for the first refinement iteration stage') parser.add_argument('--niter1', type=int, required=False, default=300, help='Number of iterations for the first refinement iteration stage') @@ -257,11 +289,7 @@ def add_parse_args(parser, is_scene_path=False): parser.add_argument('--niter2', type=int, required=False, default=300, help='Number of iterations for the second refinement iteration stage') parser.add_argument('--min_conf_thr', type=float, required=False, default=1.5, help='Minimum confidence threshold') parser.add_argument('--matching_conf_thr', type=float, required=False, default=0., help='Matching confidence threshold') - # parser.add_argument('--as_pointcloud', type=bool, required=True, help='Whether to output a pointcloud') - # parser.add_argument('--mask_sky', type=bool, required=True, help='Whether to mask the sky') parser.add_argument('--clean_depth', type=bool, required=False, default=True, help='Whether to clean the depth') - # parser.add_argument('--transparent_cams', type=bool, required=True, help='Whether to make the cameras transparent') - # parser.add_argument('--cam_size', type=float, required=True, help='Camera size') available_scenegraph_type = [ ("complete: all possible image pairs", "complete"), @@ -295,6 +323,9 @@ def add_parse_args(parser, is_scene_path=False): return parser def scale_pointcloud_based_on_geotag(): + ''' + Scale a pointcloud based on GPS data. If no scene file is provided, MASt3R will be run to generate a scene file. + ''' parser = argparse.ArgumentParser() # Add known args @@ -316,9 +347,11 @@ def scale_pointcloud_based_on_geotag(): with open(args.scene_path, 'rb') as f: data = pickle.load(f) - + # Get camera centers cam2w = data.get_im_poses() camera_centers = cam2w[:, :3, 3] # Extract translation component from [R|t] + + # Collect GPS data df = collect_gps_data(args.folder_path) image_gps_data = df.to_numpy() gps_lookup = { @@ -327,6 +360,8 @@ def scale_pointcloud_based_on_geotag(): } image_names = extract_image_names(data.img_paths) xyz_lookup = gps_to_xyz(gps_lookup, args.crs_from, args.crs_to) + + # Estimate scale scale = 1.0 if args.scale_method == 'ransac': scale, sfm_dists, gps_dists = estimate_scale_ransac(camera_centers.cpu().numpy(), xyz_lookup, image_names) @@ -335,6 +370,8 @@ def scale_pointcloud_based_on_geotag(): else: raise ValueError(f"Invalid scale method: {args.scale_method}") print(f"Estimated scale: {scale}") + + # Convert scene output to PLY convert_scene_output_to_ply(args.output_path, data, scale=scale, apply_y_flip=False, min_conf_thr=args.min_conf_thr, clean=args.clean_depth, TSDF_thresh=args.TSDF_thresh) if __name__ == "__main__":