From 306a9dbaa4391f0b86855a44ef4e008dbca7725b Mon Sep 17 00:00:00 2001
From: Yuqi Wu <ywu86@eb2-3231-lin03.csc.ncsu.edu>
Date: Tue, 1 Jul 2025 11:09:43 -0400
Subject: [PATCH 1/2] Add Modified_code folder with updated BIOT scripts

---
 src/BIOT/Modified_code/BIOT_CPU_modified.py | 267 ++++++++++++++++++
 src/BIOT/Modified_code/BIOT_GPU_modified.py | 296 ++++++++++++++++++++
 src/BIOT/Modified_code/TorchL1.py           | 252 +++++++++++++++++
 3 files changed, 815 insertions(+)
 create mode 100644 src/BIOT/Modified_code/BIOT_CPU_modified.py
 create mode 100644 src/BIOT/Modified_code/BIOT_GPU_modified.py
 create mode 100644 src/BIOT/Modified_code/TorchL1.py

diff --git a/src/BIOT/Modified_code/BIOT_CPU_modified.py b/src/BIOT/Modified_code/BIOT_CPU_modified.py
new file mode 100644
index 0000000..0875788
--- /dev/null
+++ b/src/BIOT/Modified_code/BIOT_CPU_modified.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Functions and classes for the BIOT module
+
+"""
+import pandas as pd 
+import numpy as np
+import argparse
+import time
+import math
+
+from scipy.stats import mannwhitneyu
+from sklearn.linear_model import Lasso
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import KFold
+from sklearn.preprocessing import StandardScaler
+
+###############################################################################
+# Functions and classes
+def Get_W_Lasso (X, Y, lam, fit_intercept = False, mode='cpu'):
+    k = Y.shape[1]
+    d = X.shape[1]
+    W = np.zeros((d, k))
+    w0 = np.zeros(k)
+    if fit_intercept:
+        mode = 'cpu'
+    print('.', end='', flush=True)
+    for dim_index in range(k):    
+        model = Lasso(alpha = lam, fit_intercept = fit_intercept, max_iter = 5000, tol = 1e-6)
+        model.fit(X = X, y = Y[:, dim_index])
+        W[:, dim_index] = model.coef_
+        if fit_intercept:
+            w0[dim_index] = model.intercept_
+    return W, w0
+
+def calc_max_lam (X, Y):
+    n = X.shape[0]
+    sc = StandardScaler()
+    X_norm = sc.fit_transform(X)
+    sc = StandardScaler(with_std=False)
+    Y_norm = sc.fit_transform(Y)
+    max_lam = np.max(np.absolute(X_norm.T @ Y_norm))/n
+    
+    return max_lam
+
+def normalize(X, mean, std):
+  return (X - mean) / std
+
+def Global_L1_Norm (W):
+    k = W.shape[1]
+    norm_val = 0
+    for dim_index in range(k):
+        norm_val += np.linalg.norm(W[:, dim_index], ord = 1)
+        
+    return norm_val
+
+def BIOT_Crit (X, Y, R, W, w0, lam):
+    n = X.shape[0]
+    diffs = (Y @ R) - (np.tile(w0, (n, 1)) + (X @ W))
+    LS = np.linalg.norm(diffs)**2
+    L1 = Global_L1_Norm(W)
+    
+    crit = ((1/(2*n)) * LS) + (lam * L1)
+    
+    return crit
+
+def BIOT(X, Y, lam, max_iter=200, eps=1e-6, rotation=False, R=None, fit_intercept=False, mode='cpu'):
+    print('lambda:', lam)
+    R = np.eye(6)
+    d = X.shape[1]
+    n = X.shape[0]
+    lam_norm = lam / np.sqrt(d)
+
+
+    W, w0 = Get_W_Lasso(X=X, Y=Y, lam=lam_norm, fit_intercept=fit_intercept, mode=mode)
+
+    diff = math.inf
+    iter_index = 0
+    crit_list = [math.inf]
+
+    while iter_index < max_iter and diff > eps:
+        # Compute De matrix
+        intercept_matrix = np.tile(w0, (n, 1))  # shape (n, m)
+        De = (1 / (2 * n)) * Y.T @ (intercept_matrix + X @ W)  # shape (m, m)
+
+        # SVD
+        U, S, Vh = np.linalg.svd(De, full_matrices=False)
+
+        if rotation:
+            sv = S.copy()
+            smallest = np.argmin(sv)
+            sv[smallest] = np.sign(np.linalg.det(U @ Vh))
+            sv[sv != sv[smallest]] = 1
+            R = U @ np.diag(sv) @ Vh
+        else:
+            R = U @ Vh
+
+        # Update W
+        YR = Y @ R
+        W, w0 = Get_W_Lasso(X=X, Y=YR, lam=lam_norm, fit_intercept=fit_intercept, mode=mode)
+
+        # Convergence check
+        crit_list.append(BIOT_Crit(X=X, Y=Y, R=R, W=W, w0=w0, lam=lam_norm))
+        diff = abs(crit_list[iter_index] - crit_list[iter_index + 1])
+        iter_index += 1
+
+    # Sparsity
+    num_nonzero = np.sum(W != 0)
+    total_elements = W.size
+    ratio = num_nonzero / total_elements
+
+    return R, W, w0, ratio
+
+def standardize_features(train, test):
+    mean = train.mean(axis=0)
+    std = train.std(axis=0, ddof=1)
+    std[std == 0] = 1.0
+    train_std = (train - mean) / std
+    test_std  = (test - mean) / std
+    return train_std, test_std
+
+def center_targets(train, test):
+    mean = train.mean(axis=0)
+    train_centered = train - mean
+    test_centered  = test - mean
+    return train_centered, test_centered
+
+
+def CV_BIOT(Fe, X, feature_names, lam_list, fit_intercept=False,
+            num_folds=10, random_state=1, R=None, rotation=False, scoring='neg_mean_squared_error', mode='cpu'):
+
+    start_all = time.time()
+    kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)
+    results = []
+    mse_matrix = []  
+    
+    for lam in lam_list:
+        fold_mse = []
+        fold_sparsity = []
+
+        for train_idx, test_idx in kf.split(Fe):
+            X_train = X[train_idx, :]
+            X_test  = X[test_idx, :]
+            Fe_train = Fe[train_idx, :]
+            Fe_test  = Fe[test_idx, :]
+
+            Fe_train, Fe_test = standardize_features(Fe_train, Fe_test)
+            X_train, X_test = center_targets(X_train, X_test)
+
+            R_, W_, w0_, ratio = BIOT(X=Fe_train, Y=X_train, lam=lam)
+
+            intercept = np.tile(w0_, (Fe_test.shape[0], 1))
+            X_pred = (intercept + Fe_test @ W_) @ R_.T
+
+            mse = mean_squared_error(X_test, X_pred) / 2
+            fold_mse.append(mse)
+            fold_sparsity.append(ratio)
+
+        avg_mse = np.mean(fold_mse)
+        avg_ratio = np.mean(fold_sparsity)
+        results.append((lam, avg_mse, avg_ratio))
+        mse_matrix.append(fold_mse)
+    
+    end_all = time.time()
+    print(f"CV BIOT completed in {end_all - start_all:.2f}s")
+    
+    df = pd.DataFrame(results, columns=["Lambda", "AvgMSE", "AvgSparsity"])
+    search_array = np.array(mse_matrix).T
+    print("Search array (fold x lambda):")
+    print(search_array)
+    # best_idx = df["AvgMSE"].idxmin()
+    
+    # Wilcoxon rank-sum test
+    mean_mse_per_lambda = search_array.mean(axis=0)
+    min_idx = mean_mse_per_lambda.argmin()
+    minimum_mse = search_array[:, min_idx]
+    pval = 0
+    best_idx = 0
+
+    for idx in range(min_idx + 1, len(lam_list)):
+        current_mse = search_array[:, idx]
+        pval = 1
+        if np.sum(minimum_mse - np.abs(current_mse)) != 0:
+            result = mannwhitneyu(minimum_mse, current_mse, alternative='two-sided', method='exact', use_continuity=True)
+            pval = result[1]
+        print(f"{idx}\t{pval:.4f}")
+        if pval < 0.05:
+            break
+        best_idx = idx
+        
+    print(f"The lambda value that produces the sparsest model not significantly different from the model with the lowest mse is at index: {best_idx}")
+    
+    # Final BIOT on full data
+    start_final = time.time()
+    Fe_norm, _= standardize_features(Fe, Fe) 
+    X_norm, _ = center_targets(X, X)
+    
+    R, W, w0, ratio = BIOT(
+        X=Fe_norm, Y=X_norm, lam=lam_list[best_idx],
+        max_iter=1, eps=1e-6,
+        rotation=rotation, R=None,
+        fit_intercept=fit_intercept
+    )
+    
+    end_final = time.time()
+    print(f"Final BIOT time: {end_final - start_final:.2f}s")
+    
+    return best_idx, R, W, w0, ratio, df
+
+###########################################################################################################
+parser = argparse.ArgumentParser()
+parser.add_argument('-e','--embeddingFile', help='csv file containing the embeddings to be predicted', default="embeddings2.csv")
+parser.add_argument('-p','--predictorFile', help='csv file containing the predictors to use to predict the embeddings', default="features2.csv")
+parser.add_argument('-d','--dataPath', help='common path for the embedding and predictor files', default="./datasets")
+parser.add_argument('-o','--outputPath', help='path to location in which to store the output files', default="./output")
+parser.add_argument('-s','--suffix', help='suffix to identify output files from those in other runs', default="best")
+parser.add_argument('-l','--numLambdas', help='number of lamba values to calculate', type=int, default=10)
+parser.add_argument('-m','--min', help='minimum lambda value', type=float, default=.0001)
+parser.add_argument('-x','--max', help='maximum lambda value', type=float, default=3.5)
+parser.add_argument('-f','--numFolds', help='maximum lambda value', type=int, default=10)
+parser.add_argument('-a','--random_state', help='random state to use', type=int, default=1)
+parser.add_argument('-c','--scoring', help='scoring algorithm to use in CV', default='neg_mean_squared_error')
+parser.add_argument('-r','--rotation', help='Force rotation not orthogonal transformation', action="store_true")
+parser.add_argument('-t','--fit_intercept', help='Force lasso to fit intercept', action="store_true")
+parser.add_argument('-u','--mode', help='Processing mode (cpu or gpu)', default='cpu')
+
+args = parser.parse_args()
+
+embeddingPath = f"{args.dataPath}/{args.embeddingFile}"
+predictorPath = f"{args.dataPath}/{args.predictorFile}"
+X = pd.read_csv(embeddingPath,header=None).values
+Fe = pd.read_csv(predictorPath,header=0).values
+
+lam_list = np.geomspace(args.min, args.max, args.numLambdas)
+print('Lambda values:')
+print(lam_list)
+
+random_state = 1
+R = None
+         
+if isinstance(Fe, pd.DataFrame):
+    feature_names = Fe.columns       
+else:
+    feature_names = np.array(range(Fe.shape[1]))
+
+best_idx, R, W, w0, ratio, df = CV_BIOT (Fe, X, feature_names, lam_list, fit_intercept = args.fit_intercept, num_folds=args.numFolds, random_state = args.random_state, R = R, rotation = args.rotation, scoring = args.scoring, mode=args.mode)
+
+Weights = pd.DataFrame(W)
+Weights.index = feature_names
+weightfile = f"{args.outputPath}/weights_{args.suffix}.csv"
+Weights.to_csv(weightfile)
+
+print(f"Sparsity ratio: {ratio:.4f}")
+
+rotationfile = f"{args.outputPath}/rotation_{args.suffix}.csv"
+Rotation = pd.DataFrame(R)
+Rotation.to_csv(rotationfile)
+
+interceptfile = f"{args.outputPath}/intercepts_{args.suffix}.csv"
+Intercepts = pd.DataFrame(w0)
+Intercepts.to_csv(interceptfile)
+
+dffile = f"{args.outputPath}/avgMSE_{args.suffix}.csv"
+df.to_csv(dffile, index=False)
+
diff --git a/src/BIOT/Modified_code/BIOT_GPU_modified.py b/src/BIOT/Modified_code/BIOT_GPU_modified.py
new file mode 100644
index 0000000..f82e939
--- /dev/null
+++ b/src/BIOT/Modified_code/BIOT_GPU_modified.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Functions and classes for the BIOT module
+"""
+import warnings
+warnings.filterwarnings("ignore")
+
+import time
+import pandas as pd 
+import numpy as np
+import torch
+from TorchL1 import TorchL1
+from sklearn.metrics import mean_squared_error
+from sklearn.linear_model import Lasso
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import TransformedTargetRegressor
+import math
+from scipy.stats import mannwhitneyu
+
+from sklearn.base import BaseEstimator, RegressorMixin
+from sklearn.utils.validation import check_array, check_is_fitted
+
+from sklearn.model_selection import KFold
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+
+from sklearn.preprocessing import StandardScaler
+import argparse
+
+###############################################################################
+# Functions and classes
+def Get_W_Lasso (X, Y, lam, fit_intercept = False, mode='cuda'):
+    device = torch.device("cuda") 
+    
+    k = Y.shape[1]
+    d = X.shape[1]
+    w0 = torch.zeros(k).double().to(device)
+
+    if fit_intercept:
+        print(f"Do not support this as of now.")
+        exit()
+    print('.', end='', flush=True)
+      
+    model = TorchL1(alpha=lam, max_iter=5000, tol=1e-6)
+    model.coef_ = torch.zeros(d, k).double().to(device)
+    model.alpha *= Y.shape[0]
+    model.fit(X=X, y=Y)
+    
+    return model.coef_, w0
+
+def calc_max_lam (X, Y):
+    
+    n = X.shape[0]
+    sc = StandardScaler()
+    X_norm = sc.fit_transform(X)
+    sc = StandardScaler(with_std=False)
+    Y_norm = sc.fit_transform(Y)
+    max_lam = np.max(np.absolute(X_norm.T @ Y_norm))/n
+    
+    return max_lam
+
+def normalize(X, mean, std):
+  return (X - mean) / std
+
+def Global_L1_Norm (W):
+    
+    k = W.shape[1]
+    norm_val = 0
+    for dim_index in range(k):
+        norm_val += torch.norm(W[:, dim_index], p=1)
+        
+    return norm_val
+
+def BIOT_Crit (X, Y, R, W, w0, lam):
+    
+    n = X.shape[0]
+    diffs = (Y @ R) - (w0.unsqueeze(0).repeat(n, 1) + (X @ W))
+    LS = torch.norm(diffs)**2
+    L1 = Global_L1_Norm(W)
+    crit = ((1 / (2 * n)) * LS) + (lam * L1)
+    
+    return crit
+
+def BIOT (X, Y, lam, max_iter = 200, eps = 1e-6, rotation = False, R = None, fit_intercept = False, mode='cuda'):       
+    print('lambda: ' + str(lam))
+    d = X.shape[1]
+    n = X.shape[0]
+    lam_norm = lam/np.sqrt(d)
+    
+    # If R is provided, get Lasso solution only
+    if R is not None:
+        YR = Y @ R
+        W, w0 = Get_W_Lasso(X = X, Y = YR, lam = lam_norm, fit_intercept=fit_intercept, mode=mode)
+        
+    # Otherwise, run BIOT iterations
+    else:
+        W, w0 = Get_W_Lasso(X = X, Y = Y, lam = lam_norm, fit_intercept=fit_intercept, mode=mode)
+    
+        diff = math.inf
+        iter_index = 0
+        crit_list = [math.inf]
+
+        while iter_index < max_iter and diff > eps:
+            
+            De = (1/(2*n)) * Y.T @ (w0.unsqueeze(0).repeat(n, 1) + (X @ W))
+            U, S, Vh = torch.linalg.svd(De)
+            
+            # rotation matrix desired (counterclockwise)
+            if rotation:
+                sv = S
+                smallest = torch.argmin(sv)
+                sv = sv.clone() 
+                sv[smallest] = torch.sign(torch.det(torch.mm(U, Vh.T)))
+                sv[sv != sv[smallest]] = 1
+                
+                # Construct the rotation matrix
+                R = torch.mm(U, torch.mm(torch.diag(sv), Vh.T))
+            else:
+                R = U @ Vh
+            
+            YR = Y @ R
+            W, w0 = Get_W_Lasso(X = X, Y = YR, lam = lam_norm, fit_intercept = fit_intercept, mode=mode)      
+            
+            crit_list.append(BIOT_Crit(X = X, Y = Y, R = R, W = W, w0 = w0, lam = lam_norm))
+            diff = torch.absolute(crit_list[iter_index] - crit_list[iter_index + 1])
+            iter_index += 1
+            
+    num_nonzero = (W != 0).sum().item()
+    total_elements = W.numel()
+    ratio = num_nonzero / total_elements
+
+    return R, W, w0, ratio
+
+def standardize_features(train, test):
+    mean = train.mean(axis=0)
+    std = train.std(axis=0, ddof=1)
+    std[std == 0] = 1.0
+    train_std = (train - mean) / std
+    test_std  = (test - mean) / std
+    return train_std, test_std
+
+def center_targets(train, test):
+    mean = train.mean(axis=0)
+    train_centered = train - mean
+    test_centered  = test - mean
+    return train_centered, test_centered
+
+def CV_BIOT(Fe, X, feature_names, lam_list, fit_intercept=False,
+            num_folds=10, random_state=1, R=None, rotation=False, scoring='neg_mean_squared_error', mode='cuda'):
+    
+    start_all = time.time()
+    
+    kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)
+    results = []
+    mse_matrix = [] 
+    
+    for lam in lam_list:
+        fold_mse = []
+        fold_sparsity = []
+
+        for train_idx, test_idx in kf.split(Fe):
+            X_train = X[train_idx, :]
+            X_test  = X[test_idx, :]
+            Fe_train = Fe[train_idx, :]
+            Fe_test  = Fe[test_idx, :]
+
+            Fe_train, Fe_test = standardize_features(Fe_train, Fe_test)
+            X_train, X_test = center_targets(X_train, X_test)
+            
+            Fe_train_tensor = torch.tensor(Fe_train).to(mode)
+            X_train_tensor = torch.tensor(X_train).to(mode)
+            
+            R_, W_, w0_, ratio = BIOT(
+                X=Fe_train_tensor, Y=X_train_tensor, lam=lam, rotation=rotation, R=R,
+                fit_intercept=fit_intercept, mode=mode)
+            
+            W_cpu = W_.cpu().numpy()
+            w0_cpu = w0_.cpu().numpy()
+            R_cpu = R_.cpu().numpy()
+            
+            intercept = np.tile(w0_cpu, (Fe_test.shape[0], 1))
+            X_pred = (intercept + Fe_test @ W_cpu) @ R_cpu.T
+
+            mse = mean_squared_error(X_test, X_pred) / 2
+            fold_mse.append(mse)
+            fold_sparsity.append(ratio)
+
+        avg_mse = np.mean(fold_mse)
+        avg_ratio = np.mean(fold_sparsity)
+        results.append((lam, avg_mse, avg_ratio))
+        mse_matrix.append(fold_mse)
+
+    end_all = time.time()
+    print(f"CV BIOT completed in {end_all - start_all:.2f}s")
+    
+    df = pd.DataFrame(results, columns=["Lambda", "AvgMSE", "AvgSparsity"])
+    search_array = np.array(mse_matrix).T
+    print("Search array (fold x lambda):")
+    print(search_array)
+    # best_idx = df["AvgMSE"].idxmin()
+    
+    # Wilcoxon rank-sum test
+    mean_mse_per_lambda = search_array.mean(axis=0)
+    min_idx = mean_mse_per_lambda.argmin()
+    minimum_mse = search_array[:, min_idx]
+    pval = 0
+    best_idx = 0
+
+    for idx in range(min_idx + 1, len(lam_list)):
+        current_mse = search_array[:, idx]
+        pval = 1
+        if np.sum(minimum_mse - np.abs(current_mse)) != 0:
+            result = mannwhitneyu(minimum_mse, current_mse, alternative='two-sided', method='exact', use_continuity=True)
+            pval = result[1]
+        print(f"{idx}\t{pval:.4f}")
+        if pval < 0.05:
+            break
+        best_idx = idx
+        
+    print(f"The lambda value that produces the sparsest model not significantly different from the model with the lowest mse is at index: {best_idx}")
+    
+    # Final BIOT on full data
+    start_final = time.time()
+    Fe_norm, _= standardize_features(Fe, Fe) 
+    X_norm, _ = center_targets(X, X)
+    
+    X_norm = torch.tensor(X_norm).to(mode)
+    Fe_norm = torch.tensor(Fe_norm).to(mode)
+    
+    R, W, w0, ratio= BIOT (Fe_norm, X_norm, lam_list[best_idx], max_iter = 1, eps = 1e-6, rotation = False, R = None, fit_intercept = False,mode='gpu')
+    
+    end_final = time.time()
+    print(f"Final BIOT time: {end_final - start_final:.2f}s")
+    
+    return best_idx, R, W, w0, ratio, df
+
+########################################################################################################
+parser = argparse.ArgumentParser()
+parser.add_argument('-e','--embeddingFile', help='csv file containing the embeddings to be predicted', default="embeddings.csv")
+parser.add_argument('-p','--predictorFile', help='csv file containing the predictors to use to predict the embeddings', default="features.csv")
+parser.add_argument('-d','--dataPath', help='common path for the embedding and predictor files', default="/raid/home/rajivratn/hemant_rajivratn/ets/BIOT/gpu/datasets/")
+parser.add_argument('-o','--outputPath', help='path to location in which to store the output files', default="./output")
+parser.add_argument('-s','--suffix', help='suffix to identify output files from those in other runs', default="best")
+parser.add_argument('-l','--numLambdas', help='number of lamba values to calculate', type=int, default=10)
+parser.add_argument('-m','--min', help='minimum lambda value', type=float, default=.0001)
+parser.add_argument('-x','--max', help='maximum lambda value', type=float, default=3.5)
+parser.add_argument('-f','--numFolds', help='maximum lambda value', type=int, default=10)
+parser.add_argument('-a','--random_state', help='random state to use', type=int, default=1)
+parser.add_argument('-c','--scoring', help='scoring algorithm to use in CV', default='neg_mean_squared_error')
+parser.add_argument('-r','--rotation', help='Force rotation not orthogonal transformation', action="store_true")
+parser.add_argument('-t','--fit_intercept', help='Force lasso to fit intercept', action="store_true")
+parser.add_argument('-u','--mode', help='Processing mode (cpu or gpu)', default='cuda')
+
+args = parser.parse_args()
+
+embeddingPath = f"{args.dataPath}/{args.embeddingFile}"
+predictorPath = f"{args.dataPath}/{args.predictorFile}"
+Y = pd.read_csv(embeddingPath,header=None).values
+X = pd.read_csv(predictorPath,header=0).values
+
+print(f"Check on big dataset")
+print('Embedding shape:', Y.shape)
+print('Predictor shape:', X.shape)
+
+lam_list = np.geomspace(args.min, args.max, args.numLambdas)
+print('Lambda values:')
+print(lam_list)
+
+random_state = 1
+R = None
+         
+if isinstance(X, pd.DataFrame):
+    feature_names = X.columns       
+else:
+    feature_names = np.array(range(X.shape[1]))
+
+best_idx, R, W, w0, ratio, df = CV_BIOT (X, Y, feature_names, lam_list, fit_intercept = args.fit_intercept, num_folds=args.numFolds, random_state = args.random_state, R = R, rotation = args.rotation, scoring = args.scoring, mode=args.mode)
+
+Weights = pd.DataFrame(W.cpu().numpy())
+Weights.index = feature_names
+weightfile = f"{args.outputPath}/weights_{args.suffix}.csv"
+Weights.to_csv(weightfile)
+print(f"Sparsity ratio: {ratio:.4f}")
+
+rotationfile = f"{args.outputPath}/rotation_{args.suffix}.csv"
+Rotation = pd.DataFrame(R.cpu().numpy())
+Rotation.to_csv(rotationfile)
+
+interceptfile = f"{args.outputPath}/intercepts_{args.suffix}.csv"
+Intercepts = pd.DataFrame(w0.cpu().numpy())
+Intercepts.to_csv(interceptfile)
+
+dffile = f"{args.outputPath}/avgMSE_{args.suffix}.csv"
+df.to_csv(dffile, index=False)
+
diff --git a/src/BIOT/Modified_code/TorchL1.py b/src/BIOT/Modified_code/TorchL1.py
new file mode 100644
index 0000000..716833f
--- /dev/null
+++ b/src/BIOT/Modified_code/TorchL1.py
@@ -0,0 +1,252 @@
+import torch
+
+    
+#@torch.jit.script
+class TorchL1:
+    def __init__(self, alpha=1.0, max_iter=1000, tol=1e-6):
+        self.alpha = alpha
+        self.max_iter = max_iter
+        self.tol = tol
+        self.coef_ = None      
+
+    def fit(self, X, y):
+        """
+        Fit model using coordinate descent for L1-regularized linear regression.
+        
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data.
+        y : array-like of shape (n_samples,) or (n_samples, n_targets)
+            Target values.
+        
+        Returns
+        -------
+        self : object
+            Fitted estimator.
+        """
+        # self._coordinate_descent(X, y, self.alpha, self.coef_)
+        # self._fista(X, y, self.alpha, self.coef_)
+        self._fasta(X, y, self.alpha, self.coef_)
+    
+    def _fasta(self, X, Y, lam, W, tau1=10.0, max_iter=1000, w=10,
+                    backtrack=True, record_iterates=False,
+                    stepsize_shrink=0.5, eps_n=1e-15, tol=1e-10):
+        n, p = X.shape
+        _, m = Y.shape
+        device = X.device
+        
+        # x0 = torch.zeros((p, m), device=device)
+        x0 = W
+        x1 = x0.clone()
+        d1 = x1.clone()
+
+        f = lambda B: 0.5 * torch.norm(X @ B - Y, p='fro')**2
+        gradf = lambda B: X.T @ (X @ B - Y)
+        g = lambda B: lam * torch.norm(B, p=1)
+        proxg = lambda B, tau: torch.sign(B) * torch.clamp(torch.abs(B) - tau * lam, min=0.0)
+
+        residual = torch.zeros(max_iter)
+        normalized_resid = torch.zeros(max_iter)
+        taus = torch.zeros(max_iter)
+        fvals = torch.zeros(max_iter)
+        objective = torch.zeros(max_iter + 1)
+
+        f1 = f(d1)
+        gradf1 = gradf(d1)
+        fvals[0] = f1
+        objective[0] = f1 + g(x0)
+        min_obj = objective[0].item()
+        best_beta = x1.clone()
+
+        if record_iterates:
+            iterates = torch.zeros((p, max_iter + 1))
+            iterates[:, 0] = x1.squeeze()
+        else:
+            iterates = None
+            
+        for i in range(max_iter):
+            x0 = x1.clone()
+            gradf0 = gradf1.clone()
+            tau0 = tau1
+            # print (tau0)
+            
+            x1hat = x0 - tau0 * gradf0
+            x1 = proxg(x1hat, tau0)
+            Dx = x1 - x0
+            d1 = x1.clone()
+            f1 = f(d1)
+
+            if backtrack:
+                if i > 0:
+                    M = torch.max(fvals[max(i - w, 0):i])
+                else:
+                    M = fvals[0]
+                # print (M)
+                
+                prev_f1 = f1
+
+                backtrack_count = 0
+                prop = (f1 - 1e-12 > M + torch.sum(Dx * gradf0) + 0.5 * torch.norm(Dx)**2 / tau0)
+
+                while prop and backtrack_count < 20:
+                    tau0 *= stepsize_shrink
+                    x1hat = x0 - tau0 * gradf0
+                    x1 = proxg(x1hat, tau0)
+                    d1 = x1.clone()
+                    f1 = f(d1)
+                    Dx = x1 - x0
+                    backtrack_count += 1
+                    prop = (f1 - 1e-12 > M + torch.sum(Dx * gradf0) + 0.5 * torch.norm(Dx)**2 / tau0)
+            
+            taus[i] = tau0
+            residual[i] = torch.norm(Dx) / tau0
+            normalizer = max(
+                torch.norm(gradf0).item(),
+                torch.norm(x1 - x1hat).item() / tau0
+            ) + eps_n
+            normalized_resid[i] = residual[i].item() / normalizer
+            fvals[i] = f1
+            objective[i + 1] = f1 + g(x1)
+
+            if objective[i + 1] < min_obj:
+                best_beta = x1.clone()
+                min_obj = objective[i + 1].item()
+
+            if record_iterates:
+                iterates[:, i + 1] = x1.squeeze()
+                
+            gradf1 = gradf(d1)
+            Dg = gradf1 + (x1hat - x0) / tau0
+            dotprod = torch.sum(Dx * Dg)
+
+            if abs(dotprod.item()) < 1e-15:
+                break
+
+            tau_s = torch.norm(Dx)**2 / dotprod
+            tau_m = dotprod / (torch.norm(Dg)**2 + eps_n)
+            tau_m = max(tau_m.item(), 0)
+
+            tau1 = tau_m if 2 * tau_m > tau_s else tau_s - 0.5 * tau_m
+            if tau1 <= 0 or not torch.isfinite(torch.as_tensor(tau1)):
+                tau1 = tau0 * 1.5
+
+            if torch.max(torch.abs(Dx)) < tol * (torch.max(torch.abs(x1))):
+                break
+        
+        self.coef_ = best_beta 
+
+    def _fista(self, X, Y, lam, W, max_iter=1000, tol=1e-10):
+        """
+        FISTA algorithm for L1-regularized regression (multi-output).
+        
+        Parameters
+        ----------
+        X : torch.Tensor, shape (n_samples, n_features)
+        Y : torch.Tensor, shape (n_samples, n_targets)
+        lam : float
+            L1 regularization strength.
+        W : torch.Tensor, shape (n_features, n_targets)
+            Initial weights.
+        """
+        n, a = X.shape
+        _, b = Y.shape
+
+        # Estimate Lipschitz constant L = ||X||^2
+        L = torch.linalg.norm(X, ord=2).item() ** 2
+
+        W_old = W.clone()
+        Z = W.clone()
+        t = 1
+
+        for _ in range(max_iter):
+            W_prev = W.clone()
+
+            # Gradient: Xᵀ (X Z - Y)
+            grad = X.T @ (X @ Z - Y) / n
+
+            # Gradient step
+            temp = Z - grad / L
+
+            # Soft-thresholding
+            W = torch.sign(temp) * torch.maximum(torch.abs(temp) - lam / L, torch.zeros_like(temp))
+
+            # FISTA update
+            t_next = 0.5 * (1 + torch.sqrt(torch.tensor(1 + 4 * t ** 2)))
+            Z = W + ((t - 1) / t_next) * (W - W_prev)
+            t = t_next
+
+            # stopping criteria
+            if ( torch.max(torch.abs(W - W_old)) < tol * torch.max(torch.abs(W)) ):
+                # 
+                break
+
+        self.coef_ = W
+
+    def _coordinate_descent(self, X, Y, lam, W, max_iter = 1000, tol = 1e-10):
+        """
+        Lasso coordinate descent algorithm.
+        
+        Parameters
+        ----------
+        X : torch.Tensor of shape (n_samples, n_features)
+            Training data.
+        Y : torch.Tensor of shape (n_samples, n_targets)
+            Target values.
+        lam : torch.Tensor
+            Regularization parameter (lambda).
+        W : torch.Tensor of shape (n_features, n_targets)
+            Coefficients/weights.
+        max_iter : int
+            Maximum number of iterations.
+        tol : float
+            Tolerance for stopping criteria.
+        
+        Returns
+        -------
+        W : torch.Tensor
+            Updated weights.
+        """
+  
+        _, a = X.shape
+        _, b = Y.shape
+        # W shape = (a, b)
+        
+        summation_ = torch.sum(X ** 2, dim=0) # shape = (a,)
+        
+        for _ in range(max_iter):
+            W_old = W.clone()
+            
+            for k in range(a):
+                
+                if summation_[k] == 0:
+                    continue  
+                
+                residual = Y - torch.matmul(X, W) + torch.matmul(X[:, k].unsqueeze(-1), W[k, :].unsqueeze(0)) # shape = (n, b)
+                    
+                rho = torch.mv(residual.T, X[:, k]).unsqueeze(-1) # shape = (b,1)
+                W[k,:] = (torch.sign(rho) * torch.maximum( torch.abs(rho) - lam, torch.zeros_like(rho)) ).squeeze() / summation_[k]
+        
+            # stopping criteria
+            if ( torch.max(torch.abs(W - W_old)) < tol * torch.max(torch.abs(W)) ):
+                # 
+                break
+                
+        self.coef_ = W
+
+    def predict(self, X):
+        """
+        Predict using the linear model.
+        
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Samples.
+        
+        Returns
+        -------
+        C : array-like of shape (n_samples,)
+            Returns predicted values.
+        """
+        
+        return torch.matmul(X, self.W)

From da12883458b9d924db518e02d266e09365990c0b Mon Sep 17 00:00:00 2001
From: Yuqi Wu <ywu86@eb2-3231-lin03.csc.ncsu.edu>
Date: Tue, 8 Jul 2025 10:18:11 -0400
Subject: [PATCH 2/2]  Update python code

---
 src/BIOT/Modified_code/BIOT_CPU.py          | 504 +++++++++++++++++++
 src/BIOT/Modified_code/BIOT_CPU_modified.py | 267 ----------
 src/BIOT/Modified_code/BIOT_GPU.py          | 519 ++++++++++++++++++++
 src/BIOT/Modified_code/BIOT_GPU_modified.py | 296 -----------
 src/BIOT/Modified_code/TorchL1.py           | 121 ++++-
 5 files changed, 1118 insertions(+), 589 deletions(-)
 create mode 100644 src/BIOT/Modified_code/BIOT_CPU.py
 delete mode 100644 src/BIOT/Modified_code/BIOT_CPU_modified.py
 create mode 100644 src/BIOT/Modified_code/BIOT_GPU.py
 delete mode 100644 src/BIOT/Modified_code/BIOT_GPU_modified.py

diff --git a/src/BIOT/Modified_code/BIOT_CPU.py b/src/BIOT/Modified_code/BIOT_CPU.py
new file mode 100644
index 0000000..d685bb5
--- /dev/null
+++ b/src/BIOT/Modified_code/BIOT_CPU.py
@@ -0,0 +1,504 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Functions and classes for the BIOT module
+
+"""
+import pandas as pd 
+import numpy as np
+import argparse
+import time
+import math
+
+from scipy.stats import mannwhitneyu
+from sklearn.linear_model import Lasso
+from sklearn.metrics import mean_squared_error
+from sklearn.model_selection import KFold
+
+###############################################################################
+
+# Functions and classes
+
+def Get_W_Lasso (X, Y, lam, fit_intercept = False):
+    """
+    Estimates a separate Lasso regression model for each column of Y.
+
+    Parameters
+    ----------
+    X : numpy.ndarray of shape (n_samples, n_features)
+        Feature matrix with d features (columns) used to predict Y.
+
+    Y : numpy.ndarray of shape (n_samples, n_targets)
+        Target matrix with k target variables (embedding dimensions).
+
+    lam : float
+        Regularization strength (L1 penalty). Higher values yield sparser solutions.
+
+    fit_intercept : bool, optional (default=False)
+        If True, the model includes an intercept term.
+
+    Returns
+    -------
+    W : numpy.ndarray of shape (n_features, n_targets)
+        Coefficient matrix. Each column corresponds to the Lasso solution for one dimension of Y.
+
+    w0 : numpy.ndarray of shape (n_targets,)
+        Intercept vector. Only non-zero if `fit_intercept=True`; otherwise all zeros.
+    """
+    
+    k = Y.shape[1]
+    d = X.shape[1]
+    W = np.zeros((d, k))
+    w0 = np.zeros(k)
+    if fit_intercept:
+        print(f"Do not support this as of now.")
+        exit()
+    print('.', end='', flush=True)
+    
+    for dim_index in range(k):    
+        model = Lasso(alpha = lam, fit_intercept = fit_intercept, max_iter = 5000, tol = 1e-6)
+        model.fit(X = X, y = Y[:, dim_index])
+        W[:, dim_index] = model.coef_
+        if fit_intercept:
+            w0[dim_index] = model.intercept_
+    return W, w0
+
+def normalize(X, mean, std):
+    return (X - mean) / std
+
+def Global_L1_Norm (W):
+    """
+    Compute the global L1 norm across all columns of the weight matrix W.
+
+    This is the sum of the L1 norms (i.e., sum of absolute values) of each column in W.
+
+    Parameters
+    ----------
+    W : numpy.ndarray of shape (n_features, n_targets)
+        Weight matrix, where each column corresponds to the coefficients for one target.
+
+    Returns
+    -------
+    float
+        Total L1 norm summed across all columns of W.
+    """
+    
+    k = W.shape[1]
+    norm_val = 0
+    for dim_index in range(k):
+        norm_val += np.linalg.norm(W[:, dim_index], ord = 1)
+        
+    return norm_val
+
+def BIOT_Crit (X, Y, R, W, w0, lam):
+    """
+    Compute the BIOT objective function value.
+
+    The BIOT criterion includes a least-squares loss term and an L1 regularization term
+    on the model weights. This function assumes an orthogonal transformation R has already
+    been applied to the embedding dimensions.
+
+    Parameters
+    ----------
+    X : numpy.ndarray of shape (n_samples, n_features)
+        Input feature matrix used to predict the rotated embedding.
+
+    Y : numpy.ndarray of shape (n_samples, n_targets)
+        Original embedding matrix before rotation.
+
+    R : numpy.ndarray of shape (n_targets, n_targets)
+        Orthogonal rotation matrix applied to Y.
+
+    W : numpy.ndarray of shape (n_features, n_targets)
+        Weight matrix for the linear model (one target per column).
+
+    w0 : numpy.ndarray of shape (n_targets,)
+        Intercept vector for each target dimension.
+
+    lam : float
+        L1 regularization strength (sparsity penalty).
+
+    Returns
+    -------
+    float
+        The value of the BIOT objective function:
+        (1/2n) * ||Y R - (X W + w0)||^2 + λ * ||W||_1
+    """
+    
+    n = X.shape[0]
+    diffs = (Y @ R) - (np.tile(w0, (n, 1)) + (X @ W))
+    LS = np.linalg.norm(diffs)**2
+    L1 = Global_L1_Norm(W)
+    
+    crit = ((1/(2*n)) * LS) + (lam * L1)
+    
+    return crit
+
+def BIOT(X, Y, lam, max_iter=200, eps=1e-6, rotation=False, R=None, fit_intercept=False):
+    """
+    Fit a BIOT (Bilinear Orthogonal Transformation) model using an alternating optimization procedure.
+
+    The BIOT model finds a sparse linear mapping W from input features X to a rotated version of 
+    the target embedding Y @ R, where R is an orthogonal (or rotation) matrix. 
+    The optimization alternates between:
+      - Solving Lasso regression to estimate W (sparse weights)
+      - Updating R using orthogonal Procrustes alignment (via SVD)
+
+    Parameters
+    ----------
+    X : numpy.ndarray of shape (n_samples, n_features)
+        Input feature matrix.
+
+    Y : numpy.ndarray of shape (n_samples, n_targets)
+        Target embedding matrix to be explained.
+
+    lam : float
+        Sparsity hyperparameter. Controls the L1 penalty in Lasso regression.
+
+    max_iter : int, optional (default=200)
+        Maximum number of alternating optimization steps.
+
+    eps : float, optional (default=1e-6)
+        Convergence threshold for the objective function. Optimization stops when the 
+        change in objective value is below this threshold.
+
+    rotation : bool, optional (default=False)
+        If True, constrain the transformation matrix R to be a proper rotation matrix 
+        (i.e., det(R) = +1). Otherwise, any orthogonal matrix is allowed.
+
+    R : numpy.ndarray of shape (n_targets, n_targets), optional (default=None)
+        If provided, use this as a fixed orthogonal transformation matrix. 
+        If None, R is initialized and optimized during training.
+
+    fit_intercept : bool, optional (default=False)
+        If True, includes a bias term (intercept) in the Lasso regression.
+
+    Returns
+    -------
+    R : numpy.ndarray of shape (n_targets, n_targets)
+        Learned orthogonal (or rotation) matrix applied to Y.
+
+    W : numpy.ndarray of shape (n_features, n_targets)
+        Sparse weight matrix mapping X to the rotated embedding Y @ R.
+
+    w0 : numpy.ndarray of shape (n_targets,)
+        Intercept vector (zeros if fit_intercept=False).
+
+    ratio : float
+        Sparsity ratio of W, i.e., proportion of non-zero elements.
+    """
+    
+    print('lambda:', lam)
+    d = X.shape[1]
+    n = X.shape[0]
+    lam_norm = lam / np.sqrt(d)
+    
+    # If R is provided, get Lasso solution only
+    if R is not None:
+        YR = Y @ R
+        W, w0 = Get_W_Lasso(X = X, Y = YR, lam = lam_norm, fit_intercept=fit_intercept)
+    # Otherwise, run BIOT iterations
+    else:
+       # Init W
+        W, w0 = Get_W_Lasso(X=X, Y=Y, lam=lam_norm, fit_intercept=fit_intercept)
+
+        diff = math.inf
+        iter_index = 0
+        crit_list = [math.inf]
+
+        while iter_index < max_iter and diff > eps:
+            
+            # UPDATE R
+            intercept_matrix = np.tile(w0, (n, 1)) 
+            De = (1 / (2 * n)) * Y.T @ (intercept_matrix + X @ W)  
+            U, S, Vh = np.linalg.svd(De, full_matrices=False)
+            
+            # rotation matrix desired (counterclockwise)
+            if rotation:
+                sv = S.copy()
+                smallest = np.argmin(sv)
+                sv[smallest] = np.sign(np.linalg.det(U @ Vh))
+                sv[sv != sv[smallest]] = 1
+                R = U @ np.diag(sv) @ Vh
+            # orthogonal transformation matrix desired
+            else:
+                R = U @ Vh
+
+            # UPDATE W
+            YR = Y @ R
+            W, w0 = Get_W_Lasso(X=X, Y=YR, lam=lam_norm, fit_intercept=fit_intercept)
+
+            # CHECK CONVERGENCE
+            crit_list.append(BIOT_Crit(X=X, Y=Y, R=R, W=W, w0=w0, lam=lam_norm))
+            diff = abs(crit_list[iter_index] - crit_list[iter_index + 1])
+            iter_index += 1
+
+    num_nonzero = np.sum(W != 0)
+    total_elements = W.size
+    ratio = num_nonzero / total_elements
+
+    return R, W, w0, ratio
+
+def standardize_features(train, test):
+    """
+    Standardize features in the training and test sets using the training set statistics.
+
+    Each feature is transformed to have zero mean and unit variance, based on the
+    training data. Features with zero standard deviation are left unchanged (std set to 1.0).
+
+    Parameters
+    ----------
+    train : numpy.ndarray of shape (n_samples_train, n_features)
+        Training feature matrix.
+
+    test : numpy.ndarray of shape (n_samples_test, n_features)
+        Test feature matrix to be standardized using training mean and std.
+
+    Returns
+    -------
+    train_std : numpy.ndarray of shape (n_samples_train, n_features)
+        Standardized training feature matrix.
+
+    test_std : numpy.ndarray of shape (n_samples_test, n_features)
+        Standardized test feature matrix.
+    """
+    
+    mean = train.mean(axis=0)
+    std = train.std(axis=0, ddof=1)
+    std[std == 0] = 1.0
+    train_std = (train - mean) / std
+    test_std  = (test - mean) / std
+    return train_std, test_std
+
+def center_targets(train, test):
+    """
+    Center target variables in the training and test sets using the training set mean.
+
+    Each target variable (column) is shifted to have zero mean in the training set.
+    The same mean is subtracted from the test set to ensure consistency.
+
+    Parameters
+    ----------
+    train : numpy.ndarray of shape (n_samples_train, n_targets)
+        Training target matrix.
+
+    test : numpy.ndarray of shape (n_samples_test, n_targets)
+        Test target matrix to be centered using training means.
+
+    Returns
+    -------
+    train_centered : numpy.ndarray of shape (n_samples_train, n_targets)
+        Centered training targets.
+
+    test_centered : numpy.ndarray of shape (n_samples_test, n_targets)
+        Centered test targets.
+    """
+    
+    mean = train.mean(axis=0)
+    train_centered = train - mean
+    test_centered  = test - mean
+    return train_centered, test_centered
+
+
+def CV_BIOT(Fe, X, feature_names, lam_list, fit_intercept=False,
+            num_folds=10, random_state=1, R=None, rotation=False, scoring='neg_mean_squared_error'):
+    """
+    Cross-validated BIOT.
+
+    Performs K-fold cross-validation to select the best lambda for the BIOT model,
+    then fits a final BIOT model on the full dataset using the selected lambda.
+
+    Parameters
+    ----------
+    Fe : np.ndarray of shape (n_samples, n_features)
+        Predictor variables (input features) for BIOT.
+
+    X : np.ndarray of shape (n_samples, n_targets)
+        Target embeddings to be explained (Y in BIOT).
+
+    feature_names : list
+        List of feature names (used for returning weights as DataFrame).
+
+    lam_list : list of float
+        List of lambda values (L1 penalties) to try in cross-validation.
+
+    fit_intercept : bool, default=False
+        Whether to fit an intercept in the Lasso regressions.
+
+    num_folds : int, default=10
+        Number of folds in K-fold cross-validation.
+
+    random_state : int, default=1
+        Seed for reproducibility.
+
+    R : np.ndarray or None
+        Optional fixed orthogonal transformation matrix. If None, R is optimized.
+
+    rotation : bool, default=False
+        If True, constrain R to be a proper rotation matrix (det=+1).
+
+    scoring : str, default='neg_mean_squared_error'
+        Scoring method for model selection. Currently unused (MSE is hardcoded).
+
+
+    Returns
+    -------
+    best_idx : int
+        Index in `lam_list` of the selected lambda.
+
+    R : np.ndarray of shape (n_targets, n_targets)
+        Final orthogonal transformation matrix.
+
+    W : pandas.DataFrame of shape (n_features, n_targets)
+        Final weight matrix with feature names as index.
+
+    w0 : np.ndarray of shape (n_targets,)
+        Intercept vector for the final model.
+
+    ratio : float
+        Sparsity ratio (non-zero elements / total) of the final weight matrix.
+
+    df : pandas.DataFrame
+        Cross-validation summary with columns: ["Lambda", "AvgMSE", "AvgSparsity"]
+    """
+    
+    start_all = time.time()
+    kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)
+    results = [] # Store (lambda, avg MSE, avg sparsity)
+    mse_matrix = []  # Store fold-wise MSEs for statistical testing
+    
+    for lam in lam_list:
+        fold_mse = []
+        fold_sparsity = []
+
+        for train_idx, test_idx in kf.split(Fe):
+            # Split folds
+            X_train, X_test = X[train_idx], X[test_idx]
+            Fe_train, Fe_test = Fe[train_idx], Fe[test_idx]
+            
+            # Standardize Fe and center X (targets) 
+            Fe_train, Fe_test = standardize_features(Fe_train, Fe_test)
+            X_train, X_test = center_targets(X_train, X_test)
+            
+            # Run BIOT on training fold
+            R_, W_, w0_, ratio = BIOT(
+                X=Fe_train, Y=X_train, lam=lam, 
+                fit_intercept=fit_intercept,
+                R=R, rotation=rotation
+            )
+            
+            # Predict on test fold
+            intercept = np.tile(w0_, (Fe_test.shape[0], 1))
+            X_pred = (intercept + Fe_test @ W_) @ R_.T
+            
+            # Compute MSE (scaled by 1/2 for consistency with BIOT loss)
+            mse = mean_squared_error(X_test, X_pred) / 2
+            fold_mse.append(mse)
+            fold_sparsity.append(ratio)
+        
+        # Store average results
+        avg_mse = np.mean(fold_mse)
+        avg_ratio = np.mean(fold_sparsity)
+        results.append((lam, avg_mse, avg_ratio))
+        mse_matrix.append(fold_mse)
+    
+    end_all = time.time()
+    print(f"CV BIOT completed in {end_all - start_all:.2f}s")
+    
+    # Cross-validation summary
+    df = pd.DataFrame(results, columns=["Lambda", "AvgMSE", "AvgSparsity"])
+    search_array = np.array(mse_matrix).T
+    print("Search array (fold x lambda):")
+    print(search_array)
+    
+    # Statistical test: select sparsest model not significantly worse than best
+    mean_mse_per_lambda = search_array.mean(axis=0)
+    min_idx = mean_mse_per_lambda.argmin()
+    minimum_mse = search_array[:, min_idx]
+    pval = 0
+    best_idx = 0
+
+    for idx in range(min_idx + 1, len(lam_list)):
+        current_mse = search_array[:, idx]
+        pval = 1
+        if np.sum(minimum_mse - np.abs(current_mse)) != 0:
+            result = mannwhitneyu(minimum_mse, current_mse, alternative='two-sided', method='exact', use_continuity=True)
+            pval = result[1]
+        print(f"{idx}\t{pval:.4f}")
+        if pval < 0.05:
+            break
+        best_idx = idx
+    print(f"The lambda value that produces the sparsest model not significantly different from the model with the lowest mse is at index: {best_idx}")
+    
+    # Fit final model using best lambda on full data
+    start_final = time.time()
+    Fe_norm, _= standardize_features(Fe, Fe) 
+    X_norm, _ = center_targets(X, X)
+    
+    R, W, w0, ratio = BIOT(
+        X=Fe_norm, Y=X_norm, lam=lam_list[best_idx],
+        max_iter=200, eps=1e-6,
+        rotation=rotation, R=None,
+        fit_intercept=fit_intercept
+    )
+    
+    end_final = time.time()
+    print(f"Final BIOT time: {end_final - start_final:.2f}s")
+    
+    return best_idx, R, W, w0, ratio, df
+
+###########################################################################################################
+parser = argparse.ArgumentParser()
+parser.add_argument('-e','--embeddingFile', help='csv file containing the embeddings to be predicted', default="embeddings2.csv")
+parser.add_argument('-p','--predictorFile', help='csv file containing the predictors to use to predict the embeddings', default="features2.csv")
+parser.add_argument('-d','--dataPath', help='common path for the embedding and predictor files', default="./datasets")
+parser.add_argument('-o','--outputPath', help='path to location in which to store the output files', default="./output")
+parser.add_argument('-s','--suffix', help='suffix to identify output files from those in other runs', default="best")
+parser.add_argument('-l','--numLambdas', help='number of lamba values to calculate', type=int, default=10)
+parser.add_argument('-m','--min', help='minimum lambda value', type=float, default=.0001)
+parser.add_argument('-x','--max', help='maximum lambda value', type=float, default=3.5)
+parser.add_argument('-f','--numFolds', help='maximum lambda value', type=int, default=10)
+parser.add_argument('-a','--random_state', help='random state to use', type=int, default=1)
+parser.add_argument('-c','--scoring', help='scoring algorithm to use in CV', default='neg_mean_squared_error')
+parser.add_argument('-r','--rotation', help='Force rotation not orthogonal transformation', action="store_true")
+parser.add_argument('-t','--fit_intercept', help='Force lasso to fit intercept', action="store_true")
+
+args = parser.parse_args()
+
+embeddingPath = f"{args.dataPath}/{args.embeddingFile}"
+predictorPath = f"{args.dataPath}/{args.predictorFile}"
+X = pd.read_csv(embeddingPath,header=None).values
+Fe = pd.read_csv(predictorPath,header=0).values
+
+lam_list = np.geomspace(args.min, args.max, args.numLambdas)
+print('Lambda values:')
+print(lam_list)
+
+random_state = 1
+R = None
+         
+if isinstance(Fe, pd.DataFrame):
+    feature_names = Fe.columns       
+else:
+    feature_names = np.array(range(Fe.shape[1]))
+
+best_idx, R, W, w0, ratio, df = CV_BIOT (Fe, X, feature_names, lam_list, fit_intercept = args.fit_intercept, num_folds=args.numFolds, random_state = args.random_state, R = R, rotation = args.rotation, scoring = args.scoring)
+
+Weights = pd.DataFrame(W)
+Weights.index = feature_names
+weightfile = f"{args.outputPath}/weights_{args.suffix}.csv"
+Weights.to_csv(weightfile)
+
+rotationfile = f"{args.outputPath}/rotation_{args.suffix}.csv"
+Rotation = pd.DataFrame(R)
+Rotation.to_csv(rotationfile)
+
+interceptfile = f"{args.outputPath}/intercepts_{args.suffix}.csv"
+Intercepts = pd.DataFrame(w0)
+Intercepts.to_csv(interceptfile)
+
+dffile = f"{args.outputPath}/avgMSE_{args.suffix}.csv"
+df.to_csv(dffile, index=False)
+print(f"Sparsity ratio: {ratio:.4f}")
+
+# python3 src/BIOT/BIOT_CPU.py -e mite_6d_embedding.csv -p mite_F.csv -d test/inputs/Mite -o test/outputs -s mite_6d_cd_CPU_fixed -l 20 -m 0.0001 -f 5 -x 3.5 -a 42 -c neg_mean_squared_error| tee mite_6d_cd_CPU_fixed.txt
\ No newline at end of file
diff --git a/src/BIOT/Modified_code/BIOT_CPU_modified.py b/src/BIOT/Modified_code/BIOT_CPU_modified.py
deleted file mode 100644
index 0875788..0000000
--- a/src/BIOT/Modified_code/BIOT_CPU_modified.py
+++ /dev/null
@@ -1,267 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Functions and classes for the BIOT module
-
-"""
-import pandas as pd 
-import numpy as np
-import argparse
-import time
-import math
-
-from scipy.stats import mannwhitneyu
-from sklearn.linear_model import Lasso
-from sklearn.preprocessing import StandardScaler
-from sklearn.metrics import mean_squared_error
-from sklearn.model_selection import KFold
-from sklearn.preprocessing import StandardScaler
-
-###############################################################################
-# Functions and classes
-def Get_W_Lasso (X, Y, lam, fit_intercept = False, mode='cpu'):
-    k = Y.shape[1]
-    d = X.shape[1]
-    W = np.zeros((d, k))
-    w0 = np.zeros(k)
-    if fit_intercept:
-        mode = 'cpu'
-    print('.', end='', flush=True)
-    for dim_index in range(k):    
-        model = Lasso(alpha = lam, fit_intercept = fit_intercept, max_iter = 5000, tol = 1e-6)
-        model.fit(X = X, y = Y[:, dim_index])
-        W[:, dim_index] = model.coef_
-        if fit_intercept:
-            w0[dim_index] = model.intercept_
-    return W, w0
-
-def calc_max_lam (X, Y):
-    n = X.shape[0]
-    sc = StandardScaler()
-    X_norm = sc.fit_transform(X)
-    sc = StandardScaler(with_std=False)
-    Y_norm = sc.fit_transform(Y)
-    max_lam = np.max(np.absolute(X_norm.T @ Y_norm))/n
-    
-    return max_lam
-
-def normalize(X, mean, std):
-  return (X - mean) / std
-
-def Global_L1_Norm (W):
-    k = W.shape[1]
-    norm_val = 0
-    for dim_index in range(k):
-        norm_val += np.linalg.norm(W[:, dim_index], ord = 1)
-        
-    return norm_val
-
-def BIOT_Crit (X, Y, R, W, w0, lam):
-    n = X.shape[0]
-    diffs = (Y @ R) - (np.tile(w0, (n, 1)) + (X @ W))
-    LS = np.linalg.norm(diffs)**2
-    L1 = Global_L1_Norm(W)
-    
-    crit = ((1/(2*n)) * LS) + (lam * L1)
-    
-    return crit
-
-def BIOT(X, Y, lam, max_iter=200, eps=1e-6, rotation=False, R=None, fit_intercept=False, mode='cpu'):
-    print('lambda:', lam)
-    R = np.eye(6)
-    d = X.shape[1]
-    n = X.shape[0]
-    lam_norm = lam / np.sqrt(d)
-
-
-    W, w0 = Get_W_Lasso(X=X, Y=Y, lam=lam_norm, fit_intercept=fit_intercept, mode=mode)
-
-    diff = math.inf
-    iter_index = 0
-    crit_list = [math.inf]
-
-    while iter_index < max_iter and diff > eps:
-        # Compute De matrix
-        intercept_matrix = np.tile(w0, (n, 1))  # shape (n, m)
-        De = (1 / (2 * n)) * Y.T @ (intercept_matrix + X @ W)  # shape (m, m)
-
-        # SVD
-        U, S, Vh = np.linalg.svd(De, full_matrices=False)
-
-        if rotation:
-            sv = S.copy()
-            smallest = np.argmin(sv)
-            sv[smallest] = np.sign(np.linalg.det(U @ Vh))
-            sv[sv != sv[smallest]] = 1
-            R = U @ np.diag(sv) @ Vh
-        else:
-            R = U @ Vh
-
-        # Update W
-        YR = Y @ R
-        W, w0 = Get_W_Lasso(X=X, Y=YR, lam=lam_norm, fit_intercept=fit_intercept, mode=mode)
-
-        # Convergence check
-        crit_list.append(BIOT_Crit(X=X, Y=Y, R=R, W=W, w0=w0, lam=lam_norm))
-        diff = abs(crit_list[iter_index] - crit_list[iter_index + 1])
-        iter_index += 1
-
-    # Sparsity
-    num_nonzero = np.sum(W != 0)
-    total_elements = W.size
-    ratio = num_nonzero / total_elements
-
-    return R, W, w0, ratio
-
-def standardize_features(train, test):
-    mean = train.mean(axis=0)
-    std = train.std(axis=0, ddof=1)
-    std[std == 0] = 1.0
-    train_std = (train - mean) / std
-    test_std  = (test - mean) / std
-    return train_std, test_std
-
-def center_targets(train, test):
-    mean = train.mean(axis=0)
-    train_centered = train - mean
-    test_centered  = test - mean
-    return train_centered, test_centered
-
-
-def CV_BIOT(Fe, X, feature_names, lam_list, fit_intercept=False,
-            num_folds=10, random_state=1, R=None, rotation=False, scoring='neg_mean_squared_error', mode='cpu'):
-
-    start_all = time.time()
-    kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)
-    results = []
-    mse_matrix = []  
-    
-    for lam in lam_list:
-        fold_mse = []
-        fold_sparsity = []
-
-        for train_idx, test_idx in kf.split(Fe):
-            X_train = X[train_idx, :]
-            X_test  = X[test_idx, :]
-            Fe_train = Fe[train_idx, :]
-            Fe_test  = Fe[test_idx, :]
-
-            Fe_train, Fe_test = standardize_features(Fe_train, Fe_test)
-            X_train, X_test = center_targets(X_train, X_test)
-
-            R_, W_, w0_, ratio = BIOT(X=Fe_train, Y=X_train, lam=lam)
-
-            intercept = np.tile(w0_, (Fe_test.shape[0], 1))
-            X_pred = (intercept + Fe_test @ W_) @ R_.T
-
-            mse = mean_squared_error(X_test, X_pred) / 2
-            fold_mse.append(mse)
-            fold_sparsity.append(ratio)
-
-        avg_mse = np.mean(fold_mse)
-        avg_ratio = np.mean(fold_sparsity)
-        results.append((lam, avg_mse, avg_ratio))
-        mse_matrix.append(fold_mse)
-    
-    end_all = time.time()
-    print(f"CV BIOT completed in {end_all - start_all:.2f}s")
-    
-    df = pd.DataFrame(results, columns=["Lambda", "AvgMSE", "AvgSparsity"])
-    search_array = np.array(mse_matrix).T
-    print("Search array (fold x lambda):")
-    print(search_array)
-    # best_idx = df["AvgMSE"].idxmin()
-    
-    # Wilcoxon rank-sum test
-    mean_mse_per_lambda = search_array.mean(axis=0)
-    min_idx = mean_mse_per_lambda.argmin()
-    minimum_mse = search_array[:, min_idx]
-    pval = 0
-    best_idx = 0
-
-    for idx in range(min_idx + 1, len(lam_list)):
-        current_mse = search_array[:, idx]
-        pval = 1
-        if np.sum(minimum_mse - np.abs(current_mse)) != 0:
-            result = mannwhitneyu(minimum_mse, current_mse, alternative='two-sided', method='exact', use_continuity=True)
-            pval = result[1]
-        print(f"{idx}\t{pval:.4f}")
-        if pval < 0.05:
-            break
-        best_idx = idx
-        
-    print(f"The lambda value that produces the sparsest model not significantly different from the model with the lowest mse is at index: {best_idx}")
-    
-    # Final BIOT on full data
-    start_final = time.time()
-    Fe_norm, _= standardize_features(Fe, Fe) 
-    X_norm, _ = center_targets(X, X)
-    
-    R, W, w0, ratio = BIOT(
-        X=Fe_norm, Y=X_norm, lam=lam_list[best_idx],
-        max_iter=1, eps=1e-6,
-        rotation=rotation, R=None,
-        fit_intercept=fit_intercept
-    )
-    
-    end_final = time.time()
-    print(f"Final BIOT time: {end_final - start_final:.2f}s")
-    
-    return best_idx, R, W, w0, ratio, df
-
-###########################################################################################################
-parser = argparse.ArgumentParser()
-parser.add_argument('-e','--embeddingFile', help='csv file containing the embeddings to be predicted', default="embeddings2.csv")
-parser.add_argument('-p','--predictorFile', help='csv file containing the predictors to use to predict the embeddings', default="features2.csv")
-parser.add_argument('-d','--dataPath', help='common path for the embedding and predictor files', default="./datasets")
-parser.add_argument('-o','--outputPath', help='path to location in which to store the output files', default="./output")
-parser.add_argument('-s','--suffix', help='suffix to identify output files from those in other runs', default="best")
-parser.add_argument('-l','--numLambdas', help='number of lamba values to calculate', type=int, default=10)
-parser.add_argument('-m','--min', help='minimum lambda value', type=float, default=.0001)
-parser.add_argument('-x','--max', help='maximum lambda value', type=float, default=3.5)
-parser.add_argument('-f','--numFolds', help='maximum lambda value', type=int, default=10)
-parser.add_argument('-a','--random_state', help='random state to use', type=int, default=1)
-parser.add_argument('-c','--scoring', help='scoring algorithm to use in CV', default='neg_mean_squared_error')
-parser.add_argument('-r','--rotation', help='Force rotation not orthogonal transformation', action="store_true")
-parser.add_argument('-t','--fit_intercept', help='Force lasso to fit intercept', action="store_true")
-parser.add_argument('-u','--mode', help='Processing mode (cpu or gpu)', default='cpu')
-
-args = parser.parse_args()
-
-embeddingPath = f"{args.dataPath}/{args.embeddingFile}"
-predictorPath = f"{args.dataPath}/{args.predictorFile}"
-X = pd.read_csv(embeddingPath,header=None).values
-Fe = pd.read_csv(predictorPath,header=0).values
-
-lam_list = np.geomspace(args.min, args.max, args.numLambdas)
-print('Lambda values:')
-print(lam_list)
-
-random_state = 1
-R = None
-         
-if isinstance(Fe, pd.DataFrame):
-    feature_names = Fe.columns       
-else:
-    feature_names = np.array(range(Fe.shape[1]))
-
-best_idx, R, W, w0, ratio, df = CV_BIOT (Fe, X, feature_names, lam_list, fit_intercept = args.fit_intercept, num_folds=args.numFolds, random_state = args.random_state, R = R, rotation = args.rotation, scoring = args.scoring, mode=args.mode)
-
-Weights = pd.DataFrame(W)
-Weights.index = feature_names
-weightfile = f"{args.outputPath}/weights_{args.suffix}.csv"
-Weights.to_csv(weightfile)
-
-print(f"Sparsity ratio: {ratio:.4f}")
-
-rotationfile = f"{args.outputPath}/rotation_{args.suffix}.csv"
-Rotation = pd.DataFrame(R)
-Rotation.to_csv(rotationfile)
-
-interceptfile = f"{args.outputPath}/intercepts_{args.suffix}.csv"
-Intercepts = pd.DataFrame(w0)
-Intercepts.to_csv(interceptfile)
-
-dffile = f"{args.outputPath}/avgMSE_{args.suffix}.csv"
-df.to_csv(dffile, index=False)
-
diff --git a/src/BIOT/Modified_code/BIOT_GPU.py b/src/BIOT/Modified_code/BIOT_GPU.py
new file mode 100644
index 0000000..d234e94
--- /dev/null
+++ b/src/BIOT/Modified_code/BIOT_GPU.py
@@ -0,0 +1,519 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Functions and classes for the BIOT module
+"""
+import warnings
+warnings.filterwarnings("ignore")
+
+import time
+import pandas as pd 
+import numpy as np
+import torch
+import math
+import argparse
+
+from TorchL1 import TorchL1
+from sklearn.metrics import mean_squared_error
+from scipy.stats import mannwhitneyu
+from sklearn.model_selection import KFold
+
+###############################################################################
+
+# Functions and classes
+
+def Get_W_Lasso (X, Y, lam, fit_intercept = False):
+    """
+    Estimate multi-target Lasso regression weights using a GPU-accelerated TorchL1 model.
+
+    Each column in Y is treated as a separate regression target, and a shared Lasso model
+    is fit across all targets simultaneously using coordinate descent (or other solver in TorchL1).
+
+    Parameters
+    ----------
+    X : torch.Tensor of shape (n_samples, n_features)
+        Feature matrix used to predict Y. Must be moved to GPU beforehand or will be done internally.
+
+    Y : torch.Tensor of shape (n_samples, n_targets)
+        Target matrix, with each column representing one target dimension.
+
+    lam : float
+        Regularization parameter (L1 penalty). Controls sparsity of the weight matrix.
+        Note: Will be scaled internally by the number of samples.
+
+    fit_intercept : bool, optional (default=False)
+        If True, the model would fit an intercept. Currently not supported; the function will exit if True.
+
+    Returns
+    -------
+    coef_ : torch.Tensor of shape (n_features, n_targets)
+        Lasso regression coefficient matrix.
+
+    w0 : torch.Tensor of shape (n_targets,)
+        Intercept vector (currently all zeros, as intercept fitting is not implemented).
+    """
+    
+    device = torch.device("cuda") 
+    
+    k = Y.shape[1]
+    d = X.shape[1]
+    w0 = torch.zeros(k).double().to(device)
+    if fit_intercept:
+        print(f"Do not support this as of now.")
+        exit()
+    print('.', end='', flush=True)
+      
+    model = TorchL1(alpha=lam, max_iter=5000, tol=1e-6)
+    model.coef_ = torch.zeros(d, k).double().to(device)
+    model.alpha *= Y.shape[0]
+    model.fit(X=X, y=Y)
+    return model.coef_, w0
+
+def normalize(X, mean, std):
+  return (X - mean) / std
+
+def Global_L1_Norm (W):
+    """
+    Compute the global L1 norm across all columns of the weight matrix W.
+
+    This is the sum of the L1 norms (i.e., sum of absolute values) of each column in W.
+
+    Parameters
+    ----------
+    W : numpy.ndarray of shape (n_features, n_targets)
+        Weight matrix, where each column corresponds to the coefficients for one target.
+
+    Returns
+    -------
+    float
+        Total L1 norm summed across all columns of W.
+    """
+    
+    k = W.shape[1]
+    norm_val = 0
+    for dim_index in range(k):
+        norm_val += torch.norm(W[:, dim_index], p=1)
+        
+    return norm_val
+
+def BIOT_Crit (X, Y, R, W, w0, lam):
+    """
+    Compute the BIOT objective function value.
+
+    The BIOT criterion includes a least-squares loss term and an L1 regularization term
+    on the model weights. This function assumes an orthogonal transformation R has already
+    been applied to the embedding dimensions.
+
+    Parameters
+    ----------
+    X : numpy.ndarray of shape (n_samples, n_features)
+        Input feature matrix used to predict the rotated embedding.
+
+    Y : numpy.ndarray of shape (n_samples, n_targets)
+        Original embedding matrix before rotation.
+
+    R : numpy.ndarray of shape (n_targets, n_targets)
+        Orthogonal rotation matrix applied to Y.
+
+    W : numpy.ndarray of shape (n_features, n_targets)
+        Weight matrix for the linear model (one target per column).
+
+    w0 : numpy.ndarray of shape (n_targets,)
+        Intercept vector for each target dimension.
+
+    lam : float
+        L1 regularization strength (sparsity penalty).
+
+    Returns
+    -------
+    float
+        The value of the BIOT objective function:
+        (1/2n) * ||Y R - (X W + w0)||^2 + λ * ||W||_1
+    """
+    
+    n = X.shape[0]
+    diffs = (Y @ R) - (w0.unsqueeze(0).repeat(n, 1) + (X @ W))
+    LS = torch.norm(diffs)**2
+    L1 = Global_L1_Norm(W)
+    crit = ((1 / (2 * n)) * LS) + (lam * L1)
+    
+    return crit
+
+def BIOT (X, Y, lam, max_iter = 200, eps = 1e-6, rotation = False, R = None, fit_intercept = False):       
+    """
+    Fit a BIOT (Bilinear Orthogonal Transformation) model using an alternating optimization 
+    procedure on GPU with PyTorch.
+
+    BIOT seeks a sparse linear mapping W that maps input features X to a rotated version of 
+    the target matrix Y @ R, where R is an orthogonal (or rotation) matrix. The optimization 
+    alternates between:
+
+        - Solving multi-target Lasso regression to estimate W (sparse weights)
+        - Updating R via orthogonal Procrustes alignment using SVD
+
+    This version is GPU-compatible and uses PyTorch tensors and operations throughout.
+
+    Parameters
+    ----------
+    X : torch.Tensor of shape (n_samples, n_features)
+        Input feature matrix (must be on GPU).
+
+    Y : torch.Tensor of shape (n_samples, n_targets)
+        Target matrix to be explained (must be on GPU).
+
+    lam : float
+        Sparsity regularization parameter (L1 penalty). Internally scaled by 1 / sqrt(d).
+
+    max_iter : int, optional (default=200)
+        Maximum number of alternating optimization iterations.
+
+    eps : float, optional (default=1e-6)
+        Convergence threshold for BIOT objective value.
+
+    rotation : bool, optional (default=False)
+        If True, constrain R to be a proper rotation matrix (i.e., det(R) = +1).
+        If False, R is any orthogonal matrix.
+
+    R : torch.Tensor of shape (n_targets, n_targets), optional (default=None)
+        If provided, R is used directly and not optimized. Otherwise, R is updated during training.
+
+    fit_intercept : bool, optional (default=False)
+        Whether to fit an intercept in the Lasso regression (currently must be False if not supported).
+
+    Returns
+    -------
+    R : torch.Tensor of shape (n_targets, n_targets)
+        Learned orthogonal or rotation matrix.
+
+    W : torch.Tensor of shape (n_features, n_targets)
+        Sparse regression coefficient matrix.
+
+    w0 : torch.Tensor of shape (n_targets,)
+        Intercept vector (zeros if fit_intercept=False).
+
+    ratio : float
+        Sparsity ratio of W, i.e., proportion of non-zero entries in W.
+    """
+    
+    print('lambda: ' + str(lam))
+    d = X.shape[1]
+    n = X.shape[0]
+    lam_norm = lam/np.sqrt(d)
+    
+    # If R is provided, get Lasso solution only
+    if R is not None:
+        YR = Y @ R
+        W, w0 = Get_W_Lasso(X = X, Y = YR, lam = lam_norm, fit_intercept=fit_intercept)
+    # Otherwise, run BIOT iterations
+    else:
+        W, w0 = Get_W_Lasso(X = X, Y = Y, lam = lam_norm, fit_intercept=fit_intercept)
+    
+        diff = math.inf
+        iter_index = 0
+        crit_list = [math.inf]
+
+        while iter_index < max_iter and diff > eps:
+            
+            # UPDATE R
+            De = (1/(2*n)) * Y.T @ (w0.unsqueeze(0).repeat(n, 1) + (X @ W))
+            U, S, Vh = torch.linalg.svd(De)
+            
+            # rotation matrix desired (counterclockwise)
+            if rotation:
+                sv = S
+                smallest = torch.argmin(sv)
+                sv = sv.clone() 
+                sv[smallest] = torch.sign(torch.det(torch.mm(U, Vh.T)))
+                sv[sv != sv[smallest]] = 1
+                R = torch.mm(U, torch.mm(torch.diag(sv), Vh.T))
+            # orthogonal transformation matrix desired
+            else:
+                R = U @ Vh
+            
+            # UPDATE W
+            YR = Y @ R
+            W, w0 = Get_W_Lasso(X = X, Y = YR, lam = lam_norm, fit_intercept = fit_intercept)      
+            
+            # CHECK CONVERGENCE
+            crit_list.append(BIOT_Crit(X = X, Y = Y, R = R, W = W, w0 = w0, lam = lam_norm))
+            diff = torch.absolute(crit_list[iter_index] - crit_list[iter_index + 1])
+            iter_index += 1
+            
+    num_nonzero = (W != 0).sum().item()
+    total_elements = W.numel()
+    ratio = num_nonzero / total_elements
+
+    return R, W, w0, ratio
+
+def standardize_features(train, test):
+    """
+    Standardize features in the training and test sets using the training set statistics.
+
+    Each feature is transformed to have zero mean and unit variance, based on the
+    training data. Features with zero standard deviation are left unchanged (std set to 1.0).
+
+    Parameters
+    ----------
+    train : numpy.ndarray of shape (n_samples_train, n_features)
+        Training feature matrix.
+
+    test : numpy.ndarray of shape (n_samples_test, n_features)
+        Test feature matrix to be standardized using training mean and std.
+
+    Returns
+    -------
+    train_std : numpy.ndarray of shape (n_samples_train, n_features)
+        Standardized training feature matrix.
+
+    test_std : numpy.ndarray of shape (n_samples_test, n_features)
+        Standardized test feature matrix.
+    """
+    
+    mean = train.mean(axis=0)
+    std = train.std(axis=0, ddof=1)
+    std[std == 0] = 1.0
+    train_std = (train - mean) / std
+    test_std  = (test - mean) / std
+    return train_std, test_std
+
+def center_targets(train, test):
+    """
+    Center target variables in the training and test sets using the training set mean.
+
+    Each target variable (column) is shifted to have zero mean in the training set.
+    The same mean is subtracted from the test set to ensure consistency.
+
+    Parameters
+    ----------
+    train : numpy.ndarray of shape (n_samples_train, n_targets)
+        Training target matrix.
+
+    test : numpy.ndarray of shape (n_samples_test, n_targets)
+        Test target matrix to be centered using training means.
+
+    Returns
+    -------
+    train_centered : numpy.ndarray of shape (n_samples_train, n_targets)
+        Centered training targets.
+
+    test_centered : numpy.ndarray of shape (n_samples_test, n_targets)
+        Centered test targets.
+    """
+    
+    mean = train.mean(axis=0)
+    train_centered = train - mean
+    test_centered  = test - mean
+    return train_centered, test_centered
+
+def CV_BIOT(Fe, X, feature_names, lam_list, fit_intercept=False,
+            num_folds=10, random_state=1, R=None, rotation=False, scoring='neg_mean_squared_error'):
+    """
+    Cross-validated BIOT.
+
+    Performs K-fold cross-validation to select the best lambda for the BIOT model,
+    then fits a final BIOT model on the full dataset using the selected lambda.
+
+    Parameters
+    ----------
+    Fe : np.ndarray of shape (n_samples, n_features)
+        Predictor variables (input features) for BIOT.
+
+    X : np.ndarray of shape (n_samples, n_targets)
+        Target embeddings to be explained (Y in BIOT).
+
+    feature_names : list
+        List of feature names (used for returning weights as DataFrame).
+
+    lam_list : list of float
+        List of lambda values (L1 penalties) to try in cross-validation.
+
+    fit_intercept : bool, default=False
+        Whether to fit an intercept in the Lasso regressions.
+
+    num_folds : int, default=10
+        Number of folds in K-fold cross-validation.
+
+    random_state : int, default=1
+        Seed for reproducibility.
+
+    R : np.ndarray or None
+        Optional fixed orthogonal transformation matrix. If None, R is optimized.
+
+    rotation : bool, default=False
+        If True, constrain R to be a proper rotation matrix (det=+1).
+
+    scoring : str, default='neg_mean_squared_error'
+        Scoring method for model selection. Currently unused (MSE is hardcoded).
+
+
+    Returns
+    -------
+    best_idx : int
+        Index in `lam_list` of the selected lambda.
+
+    R : np.ndarray of shape (n_targets, n_targets)
+        Final orthogonal transformation matrix.
+
+    W : pandas.DataFrame of shape (n_features, n_targets)
+        Final weight matrix with feature names as index.
+
+    w0 : np.ndarray of shape (n_targets,)
+        Intercept vector for the final model.
+
+    ratio : float
+        Sparsity ratio (non-zero elements / total) of the final weight matrix.
+
+    df : pandas.DataFrame
+        Cross-validation summary with columns: ["Lambda", "AvgMSE", "AvgSparsity"]
+    """
+    
+    start_all = time.time()
+    kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)
+    results = [] # Store (lambda, avg MSE, avg sparsity)
+    mse_matrix = []  # Store fold-wise MSEs for statistical testing
+     
+    for lam in lam_list:
+        fold_mse = []
+        fold_sparsity = []
+
+        for train_idx, test_idx in kf.split(Fe):
+            # Split folds
+            X_train, X_test = X[train_idx], X[test_idx]
+            Fe_train, Fe_test = Fe[train_idx], Fe[test_idx]
+            
+            # Standardize Fe and center X (targets) 
+            Fe_train, Fe_test = standardize_features(Fe_train, Fe_test)
+            X_train, X_test = center_targets(X_train, X_test)
+            
+            Fe_train_tensor = torch.tensor(Fe_train).to("cuda")
+            X_train_tensor = torch.tensor(X_train).to("cuda")
+            
+            # Run BIOT on training fold
+            R_, W_, w0_, ratio = BIOT(
+                X=Fe_train_tensor, Y=X_train_tensor, lam=lam, 
+                rotation=rotation, R=R,
+                fit_intercept=fit_intercept
+            )
+            
+            W_cpu = W_.cpu().numpy()
+            w0_cpu = w0_.cpu().numpy()
+            R_cpu = R_.cpu().numpy()
+            
+            # Predict on test fold
+            intercept = np.tile(w0_cpu, (Fe_test.shape[0], 1))
+            X_pred = (intercept + Fe_test @ W_cpu) @ R_cpu.T
+            
+            # Compute MSE (scaled by 1/2 for consistency with BIOT loss)
+            mse = mean_squared_error(X_test, X_pred) / 2
+            fold_mse.append(mse)
+            fold_sparsity.append(ratio)
+        
+        # Store average results
+        avg_mse = np.mean(fold_mse)
+        avg_ratio = np.mean(fold_sparsity)
+        results.append((lam, avg_mse, avg_ratio))
+        mse_matrix.append(fold_mse)
+
+    end_all = time.time()
+    print(f"CV BIOT completed in {end_all - start_all:.2f}s")
+    
+    # Cross-validation summary
+    df = pd.DataFrame(results, columns=["Lambda", "AvgMSE", "AvgSparsity"])
+    search_array = np.array(mse_matrix).T
+    print("Search array (fold x lambda):")
+    print(search_array)
+
+    # Statistical test: select sparsest model not significantly worse than best
+    mean_mse_per_lambda = search_array.mean(axis=0)
+    min_idx = mean_mse_per_lambda.argmin()
+    minimum_mse = search_array[:, min_idx]
+    pval = 0
+    best_idx = 0
+
+    for idx in range(min_idx + 1, len(lam_list)):
+        current_mse = search_array[:, idx]
+        pval = 1
+        if np.sum(minimum_mse - np.abs(current_mse)) != 0:
+            result = mannwhitneyu(minimum_mse, current_mse, alternative='two-sided', method='exact', use_continuity=True)
+            pval = result[1]
+        print(f"{idx}\t{pval:.4f}")
+        if pval < 0.05:
+            break
+        best_idx = idx     
+    print(f"The lambda value that produces the sparsest model not significantly different from the model with the lowest mse is at index: {best_idx}")
+    
+    # Fit final model using best lambda on full data
+    start_final = time.time()
+    Fe_norm, _= standardize_features(Fe, Fe) 
+    X_norm, _ = center_targets(X, X)
+    X_norm = torch.tensor(X_norm).to("cuda")
+    Fe_norm = torch.tensor(Fe_norm).to("cuda")
+    
+    R, W, w0, ratio = BIOT(
+        X=Fe_norm, Y=X_norm, lam=lam_list[best_idx],
+        max_iter=200, eps=1e-6,
+        rotation=rotation, R=None,
+        fit_intercept=fit_intercept
+    )
+    
+    end_final = time.time()
+    print(f"Final BIOT time: {end_final - start_final:.2f}s")
+    
+    return best_idx, R, W, w0, ratio, df
+
+########################################################################################################
+parser = argparse.ArgumentParser()
+parser.add_argument('-e','--embeddingFile', help='csv file containing the embeddings to be predicted', default="embeddings.csv")
+parser.add_argument('-p','--predictorFile', help='csv file containing the predictors to use to predict the embeddings', default="features.csv")
+parser.add_argument('-d','--dataPath', help='common path for the embedding and predictor files', default="/raid/home/rajivratn/hemant_rajivratn/ets/BIOT/gpu/datasets/")
+parser.add_argument('-o','--outputPath', help='path to location in which to store the output files', default="./output")
+parser.add_argument('-s','--suffix', help='suffix to identify output files from those in other runs', default="best")
+parser.add_argument('-l','--numLambdas', help='number of lamba values to calculate', type=int, default=10)
+parser.add_argument('-m','--min', help='minimum lambda value', type=float, default=.0001)
+parser.add_argument('-x','--max', help='maximum lambda value', type=float, default=3.5)
+parser.add_argument('-f','--numFolds', help='maximum lambda value', type=int, default=10)
+parser.add_argument('-a','--random_state', help='random state to use', type=int, default=1)
+parser.add_argument('-c','--scoring', help='scoring algorithm to use in CV', default='neg_mean_squared_error')
+parser.add_argument('-r','--rotation', help='Force rotation not orthogonal transformation', action="store_true")
+parser.add_argument('-t','--fit_intercept', help='Force lasso to fit intercept', action="store_true")
+
+args = parser.parse_args()
+
+embeddingPath = f"{args.dataPath}/{args.embeddingFile}"
+predictorPath = f"{args.dataPath}/{args.predictorFile}"
+X = pd.read_csv(embeddingPath,header=None).values
+Fe = pd.read_csv(predictorPath,header=0).values
+
+lam_list = np.geomspace(args.min, args.max, args.numLambdas)
+print('Lambda values:')
+print(lam_list)
+
+random_state = 1
+R = None
+         
+if isinstance(Fe, pd.DataFrame):
+    feature_names = Fe.columns       
+else:
+    feature_names = np.array(range(Fe.shape[1]))
+
+best_idx, R, W, w0, ratio, df = CV_BIOT (Fe, X, feature_names, lam_list, fit_intercept = args.fit_intercept, num_folds=args.numFolds, random_state = args.random_state, R = R, rotation = args.rotation, scoring = args.scoring)
+
+Weights = pd.DataFrame(W.cpu().numpy())
+Weights.index = feature_names
+weightfile = f"{args.outputPath}/weights_{args.suffix}.csv"
+Weights.to_csv(weightfile)
+
+rotationfile = f"{args.outputPath}/rotation_{args.suffix}.csv"
+Rotation = pd.DataFrame(R.cpu().numpy())
+Rotation.to_csv(rotationfile)
+
+interceptfile = f"{args.outputPath}/intercepts_{args.suffix}.csv"
+Intercepts = pd.DataFrame(w0.cpu().numpy())
+Intercepts.to_csv(interceptfile)
+
+dffile = f"{args.outputPath}/avgMSE_{args.suffix}.csv"
+df.to_csv(dffile, index=False)
+print(f"Sparsity ratio: {ratio:.4f}")
+
+# python3 src/BIOT/BIOT_GPU.py -e mite_6d_embedding.csv -p mite_F.csv -d test/inputs/Mite -o test/outputs -s mite_6d_fasta_fixed -l 20 -m 0.0001 -f 5 -x 3.5 -a 42 -c neg_mean_squared_error | tee mite_6d_fasta_fixed.txt 
+ 
\ No newline at end of file
diff --git a/src/BIOT/Modified_code/BIOT_GPU_modified.py b/src/BIOT/Modified_code/BIOT_GPU_modified.py
deleted file mode 100644
index f82e939..0000000
--- a/src/BIOT/Modified_code/BIOT_GPU_modified.py
+++ /dev/null
@@ -1,296 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Functions and classes for the BIOT module
-"""
-import warnings
-warnings.filterwarnings("ignore")
-
-import time
-import pandas as pd 
-import numpy as np
-import torch
-from TorchL1 import TorchL1
-from sklearn.metrics import mean_squared_error
-from sklearn.linear_model import Lasso
-from sklearn.preprocessing import StandardScaler
-from sklearn.compose import TransformedTargetRegressor
-import math
-from scipy.stats import mannwhitneyu
-
-from sklearn.base import BaseEstimator, RegressorMixin
-from sklearn.utils.validation import check_array, check_is_fitted
-
-from sklearn.model_selection import KFold
-from sklearn.model_selection import GridSearchCV
-from sklearn.pipeline import Pipeline
-
-from sklearn.preprocessing import StandardScaler
-import argparse
-
-###############################################################################
-# Functions and classes
-def Get_W_Lasso (X, Y, lam, fit_intercept = False, mode='cuda'):
-    device = torch.device("cuda") 
-    
-    k = Y.shape[1]
-    d = X.shape[1]
-    w0 = torch.zeros(k).double().to(device)
-
-    if fit_intercept:
-        print(f"Do not support this as of now.")
-        exit()
-    print('.', end='', flush=True)
-      
-    model = TorchL1(alpha=lam, max_iter=5000, tol=1e-6)
-    model.coef_ = torch.zeros(d, k).double().to(device)
-    model.alpha *= Y.shape[0]
-    model.fit(X=X, y=Y)
-    
-    return model.coef_, w0
-
-def calc_max_lam (X, Y):
-    
-    n = X.shape[0]
-    sc = StandardScaler()
-    X_norm = sc.fit_transform(X)
-    sc = StandardScaler(with_std=False)
-    Y_norm = sc.fit_transform(Y)
-    max_lam = np.max(np.absolute(X_norm.T @ Y_norm))/n
-    
-    return max_lam
-
-def normalize(X, mean, std):
-  return (X - mean) / std
-
-def Global_L1_Norm (W):
-    
-    k = W.shape[1]
-    norm_val = 0
-    for dim_index in range(k):
-        norm_val += torch.norm(W[:, dim_index], p=1)
-        
-    return norm_val
-
-def BIOT_Crit (X, Y, R, W, w0, lam):
-    
-    n = X.shape[0]
-    diffs = (Y @ R) - (w0.unsqueeze(0).repeat(n, 1) + (X @ W))
-    LS = torch.norm(diffs)**2
-    L1 = Global_L1_Norm(W)
-    crit = ((1 / (2 * n)) * LS) + (lam * L1)
-    
-    return crit
-
-def BIOT (X, Y, lam, max_iter = 200, eps = 1e-6, rotation = False, R = None, fit_intercept = False, mode='cuda'):       
-    print('lambda: ' + str(lam))
-    d = X.shape[1]
-    n = X.shape[0]
-    lam_norm = lam/np.sqrt(d)
-    
-    # If R is provided, get Lasso solution only
-    if R is not None:
-        YR = Y @ R
-        W, w0 = Get_W_Lasso(X = X, Y = YR, lam = lam_norm, fit_intercept=fit_intercept, mode=mode)
-        
-    # Otherwise, run BIOT iterations
-    else:
-        W, w0 = Get_W_Lasso(X = X, Y = Y, lam = lam_norm, fit_intercept=fit_intercept, mode=mode)
-    
-        diff = math.inf
-        iter_index = 0
-        crit_list = [math.inf]
-
-        while iter_index < max_iter and diff > eps:
-            
-            De = (1/(2*n)) * Y.T @ (w0.unsqueeze(0).repeat(n, 1) + (X @ W))
-            U, S, Vh = torch.linalg.svd(De)
-            
-            # rotation matrix desired (counterclockwise)
-            if rotation:
-                sv = S
-                smallest = torch.argmin(sv)
-                sv = sv.clone() 
-                sv[smallest] = torch.sign(torch.det(torch.mm(U, Vh.T)))
-                sv[sv != sv[smallest]] = 1
-                
-                # Construct the rotation matrix
-                R = torch.mm(U, torch.mm(torch.diag(sv), Vh.T))
-            else:
-                R = U @ Vh
-            
-            YR = Y @ R
-            W, w0 = Get_W_Lasso(X = X, Y = YR, lam = lam_norm, fit_intercept = fit_intercept, mode=mode)      
-            
-            crit_list.append(BIOT_Crit(X = X, Y = Y, R = R, W = W, w0 = w0, lam = lam_norm))
-            diff = torch.absolute(crit_list[iter_index] - crit_list[iter_index + 1])
-            iter_index += 1
-            
-    num_nonzero = (W != 0).sum().item()
-    total_elements = W.numel()
-    ratio = num_nonzero / total_elements
-
-    return R, W, w0, ratio
-
-def standardize_features(train, test):
-    mean = train.mean(axis=0)
-    std = train.std(axis=0, ddof=1)
-    std[std == 0] = 1.0
-    train_std = (train - mean) / std
-    test_std  = (test - mean) / std
-    return train_std, test_std
-
-def center_targets(train, test):
-    mean = train.mean(axis=0)
-    train_centered = train - mean
-    test_centered  = test - mean
-    return train_centered, test_centered
-
-def CV_BIOT(Fe, X, feature_names, lam_list, fit_intercept=False,
-            num_folds=10, random_state=1, R=None, rotation=False, scoring='neg_mean_squared_error', mode='cuda'):
-    
-    start_all = time.time()
-    
-    kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)
-    results = []
-    mse_matrix = [] 
-    
-    for lam in lam_list:
-        fold_mse = []
-        fold_sparsity = []
-
-        for train_idx, test_idx in kf.split(Fe):
-            X_train = X[train_idx, :]
-            X_test  = X[test_idx, :]
-            Fe_train = Fe[train_idx, :]
-            Fe_test  = Fe[test_idx, :]
-
-            Fe_train, Fe_test = standardize_features(Fe_train, Fe_test)
-            X_train, X_test = center_targets(X_train, X_test)
-            
-            Fe_train_tensor = torch.tensor(Fe_train).to(mode)
-            X_train_tensor = torch.tensor(X_train).to(mode)
-            
-            R_, W_, w0_, ratio = BIOT(
-                X=Fe_train_tensor, Y=X_train_tensor, lam=lam, rotation=rotation, R=R,
-                fit_intercept=fit_intercept, mode=mode)
-            
-            W_cpu = W_.cpu().numpy()
-            w0_cpu = w0_.cpu().numpy()
-            R_cpu = R_.cpu().numpy()
-            
-            intercept = np.tile(w0_cpu, (Fe_test.shape[0], 1))
-            X_pred = (intercept + Fe_test @ W_cpu) @ R_cpu.T
-
-            mse = mean_squared_error(X_test, X_pred) / 2
-            fold_mse.append(mse)
-            fold_sparsity.append(ratio)
-
-        avg_mse = np.mean(fold_mse)
-        avg_ratio = np.mean(fold_sparsity)
-        results.append((lam, avg_mse, avg_ratio))
-        mse_matrix.append(fold_mse)
-
-    end_all = time.time()
-    print(f"CV BIOT completed in {end_all - start_all:.2f}s")
-    
-    df = pd.DataFrame(results, columns=["Lambda", "AvgMSE", "AvgSparsity"])
-    search_array = np.array(mse_matrix).T
-    print("Search array (fold x lambda):")
-    print(search_array)
-    # best_idx = df["AvgMSE"].idxmin()
-    
-    # Wilcoxon rank-sum test
-    mean_mse_per_lambda = search_array.mean(axis=0)
-    min_idx = mean_mse_per_lambda.argmin()
-    minimum_mse = search_array[:, min_idx]
-    pval = 0
-    best_idx = 0
-
-    for idx in range(min_idx + 1, len(lam_list)):
-        current_mse = search_array[:, idx]
-        pval = 1
-        if np.sum(minimum_mse - np.abs(current_mse)) != 0:
-            result = mannwhitneyu(minimum_mse, current_mse, alternative='two-sided', method='exact', use_continuity=True)
-            pval = result[1]
-        print(f"{idx}\t{pval:.4f}")
-        if pval < 0.05:
-            break
-        best_idx = idx
-        
-    print(f"The lambda value that produces the sparsest model not significantly different from the model with the lowest mse is at index: {best_idx}")
-    
-    # Final BIOT on full data
-    start_final = time.time()
-    Fe_norm, _= standardize_features(Fe, Fe) 
-    X_norm, _ = center_targets(X, X)
-    
-    X_norm = torch.tensor(X_norm).to(mode)
-    Fe_norm = torch.tensor(Fe_norm).to(mode)
-    
-    R, W, w0, ratio= BIOT (Fe_norm, X_norm, lam_list[best_idx], max_iter = 1, eps = 1e-6, rotation = False, R = None, fit_intercept = False,mode='gpu')
-    
-    end_final = time.time()
-    print(f"Final BIOT time: {end_final - start_final:.2f}s")
-    
-    return best_idx, R, W, w0, ratio, df
-
-########################################################################################################
-parser = argparse.ArgumentParser()
-parser.add_argument('-e','--embeddingFile', help='csv file containing the embeddings to be predicted', default="embeddings.csv")
-parser.add_argument('-p','--predictorFile', help='csv file containing the predictors to use to predict the embeddings', default="features.csv")
-parser.add_argument('-d','--dataPath', help='common path for the embedding and predictor files', default="/raid/home/rajivratn/hemant_rajivratn/ets/BIOT/gpu/datasets/")
-parser.add_argument('-o','--outputPath', help='path to location in which to store the output files', default="./output")
-parser.add_argument('-s','--suffix', help='suffix to identify output files from those in other runs', default="best")
-parser.add_argument('-l','--numLambdas', help='number of lamba values to calculate', type=int, default=10)
-parser.add_argument('-m','--min', help='minimum lambda value', type=float, default=.0001)
-parser.add_argument('-x','--max', help='maximum lambda value', type=float, default=3.5)
-parser.add_argument('-f','--numFolds', help='maximum lambda value', type=int, default=10)
-parser.add_argument('-a','--random_state', help='random state to use', type=int, default=1)
-parser.add_argument('-c','--scoring', help='scoring algorithm to use in CV', default='neg_mean_squared_error')
-parser.add_argument('-r','--rotation', help='Force rotation not orthogonal transformation', action="store_true")
-parser.add_argument('-t','--fit_intercept', help='Force lasso to fit intercept', action="store_true")
-parser.add_argument('-u','--mode', help='Processing mode (cpu or gpu)', default='cuda')
-
-args = parser.parse_args()
-
-embeddingPath = f"{args.dataPath}/{args.embeddingFile}"
-predictorPath = f"{args.dataPath}/{args.predictorFile}"
-Y = pd.read_csv(embeddingPath,header=None).values
-X = pd.read_csv(predictorPath,header=0).values
-
-print(f"Check on big dataset")
-print('Embedding shape:', Y.shape)
-print('Predictor shape:', X.shape)
-
-lam_list = np.geomspace(args.min, args.max, args.numLambdas)
-print('Lambda values:')
-print(lam_list)
-
-random_state = 1
-R = None
-         
-if isinstance(X, pd.DataFrame):
-    feature_names = X.columns       
-else:
-    feature_names = np.array(range(X.shape[1]))
-
-best_idx, R, W, w0, ratio, df = CV_BIOT (X, Y, feature_names, lam_list, fit_intercept = args.fit_intercept, num_folds=args.numFolds, random_state = args.random_state, R = R, rotation = args.rotation, scoring = args.scoring, mode=args.mode)
-
-Weights = pd.DataFrame(W.cpu().numpy())
-Weights.index = feature_names
-weightfile = f"{args.outputPath}/weights_{args.suffix}.csv"
-Weights.to_csv(weightfile)
-print(f"Sparsity ratio: {ratio:.4f}")
-
-rotationfile = f"{args.outputPath}/rotation_{args.suffix}.csv"
-Rotation = pd.DataFrame(R.cpu().numpy())
-Rotation.to_csv(rotationfile)
-
-interceptfile = f"{args.outputPath}/intercepts_{args.suffix}.csv"
-Intercepts = pd.DataFrame(w0.cpu().numpy())
-Intercepts.to_csv(interceptfile)
-
-dffile = f"{args.outputPath}/avgMSE_{args.suffix}.csv"
-df.to_csv(dffile, index=False)
-
diff --git a/src/BIOT/Modified_code/TorchL1.py b/src/BIOT/Modified_code/TorchL1.py
index 716833f..f3cc433 100644
--- a/src/BIOT/Modified_code/TorchL1.py
+++ b/src/BIOT/Modified_code/TorchL1.py
@@ -1,6 +1,6 @@
 import torch
 
-    
+
 #@torch.jit.script
 class TorchL1:
     def __init__(self, alpha=1.0, max_iter=1000, tol=1e-6):
@@ -32,15 +32,58 @@ def fit(self, X, y):
     def _fasta(self, X, Y, lam, W, tau1=10.0, max_iter=1000, w=10,
                     backtrack=True, record_iterates=False,
                     stepsize_shrink=0.5, eps_n=1e-15, tol=1e-10):
-        n, p = X.shape
-        _, m = Y.shape
-        device = X.device
+        """
+        Solves a multi-target Lasso problem using the FASTA algorithm (Forward-Backward Splitting with Adaptive step size).
+
+        Objective:
+            min_B  (1/2) * ||X B - Y||_F^2 + lam * ||B||_1
+
+        Parameters
+        ----------
+        X : torch.Tensor of shape (n_samples, n_features)
+            Input feature matrix.
+
+        Y : torch.Tensor of shape (n_samples, n_targets)
+            Target matrix for regression.
+
+        lam : float
+            L1 penalty (sparsity control).
+
+        W : torch.Tensor of shape (n_features, n_targets)
+            Initialization of the coefficient matrix.
+
+        tau1 : float, optional (default=10.0)
+            Initial step size.
+
+        max_iter : int, optional (default=1000)
+            Maximum number of iterations.
+
+        w : int, optional (default=10)
+            Number of previous function values to consider for backtracking.
+
+        backtrack : bool, optional (default=True)
+            Whether to use backtracking line search.
+
+        record_iterates : bool, optional (default=False)
+            Whether to store the iterate history (useful for debugging/analysis).
+
+        stepsize_shrink : float, optional (default=0.5)
+            Step size shrinkage factor during backtracking.
+
+        eps_n : float, optional (default=1e-15)
+            Small constant to avoid division by zero.
+
+        tol : float, optional (default=1e-10)
+            Convergence tolerance based on relative parameter change.
+        """
         
-        # x0 = torch.zeros((p, m), device=device)
+        # Initialize variables
+        _, p = X.shape
         x0 = W
         x1 = x0.clone()
         d1 = x1.clone()
-
+        
+        # Define loss and proximal operators
         f = lambda B: 0.5 * torch.norm(X @ B - Y, p='fro')**2
         gradf = lambda B: X.T @ (X @ B - Y)
         g = lambda B: lam * torch.norm(B, p=1)
@@ -51,14 +94,16 @@ def _fasta(self, X, Y, lam, W, tau1=10.0, max_iter=1000, w=10,
         taus = torch.zeros(max_iter)
         fvals = torch.zeros(max_iter)
         objective = torch.zeros(max_iter + 1)
-
+        
+        # Initial objective
         f1 = f(d1)
         gradf1 = gradf(d1)
         fvals[0] = f1
         objective[0] = f1 + g(x0)
         min_obj = objective[0].item()
         best_beta = x1.clone()
-
+        
+        # Optional: record all iterates
         if record_iterates:
             iterates = torch.zeros((p, max_iter + 1))
             iterates[:, 0] = x1.squeeze()
@@ -69,22 +114,21 @@ def _fasta(self, X, Y, lam, W, tau1=10.0, max_iter=1000, w=10,
             x0 = x1.clone()
             gradf0 = gradf1.clone()
             tau0 = tau1
-            # print (tau0)
-            
+
+            # Forward step (gradient descent)
             x1hat = x0 - tau0 * gradf0
+            # Backward step (proximal operator)
             x1 = proxg(x1hat, tau0)
             Dx = x1 - x0
             d1 = x1.clone()
             f1 = f(d1)
-
+            
+            # Optional backtracking line search
             if backtrack:
                 if i > 0:
                     M = torch.max(fvals[max(i - w, 0):i])
                 else:
                     M = fvals[0]
-                # print (M)
-                
-                prev_f1 = f1
 
                 backtrack_count = 0
                 prop = (f1 - 1e-12 > M + torch.sum(Dx * gradf0) + 0.5 * torch.norm(Dx)**2 / tau0)
@@ -99,6 +143,7 @@ def _fasta(self, X, Y, lam, W, tau1=10.0, max_iter=1000, w=10,
                     backtrack_count += 1
                     prop = (f1 - 1e-12 > M + torch.sum(Dx * gradf0) + 0.5 * torch.norm(Dx)**2 / tau0)
             
+            # Store diagnostics
             taus[i] = tau0
             residual[i] = torch.norm(Dx) / tau0
             normalizer = max(
@@ -108,14 +153,16 @@ def _fasta(self, X, Y, lam, W, tau1=10.0, max_iter=1000, w=10,
             normalized_resid[i] = residual[i].item() / normalizer
             fvals[i] = f1
             objective[i + 1] = f1 + g(x1)
-
+            
+            # Track best solution (objective-wise)
             if objective[i + 1] < min_obj:
                 best_beta = x1.clone()
                 min_obj = objective[i + 1].item()
 
             if record_iterates:
                 iterates[:, i + 1] = x1.squeeze()
-                
+            
+            # Update step size using Barzilai–Borwein rule    
             gradf1 = gradf(d1)
             Dg = gradf1 + (x1hat - x0) / tau0
             dotprod = torch.sum(Dx * Dg)
@@ -130,7 +177,8 @@ def _fasta(self, X, Y, lam, W, tau1=10.0, max_iter=1000, w=10,
             tau1 = tau_m if 2 * tau_m > tau_s else tau_s - 0.5 * tau_m
             if tau1 <= 0 or not torch.isfinite(torch.as_tensor(tau1)):
                 tau1 = tau0 * 1.5
-
+            
+            # Check for convergence
             if torch.max(torch.abs(Dx)) < tol * (torch.max(torch.abs(x1))):
                 break
         
@@ -138,19 +186,40 @@ def _fasta(self, X, Y, lam, W, tau1=10.0, max_iter=1000, w=10,
 
     def _fista(self, X, Y, lam, W, max_iter=1000, tol=1e-10):
         """
-        FISTA algorithm for L1-regularized regression (multi-output).
-        
+        FISTA algorithm for solving the L1-regularized least squares problem (multi-target Lasso).
+
+        Solves the following optimization problem:
+            min_W (1/2n) * ||XW - Y||_F^2 + lam * ||W||_1
+
+        using the Fast Iterative Shrinkage-Thresholding Algorithm (FISTA), which accelerates
+        proximal gradient descent with a momentum term.
+
         Parameters
         ----------
-        X : torch.Tensor, shape (n_samples, n_features)
-        Y : torch.Tensor, shape (n_samples, n_targets)
+        X : torch.Tensor of shape (n_samples, n_features)
+            Design matrix.
+
+        Y : torch.Tensor of shape (n_samples, n_targets)
+            Target matrix for regression.
+
         lam : float
-            L1 regularization strength.
-        W : torch.Tensor, shape (n_features, n_targets)
-            Initial weights.
+            L1 regularization strength (controls sparsity).
+
+        W : torch.Tensor of shape (n_features, n_targets)
+            Initial weight matrix (used as warm start).
+
+        max_iter : int, optional (default=1000)
+            Maximum number of FISTA iterations.
+
+        tol : float, optional (default=1e-10)
+            Tolerance for convergence. Optimization stops when relative change is small.
+
+        Returns
+        -------
+        None
+            The solution is stored in `self.coef_`, a tensor of shape (n_features, n_targets).
         """
-        n, a = X.shape
-        _, b = Y.shape
+        n, _ = X.shape
 
         # Estimate Lipschitz constant L = ||X||^2
         L = torch.linalg.norm(X, ord=2).item() ** 2