Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ example: ./run_pareto_analysis.sh 3cNWY5 wiki10m

Serve the webui on port 8000:

cd web-ui-new; python3 -m http.server
cd web-ui; python3 -m http.server
107 changes: 50 additions & 57 deletions generate-combinations.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,104 +59,92 @@
else:
algo_variants[param] = value

# Generate all combination of variants. For each combination, generate a hashed ID, and a file with the
# name pattern as <sweep>-<algo>-<hash>.json. The file should contain the invariants as is, and the variants as the current combination.
if algo_variants:
# Separate efSearch from other variants if it exists
efSearch_values = None
efSearchScaleFactor_values = None
other_variant_keys = []
other_variant_values = []

for key, value in algo_variants.items():
if key == 'efSearch':
efSearch_values = value
elif key == 'efSearchScaleFactor':
efSearchScaleFactor_values = value
else:
other_variant_keys.append(key)
other_variant_values.append(value)

# Generate combinations with efSearch at the beginning (innermost loop)
if efSearch_values and other_variant_keys:
# Generate combinations of other parameters first

if (efSearch_values or efSearchScaleFactor_values) and other_variant_keys:
for other_combination in itertools.product(*other_variant_values):
other_variants = dict(zip(other_variant_keys, other_combination))
# Then iterate through efSearch values
for ef_index, ef_value in enumerate(efSearch_values):
search_values = efSearch_values if efSearch_values else efSearchScaleFactor_values
search_key = 'efSearch' if efSearch_values else 'efSearchScaleFactor'
for ef_index, ef_value in enumerate(search_values):
current_variants = other_variants.copy()
current_variants['efSearch'] = ef_value

# Skip if cagraIntermediateDegree < cagraGraphDegree
current_variants[search_key] = ef_value

if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
continue

# Skip if hnswMaxConn > hnswBeamWidth

if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
continue

# Generate hash only from other_variants (excluding efSearch)

base_hash = hashlib.md5(json.dumps(other_variants, sort_keys=True).encode()).hexdigest()[:8]
hash_id = f"{base_hash}-ef{ef_value}"
hash_id = f"{base_hash}-ef{ef_value}" if search_key == 'efSearch' else f"{base_hash}-efs{ef_value}"

config = algo_invariants.copy()
config.update(current_variants)

# For multiple efSearch combinations: subsequent ones skip indexing
if len(efSearch_values) > 1 and ef_index > 0:

if len(search_values) > 1 and ef_index > 0:
config['skipIndexing'] = True

# Set cleanIndexDirectory based on position

if ef_index == 0:
config['cleanIndexDirectory'] = False
elif ef_index == len(efSearch_values) - 1:
elif ef_index == len(search_values) - 1:
config['cleanIndexDirectory'] = True
else:
config['cleanIndexDirectory'] = False

# Use base_hash for index directory paths

if 'hnswIndexDirPath' in config:
config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
if 'cuvsIndexDirPath' in config:
config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"

filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
elif efSearch_values:
# Only efSearch values, no other variants
for ef_index, ef_value in enumerate(efSearch_values):
current_variants = {'efSearch': ef_value}
# Generate hash from empty dict since no other variants exist
elif efSearch_values or efSearchScaleFactor_values:
search_values = efSearch_values if efSearch_values else efSearchScaleFactor_values
search_key = 'efSearch' if efSearch_values else 'efSearchScaleFactor'
for ef_index, ef_value in enumerate(search_values):
current_variants = {search_key: ef_value}
base_hash = hashlib.md5(json.dumps({}, sort_keys=True).encode()).hexdigest()[:8]
hash_id = f"{base_hash}-ef{ef_value}"
hash_id = f"{base_hash}-ef{ef_value}" if search_key == 'efSearch' else f"{base_hash}-efs{ef_value}"

config = algo_invariants.copy()
config.update(current_variants)

# For multiple efSearch combinations: subsequent ones skip indexing
if len(efSearch_values) > 1 and ef_index > 0:

if len(search_values) > 1 and ef_index > 0:
config['skipIndexing'] = True

# Set cleanIndexDirectory based on position

if ef_index == 0:
config['cleanIndexDirectory'] = False
elif ef_index == len(efSearch_values) - 1:
elif ef_index == len(search_values) - 1:
config['cleanIndexDirectory'] = True
else:
config['cleanIndexDirectory'] = False

# Use base_hash for index directory paths

if 'hnswIndexDirPath' in config:
config['hnswIndexDirPath'] = f"hnswIndex-{base_hash}"
if 'cuvsIndexDirPath' in config:
config['cuvsIndexDirPath'] = f"cuvsIndex-{base_hash}"

filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
Expand All @@ -165,26 +153,21 @@
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")
else:
# No efSearch, use original logic
variant_keys = list(algo_variants.keys())
variant_values = list(algo_variants.values())
for combination in itertools.product(*variant_values):
current_variants = dict(zip(variant_keys, combination))

# Skip if cagraIntermediateDegree < cagraGraphDegree

if 'cagraIntermediateDegree' in current_variants and 'cagraGraphDegree' in current_variants:
if current_variants['cagraIntermediateDegree'] < current_variants['cagraGraphDegree']:
print(f"\t\tSkipping combination: cagraIntermediateDegree ({current_variants['cagraIntermediateDegree']}) < cagraGraphDegree ({current_variants['cagraGraphDegree']})")
continue

# Skip if hnswMaxConn > hnswBeamWidth

if 'hnswMaxConn' in current_variants and 'hnswBeamWidth' in current_variants:
if current_variants['hnswMaxConn'] > current_variants['hnswBeamWidth']:
print(f"\t\tSkipping combination: hnswMaxConn ({current_variants['hnswMaxConn']}) > hnswBeamWidth ({current_variants['hnswBeamWidth']})")
continue

hash_id = hashlib.md5(json.dumps(current_variants, sort_keys=True).encode()).hexdigest()[:8]

config = algo_invariants.copy()
config.update(current_variants)
filename = f"{algo}-{hash_id}.json"
Expand All @@ -194,6 +177,16 @@
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")


else:
hash_id = hashlib.md5(json.dumps({}, sort_keys=True).encode()).hexdigest()[:8]
config = algo_invariants.copy()
filename = f"{algo}-{hash_id}.json"
sweep_dir = f"{args.configs_dir}/{sweep}"
filepath = f"{sweep_dir}/{filename}"
os.makedirs(sweep_dir, exist_ok=True)
with open(filepath, 'w') as f:
json.dump(config, f, indent=2)
print(f"\tGenerated config file: {filepath}")


print("----------------------")
23 changes: 15 additions & 8 deletions plot_pareto.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,16 @@ def create_plot_search(
# Sorting by mean y-value helps aligning plots with labels
def mean_y(algo):
points = np.array(all_data[algo], dtype=object)
if len(points) == 0 or points.ndim < 2:
return float('inf')
return -np.log(np.array(points[:, 3], dtype=np.float32)).mean()

# Find range for logit x-scale
min_x, max_x = 1, 0
for algo in sorted(all_data.keys(), key=mean_y):
points = np.array(all_data[algo], dtype=object)
if len(points) == 0 or points.ndim < 2:
continue
xs = points[:, 2]
ys = points[:, 3]
min_x = min([min_x] + [x for x in xs if x > 0])
Expand Down Expand Up @@ -226,10 +230,14 @@ def create_plot_build(
# Sorting by mean y-value helps aligning plots with labels
def mean_y(algo):
points = np.array(search_results[algo], dtype=object)
if len(points) == 0 or points.ndim < 2:
return float('inf')
return -np.log(np.array(points[:, 3], dtype=np.float32)).mean()

for pos, algo in enumerate(sorted(search_results.keys(), key=mean_y)):
points = np.array(search_results[algo], dtype=object)
if len(points) == 0 or points.ndim < 2:
continue
# x is recall, ls is algo_name, idxs is index_name
xs = points[:, 2]
ls = points[:, 0]
Expand Down Expand Up @@ -279,33 +287,32 @@ def mean_y(algo):
df = pd.DataFrame(data, index=index)
df.replace(0.0, np.nan, inplace=True)
df = df.dropna(how="all")

if df.empty or df.shape[1] == 0:
print(f"Skipping build plot: no data points in recall buckets >= 80%")
return

plt.figure(figsize=(12, 9))
ax = df.plot.bar(rot=0, color=colors)
fig = ax.get_figure()

# Add speedup annotations
if 'LUCENE_HNSW' in df.columns and 'CAGRA_HNSW' in df.columns:
y_max = ax.get_ylim()[1]

for i, bucket in enumerate(df.index):
lucene_time = df.loc[bucket, 'LUCENE_HNSW']
cagra_time = df.loc[bucket, 'CAGRA_HNSW']

if pd.notna(lucene_time) and pd.notna(cagra_time) and lucene_time > 0 and cagra_time > 0:
speedup = lucene_time / cagra_time
# Position annotations just above the bars, below subtitle
ax.text(i, y_max * 0.98, f'{speedup:.1f}x',
ha='center', va='bottom', fontsize=9, fontweight='bold',
bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.9, edgecolor='gray'))

print(f"writing build output to {fn_out}")
plt.title(
"Average Build Time within Recall Range "
f"for k={k} n_queries={n_queries}"
)
plt.title(f"Average Build Time within Recall Range for k={k} n_queries={n_queries}")
plt.suptitle(f"{dataset}")
plt.ylabel("Build Time (s)")
fig.savefig(fn_out)
plt.close()


def load_lines(results_path, result_files, method, index_key, mode, time_unit):
Expand Down
20 changes: 6 additions & 14 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,6 @@
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>

<repositories>
<repository>
<id>searchscale-maven</id>
<name>SearchScale Maven</name>
<url>https://maven.searchscale.com/snapshots</url>
</repository>
</repositories>

<dependencyManagement>
<dependencies>
<dependency>
Expand All @@ -37,15 +29,15 @@
</dependencyManagement>

<dependencies>
<dependency>
<groupId>com.nvidia.cuvs.lucene</groupId>
<artifactId>cuvs-lucene</artifactId>
<version>25.10.0-33318-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.nvidia.cuvs.lucene</groupId>
<artifactId>cuvs-lucene</artifactId>
<version>25.10.0</version>
</dependency>
<dependency>
<groupId>com.nvidia.cuvs</groupId>
<artifactId>cuvs-java</artifactId>
<version>25.10.0-55985-SNAPSHOT</version>
<version>25.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
Expand Down
Loading