Skip to content
Open
3 changes: 0 additions & 3 deletions data.json

This file was deleted.

74 changes: 30 additions & 44 deletions models.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,119 +29,119 @@
"args": ""
},
{
"name": "pyt_vllm_llama-3.1-8b",
"name": "pyt_vllm_llama-2-7b",
"url": "",
"data": "meta-llama/Llama-2-7b-chat-hf",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
"multiple_results": "perf_Llama-3.1-8B-Instruct.csv",
"multiple_results": "perf_Llama-2-7b-chat-hf.csv",
"tags": [
"pyt",
"vllm"
],
"timeout": -1,
"args":
"--model_repo meta-llama/Llama-3.1-8B-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off"
"--model_repo meta-llama/Llama-2-7b-chat-hf --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off"
},
{
"name": "pyt_vllm_llama-3.1-70b",
"name": "pyt_vllm_llama-2-70b",
"url": "",
"data": "meta-llama/Llama-2-70b-chat-hf",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
"multiple_results": "perf_Llama-3.1-70B-Instruct.csv",
"multiple_results": "perf_Llama-2-70b-chat-hf.csv",
"tags": [
"pyt",
"vllm"
],
"timeout": -1,
"args":
"--model_repo meta-llama/Llama-3.1-70B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off"
"--model_repo meta-llama/Llama-2-70b-chat-hf --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off"
},
{
"name": "pyt_vllm_llama-3.1-405b",
"name": "pyt_vllm_llama-3.1-8b",
"url": "",
"data": "meta-llama/Llama-3.1-8B-Instruct",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
"multiple_results": "perf_Llama-3.1-405B-Instruct.csv",
"multiple_results": "perf_Llama-3.1-8B-Instruct.csv",
"tags": [
"pyt",
"vllm"
],
"timeout": -1,
"args":
"--model_repo meta-llama/Llama-3.1-405B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off"
"--model_repo meta-llama/Llama-3.1-8B-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off"
},
{
"name": "pyt_vllm_llama-3.2-11b-vision-instruct",
"name": "pyt_vllm_llama-3.1-70b",
"url": "",
"data": "meta-llama/Llama-3.1-70B-Instruct",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
"multiple_results": "perf_Llama-3.2-11B-Vision-Instruct.csv",
"multiple_results": "perf_Llama-3.1-70B-Instruct.csv",
"tags": [
"pyt",
"vllm"
],
"timeout": -1,
"args":
"--model_repo meta-llama/Llama-3.2-11B-Vision-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off"
"--model_repo meta-llama/Llama-3.1-70B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off"
},
{
"name": "pyt_vllm_llama-2-7b",
"name": "pyt_vllm_llama-3.1-405b",
"url": "",
"data": "meta-llama/Llama-3.1-405B-Instruct",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
"multiple_results": "perf_Llama-2-7b-chat-hf.csv",
"multiple_results": "perf_Llama-3.1-405B-Instruct.csv",
"tags": [
"pyt",
"vllm"
],
"timeout": -1,
"args":
"--model_repo meta-llama/Llama-2-7b-chat-hf --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off"
"--model_repo meta-llama/Llama-3.1-405B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off"
},
{
"name": "pyt_vllm_llama-2-70b",
"name": "pyt_vllm_llama-3.2-11b-vision-instruct",
"url": "",
"data": "meta-llama/Llama-3.2-11B-Vision-Instruct",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
"multiple_results": "perf_Llama-2-70b-chat-hf.csv",
"multiple_results": "perf_Llama-3.2-11B-Vision-Instruct.csv",
"tags": [
"pyt",
"vllm"
],
"timeout": -1,
"args":
"--model_repo meta-llama/Llama-2-70b-chat-hf --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off"
"--model_repo meta-llama/Llama-3.2-11B-Vision-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off"
},
{
"name": "pyt_vllm_mixtral-8x7b",
"url": "",
"data": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -157,9 +157,9 @@
{
"name": "pyt_vllm_mixtral-8x22b",
"url": "",
"data": "mistralai/Mixtral-8x22B-Instruct-v0.1",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -175,9 +175,9 @@
{
"name": "pyt_vllm_mistral-7b",
"url": "",
"data": "mistralai/Mistral-7B-Instruct-v0.1",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -195,7 +195,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -213,7 +212,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -231,7 +229,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -249,7 +246,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -267,7 +263,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -285,7 +280,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -301,9 +295,9 @@
{
"name": "pyt_vllm_llama-3.1-8b_fp8",
"url": "",
"data": "amd/Llama-3.1-8B-Instruct-FP8-KV",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -319,9 +313,9 @@
{
"name": "pyt_vllm_llama-3.1-70b_fp8",
"url": "",
"data": "amd/Llama-3.1-70B-Instruct-FP8-KV",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -337,9 +331,9 @@
{
"name": "pyt_vllm_llama-3.1-405b_fp8",
"url": "",
"data": "amd/Llama-3.1-405B-Instruct-FP8-KV",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -357,7 +351,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -375,7 +368,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -393,7 +385,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -411,7 +402,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -429,7 +419,6 @@
"url": "",
"dockerfile": "docker/pyt_vllm",
"scripts": "scripts/vllm/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -447,7 +436,6 @@
"url": "",
"dockerfile": "docker/pytorch_train",
"scripts": "scripts/pytorch_train/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -464,7 +452,6 @@
"url": "",
"dockerfile": "docker/pytorch_train",
"scripts": "scripts/pytorch_train/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand All @@ -481,7 +468,6 @@
"url": "",
"dockerfile": "docker/pytorch_train",
"scripts": "scripts/pytorch_train/run.sh",
"data": "huggingface",
"n_gpus": "-1",
"owner": "mad.support@amd.com",
"training_precision": "",
Expand Down
6 changes: 6 additions & 0 deletions scripts/vllm/vllm_benchmark_report.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ model_org_name=(${model//// })
model_name=${model_org_name[1]}
tp=$numgpu

# Use local data if present
if [ -n "$MAD_DATAHOME" ]; then
echo "Using data from $MAD_DATAHOME"
model=$MAD_DATAHOME
fi

# perf configuration
export VLLM_USE_TRITON_FLASH_ATTN=0
export NCCL_MIN_NCHANNELS=112
Expand Down