diff --git a/data.json b/data.json deleted file mode 100644 index 6abaa86..0000000 --- a/data.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "huggingface": {} -} diff --git a/models.json b/models.json index 74cb8c6..7b73ddd 100644 --- a/models.json +++ b/models.json @@ -29,119 +29,119 @@ "args": "" }, { - "name": "pyt_vllm_llama-3.1-8b", + "name": "pyt_vllm_llama-2-7b", "url": "", + "data": "meta-llama/Llama-2-7b-chat-hf", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-3.1-8B-Instruct.csv", + "multiple_results": "perf_Llama-2-7b-chat-hf.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-3.1-8B-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-2-7b-chat-hf --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" }, { - "name": "pyt_vllm_llama-3.1-70b", + "name": "pyt_vllm_llama-2-70b", "url": "", + "data": "meta-llama/Llama-2-70b-chat-hf", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-3.1-70B-Instruct.csv", + "multiple_results": "perf_Llama-2-70b-chat-hf.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-3.1-70B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-2-70b-chat-hf --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" }, { - "name": "pyt_vllm_llama-3.1-405b", + "name": "pyt_vllm_llama-3.1-8b", "url": "", + "data": "meta-llama/Llama-3.1-8B-Instruct", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-3.1-405B-Instruct.csv", + "multiple_results": "perf_Llama-3.1-8B-Instruct.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-3.1-405B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-3.1-8B-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" }, { - "name": "pyt_vllm_llama-3.2-11b-vision-instruct", + "name": "pyt_vllm_llama-3.1-70b", "url": "", + "data": "meta-llama/Llama-3.1-70B-Instruct", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-3.2-11B-Vision-Instruct.csv", + "multiple_results": "perf_Llama-3.1-70B-Instruct.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-3.2-11B-Vision-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-3.1-70B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" }, { - "name": "pyt_vllm_llama-2-7b", + "name": "pyt_vllm_llama-3.1-405b", "url": "", + "data": "meta-llama/Llama-3.1-405B-Instruct", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-2-7b-chat-hf.csv", + "multiple_results": "perf_Llama-3.1-405B-Instruct.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-2-7b-chat-hf --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-3.1-405B-Instruct --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" }, { - "name": "pyt_vllm_llama-2-70b", + "name": "pyt_vllm_llama-3.2-11b-vision-instruct", "url": "", + "data": "meta-llama/Llama-3.2-11B-Vision-Instruct", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", - "multiple_results": "perf_Llama-2-70b-chat-hf.csv", + "multiple_results": "perf_Llama-3.2-11B-Vision-Instruct.csv", "tags": [ "pyt", "vllm" ], "timeout": -1, "args": - "--model_repo meta-llama/Llama-2-70b-chat-hf --test_option latency,throughput --num_gpu 8 --datatype float16 --tunableop off" + "--model_repo meta-llama/Llama-3.2-11B-Vision-Instruct --test_option latency,throughput --num_gpu 1 --datatype float16 --tunableop off" }, { "name": "pyt_vllm_mixtral-8x7b", "url": "", + "data": "mistralai/Mixtral-8x7B-Instruct-v0.1", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -157,9 +157,9 @@ { "name": "pyt_vllm_mixtral-8x22b", "url": "", + "data": "mistralai/Mixtral-8x22B-Instruct-v0.1", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -175,9 +175,9 @@ { "name": "pyt_vllm_mistral-7b", "url": "", + "data": "mistralai/Mistral-7B-Instruct-v0.1", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -195,7 +195,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -213,7 +212,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -231,7 +229,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -249,7 +246,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -267,7 +263,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -285,7 +280,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -301,9 +295,9 @@ { "name": "pyt_vllm_llama-3.1-8b_fp8", "url": "", + "data": "amd/Llama-3.1-8B-Instruct-FP8-KV", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -319,9 +313,9 @@ { "name": "pyt_vllm_llama-3.1-70b_fp8", "url": "", + "data": "amd/Llama-3.1-70B-Instruct-FP8-KV", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -337,9 +331,9 @@ { "name": "pyt_vllm_llama-3.1-405b_fp8", "url": "", + "data": "amd/Llama-3.1-405B-Instruct-FP8-KV", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -357,7 +351,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -375,7 +368,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -393,7 +385,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -411,7 +402,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -429,7 +419,6 @@ "url": "", "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -447,7 +436,6 @@ "url": "", "dockerfile": "docker/pytorch_train", "scripts": "scripts/pytorch_train/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -464,7 +452,6 @@ "url": "", "dockerfile": "docker/pytorch_train", "scripts": "scripts/pytorch_train/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", @@ -481,7 +468,6 @@ "url": "", "dockerfile": "docker/pytorch_train", "scripts": "scripts/pytorch_train/run.sh", - "data": "huggingface", "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", diff --git a/scripts/vllm/vllm_benchmark_report.sh b/scripts/vllm/vllm_benchmark_report.sh index c3b8b21..bd5f5d2 100755 --- a/scripts/vllm/vllm_benchmark_report.sh +++ b/scripts/vllm/vllm_benchmark_report.sh @@ -50,6 +50,12 @@ model_org_name=(${model//// }) model_name=${model_org_name[1]} tp=$numgpu +# Use local data if present +if [ -n "$MAD_DATAHOME" ]; then + echo "Using data from $MAD_DATAHOME" + model=$MAD_DATAHOME +fi + # perf configuration export VLLM_USE_TRITON_FLASH_ATTN=0 export NCCL_MIN_NCHANNELS=112