From dd188b0c7b808796287cd7ca456afcd872ec630f Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 25 Jul 2024 10:40:54 -0700 Subject: [PATCH 01/35] Unpin transformers version Signed-off-by: Logan Adams --- .github/workflows/nv-v100-legacy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml index f0598279..426996b3 100644 --- a/.github/workflows/nv-v100-legacy.yml +++ b/.github/workflows/nv-v100-legacy.yml @@ -36,7 +36,7 @@ jobs: - name: Install dependencies run: | pip install git+https://github.com/microsoft/DeepSpeed.git - pip install git+https://github.com/huggingface/transformers.git@v4.42.4 + pip install git+https://github.com/huggingface/transformers.git pip install -U accelerate ds_report From 11839bc29c53fe83e759a4bb0a32a7421864ee55 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 25 Jul 2024 13:29:13 -0700 Subject: [PATCH 02/35] Update model support (#429) Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Ammar Ahmad Awan Co-authored-by: Logan Adams Signed-off-by: Logan Adams --- README.md | 18 ++++----- requirements/requirements-dev.txt | 3 ++ tests/test_model_support.py | 63 ++++++++++++------------------- 3 files changed, 36 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index f666762a..476c7cc1 100644 --- a/README.md +++ b/README.md @@ -85,18 +85,18 @@ Under-the-hood MII is powered by [DeepSpeed-Inference](https://github.com/micros # Supported Models -MII currently supports over 20,000 models across eight popular model architectures. We plan to add additional models in the near term, if there are specific model architectures you would like supported please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know. All current models leverage Hugging Face in our backend to provide both the model weights and the model's corresponding tokenizer. For our current release we support the following model architectures: +MII currently supports over 37,000 models across eight popular model architectures. We plan to add additional models in the near term, if there are specific model architectures you would like supported please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know. All current models leverage Hugging Face in our backend to provide both the model weights and the model's corresponding tokenizer. For our current release we support the following model architectures: model family | size range | ~model count ------ | ------ | ------ -[falcon](https://huggingface.co/models?other=falcon) | 7B - 180B | 300 -[llama](https://huggingface.co/models?other=llama) | 7B - 65B | 19,000 -[llama-2](https://huggingface.co/models?other=llama-2) | 7B - 70B | 900 -[mistral](https://huggingface.co/models?other=mistral) | 7B | 6,000 -[mixtral (MoE)](https://huggingface.co/models?other=mixtral) | 8x7B | 1,100 -[opt](https://huggingface.co/models?other=opt) | 0.1B - 66B | 1,300 -[phi-2](https://huggingface.co/models?other=phi) | 2.7B | 200 -[qwen](https://huggingface.co/models?other=qwen) | 7B - 72B | 200 +[falcon](https://huggingface.co/models?other=falcon) | 7B - 180B | 500 +[llama](https://huggingface.co/models?other=llama) | 7B - 65B | 52,000 +[llama-2](https://huggingface.co/models?other=llama-2) | 7B - 70B | 1,200 +[mistral](https://huggingface.co/models?other=mistral) | 7B | 23,000 +[mixtral (MoE)](https://huggingface.co/models?other=mixtral) | 8x7B | 2,900 +[opt](https://huggingface.co/models?other=opt) | 0.1B - 66B | 2,100 +[phi-2](https://huggingface.co/models?other=phi) | 2.7B | 1,500 +[qwen](https://huggingface.co/models?other=qwen) | 7B - 72B | 500 ## MII Legacy Model Support diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 1d69f875..4b7bb770 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,5 +1,8 @@ clang-format==16.0.2 +einops pre-commit>=2.20.0 pytest pytest-forked sentencepiece +tiktoken +transformers-stream-generator diff --git a/tests/test_model_support.py b/tests/test_model_support.py index be49044a..fb554206 100644 --- a/tests/test_model_support.py +++ b/tests/test_model_support.py @@ -11,25 +11,16 @@ CheckpointEngineBase, HuggingFaceCheckpointEngine, ) -from transformers import AutoConfig, AutoModel, GenerationConfig +from transformers import AutoConfig, AutoModelForCausalLM, GenerationConfig from typing import Iterable, Tuple -class RandomWeightsCheckpointEngine(CheckpointEngineBase): - - # When using AutoModel.from_config() to load the model, the layer names are - # often missing a prefix. We default to adding "model." as the prefix, but - # others can be specified here. - layer_prefix_map = {"falcon": "transformer."} - - # When using AutoModel.from_config() to load the model, the lm_head layer is - # not generated. We default to populating this with the - # "embed_tokens.weight" layer, but others can be specified here. - lm_head_layer_map = {"falcon": "word_embeddings.weight"} - +class ZeroWeightsCheckpointEngine(CheckpointEngineBase): + """ Generates weight with all zeros for a given model for testing purposes. """ def __init__(self, model_name_or_path: str, auth_token: str = None) -> None: self.model_name_or_path = model_name_or_path - self.model_config = AutoConfig.from_pretrained(self.model_name_or_path) + self.model_config = AutoConfig.from_pretrained(self.model_name_or_path, + trust_remote_code=True) if hasattr(self.model_config, "max_position_embeddings"): self.model_config.max_seq_length = self.model_config.max_position_embeddings else: @@ -40,37 +31,21 @@ def __init__(self, model_name_or_path: str, auth_token: str = None) -> None: except OSError: self.model_config.max_seq_length = 2048 - def _get_layer_prefix(self) -> str: - for model_type, prefix in self.layer_prefix_map.items(): - if model_type in self.model_name_or_path.lower(): - return prefix - return "model." - - def _get_lm_head_layer(self) -> str: - for model_type, layer in self.lm_head_layer_map.items(): - if model_type in self.model_name_or_path.lower(): - return layer - return "embed_tokens.weight" - def parameters(self) -> Iterable[Tuple[str, torch.Tensor]]: - layer_prefix = self._get_layer_prefix() - lm_head_layer = self._get_lm_head_layer() - # Load with meta device is faster with deepspeed.OnDevice(dtype=torch.float16, device="meta"): - model = AutoModel.from_config(self.model_config) + model = AutoModelForCausalLM.from_config(self.model_config, + trust_remote_code=True) for param_name, param in model.state_dict().items(): - yield layer_prefix + param_name, torch.zeros(param.shape) - if param_name == lm_head_layer: - yield "lm_head.weight", torch.zeros(param.shape) + yield param_name, torch.zeros(param.shape) @pytest.fixture(scope="module", autouse=True) def inject_checkpoint_engine(): # Inject the random weihts checkpoint engine deepspeed.inference.v2.engine_factory.HuggingFaceCheckpointEngine = ( - RandomWeightsCheckpointEngine) + ZeroWeightsCheckpointEngine) yield None # Restore the original checkpoint engine deepspeed.inference.v2.engine_factory.HuggingFaceCheckpointEngine = ( @@ -81,16 +56,26 @@ def inject_checkpoint_engine(): "model_name", [ "tiiuae/falcon-7b", + "huggyllama/llama-7b", "NousResearch/Llama-2-7b-hf", "NousResearch/Hermes-2-Pro-Mistral-7B", "cloudyu/Mixtral_11Bx2_MoE_19B", "facebook/opt-125m", + "microsoft/phi-2", + "Qwen/Qwen-7B-Chat", + "Qwen/Qwen1.5-0.5B", + ], + ids=[ + "falcon", + "llama", + "llama-2", + "mistral", + "mixtral", + "opt", + "phi-2", + "qwen", + "qwen-2" ], - ids=["falcon", - "llama", - "mistral", - "mixtral", - "opt"], ) def test_model(pipeline, query): outputs = pipeline(query, max_new_tokens=16) From 416d4cc85db727efeb47ec243e53a75e5e966cd9 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Thu, 25 Jul 2024 13:34:47 -0700 Subject: [PATCH 03/35] Update version.txt (#511) Signed-off-by: Logan Adams --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index abd41058..3a4036fb 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.2.4 +0.2.5 From 036b9d99b04019b3f86b30c5451c62b180f17485 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 25 Jul 2024 16:14:16 -0700 Subject: [PATCH 04/35] Test with transformers v4.43.0 Signed-off-by: Logan Adams --- .github/workflows/nv-v100-legacy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml index 426996b3..0eef4799 100644 --- a/.github/workflows/nv-v100-legacy.yml +++ b/.github/workflows/nv-v100-legacy.yml @@ -36,7 +36,7 @@ jobs: - name: Install dependencies run: | pip install git+https://github.com/microsoft/DeepSpeed.git - pip install git+https://github.com/huggingface/transformers.git + pip install git+https://github.com/huggingface/transformers.git@v4.43.0 pip install -U accelerate ds_report From ba9c26f2b2b306b7af8b296d45f0a92bbd98fd23 Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Date: Mon, 5 Aug 2024 12:58:51 -0700 Subject: [PATCH 05/35] Fix scheduling for non-persistent pipeline (#515) Signed-off-by: Logan Adams --- mii/batching/ragged_batching.py | 28 ++++++++++++++++------------ mii/legacy/logging.py | 2 +- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/mii/batching/ragged_batching.py b/mii/batching/ragged_batching.py index 4e9583b9..5b37e5b8 100644 --- a/mii/batching/ragged_batching.py +++ b/mii/batching/ragged_batching.py @@ -101,13 +101,13 @@ def is_rank_0(self) -> bool: return self.local_rank == 0 @profiler - def generate(self) -> None: + def generate(self) -> Union[None, bool]: """ This is the main loop of FastGen: puts requests and gets generated results. """ # 1. Get a batch of requests, broadcast to all ranks - scheduled_requests = self._bcast_requests() + scheduled_requests, force = self._bcast_requests() # 2. Flush for uids that are finished generating self.flush(scheduled_requests.requests_to_flush.uids) @@ -121,7 +121,7 @@ def generate(self) -> None: # short circuit if not rank 0, only rank 0 does scheduling and postprocessing of logits if not self.is_rank_0: - return + return force # 4. Launch logit processing and token generation running_requests = scheduled_requests.requests_to_run @@ -173,20 +173,22 @@ def _bcast_requests(self, force=False) -> RequestBatch: # the prompt tokens must be broadcast to all TP processes. if self.is_rank_0: if not self.scheduled_requests and not force: - return self.scheduled_requests + return self.scheduled_requests, force # Rank 0 gets batch of requests and broadcasts to other ranks data_dicts = self.scheduled_requests.to_msg_dicts() - json_data = ujson.dumps(data_dicts) + json_data = ujson.dumps({"data": data_dicts, "force": force}) self.socket.send_string(json_data) else: try: json_data = self.socket.recv_string() - data_dicts = ujson.loads(json_data) + recv_dict = ujson.loads(json_data) + data_dicts = recv_dict["data"] + force = recv_dict["force"] self.scheduled_requests = RequestBatch.from_msg_dicts(data_dicts) except zmq.Again: self.scheduled_requests = RequestBatch() - return self.scheduled_requests + return self.scheduled_requests, force def _reset_scheduler_bookkeeping(self) -> None: self.scheduled_requests = RequestBatch() @@ -560,6 +562,7 @@ def __init__(self, all_rank_output: bool = False, *args, **kwargs) -> None: self.tid = threading.get_ident() self._all_rank_output = all_rank_output self._destroyed = False + get_accelerator().set_device(int(os.getenv("LOCAL_RANK", "0"))) def __call__(self, prompts: Union[str, @@ -589,25 +592,26 @@ def __call__(self, request_kwargs = generate_kwargs.copy() self._put_request(uid, input, request_kwargs) - self.schedule_requests() - if self.is_rank_0: # Rank 0 runs generate() until all responses are returned while uids_running: - self.generate() while not self.result_queues[self.tid].empty(): uid, response = self._get_response() outputs.append(response) self._queue_flush_request(uid) uids_complete_order.append(uid) uids_running.remove(uid) + self.generate() # Ensure final flush requests broadcast and # kick ranks 1 -> n out of the while loop self._bcast_requests(force=True) + self.flush(self.scheduled_requests.requests_to_flush.uids) + self.scheduled_requests = RequestBatch() else: # Ranks 1 -> n just run generate() until there are no more requests - while self.scheduled_requests: - self.generate() + exit = False + while not exit: + exit = self.generate() outputs = [ r for idx, diff --git a/mii/legacy/logging.py b/mii/legacy/logging.py index 1fcf2ac9..9fe62a20 100644 --- a/mii/legacy/logging.py +++ b/mii/legacy/logging.py @@ -42,4 +42,4 @@ def create_logger(name=None, level=logging.INFO): return logger_ -logger = LoggerFactory.create_logger(name="MII", level=logging.INFO) +logger = LoggerFactory.create_logger(name="MII_legacy", level=logging.INFO) From 24c42b5278a9f6cf705b8bdff0a48f504bfb67ac Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Wed, 21 Aug 2024 17:35:10 +0100 Subject: [PATCH 06/35] Add Kubernetes health check route to REST server (#445) Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Signed-off-by: Logan Adams --- mii/grpc_related/restful_gateway.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mii/grpc_related/restful_gateway.py b/mii/grpc_related/restful_gateway.py index a5f1692b..5b93fea6 100644 --- a/mii/grpc_related/restful_gateway.py +++ b/mii/grpc_related/restful_gateway.py @@ -40,6 +40,10 @@ def terminate(): threading.Thread(target=shutdown, args=(server_thread, )).start() return "Shutting down RESTful API gateway server" + @app.route("/healthz", methods=["GET"]) + def healthz(): + return "ok" + api = Api(app) path = "/{}/{}".format(RESTFUL_API_PATH, deployment_name) api.add_resource(RestfulGatewayService, path) From ab5b2ba234926352c72122b95d59bac13ee503b0 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 22 Aug 2024 15:49:24 -0700 Subject: [PATCH 07/35] Update in advance of pydantic PR Signed-off-by: Logan Adams --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index 3a4036fb..0d91a54c 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.2.5 +0.3.0 From b285e81de8dd866d183582e3a00a55ca684602fc Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 22 Aug 2024 15:52:00 -0700 Subject: [PATCH 08/35] Pydantic v2 migration (#423) Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Abhishek Kulkarni Co-authored-by: Logan Adams Signed-off-by: Logan Adams --- docs/requirements.txt | 4 +- mii/api.py | 10 +- mii/backend/client.py | 2 +- mii/backend/server.py | 19 ++- mii/config.py | 125 ++++++++--------- mii/legacy/client.py | 2 +- mii/legacy/config.py | 179 ++++++++++++------------ mii/legacy/deployment.py | 18 +-- mii/legacy/pydantic_v1.py | 16 --- mii/legacy/server.py | 15 +- mii/legacy/utils.py | 2 +- mii/pydantic_v1.py | 16 --- mii/score/generate.py | 2 +- requirements/requirements.txt | 4 +- tests/legacy/test_config.py | 10 +- tests/legacy/test_deployment_options.py | 4 +- tests/test_arg_parsing.py | 24 ++-- 17 files changed, 207 insertions(+), 245 deletions(-) delete mode 100644 mii/legacy/pydantic_v1.py delete mode 100644 mii/pydantic_v1.py diff --git a/docs/requirements.txt b/docs/requirements.txt index e2a2fd67..1afb6a65 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,6 @@ asyncio -autodoc_pydantic<2.0.0 -deepspeed>=0.13.0 +autodoc_pydantic>=2.0.0 +deepspeed>=0.15.0 grpcio grpcio-tools sphinx==7.1.2 diff --git a/mii/api.py b/mii/api.py index 77ed6e19..841f7624 100644 --- a/mii/api.py +++ b/mii/api.py @@ -39,7 +39,7 @@ def _parse_kwargs_to_model_config( # Fill model_config dict with relevant kwargs, store remaining kwargs in a new dict remaining_kwargs = {} for key, val in kwargs.items(): - if key in ModelConfig.__dict__["__fields__"]: + if key in ModelConfig.model_fields.keys(): if key in model_config: assert ( model_config.get(key) == val @@ -77,7 +77,7 @@ def _parse_kwargs_to_mii_config( # Fill mii_config dict with relevant kwargs, raise error on unknown kwargs for key, val in remaining_kwargs.items(): - if key in MIIConfig.__dict__["__fields__"]: + if key in MIIConfig.model_fields.keys(): if key in mii_config: assert ( mii_config.get(key) == val @@ -183,9 +183,9 @@ def serve( mii.aml_related.utils.generate_aml_scripts( acr_name=acr_name, deployment_name=mii_config.deployment_name, - model_name=mii_config.model_config.model, - task_name=mii_config.model_config.task, - replica_num=mii_config.model_config.replica_num, + model_name=mii_config.model_conf.model, + task_name=mii_config.model_conf.task, + replica_num=mii_config.model_conf.replica_num, instance_type=mii_config.instance_type, version=mii_config.version, ) diff --git a/mii/backend/client.py b/mii/backend/client.py index cb4acc17..d946fce6 100644 --- a/mii/backend/client.py +++ b/mii/backend/client.py @@ -37,7 +37,7 @@ class MIIClient: """ def __init__(self, mii_config: MIIConfig, host: str = "localhost") -> None: self.mii_config = mii_config - self.task = mii_config.model_config.task + self.task = mii_config.model_conf.task self.port = mii_config.port_number self.asyncio_loop = asyncio.get_event_loop() channel = create_channel(host, self.port) diff --git a/mii/backend/server.py b/mii/backend/server.py index 02e055d5..ac51a018 100644 --- a/mii/backend/server.py +++ b/mii/backend/server.py @@ -20,7 +20,7 @@ def config_to_b64_str(config: DeepSpeedConfigModel) -> str: # convert json str -> bytes - json_bytes = config.json().encode() + json_bytes = config.model_dump_json().encode() # base64 encoded bytes b64_config_bytes = base64.urlsafe_b64encode(json_bytes) # bytes -> str @@ -31,7 +31,7 @@ class MIIServer: """Initialize the model, setup the server for the model""" def __init__(self, mii_config: MIIConfig) -> None: - self.task = mii_config.model_config.task + self.task = mii_config.model_conf.task self.port_number = mii_config.port_number if not os.path.isfile(mii_config.hostfile): @@ -47,8 +47,7 @@ def __init__(self, mii_config: MIIConfig) -> None: # balancer process, each DeepSpeed model replica, and optionally the # REST API process) processes = self._initialize_service(mii_config) - self._wait_until_server_is_live(processes, - mii_config.model_config.replica_configs) + self._wait_until_server_is_live(processes, mii_config.model_conf.replica_configs) def _wait_until_server_is_live(self, processes: List[subprocess.Popen], @@ -143,15 +142,15 @@ def _initialize_service(self, mii_config: MIIConfig) -> List[subprocess.Popen]: ] host_gpus = defaultdict(list) - for repl_config in mii_config.model_config.replica_configs: + for repl_config in mii_config.model_conf.replica_configs: host_gpus[repl_config.hostname].extend(repl_config.gpu_indices) use_multiple_hosts = len( set(repl_config.hostname - for repl_config in mii_config.model_config.replica_configs)) > 1 + for repl_config in mii_config.model_conf.replica_configs)) > 1 # Start replica instances - for repl_config in mii_config.model_config.replica_configs: + for repl_config in mii_config.model_conf.replica_configs: hostfile = tempfile.NamedTemporaryFile(delete=False) hostfile.write( f"{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n" @@ -161,7 +160,7 @@ def _initialize_service(self, mii_config: MIIConfig) -> List[subprocess.Popen]: use_multiple_hosts) processes.append( self._launch_server_process( - mii_config.model_config, + mii_config.model_conf, "MII server", ds_launch_str=ds_launch_str, server_args=server_args + [ @@ -175,7 +174,7 @@ def _initialize_service(self, mii_config: MIIConfig) -> List[subprocess.Popen]: # expected to assign one GPU to one process. processes.append( self._launch_server_process( - mii_config.model_config, + mii_config.model_conf, "load balancer", server_args=server_args + ["--load-balancer"], )) @@ -183,7 +182,7 @@ def _initialize_service(self, mii_config: MIIConfig) -> List[subprocess.Popen]: if mii_config.enable_restful_api: processes.append( self._launch_server_process( - mii_config.model_config, + mii_config.model_conf, "restful api gateway", server_args=server_args + ["--restful-gateway"], )) diff --git a/mii/config.py b/mii/config.py index 565cdbbc..a1cafb66 100644 --- a/mii/config.py +++ b/mii/config.py @@ -8,27 +8,18 @@ from deepspeed.launcher.runner import DLTS_HOSTFILE, fetch_hostfile from deepspeed.inference import RaggedInferenceEngineConfig +from deepspeed.runtime.config_utils import DeepSpeedConfigModel +from pydantic import Field, model_validator, field_validator from mii.constants import DeploymentType, TaskType, ModelProvider from mii.errors import DeploymentNotFoundError from mii.modeling.tokenizers import MIITokenizerWrapper -from mii.pydantic_v1 import BaseModel, Field, root_validator, validator, Extra -from mii.utils import generate_deployment_name, get_default_task, import_score_file +from mii.utils import generate_deployment_name, import_score_file DEVICE_MAP_DEFAULT = "auto" -class MIIConfigModel(BaseModel): - class Config: - validate_all = True - validate_assignment = True - use_enum_values = True - allow_population_by_field_name = True - extra = "forbid" - arbitrary_types_allowed = True - - -class GenerateParamsConfig(MIIConfigModel): +class GenerateParamsConfig(DeepSpeedConfigModel): """ Options for changing text-generation behavior. """ @@ -39,7 +30,7 @@ class GenerateParamsConfig(MIIConfigModel): max_length: int = 1024 """ Maximum length of ``input_tokens`` + ``generated_tokens``. """ - max_new_tokens: int = None + max_new_tokens: Optional[int] = None """ Maximum number of new tokens generated. ``max_length`` takes precedent. """ min_new_tokens: int = 0 @@ -68,24 +59,25 @@ class GenerateParamsConfig(MIIConfigModel): stop: List[str] = [] """ List of strings to stop generation at.""" - @validator("stop", pre=True) + @field_validator("stop", mode="before") + @classmethod def make_stop_string_list(cls, field_value: Union[str, List[str]]) -> List[str]: if isinstance(field_value, str): return [field_value] return field_value - @validator("stop") + @field_validator("stop") + @classmethod def sort_stop_strings(cls, field_value: List[str]) -> List[str]: return sorted(field_value) - @root_validator - def check_prompt_length(cls, values: Dict[str, Any]) -> Dict[str, Any]: - prompt_length = values.get("prompt_length") - max_length = values.get("max_length") - assert max_length > prompt_length, f"max_length ({max_length}) must be greater than prompt_length ({prompt_length})" - return values + @model_validator(mode="after") + def check_prompt_length(self) -> "GenerateParamsConfig": + assert self.max_length > self.prompt_length, f"max_length ({self.max_length}) must be greater than prompt_length ({self.prompt_length})" + return self - @root_validator + @model_validator(mode="before") + @classmethod def set_max_new_tokens(cls, values: Dict[str, Any]) -> Dict[str, Any]: max_length = values.get("max_length") max_new_tokens = values.get("max_new_tokens") @@ -94,19 +86,16 @@ def set_max_new_tokens(cls, values: Dict[str, Any]) -> Dict[str, Any]: values["max_new_tokens"] = max_length - prompt_length return values - class Config: - extra = Extra.forbid - -class ReplicaConfig(MIIConfigModel): +class ReplicaConfig(DeepSpeedConfigModel): hostname: str = "" tensor_parallel_ports: List[int] = [] - torch_dist_port: int = None + torch_dist_port: Optional[int] = None gpu_indices: List[int] = [] - zmq_port: int = None + zmq_port: Optional[int] = None -class ModelConfig(MIIConfigModel): +class ModelConfig(DeepSpeedConfigModel): model_name_or_path: str """ Model name or path of the model to HuggingFace model to be deployed. @@ -192,8 +181,9 @@ class ModelConfig(MIIConfigModel): def provider(self) -> ModelProvider: return ModelProvider.HUGGING_FACE - @validator("device_map", pre=True) - def make_device_map_dict(cls, v): + @field_validator("device_map", mode="before") + @classmethod + def make_device_map_dict(cls, v: Any) -> Dict: if isinstance(v, int): return {"localhost": [[v]]} if isinstance(v, list) and isinstance(v[0], int): @@ -202,36 +192,36 @@ def make_device_map_dict(cls, v): return {"localhost": v} return v - @root_validator + @model_validator(mode="before") + @classmethod def auto_fill_values(cls, values: Dict[str, Any]) -> Dict[str, Any]: + assert values.get("model_name_or_path"), "model_name_or_path must be provided" if not values.get("tokenizer"): values["tokenizer"] = values.get("model_name_or_path") - if not values.get("task"): - values["task"] = get_default_task(values.get("model_name_or_path")) + #if not values.get("task"): + # values["task"] = get_default_task(values.get("model_name_or_path")) + values["task"] = TaskType.TEXT_GENERATION return values - @root_validator - def propagate_tp_size(cls, values: Dict[str, Any]) -> Dict[str, Any]: - tensor_parallel = values.get("tensor_parallel") - values.get("inference_engine_config").tensor_parallel.tp_size = tensor_parallel - return values - - @root_validator - def propagate_quantization_mode(cls, values: Dict[str, Any]) -> Dict[str, Any]: - quantization_mode = values.get("quantization_mode") - values.get( - "inference_engine_config").quantization.quantization_mode = quantization_mode - return values + @model_validator(mode="after") + def propagate_tp_size(self) -> "ModelConfig": + self.inference_engine_config.tensor_parallel.tp_size = self.tensor_parallel + return self - @root_validator - def check_replica_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: - num_replica_config = len(values.get("replica_configs")) + @model_validator(mode="after") + def check_replica_config(self) -> "ModelConfig": + num_replica_config = len(self.replica_configs) if num_replica_config > 0: - assert num_replica_config == values.get("replica_num"), "Number of replica configs must match replica_num" - return values + assert num_replica_config == self.replica_num, "Number of replica configs must match replica_num" + return self + + @model_validator(mode="after") + def propagate_quantization_mode(self) -> "ModelConfig": + self.inference_engine_config.quantization.quantization_mode = self.quantization_mode + return self -class MIIConfig(MIIConfigModel): +class MIIConfig(DeepSpeedConfigModel): deployment_name: str = "" """ Name of the deployment. Used as an identifier for obtaining a inference @@ -245,7 +235,7 @@ class MIIConfig(MIIConfigModel): * `AML` will generate the assets necessary to deploy on AML resources. """ - model_config: ModelConfig + model_conf: ModelConfig = Field(alias="model_config") """ Configuration for the deployed model(s). """ @@ -290,17 +280,18 @@ class MIIConfig(MIIConfigModel): """ AML instance type to use when create AML deployment assets. """ - @root_validator(skip_on_failure=True) - def AML_name_valid(cls, values: Dict[str, Any]) -> Dict[str, Any]: - if values.get("deployment_type") == DeploymentType.AML: + @model_validator(mode="after") + def AML_name_valid(self) -> "MIIConfig": + if self.deployment_type == DeploymentType.AML: allowed_chars = set(string.ascii_lowercase + string.ascii_uppercase + string.digits + "-") assert ( - set(values.get("deployment_name")) <= allowed_chars + set(self.deployment_name) <= allowed_chars ), "AML deployment names can only contain a-z, A-Z, 0-9, and '-'." - return values + return self - @root_validator(skip_on_failure=True) + @model_validator(mode="before") + @classmethod def check_deployment_name(cls, values: Dict[str, Any]) -> Dict[str, Any]: deployment_name = values.get("deployment_name") if not deployment_name: @@ -311,14 +302,14 @@ def check_deployment_name(cls, values: Dict[str, Any]) -> Dict[str, Any]: return values def generate_replica_configs(self) -> None: - if self.model_config.replica_configs: + if self.model_conf.replica_configs: return - torch_dist_port = self.model_config.torch_dist_port - tensor_parallel = self.model_config.tensor_parallel + torch_dist_port = self.model_conf.torch_dist_port + tensor_parallel = self.model_conf.tensor_parallel replica_pool = _allocate_devices(self.hostfile, tensor_parallel, - self.model_config.replica_num, - self.model_config.device_map) + self.model_conf.replica_num, + self.model_conf.device_map) replica_configs = [] for i, (hostname, gpu_indices) in enumerate(replica_pool): # Reserver port for a LB proxy when replication is enabled @@ -332,10 +323,10 @@ def generate_replica_configs(self) -> None: tensor_parallel_ports=tensor_parallel_ports, torch_dist_port=replica_torch_dist_port, gpu_indices=gpu_indices, - zmq_port=self.model_config.zmq_port_number + i, + zmq_port=self.model_conf.zmq_port_number + i, )) - self.model_config.replica_configs = replica_configs + self.model_conf.replica_configs = replica_configs def _allocate_devices(hostfile_path: str, diff --git a/mii/legacy/client.py b/mii/legacy/client.py index 0a03d810..2f299eb1 100644 --- a/mii/legacy/client.py +++ b/mii/legacy/client.py @@ -37,7 +37,7 @@ def mii_query_handle(deployment_name): return MIINonPersistentClient(task, deployment_name) mii_config = _get_mii_config(deployment_name) - return MIIClient(mii_config.model_config.task, + return MIIClient(mii_config.model_conf.task, "localhost", # TODO: This can probably be removed mii_config.port_number) diff --git a/mii/legacy/config.py b/mii/legacy/config.py index 793c976f..e149cc7a 100644 --- a/mii/legacy/config.py +++ b/mii/legacy/config.py @@ -5,20 +5,21 @@ import torch import os import string +from pydantic import field_validator, model_validator, Field from typing import List, Optional, Dict, Any -import mii.legacy as mii -from .constants import DeploymentType, TaskType, ModelProvider, MII_MODEL_PATH_DEFAULT -from .pydantic_v1 import validator, root_validator, Field from deepspeed.runtime.config_utils import DeepSpeedConfigModel from deepspeed.inference.config import DtypeEnum from deepspeed.launcher.runner import DLTS_HOSTFILE, fetch_hostfile +import mii.legacy as mii +from .constants import DeploymentType, TaskType, ModelProvider, MII_MODEL_PATH_DEFAULT + class ReplicaConfig(DeepSpeedConfigModel): hostname: str = "" tensor_parallel_ports: List[int] = [] - torch_dist_port: int = None + torch_dist_port: Optional[int] = None gpu_indices: List[int] = [] @@ -39,7 +40,7 @@ class ModelConfig(DeepSpeedConfigModel): 'text-to-image']`` """ - dtype: DtypeEnum = DtypeEnum.fp32 + dtype: torch.dtype = torch.float32 """ Desired model data type, will convert model to this type. Supported target types: `torch.half`, `torch.float`, `torch.int8` (for BLOOM models) @@ -102,9 +103,12 @@ class ModelConfig(DeepSpeedConfigModel): hf_auth_token: Optional[str] = Field( None, - deprecated=True, - deprecated_msg= - "Parameter will be removed. Please use the `pipeline_kwargs` field to pass kwargs to the HuggingFace pipeline creation.", + json_schema_extra={ + "deprecated": + True, + "deprecated_msg": + "Parameter will be removed. Please use the `pipeline_kwargs` field to pass kwargs to the HuggingFace pipeline creation." + }, ) """ HuggingFace authentication token for accessing models. Will be propagated @@ -113,9 +117,12 @@ class ModelConfig(DeepSpeedConfigModel): trust_remote_code: bool = Field( False, - deprecated=True, - deprecated_msg= - "Parameter will be removed. Please use the `pipeline_kwargs` field to pass kwargs to the HuggingFace pipeline creation.", + json_schema_extra={ + "deprecated": + True, + "deprecated_msg": + "Parameter will be removed. Please use the `pipeline_kwargs` field to pass kwargs to the HuggingFace pipeline creation." + }, ) """ HuggingFace `tranformer.pipeline` option for `trust_remote_code`. @@ -168,15 +175,13 @@ class ModelConfig(DeepSpeedConfigModel): the input and output tokens. Please consider increasing it to the required token-length required for your use-case. """ - class Config: - json_encoders = {torch.dtype: lambda x: str(x)} - @property def provider(self): return mii.utils.get_provider(self.model, self.task) - @validator("checkpoint_dict") - def checkpoint_dict_valid(cls, field_value, values): + @field_validator("checkpoint_dict", mode="after") + @classmethod + def checkpoint_dict_valid(cls, field_value): if field_value is None: return field_value for k in ["checkpoints", "version", "type", "base_dir"]: @@ -184,51 +189,56 @@ def checkpoint_dict_valid(cls, field_value, values): raise ValueError(f"Missing key={k} in checkpoint_dict") return field_value - @validator("deploy_rank", pre=True) - def deploy_rank_to_list(cls, field_value, values): + @field_validator("deploy_rank", mode="before") + @classmethod + def deploy_rank_to_list(cls, field_value): if field_value and not isinstance(field_value, list): field_value = [field_value] return field_value - @root_validator - def zero_or_meta(cls, values): - if values.get("enable_zero"): - assert not values.get( - "meta_tensor" - ), "ZeRO-Inference does not support meta tensors." - return values + @field_validator("dtype", mode="before") + def validate_dtype(cls, field_value, values): + if isinstance(field_value, str): + return DtypeEnum.from_str(field_value).value[0] + if isinstance(field_value, torch.dtype): + return field_value + raise TypeError(f"Invalid type for dtype: {type(field_value)}") - @root_validator - def bloom_model_valid(cls, values): - if "bigscience/bloom" in values.get("model"): + @model_validator(mode="after") + def zero_or_meta(self): + if self.enable_zero: + assert not self.meta_tensor, "ZeRO-Inference does not support meta tensors." + return self + + @model_validator(mode="after") + def bloom_model_valid(self): + if "bigscience/bloom" in self.model: # TODO: SHould be albe to use DtypeEnum here - assert values.get("dtype") in [ + assert self.dtype in [ torch.int8, torch.float16, ], "Bloom models only support fp16/int8." - assert not values.get( - "enable_cuda_graph" - ), "Bloom models do not support CUDA Graph." - return values + assert not self.enable_cuda_graph, "Bloom models do not support CUDA Graph." + return self - @root_validator - def deploy_rank_valid(cls, values): - tensor_parallel = values.get("tensor_parallel") - deploy_rank = values.get("deploy_rank") + @model_validator(mode="after") + def deploy_rank_valid(self): + deploy_rank = self.deploy_rank # if deploy rank is not given, default to align with TP value if deploy_rank is None: - deploy_rank = list(range(tensor_parallel)) + deploy_rank = list(range(self.tensor_parallel)) # number of ranks provided must be equal to TP size, DP is handled outside MII currently - assert tensor_parallel == len( + assert self.tensor_parallel == len( deploy_rank - ), f"{len(deploy_rank)} rank(s) provided in 'deploy_rank' does not align with tensor_parallel size of {tensor_parallel}" + ), f"{len(deploy_rank)} rank(s) provided in 'deploy_rank' does not align with tensor_parallel size of {self.tensor_parallel}" - values["deploy_rank"] = deploy_rank - return values + self.__dict__["deploy_rank"] = deploy_rank + return self - @root_validator + @model_validator(mode="before") + @classmethod def set_model_path(cls, values): model_path = values.get("model_path") if not model_path: @@ -249,54 +259,47 @@ def set_model_path(cls, values): values["model_path"] = model_path return values - @root_validator - def validate_model_and_task(cls, values): - task = values.get("task") - model = values.get("model") - if not values.get("skip_model_check"): - mii.utils.check_if_task_and_model_is_valid(task, model) - if values.get("enable_deepspeed"): - mii.utils.check_if_task_and_model_is_supported(task, model) - # Skip any future checks - values["skip_model_check"] = True - return values + @model_validator(mode="after") + def validate_model_and_task(self): + if not self.skip_model_check: + mii.utils.check_if_task_and_model_is_valid(self.task, self.model) + mii.utils.check_if_task_and_model_is_supported(self.task, self.model) + return self - @root_validator - def meta_tensor_or_sys_mem(cls, values): - if values.get("meta_tensor") and values.get("load_with_sys_mem"): + @model_validator(mode="after") + def meta_tensor_or_sys_mem(self): + if self.meta_tensor and self.load_with_sys_mem: raise ValueError( "`meta_tensor` and `load_with_sys_mem` cannot be active at the same time." ) - return values - - @root_validator - def sys_mem_and_diffusers(cls, values): - if values.get("load_with_sys_mem"): - model = values.get("model") - task = values.get("task") - assert not (mii.utils.get_provider(model, task) == ModelProvider.DIFFUSERS), "`load_with_sys_mem` is not support with Stable Diffusion" - return values - - @root_validator - def zero_dtype_valid(cls, values): - if values.get("enable_zero"): - if values.get("ds_config").get("fp16", {}).get("enabled", False): + return self + + @model_validator(mode="after") + def sys_mem_and_diffusers(self): + if self.load_with_sys_mem: + assert not (mii.utils.get_provider(self.model, self.task) == ModelProvider.DIFFUSERS), "`load_with_sys_mem` is not support with Stable Diffusion" + return self + + @model_validator(mode="after") + def zero_dtype_valid(self): + if self.enable_zero: + if self.ds_config.get("fp16", {}).get("enabled", False): # TODO: We should be able to use DtypeEnum instead of torch.float assert ( - values.get("dtype") == torch.float16 + self.dtype == torch.float16 ), "ZeRO FP16 enabled, `dtype` must be set to `torch.float16`" else: assert ( - values.get("dtype") == torch.float32 + self.dtype == torch.float32 ), "ZeRO FP16 disabled, `dtype` must be set to `torch.float32`" - return values + return self - @root_validator - def deepspeed_or_zero(cls, values): + @model_validator(mode="after") + def deepspeed_or_zero(self): assert not ( - values.get("enable_deepspeed") and values.get("enable_zero") + self.enable_deepspeed and self.enable_zero ), "DeepSpeed and ZeRO cannot both be enabled, select only one" - return values + return self class MIIConfig(DeepSpeedConfigModel): @@ -314,7 +317,7 @@ class MIIConfig(DeepSpeedConfigModel): * `AML` will generate the assets necessary to deploy on AML resources. """ - model_config: ModelConfig + model_conf: ModelConfig """ Configuration for the deployed model(s). """ @@ -349,23 +352,23 @@ class MIIConfig(DeepSpeedConfigModel): """ AML instance type to use when create AML deployment assets. """ - @root_validator(skip_on_failure=True) - def AML_name_valid(cls, values): - if values.get("deployment_type") == DeploymentType.AML: + @model_validator(mode="after") + def AML_name_valid(self): + if self.deployment_type == DeploymentType.AML: allowed_chars = set(string.ascii_lowercase + string.ascii_uppercase + string.digits + "-") assert ( - set(values.get("deployment_name")) <= allowed_chars + set(self.deployment_name) <= allowed_chars ), "AML deployment names can only contain a-z, A-Z, 0-9, and '-'." - return values + return self def generate_replica_configs(self): # TODO: refactor this function hostfile = self.hostfile port_number = self.port_number - torch_dist_port = self.model_config.torch_dist_port - tensor_parallel = self.model_config.tensor_parallel - replica_num = self.model_config.replica_num + torch_dist_port = self.model_conf.torch_dist_port + tensor_parallel = self.model_conf.tensor_parallel + replica_num = self.model_conf.replica_num replica_pool = _allocate_processes(hostfile, tensor_parallel, replica_num) replica_configs = [] for i, (hostname, gpu_indices) in enumerate(replica_pool): @@ -382,7 +385,7 @@ def generate_replica_configs(self): gpu_indices=gpu_indices, )) - self.model_config.replica_configs = replica_configs + self.model_conf.replica_configs = replica_configs def _allocate_processes(hostfile_path, tensor_parallel, replica_num): diff --git a/mii/legacy/deployment.py b/mii/legacy/deployment.py index 59954901..b8b0753f 100644 --- a/mii/legacy/deployment.py +++ b/mii/legacy/deployment.py @@ -37,7 +37,7 @@ def support_legacy_api( } # TODO do this in a single for loop for key, val in mii_config.items(): - if key not in MIIConfig.__dict__["__fields__"]: + if key not in MIIConfig.fields.keys(): model_config[key] = val mii_config = { k: v @@ -68,10 +68,10 @@ def deploy( model_config, mii_config = support_legacy_api(*args, **kwargs) mii_config["deployment_name"] = deployment_name - mii_config["model_config"] = model_config + mii_config["model_conf"] = model_config mii_config = mii.config.MIIConfig(**mii_config) - if mii_config.model_config.enable_deepspeed: + if mii_config.model_conf.enable_deepspeed: logger.info( "************* MII is using DeepSpeed Optimizations to accelerate your model *************" ) @@ -100,9 +100,9 @@ def _deploy_aml(mii_config): mii.aml_related.utils.generate_aml_scripts( acr_name=acr_name, deployment_name=mii_config.deployment_name, - model_name=mii_config.model_config.model, - task_name=mii_config.model_config.task, - replica_num=mii_config.model_config.replica_num, + model_name=mii_config.model_conf.model, + task_name=mii_config.model_conf.task, + replica_num=mii_config.model_conf.replica_num, instance_type=mii_config.instance_type, version=mii_config.version, ) @@ -115,10 +115,10 @@ def _deploy_aml(mii_config): def _deploy_nonpersistent(mii_config): assert ( int(os.getenv("WORLD_SIZE", "1")) - == mii_config.model_config.tensor_parallel + == mii_config.model_conf.tensor_parallel ), "World Size does not equal number of tensors. When using non-persistent deployment type, please launch with `deepspeed --num_gpus `" deployment_name = mii_config.deployment_name mii.non_persistent_models[deployment_name] = ( - load_models(mii_config.model_config), - mii_config.model_config.task, + load_models(mii_config.model_conf), + mii_config.model_conf.task, ) diff --git a/mii/legacy/pydantic_v1.py b/mii/legacy/pydantic_v1.py deleted file mode 100644 index 6aba072a..00000000 --- a/mii/legacy/pydantic_v1.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team -"""Pydantic v1 compatibility module. - -Pydantic v2 introduced breaking changes that hinder its adoption: -https://docs.pydantic.dev/latest/migration/. To provide deepspeed users the option to -migrate to pydantic v2 on their own timeline, deepspeed uses this compatibility module -as a pydantic-version-agnostic alias for pydantic's v1 API. -""" - -try: - from pydantic.v1 import * # noqa: F401 -except ImportError: - from pydantic import * # noqa: F401 diff --git a/mii/legacy/server.py b/mii/legacy/server.py index 8a66f3ec..75ba24fe 100644 --- a/mii/legacy/server.py +++ b/mii/legacy/server.py @@ -28,7 +28,7 @@ class MIIServer: """Initialize the model, setup the server for the model under model_path""" def __init__(self, mii_config): - self.task = mii_config.model_config.task + self.task = mii_config.model_conf.task self.num_gpus = get_num_gpus(mii_config) assert self.num_gpus > 0, "GPU count must be greater than 0" @@ -44,8 +44,7 @@ def __init__(self, mii_config): mii_config.generate_replica_configs() processes = self._initialize_service(mii_config) - self._wait_until_server_is_live(processes, - mii_config.model_config.replica_configs) + self._wait_until_server_is_live(processes, mii_config.model_conf.replica_configs) def _wait_until_server_is_live(self, processes, deployment): for process, repl_config in zip(processes, deployment): @@ -128,11 +127,11 @@ def _initialize_service(self, mii_config): ] host_gpus = defaultdict(list) - for repl_config in mii_config.model_config.replica_configs: + for repl_config in mii_config.model_conf.replica_configs: host_gpus[repl_config.hostname].extend(repl_config.gpu_indices) # Start replica instances - for repl_config in mii_config.model_config.replica_configs: + for repl_config in mii_config.model_conf.replica_configs: hostfile = tempfile.NamedTemporaryFile(delete=False) hostfile.write( f"{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n" @@ -140,7 +139,7 @@ def _initialize_service(self, mii_config): ds_launch_str = self._generate_ds_launch_str(repl_config, hostfile.name) processes.append( self._launch_server_process( - mii_config.model_config, + mii_config.model_conf, "MII server", ds_launch_str=ds_launch_str, server_args=server_args + @@ -153,7 +152,7 @@ def _initialize_service(self, mii_config): # expected to assign one GPU to one process. processes.append( self._launch_server_process( - mii_config.model_config, + mii_config.model_conf, "load balancer", server_args=server_args + ["--load-balancer"], )) @@ -161,7 +160,7 @@ def _initialize_service(self, mii_config): if mii_config.enable_restful_api: processes.append( self._launch_server_process( - mii_config.model_config, + mii_config.model_conf, "restful api gateway", server_args=server_args + ["--restful-gateway"], )) diff --git a/mii/legacy/utils.py b/mii/legacy/utils.py index f1a7cb59..8d574ad9 100644 --- a/mii/legacy/utils.py +++ b/mii/legacy/utils.py @@ -179,7 +179,7 @@ def extract_query_dict(task, request_dict): def get_num_gpus(mii_config): - num_gpus = mii_config.model_config.tensor_parallel + num_gpus = mii_config.model_conf.tensor_parallel assert ( torch.cuda.device_count() >= num_gpus diff --git a/mii/pydantic_v1.py b/mii/pydantic_v1.py deleted file mode 100644 index 6aba072a..00000000 --- a/mii/pydantic_v1.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team -"""Pydantic v1 compatibility module. - -Pydantic v2 introduced breaking changes that hinder its adoption: -https://docs.pydantic.dev/latest/migration/. To provide deepspeed users the option to -migrate to pydantic v2 on their own timeline, deepspeed uses this compatibility module -as a pydantic-version-agnostic alias for pydantic's v1 API. -""" - -try: - from pydantic.v1 import * # noqa: F401 -except ImportError: - from pydantic import * # noqa: F401 diff --git a/mii/score/generate.py b/mii/score/generate.py index a34a96c6..978a635b 100644 --- a/mii/score/generate.py +++ b/mii/score/generate.py @@ -19,7 +19,7 @@ def create_score_file(mii_config): score_src = fd.read() # update score file w. global config dict - config_dict = mii_config.dict() + config_dict = mii_config.model_dump() source_with_config = f"{score_src}\n" source_with_config += f"mii_config = {pprint.pformat(config_dict, indent=4)}" diff --git a/requirements/requirements.txt b/requirements/requirements.txt index b4191e29..11cf6b83 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,12 +1,12 @@ accelerate asyncio -deepspeed>=0.14.0 +deepspeed>=0.15.0 deepspeed-kernels Flask-RESTful grpcio grpcio-tools Pillow -pydantic +pydantic>=2.0.0 pyzmq safetensors torch diff --git a/tests/legacy/test_config.py b/tests/legacy/test_config.py index bc2ca1fd..f99b2524 100644 --- a/tests/legacy/test_config.py +++ b/tests/legacy/test_config.py @@ -6,24 +6,24 @@ import pytest import mii.legacy as mii -from mii.legacy import pydantic_v1 +from pydantic import ValidationError @pytest.mark.parametrize("port_number", [12345]) @pytest.mark.parametrize("tensor_parallel", [4]) def test_base_configs(deployment_name, mii_config, model_config): mii_config["deployment_name"] = deployment_name - mii_config["model_config"] = model_config + mii_config["model_conf"] = model_config mii_config = mii.config.MIIConfig(**mii_config) assert mii_config.port_number == 12345 - assert mii_config.model_config.tensor_parallel == 4 + assert mii_config.model_conf.tensor_parallel == 4 @pytest.mark.parametrize("port_number", ["fail"]) @pytest.mark.parametrize("tensor_parallel", [3.5]) def test_base_configs_literalfail(deployment_name, mii_config, model_config): - with pytest.raises(pydantic_v1.ValidationError): + with pytest.raises(ValidationError): mii_config["deployment_name"] = deployment_name - mii_config["model_config"] = model_config + mii_config["model_conf"] = model_config mii_config = mii.config.MIIConfig(**mii_config) diff --git a/tests/legacy/test_deployment_options.py b/tests/legacy/test_deployment_options.py index e60ebcd7..2cda7a6f 100644 --- a/tests/legacy/test_deployment_options.py +++ b/tests/legacy/test_deployment_options.py @@ -7,7 +7,7 @@ import json import requests import mii.legacy as mii -from mii.legacy import pydantic_v1 +from pydantic import ValidationError @pytest.mark.deepspeed @@ -81,7 +81,7 @@ def test_zero_config(deployment, query): @pytest.mark.deepspeed -@pytest.mark.parametrize("expected_failure", [pydantic_v1.ValidationError]) +@pytest.mark.parametrize("expected_failure", [ValidationError]) @pytest.mark.parametrize( "enable_deepspeed, enable_zero, dtype", [(True, diff --git a/tests/test_arg_parsing.py b/tests/test_arg_parsing.py index 640512ae..957b1eeb 100644 --- a/tests/test_arg_parsing.py +++ b/tests/test_arg_parsing.py @@ -5,31 +5,33 @@ import pytest +from pydantic import ValidationError + from mii.api import _parse_kwargs_to_model_config, _parse_kwargs_to_mii_config from mii.errors import UnknownArgument def test_model_name_or_path(): # model_name_or_path is required - with pytest.raises(ValueError): + with pytest.raises(ValidationError): _parse_kwargs_to_mii_config() - with pytest.raises(ValueError): + with pytest.raises(ValidationError): _parse_kwargs_to_model_config() # passing model_name_or_path as positional arg mii_config = _parse_kwargs_to_mii_config("test") - assert mii_config.model_config.model_name_or_path == "test" + assert mii_config.model_conf.model_name_or_path == "test" model_config, _ = _parse_kwargs_to_model_config("test") assert model_config.model_name_or_path == "test" # passing model_name_or_path in model_config mii_config = _parse_kwargs_to_mii_config(model_config={"model_name_or_path": "test"}) - assert mii_config.model_config.model_name_or_path == "test" + assert mii_config.model_conf.model_name_or_path == "test" mii_config = _parse_kwargs_to_mii_config( mii_config={"model_config": { "model_name_or_path": "test" }}) - assert mii_config.model_config.model_name_or_path == "test" + assert mii_config.model_conf.model_name_or_path == "test" model_config, _ = _parse_kwargs_to_model_config( model_config={"model_name_or_path": "test"} ) @@ -53,8 +55,8 @@ def test_only_kwargs(): mii_config = _parse_kwargs_to_mii_config("test", tensor_parallel=2, enable_restful_api=True) - assert mii_config.model_config.model_name_or_path == "test" - assert mii_config.model_config.tensor_parallel == 2 + assert mii_config.model_conf.model_name_or_path == "test" + assert mii_config.model_conf.tensor_parallel == 2 assert mii_config.enable_restful_api is True model_config, _ = _parse_kwargs_to_model_config("test", tensor_parallel=2) @@ -70,8 +72,8 @@ def test_only_config_dicts(): "tensor_parallel": 2 }, ) - assert mii_config.model_config.model_name_or_path == "test" - assert mii_config.model_config.tensor_parallel == 2 + assert mii_config.model_conf.model_name_or_path == "test" + assert mii_config.model_conf.tensor_parallel == 2 assert mii_config.enable_restful_api is True mii_config = _parse_kwargs_to_mii_config( @@ -82,8 +84,8 @@ def test_only_config_dicts(): "tensor_parallel": 2 }, }) - assert mii_config.model_config.model_name_or_path == "test" - assert mii_config.model_config.tensor_parallel == 2 + assert mii_config.model_conf.model_name_or_path == "test" + assert mii_config.model_conf.tensor_parallel == 2 assert mii_config.enable_restful_api is True model_config, _ = _parse_kwargs_to_model_config( From 3ab2d05c58ea55bc987ce26bbfa002afec8d286e Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Thu, 22 Aug 2024 15:53:45 -0700 Subject: [PATCH 09/35] Update version.txt after 0.3.0 release (#520) Co-authored-by: loadams Signed-off-by: Logan Adams --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index 0d91a54c..9e11b32f 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.3.0 +0.3.1 From 9bc5f966977fc669eadfb6d07952fcee93345ea9 Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Date: Thu, 22 Aug 2024 16:39:05 -0700 Subject: [PATCH 10/35] Update supported model list (#519) Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Signed-off-by: Logan Adams --- README.md | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 476c7cc1..d4e2d3c9 100644 --- a/README.md +++ b/README.md @@ -89,14 +89,16 @@ MII currently supports over 37,000 models across eight popular model architectur model family | size range | ~model count ------ | ------ | ------ -[falcon](https://huggingface.co/models?other=falcon) | 7B - 180B | 500 -[llama](https://huggingface.co/models?other=llama) | 7B - 65B | 52,000 -[llama-2](https://huggingface.co/models?other=llama-2) | 7B - 70B | 1,200 -[mistral](https://huggingface.co/models?other=mistral) | 7B | 23,000 -[mixtral (MoE)](https://huggingface.co/models?other=mixtral) | 8x7B | 2,900 -[opt](https://huggingface.co/models?other=opt) | 0.1B - 66B | 2,100 -[phi-2](https://huggingface.co/models?other=phi) | 2.7B | 1,500 -[qwen](https://huggingface.co/models?other=qwen) | 7B - 72B | 500 +[Falcon](https://huggingface.co/models?other=falcon) | 7B - 180B | 600 +[Llama](https://huggingface.co/models?other=llama) | 7B - 65B | 57,000 +[Llama-2](https://huggingface.co/models?other=llama-2) | 7B - 70B | 1,200 +[Llama-3](https://huggingface.co/models?other=llama-3) | 8B - 405B | 1,600 +[Mistral](https://huggingface.co/models?other=mistral) | 7B | 23,000 +[Mixtral (MoE)](https://huggingface.co/models?other=mixtral) | 8x7B | 2,900 +[OPT](https://huggingface.co/models?other=opt) | 0.1B - 66B | 2,200 +[Phi-2](https://huggingface.co/models?other=phi) | 2.7B | 1,500 +[Qwen](https://huggingface.co/models?other=qwen) | 7B - 72B | 500 +[Qwen2](https://huggingface.co/models?other=qwen2) | 0.5B - 72B | 3700 ## MII Legacy Model Support From 7185697604897181508dd54c474df71c0e4851b5 Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni <11399+adk9@users.noreply.github.com> Date: Tue, 3 Sep 2024 13:21:19 -0700 Subject: [PATCH 11/35] Enable streaming option in the OpenAI API server (#480) Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Logan Adams Signed-off-by: Logan Adams --- mii/entrypoints/api_server.py | 27 +++--- mii/entrypoints/data_models.py | 2 +- mii/entrypoints/openai_api_server.py | 140 +++++++++++++-------------- requirements/requirements.txt | 3 + 4 files changed, 87 insertions(+), 85 deletions(-) diff --git a/mii/entrypoints/api_server.py b/mii/entrypoints/api_server.py index 2a2bc84f..aac16b81 100644 --- a/mii/entrypoints/api_server.py +++ b/mii/entrypoints/api_server.py @@ -6,13 +6,14 @@ import json import grpc import argparse +from typing import AsyncGenerator # Third-party imports import uvicorn import mii from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, Response +from fastapi.responses import StreamingResponse, JSONResponse, Response from mii.grpc_related.proto.modelresponse_pb2_grpc import ModelResponseStub from mii.grpc_related.proto import modelresponse_pb2 from mii.utils import kwarg_dict_to_proto @@ -81,18 +82,18 @@ async def generate(request: CompletionRequest) -> Response: # Streaming case if request.stream: - return JSONResponse({"error": "Streaming is not yet supported."}, - status_code=400) - # async def StreamResults() -> AsyncGenerator[bytes, None]: - # # Send an empty chunk to start the stream and prevent timeout - # yield "" - # async for response_chunk in stub.GeneratorReplyStream(requestData): - # # Send the response chunk - # responses = [obj.response for obj in response_chunk.response] - # dataOut = {"text": responses} - # yield f"data: {json.dumps(dataOut)}\n\n" - # yield f"data: [DONE]\n\n" - # return StreamingResponse(StreamResults(), media_type="text/event-stream") + + async def StreamResults() -> AsyncGenerator[bytes, None]: + # Send an empty chunk to start the stream and prevent timeout + yield "" + async for response_chunk in stub.GeneratorReplyStream(requestData): + # Send the response chunk + responses = [obj.response for obj in response_chunk.response] + dataOut = {"text": responses} + yield f"data: {json.dumps(dataOut)}\n\n" + yield f"data: [DONE]\n\n" + + return StreamingResponse(StreamResults(), media_type="text/event-stream") # Non-streaming case responseData = await stub.GeneratorReply(requestData) diff --git a/mii/entrypoints/data_models.py b/mii/entrypoints/data_models.py index 9bba1342..190e486c 100644 --- a/mii/entrypoints/data_models.py +++ b/mii/entrypoints/data_models.py @@ -9,7 +9,7 @@ import time import shortuuid -from pydantic import BaseModel, BaseSettings, Field +from mii.pydantic_v1 import BaseModel, BaseSettings, Field class ErrorResponse(BaseModel): diff --git a/mii/entrypoints/openai_api_server.py b/mii/entrypoints/openai_api_server.py index 26f42be2..c8df3d6c 100644 --- a/mii/entrypoints/openai_api_server.py +++ b/mii/entrypoints/openai_api_server.py @@ -10,14 +10,14 @@ import argparse import json import os -from typing import Optional, List, Union +from typing import AsyncGenerator, Optional, List, Union from transformers import AutoTokenizer import codecs from fastapi import FastAPI, Depends, HTTPException, Response from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse +from fastapi.responses import StreamingResponse, JSONResponse from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer import shortuuid @@ -31,16 +31,16 @@ from .data_models import ( ChatCompletionRequest, ChatCompletionResponse, - # ChatCompletionResponseStreamChoice, - # ChatCompletionStreamResponse, + ChatCompletionResponseStreamChoice, + ChatCompletionStreamResponse, ChatMessage, ChatCompletionResponseChoice, CompletionRequest, CompletionResponse, CompletionResponseChoice, - # DeltaMessage, - # CompletionResponseStreamChoice, - # CompletionStreamResponse, + DeltaMessage, + CompletionResponseStreamChoice, + CompletionStreamResponse, ErrorResponse, ModelCard, ModelList, @@ -202,42 +202,41 @@ async def create_chat_completion(request: ChatCompletionRequest): # Streaming case if request.stream: - return create_error_response( - ErrorCode.VALIDATION_TYPE_ERROR, - f"Streaming is not yet supported.", - ) - # async def StreamResults() -> AsyncGenerator[bytes, None]: - # # First chunk with role - # firstChoices = [] - # for _ in range(request.n): - # firstChoice = ChatCompletionResponseStreamChoice( - # index=len(firstChoices), - # delta=DeltaMessage(role=response_role), - # finish_reason=None, - # ) - # firstChoices.append(firstChoice) - - # chunk = ChatCompletionStreamResponse( - # id=id, choices=firstChoices, model=app_settings.model_id - # ) - # yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" - # async for response_chunk in stub.GeneratorReplyStream(requestData): - # streamChoices = [] - - # for c in response_chunk.response: - # choice = ChatCompletionResponseStreamChoice( - # index=len(streamChoices), - # delta=DeltaMessage(content=c.response), - # finish_reason=None if c.finish_reason == "none" else c.finish_reason, - # ) - # streamChoices.append(choice) - - # chunk = ChatCompletionStreamResponse( - # id=id, choices=streamChoices, model=app_settings.model_id - # ) - # yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" - # yield "data: [DONE]\n\n" - # return StreamingResponse(StreamResults(), media_type="text/event-stream") + + async def StreamResults() -> AsyncGenerator[bytes, None]: + # First chunk with role + firstChoices = [] + for _ in range(request.n): + firstChoice = ChatCompletionResponseStreamChoice( + index=len(firstChoices), + delta=DeltaMessage(role=response_role), + finish_reason=None, + ) + firstChoices.append(firstChoice) + + chunk = ChatCompletionStreamResponse(id=id, + choices=firstChoices, + model=app_settings.model_id) + yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" + async for response_chunk in stub.GeneratorReplyStream(requestData): + streamChoices = [] + + for c in response_chunk.response: + choice = ChatCompletionResponseStreamChoice( + index=len(streamChoices), + delta=DeltaMessage(content=c.response), + finish_reason=None + if c.finish_reason == "none" else c.finish_reason, + ) + streamChoices.append(choice) + + chunk = ChatCompletionStreamResponse(id=id, + choices=streamChoices, + model=app_settings.model_id) + yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse(StreamResults(), media_type="text/event-stream") # Non-streaming case responseData = await stub.GeneratorReply(requestData) @@ -330,34 +329,33 @@ async def create_completion(request: CompletionRequest): id = f"cmpl-{shortuuid.random()}" # Streaming case if request.stream: - return create_error_response( - ErrorCode.VALIDATION_TYPE_ERROR, - f"Streaming is not yet supported.", - ) - # async def StreamResults() -> AsyncGenerator[bytes, None]: - # # Send an empty chunk to start the stream and prevent timeout - # yield "" - # async for response_chunk in stub.GeneratorReplyStream(requestData): - # streamChoices = [] - - # for c in response_chunk.response: - # choice = CompletionResponseStreamChoice( - # index=len(streamChoices), - # text=c.response, - # logprobs=None, - # finish_reason=None if c.finish_reason == "none" else c.finish_reason, - # ) - # streamChoices.append(choice) - - # chunk = CompletionStreamResponse( - # id=id, - # object="text_completion", - # choices=streamChoices, - # model=app_settings.model_id, - # ) - # yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" - # yield "data: [DONE]\n\n" - # return StreamingResponse(StreamResults(), media_type="text/event-stream") + + async def StreamResults() -> AsyncGenerator[bytes, None]: + # Send an empty chunk to start the stream and prevent timeout + yield "" + async for response_chunk in stub.GeneratorReplyStream(requestData): + streamChoices = [] + + for c in response_chunk.response: + choice = CompletionResponseStreamChoice( + index=len(streamChoices), + text=c.response, + logprobs=None, + finish_reason=None + if c.finish_reason == "none" else c.finish_reason, + ) + streamChoices.append(choice) + + chunk = CompletionStreamResponse( + id=id, + object="text_completion", + choices=streamChoices, + model=app_settings.model_id, + ) + yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse(StreamResults(), media_type="text/event-stream") # Non-streaming case responseData = await stub.GeneratorReply(requestData) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 11cf6b83..8ca8791c 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -2,6 +2,8 @@ accelerate asyncio deepspeed>=0.15.0 deepspeed-kernels +fastapi +fastchat Flask-RESTful grpcio grpcio-tools @@ -9,6 +11,7 @@ Pillow pydantic>=2.0.0 pyzmq safetensors +shortuuid torch transformers ujson From d2c93b3a9446ffc285c9e873cc543efef2b5fdd3 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Wed, 11 Sep 2024 07:49:42 -0700 Subject: [PATCH 12/35] Fix missing pydantic updates in legacy mii code (#524) Signed-off-by: Logan Adams --- mii/legacy/deployment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mii/legacy/deployment.py b/mii/legacy/deployment.py index b8b0753f..78610b17 100644 --- a/mii/legacy/deployment.py +++ b/mii/legacy/deployment.py @@ -37,12 +37,12 @@ def support_legacy_api( } # TODO do this in a single for loop for key, val in mii_config.items(): - if key not in MIIConfig.fields.keys(): + if key not in MIIConfig.model_fields.keys(): model_config[key] = val mii_config = { k: v for k, - v in mii_config.items() if k in MIIConfig.__dict__["__fields__"] + v in mii_config.items() if k in MIIConfig.model_fields.keys() } mii_config["version"] = version mii_config["deployment_type"] = deployment_type From a5b443fec51505f7136e3ae40c302830b748719e Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Mon, 28 Oct 2024 16:38:26 -0700 Subject: [PATCH 13/35] Update docker container version (#533) Signed-off-by: Logan Adams --- .github/workflows/nv-a6000-fastgen.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-a6000-fastgen.yml b/.github/workflows/nv-a6000-fastgen.yml index 80bca5ae..cae5e844 100644 --- a/.github/workflows/nv-a6000-fastgen.yml +++ b/.github/workflows/nv-a6000-fastgen.yml @@ -18,7 +18,7 @@ jobs: unit-tests: runs-on: [self-hosted, nvidia, a6000] container: - image: nvcr.io/nvidia/pytorch:23.03-py3 + image: nvcr.io/nvidia/pytorch:24.03-py3 ports: - 80 options: --gpus all --shm-size "8G" From 87e9b0d77df90936e8874fd5bf78c65a27bf25c0 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:12:30 -0700 Subject: [PATCH 14/35] Update CODEOWNERS (#535) Signed-off-by: Logan Adams --- CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index 29171a01..82efda6e 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* @jeffra @mrwyattii @awan-10 @samyam +* @tohtana @tjruwase @awan-10 @loadams From 2b08ace7730a81a3adcf11e8893c4c6903d6ae51 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:14:22 -0700 Subject: [PATCH 15/35] Update labels to acquire new runners (#534) Signed-off-by: Logan Adams --- .github/workflows/nv-v100-legacy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml index 0eef4799..2949ac6e 100644 --- a/.github/workflows/nv-v100-legacy.yml +++ b/.github/workflows/nv-v100-legacy.yml @@ -19,7 +19,7 @@ concurrency: jobs: unit-tests: - runs-on: [self-hosted, nvidia, cu117, v100] + runs-on: [self-hosted, nvidia, cu121, v100] steps: - uses: actions/checkout@v4 @@ -29,7 +29,7 @@ jobs: - name: Install pytorch run: | - pip3 install -U --cache-dir /blob/torch_cache/ torch --index-url https://download.pytorch.org/whl/cu118 + pip3 install -U --cache-dir /blob/torch_cache/ torch --index-url https://download.pytorch.org/whl/cu121 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" From 4a3467ef0e604a0b8a94e269f864df500019dd16 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 29 Oct 2024 11:30:14 -0700 Subject: [PATCH 16/35] Test with latest transformers Signed-off-by: Logan Adams --- .github/workflows/nv-v100-legacy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml index 2949ac6e..01d12de4 100644 --- a/.github/workflows/nv-v100-legacy.yml +++ b/.github/workflows/nv-v100-legacy.yml @@ -36,7 +36,7 @@ jobs: - name: Install dependencies run: | pip install git+https://github.com/microsoft/DeepSpeed.git - pip install git+https://github.com/huggingface/transformers.git@v4.43.0 + pip install git+https://github.com/huggingface/transformers.git pip install -U accelerate ds_report From 6079f9c4d3319f5830c80974b1f2726fc2677cd2 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Wed, 30 Oct 2024 11:30:57 -0700 Subject: [PATCH 17/35] Update path triggers that were incorrect before (#537) Signed-off-by: Logan Adams --- .github/workflows/nv-a6000-fastgen.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-a6000-fastgen.yml b/.github/workflows/nv-a6000-fastgen.yml index cae5e844..0b9da000 100644 --- a/.github/workflows/nv-a6000-fastgen.yml +++ b/.github/workflows/nv-a6000-fastgen.yml @@ -8,7 +8,7 @@ on: paths-ignore: - 'mii/legacy/**' - 'tests/legacy/**' - - '.github/workflows/nv-torch-latest-v100.yml' + - '.github/workflows/nv-v100-legacy.yml' concurrency: group: ${{ github.workflow }}-${{ github.ref }} From 61c326a494548f5d8e8ae4fef1bb664361e7dede Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Wed, 30 Oct 2024 11:56:16 -0700 Subject: [PATCH 18/35] Update clang-format version to match DeepSpeed (#538) Signed-off-by: Logan Adams --- requirements/requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 4b7bb770..88d0d08e 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,4 +1,4 @@ -clang-format==16.0.2 +clang-format==18.1.3 einops pre-commit>=2.20.0 pytest From 7fe9106aa44d7f06175d5cac3dd28a630ae5c393 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Wed, 30 Oct 2024 13:57:33 -0700 Subject: [PATCH 19/35] Update version.txt (#539) Signed-off-by: Logan Adams --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index 9e11b32f..d15723fb 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.3.1 +0.3.2 From 6c29dc6a4047c1ed0a5a11f10745d870c260c888 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 31 Oct 2024 14:28:41 -0700 Subject: [PATCH 20/35] Test pinning to 4.43.4 Signed-off-by: Logan Adams --- .github/workflows/nv-v100-legacy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml index 01d12de4..346e7f18 100644 --- a/.github/workflows/nv-v100-legacy.yml +++ b/.github/workflows/nv-v100-legacy.yml @@ -36,7 +36,7 @@ jobs: - name: Install dependencies run: | pip install git+https://github.com/microsoft/DeepSpeed.git - pip install git+https://github.com/huggingface/transformers.git + pip install git+https://github.com/huggingface/transformers.git@v4.43.4 pip install -U accelerate ds_report From 87e1d873a719b4e9d9c08b4d33a6b914cb683dd1 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 31 Oct 2024 16:39:55 -0700 Subject: [PATCH 21/35] Update to latest transformers Signed-off-by: Logan Adams --- .github/workflows/nv-v100-legacy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml index 346e7f18..01d12de4 100644 --- a/.github/workflows/nv-v100-legacy.yml +++ b/.github/workflows/nv-v100-legacy.yml @@ -36,7 +36,7 @@ jobs: - name: Install dependencies run: | pip install git+https://github.com/microsoft/DeepSpeed.git - pip install git+https://github.com/huggingface/transformers.git@v4.43.4 + pip install git+https://github.com/huggingface/transformers.git pip install -U accelerate ds_report From ab92351eee3c3eaa15d446c830d54d76960e215f Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 7 Nov 2024 13:03:18 -0800 Subject: [PATCH 22/35] Update to version where only the zero-shot-image-classification fails Signed-off-by: Logan Adams --- .github/workflows/nv-v100-legacy.yml | 2 +- mii/legacy/models/providers/diffusers.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml index 01d12de4..346e7f18 100644 --- a/.github/workflows/nv-v100-legacy.yml +++ b/.github/workflows/nv-v100-legacy.yml @@ -36,7 +36,7 @@ jobs: - name: Install dependencies run: | pip install git+https://github.com/microsoft/DeepSpeed.git - pip install git+https://github.com/huggingface/transformers.git + pip install git+https://github.com/huggingface/transformers.git@v4.43.4 pip install -U accelerate ds_report diff --git a/mii/legacy/models/providers/diffusers.py b/mii/legacy/models/providers/diffusers.py index 15973d0e..b75ad3a4 100644 --- a/mii/legacy/models/providers/diffusers.py +++ b/mii/legacy/models/providers/diffusers.py @@ -19,10 +19,11 @@ def diffusers_provider(model_config: ModelConfig): kwargs["torch_dtype"] = torch.float16 kwargs["revision"] = "fp16" + kwargs["device"] = torch.device(f"cuda:{local_rank}") pipeline = attempt_load(DiffusionPipeline.from_pretrained, model_config.model, model_config.model_path, kwargs=kwargs) - pipeline = pipeline.to(f"cuda:{local_rank}") + #pipeline = pipeline.to(f"cuda:{local_rank}") pipeline.set_progress_bar_config(disable=True) return pipeline From 6525bd3744d75ffc532b80387a9bf29df716892c Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 7 Nov 2024 14:55:36 -0800 Subject: [PATCH 23/35] Revert certain changes Signed-off-by: Logan Adams --- mii/legacy/models/providers/diffusers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mii/legacy/models/providers/diffusers.py b/mii/legacy/models/providers/diffusers.py index b75ad3a4..15973d0e 100644 --- a/mii/legacy/models/providers/diffusers.py +++ b/mii/legacy/models/providers/diffusers.py @@ -19,11 +19,10 @@ def diffusers_provider(model_config: ModelConfig): kwargs["torch_dtype"] = torch.float16 kwargs["revision"] = "fp16" - kwargs["device"] = torch.device(f"cuda:{local_rank}") pipeline = attempt_load(DiffusionPipeline.from_pretrained, model_config.model, model_config.model_path, kwargs=kwargs) - #pipeline = pipeline.to(f"cuda:{local_rank}") + pipeline = pipeline.to(f"cuda:{local_rank}") pipeline.set_progress_bar_config(disable=True) return pipeline From efb14e04605e7763bd6272614c5b8bc679cf4c94 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 7 Nov 2024 15:15:28 -0800 Subject: [PATCH 24/35] Add other debugging Signed-off-by: Logan Adams --- mii/legacy/method_table.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mii/legacy/method_table.py b/mii/legacy/method_table.py index 520e9a1c..91ec616b 100644 --- a/mii/legacy/method_table.py +++ b/mii/legacy/method_table.py @@ -252,6 +252,10 @@ def unpack_request_from_proto(self, request): def run_inference(self, inference_pipeline, args, kwargs): image, candidate_labels = args + print({"image":image, "candidate_labels":candidate_labels, "kwargs":kwargs}) + import torch + kwargs["torch_dtype"] = torch.float16 + print({"image":image, "candidate_labels":candidate_labels, "kwargs":kwargs}) return inference_pipeline(image, candidate_labels=candidate_labels, **kwargs) From 3a0c215454d363cefc6c8cc44c598425a297229b Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 7 Nov 2024 15:16:46 -0800 Subject: [PATCH 25/35] pre-commit Signed-off-by: Logan Adams --- mii/legacy/method_table.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mii/legacy/method_table.py b/mii/legacy/method_table.py index 91ec616b..d190a6a3 100644 --- a/mii/legacy/method_table.py +++ b/mii/legacy/method_table.py @@ -252,10 +252,10 @@ def unpack_request_from_proto(self, request): def run_inference(self, inference_pipeline, args, kwargs): image, candidate_labels = args - print({"image":image, "candidate_labels":candidate_labels, "kwargs":kwargs}) + print({"image": image, "candidate_labels": candidate_labels, "kwargs": kwargs}) import torch kwargs["torch_dtype"] = torch.float16 - print({"image":image, "candidate_labels":candidate_labels, "kwargs":kwargs}) + print({"image": image, "candidate_labels": candidate_labels, "kwargs": kwargs}) return inference_pipeline(image, candidate_labels=candidate_labels, **kwargs) From 6fc890be1d01b5d56eef9be7f8d4888c91f41482 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 8 Nov 2024 09:54:09 -0800 Subject: [PATCH 26/35] Confirm replacement of inference_pipeline.model with engine causes problems Signed-off-by: Logan Adams --- mii/legacy/method_table.py | 4 ---- mii/legacy/models/load_models.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/mii/legacy/method_table.py b/mii/legacy/method_table.py index d190a6a3..520e9a1c 100644 --- a/mii/legacy/method_table.py +++ b/mii/legacy/method_table.py @@ -252,10 +252,6 @@ def unpack_request_from_proto(self, request): def run_inference(self, inference_pipeline, args, kwargs): image, candidate_labels = args - print({"image": image, "candidate_labels": candidate_labels, "kwargs": kwargs}) - import torch - kwargs["torch_dtype"] = torch.float16 - print({"image": image, "candidate_labels": candidate_labels, "kwargs": kwargs}) return inference_pipeline(image, candidate_labels=candidate_labels, **kwargs) diff --git a/mii/legacy/models/load_models.py b/mii/legacy/models/load_models.py index cfbf455f..9a37fcaa 100644 --- a/mii/legacy/models/load_models.py +++ b/mii/legacy/models/load_models.py @@ -75,7 +75,7 @@ def load_models(model_config): if model_config.profile_model_time: engine.profile_model_time() if hasattr(inference_pipeline, "model"): - inference_pipeline.model = engine + #inference_pipeline.model = engine elif model_config.enable_zero: ds_config = DeepSpeedConfig(model_config.ds_config) From 71331eb9702192076460b7136abdea515add940e Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 8 Nov 2024 09:57:20 -0800 Subject: [PATCH 27/35] Formatting Signed-off-by: Logan Adams --- mii/legacy/models/load_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mii/legacy/models/load_models.py b/mii/legacy/models/load_models.py index 9a37fcaa..7a6d55da 100644 --- a/mii/legacy/models/load_models.py +++ b/mii/legacy/models/load_models.py @@ -74,7 +74,7 @@ def load_models(model_config): config=inf_config) if model_config.profile_model_time: engine.profile_model_time() - if hasattr(inference_pipeline, "model"): + #if hasattr(inference_pipeline, "model"): #inference_pipeline.model = engine elif model_config.enable_zero: From 123d16baaffb056587ae9b57223215ce7c12aa16 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 8 Nov 2024 09:57:42 -0800 Subject: [PATCH 28/35] yapf Signed-off-by: Logan Adams --- mii/legacy/models/load_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mii/legacy/models/load_models.py b/mii/legacy/models/load_models.py index 7a6d55da..e65653f6 100644 --- a/mii/legacy/models/load_models.py +++ b/mii/legacy/models/load_models.py @@ -75,7 +75,7 @@ def load_models(model_config): if model_config.profile_model_time: engine.profile_model_time() #if hasattr(inference_pipeline, "model"): - #inference_pipeline.model = engine + #inference_pipeline.model = engine elif model_config.enable_zero: ds_config = DeepSpeedConfig(model_config.ds_config) From 6e57003d411e4c6f4fa81fe004883a22e6f68985 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 8 Nov 2024 11:01:00 -0800 Subject: [PATCH 29/35] Update code Signed-off-by: Logan Adams --- mii/legacy/models/load_models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mii/legacy/models/load_models.py b/mii/legacy/models/load_models.py index e65653f6..bbb7e37b 100644 --- a/mii/legacy/models/load_models.py +++ b/mii/legacy/models/load_models.py @@ -74,8 +74,10 @@ def load_models(model_config): config=inf_config) if model_config.profile_model_time: engine.profile_model_time() - #if hasattr(inference_pipeline, "model"): - #inference_pipeline.model = engine + if hasattr(inference_pipeline, "model"): + engine._parameters = inference_pipeline.model._parameters + engine.training = inference_pipeline.model.training + inference_pipeline.model = engine elif model_config.enable_zero: ds_config = DeepSpeedConfig(model_config.ds_config) From 936e2b14d8a56a9caa41a08f4e8dd13bba75700e Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 8 Nov 2024 14:02:47 -0800 Subject: [PATCH 30/35] Skip zero-shot tests for now Signed-off-by: Logan Adams --- mii/legacy/models/load_models.py | 2 -- tests/legacy/test_local_deployment.py | 11 ----------- tests/legacy/test_non_persistent_deployment.py | 11 ----------- 3 files changed, 24 deletions(-) diff --git a/mii/legacy/models/load_models.py b/mii/legacy/models/load_models.py index bbb7e37b..cfbf455f 100644 --- a/mii/legacy/models/load_models.py +++ b/mii/legacy/models/load_models.py @@ -75,8 +75,6 @@ def load_models(model_config): if model_config.profile_model_time: engine.profile_model_time() if hasattr(inference_pipeline, "model"): - engine._parameters = inference_pipeline.model._parameters - engine.training = inference_pipeline.model.training inference_pipeline.model = engine elif model_config.enable_zero: diff --git a/tests/legacy/test_local_deployment.py b/tests/legacy/test_local_deployment.py index 531036f6..69bac328 100644 --- a/tests/legacy/test_local_deployment.py +++ b/tests/legacy/test_local_deployment.py @@ -53,17 +53,6 @@ "query": "DeepSpeed is the greatest" }, ), - ( - "zero-shot-image-classification", - "openai/clip-vit-base-patch32", - { - "image": - "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", - "candidate_labels": ["animals", - "humans", - "landscape"] - }, - ), ], ) def test_single_GPU(deployment, query): diff --git a/tests/legacy/test_non_persistent_deployment.py b/tests/legacy/test_non_persistent_deployment.py index ed2b13fb..fe5309d6 100644 --- a/tests/legacy/test_non_persistent_deployment.py +++ b/tests/legacy/test_non_persistent_deployment.py @@ -55,17 +55,6 @@ "query": "DeepSpeed is the greatest" }, ), - ( - "zero-shot-image-classification", - "openai/clip-vit-base-patch32", - { - "image": - "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", - "candidate_labels": ["animals", - "humans", - "landscape"], - }, - ), ], ) def test_single_GPU(deployment, query): From 75b90c82222a753dfcd117653126e6970466ac99 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 8 Nov 2024 15:07:22 -0800 Subject: [PATCH 31/35] Unpin transformers Signed-off-by: Logan Adams --- .github/workflows/nv-v100-legacy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml index 346e7f18..01d12de4 100644 --- a/.github/workflows/nv-v100-legacy.yml +++ b/.github/workflows/nv-v100-legacy.yml @@ -36,7 +36,7 @@ jobs: - name: Install dependencies run: | pip install git+https://github.com/microsoft/DeepSpeed.git - pip install git+https://github.com/huggingface/transformers.git@v4.43.4 + pip install git+https://github.com/huggingface/transformers.git pip install -U accelerate ds_report From 825c072f670f7a2044f9ae38208a07d0bc1e4b40 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 19 Nov 2024 16:54:00 -0800 Subject: [PATCH 32/35] Test branch working around Bloom errors Signed-off-by: Logan Adams --- .github/workflows/nv-v100-legacy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml index 01d12de4..cee04591 100644 --- a/.github/workflows/nv-v100-legacy.yml +++ b/.github/workflows/nv-v100-legacy.yml @@ -35,7 +35,7 @@ jobs: - name: Install dependencies run: | - pip install git+https://github.com/microsoft/DeepSpeed.git + pip install git+https://github.com/microsoft/DeepSpeed.git@lekurile/bloom_v_check pip install git+https://github.com/huggingface/transformers.git pip install -U accelerate ds_report From d71a7b0ca762a3fd892f326c1048897f352e49e1 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Fri, 24 Jan 2025 14:55:51 -0800 Subject: [PATCH 33/35] Update CODEOWNERS file (#552) Signed-off-by: Logan Adams --- CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index 82efda6e..3cc2320e 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1 +1 @@ -* @tohtana @tjruwase @awan-10 @loadams +* @tohtana @tjruwase @loadams From 508906af4d0c3fcd8fcde9eb2a77bde7565bd34f Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Fri, 7 Feb 2025 09:33:26 -0800 Subject: [PATCH 34/35] Update contributing language on README for CLA->DCO (#554) Signed-off-by: Logan Adams --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d4e2d3c9..86f1a24c 100644 --- a/README.md +++ b/README.md @@ -321,13 +321,14 @@ Users can also control the generation characteristics for individual prompts (i. # Contributing -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. +This project welcomes contributions and suggestions. -When you submit a pull request, a CLA bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. +DeepSpeed-MII has adopted the [DCO](https://en.wikipedia.org/wiki/Developer_Certificate_of_Origin). All deepspeedai repos require a DCO. +(DeepSpeed previously used a CLA which is being replaced with DCO). + +DCO is provided by including a sign-off-by line in commit messages. Using the `-s` flag for `git commit` will automatically append this line. +For example, running `git commit -s -m 'commit info.'` will produce a commit that has the message `commit info. Signed-off-by: My Name .` +The DCO bot will ensure commits are signed with an email address that matches the commit author before they are eligible to be merged. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or From 3c9c7069cdd6ab503304bcf1e5975d39764c2e1a Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Fri, 7 Feb 2025 10:10:20 -0800 Subject: [PATCH 35/35] Update references due to DeepSpeed* GH repo move (#553) Signed-off-by: Logan Adams --- .github/workflows/nv-a6000-fastgen.yml | 2 +- .github/workflows/nv-v100-legacy.yml | 2 +- README.md | 24 +++++++++---------- docs/source/index.rst | 10 ++++---- docs/source/install.rst | 4 ++-- examples/README.md | 2 +- mii/aml_related/templates.py | 4 ++-- mii/legacy/README.md | 8 +++---- mii/legacy/aml_related/templates.py | 4 ++-- mii/legacy/docs/GPT-NeoX.md | 4 ++-- .../examples/benchmark/txt2img/README.md | 8 +++---- mii/legacy/examples/local/chat/README.md | 2 +- .../local/chat/chat-server-example.py | 2 +- setup.py | 4 ++-- 14 files changed, 40 insertions(+), 40 deletions(-) diff --git a/.github/workflows/nv-a6000-fastgen.yml b/.github/workflows/nv-a6000-fastgen.yml index 0b9da000..7363979d 100644 --- a/.github/workflows/nv-a6000-fastgen.yml +++ b/.github/workflows/nv-a6000-fastgen.yml @@ -41,7 +41,7 @@ jobs: python -m pip install . - name: Install deepspeed run: | - git clone --depth=1 https://github.com/microsoft/DeepSpeed + git clone --depth=1 https://github.com/deepspeedai/DeepSpeed cd DeepSpeed python -m pip install . ds_report diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml index cee04591..4e77036b 100644 --- a/.github/workflows/nv-v100-legacy.yml +++ b/.github/workflows/nv-v100-legacy.yml @@ -35,7 +35,7 @@ jobs: - name: Install dependencies run: | - pip install git+https://github.com/microsoft/DeepSpeed.git@lekurile/bloom_v_check + pip install git+https://github.com/deepspeedai/DeepSpeed.git@lekurile/bloom_v_check pip install git+https://github.com/huggingface/transformers.git pip install -U accelerate ds_report diff --git a/README.md b/README.md index 86f1a24c..70e3cef0 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -[![Formatting](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/formatting.yml/badge.svg?branch=main)](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/formatting.yml) -[![nv-v100-legacy](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/nv-v100-legacy.yml/badge.svg?branch=main)](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/nv-v100-legacy.yml) -[![nv-a6000-fastgen](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/nv-a6000-fastgen.yml/badge.svg?branch=main)](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/nv-a6000-fastgen.yml) -[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE) +[![Formatting](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/formatting.yml/badge.svg?branch=main)](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/formatting.yml) +[![nv-v100-legacy](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/nv-v100-legacy.yml/badge.svg?branch=main)](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/nv-v100-legacy.yml) +[![nv-a6000-fastgen](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/nv-a6000-fastgen.yml/badge.svg?branch=main)](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/nv-a6000-fastgen.yml) +[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/deepspeedai/DeepSpeed/blob/master/LICENSE) [![PyPI version](https://badge.fury.io/py/deepspeed-mii.svg)](https://pypi.org/project/deepspeed-mii/) @@ -12,8 +12,8 @@ ## Latest News -* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) -* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) +* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) +* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen) * [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](mii/legacy/examples/benchmark/txt2img) * [2022/10] [Announcing DeepSpeed Model Implementations for Inference (MII)](https://www.deepspeed.ai/2022/10/10/mii.html) @@ -33,7 +33,7 @@ Introducing MII, an open-source Python library designed by DeepSpeed to democratize powerful model inference with a focus on high-throughput, low latency, and cost-effectiveness. -* MII features include blocked KV-caching, continuous batching, Dynamic SplitFuse, tensor parallelism, and high-performance CUDA kernels to support fast high throughput text-generation for LLMs such as Llama-2-70B, Mixtral (MoE) 8x7B, and Phi-2. The latest updates in v0.2 add new model families, performance optimizations, and feature enhancements. MII now delivers up to 2.5 times higher effective throughput compared to leading systems such as vLLM. For detailed performance results please see our [latest DeepSpeed-FastGen blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) and [DeepSpeed-FastGen release blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen). +* MII features include blocked KV-caching, continuous batching, Dynamic SplitFuse, tensor parallelism, and high-performance CUDA kernels to support fast high throughput text-generation for LLMs such as Llama-2-70B, Mixtral (MoE) 8x7B, and Phi-2. The latest updates in v0.2 add new model families, performance optimizations, and feature enhancements. MII now delivers up to 2.5 times higher effective throughput compared to leading systems such as vLLM. For detailed performance results please see our [latest DeepSpeed-FastGen blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) and [DeepSpeed-FastGen release blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen).
@@ -58,7 +58,7 @@ MII provides accelerated text-generation inference through the use of four key t * Dynamic SplitFuse * High Performance CUDA Kernels -For a deeper dive into understanding these features please [refer to our blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) which also includes a detailed performance analysis. +For a deeper dive into understanding these features please [refer to our blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen) which also includes a detailed performance analysis. ## MII Legacy @@ -78,14 +78,14 @@ In the past, MII introduced several [key performance optimizations](https://www.
-Figure 1: MII architecture, showing how MII automatically optimizes OSS models using DS-Inference before deploying them. DeepSpeed-FastGen optimizations in the figure have been published in [our blog post](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen). +Figure 1: MII architecture, showing how MII automatically optimizes OSS models using DS-Inference before deploying them. DeepSpeed-FastGen optimizations in the figure have been published in [our blog post](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen). -Under-the-hood MII is powered by [DeepSpeed-Inference](https://github.com/microsoft/deepspeed). Based on the model architecture, model size, batch size, and available hardware resources, MII automatically applies the appropriate set of system optimizations to minimize latency and maximize throughput. +Under-the-hood MII is powered by [DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed). Based on the model architecture, model size, batch size, and available hardware resources, MII automatically applies the appropriate set of system optimizations to minimize latency and maximize throughput. # Supported Models -MII currently supports over 37,000 models across eight popular model architectures. We plan to add additional models in the near term, if there are specific model architectures you would like supported please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know. All current models leverage Hugging Face in our backend to provide both the model weights and the model's corresponding tokenizer. For our current release we support the following model architectures: +MII currently supports over 37,000 models across eight popular model architectures. We plan to add additional models in the near term, if there are specific model architectures you would like supported please [file an issue](https://github.com/deepspeedai/DeepSpeed-MII/issues) and let us know. All current models leverage Hugging Face in our backend to provide both the model weights and the model's corresponding tokenizer. For our current release we support the following model architectures: model family | size range | ~model count ------ | ------ | ------ @@ -120,7 +120,7 @@ The fasest way to get started is with our [PyPI release of DeepSpeed-MII](https: pip install deepspeed-mii ``` -For ease of use and significant reduction in lengthy compile times that many projects require in this space we distribute a pre-compiled python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/microsoft/DeepSpeed-Kernels#source). +For ease of use and significant reduction in lengthy compile times that many projects require in this space we distribute a pre-compiled python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/deepspeedai/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/deepspeedai/DeepSpeed-Kernels#source). ## Non-Persistent Pipeline diff --git a/docs/source/index.rst b/docs/source/index.rst index 813f232c..8099898e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -14,15 +14,15 @@ democratize powerful model inference with a focus on high-throughput, low latency, and cost-effectiveness. MII v0.1 introduced several features as part of our `DeepSpeed-FastGen release -`_ +`_ such as blocked KV-caching, continuous batching, Dynamic SplitFuse, tensor parallelism, and high-performance CUDA kernels to support fast high throughput text-generation with LLMs. The latest version of MII delivers up to 2.5 times higher effective throughput compared to leading systems such as vLLM. For detailed performance results please see our `DeepSpeed-FastGen release blog -`_ +`_ and the `latest DeepSpeed-FastGen blog -`_. +`_. MII-Legacy ---------- @@ -32,9 +32,9 @@ We first `announced MII `_ in of DeepSpeed-FastGen. MII-Legacy, which covers all prior releases up to v0.0.9, provides support for running inference for a wide variety of language model tasks. We also support accelerating `text2image models like Stable Diffusion -`_. +`_. For more details on our previous releases please see our `legacy APIs -`_. +`_. Contents diff --git a/docs/source/install.rst b/docs/source/install.rst index 523c9c1a..ee16f0ff 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -19,11 +19,11 @@ pip to install from source: .. code-block:: console - (.venv) $ pip install git+https://github.com/Microsoft/DeepSpeed-MII.git + (.venv) $ pip install git+https://github.com/deepspeedai/DeepSpeed-MII.git Or you can clone the repository and install: .. code-block:: console - (.venv) $ git clone https://github.com/Microsoft/DeepSpeed-MII.git + (.venv) $ git clone https://github.com/deepspeedai/DeepSpeed-MII.git (.venv) $ pip install ./DeepSpeed-MII diff --git a/examples/README.md b/examples/README.md index 4efb2155..334840a2 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,2 +1,2 @@ # MII Examples -Please see [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) for a few examples on using MII. +Please see [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/mii) for a few examples on using MII. diff --git a/mii/aml_related/templates.py b/mii/aml_related/templates.py index 71f1cb44..33805628 100644 --- a/mii/aml_related/templates.py +++ b/mii/aml_related/templates.py @@ -165,8 +165,8 @@ RUN /opt/miniconda/envs/amlenv/bin/pip install torch torchvision --index-url https://download.pytorch.org/whl/cu113 && \ /opt/miniconda/envs/amlenv/bin/pip install -r "$BUILD_DIR/requirements.txt" && \ /opt/miniconda/envs/amlenv/bin/pip install azureml-inference-server-http && \ - /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/microsoft/DeepSpeed.git && \ - /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/microsoft/DeepSpeed-MII.git && \ + /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/deepspeedai/DeepSpeed.git && \ + /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/deepspeedai/DeepSpeed-MII.git && \ /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/huggingface/transformers.git diff --git a/mii/legacy/README.md b/mii/legacy/README.md index ed949a1c..041c9516 100644 --- a/mii/legacy/README.md +++ b/mii/legacy/README.md @@ -1,6 +1,6 @@ - -[![Formatting](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/formatting.yml/badge.svg)](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/formatting.yml) -[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE) + +[![Formatting](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/formatting.yml/badge.svg)](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/formatting.yml) +[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/deepspeedai/DeepSpeed/blob/master/LICENSE) [![PyPI version](https://badge.fury.io/py/deepspeed-mii.svg)](https://pypi.org/project/deepspeed-mii/) @@ -195,7 +195,7 @@ result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=Tr ``` -You can find a complete example [here]("https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/non_persistent") +You can find a complete example [here]("https://github.com/deepspeedai/DeepSpeed-MII/tree/main/examples/non_persistent") Any HTTP client can be used to call the APIs. An example of using curl is: ```bash diff --git a/mii/legacy/aml_related/templates.py b/mii/legacy/aml_related/templates.py index 71f1cb44..33805628 100644 --- a/mii/legacy/aml_related/templates.py +++ b/mii/legacy/aml_related/templates.py @@ -165,8 +165,8 @@ RUN /opt/miniconda/envs/amlenv/bin/pip install torch torchvision --index-url https://download.pytorch.org/whl/cu113 && \ /opt/miniconda/envs/amlenv/bin/pip install -r "$BUILD_DIR/requirements.txt" && \ /opt/miniconda/envs/amlenv/bin/pip install azureml-inference-server-http && \ - /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/microsoft/DeepSpeed.git && \ - /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/microsoft/DeepSpeed-MII.git && \ + /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/deepspeedai/DeepSpeed.git && \ + /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/deepspeedai/DeepSpeed-MII.git && \ /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/huggingface/transformers.git diff --git a/mii/legacy/docs/GPT-NeoX.md b/mii/legacy/docs/GPT-NeoX.md index dafcc736..6e495ca7 100644 --- a/mii/legacy/docs/GPT-NeoX.md +++ b/mii/legacy/docs/GPT-NeoX.md @@ -18,7 +18,7 @@ source ./MII-GPT-NeoX/bin/activate ## Install MII ```bash -git clone https://github.com/microsoft/DeepSpeed-MII.git +git clone https://github.com/deepspeedai/DeepSpeed-MII.git cd DeepSpeed-MII pip install .[local] pip install . @@ -26,7 +26,7 @@ pip install . ## Install DeepSpeed-GPT-NeoX ```bash -git clone -b ds-updates https://github.com/microsoft/deepspeed-gpt-neox.git +git clone -b ds-updates https://github.com/deepspeedai/DeepSpeed-gpt-neox.git cd deepspeed-gpt-neox pip install -r requirements/requirements-inference.txt pip install . diff --git a/mii/legacy/examples/benchmark/txt2img/README.md b/mii/legacy/examples/benchmark/txt2img/README.md index ad4f769d..469afb91 100644 --- a/mii/legacy/examples/benchmark/txt2img/README.md +++ b/mii/legacy/examples/benchmark/txt2img/README.md @@ -5,7 +5,7 @@ -In this tutorial you will learn how to deploy [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion-v1-4) with state-of-the-art performance optimizations from [DeepSpeed Inference](https://github.com/microsoft/deepspeed) and [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii). In addition to deploying we will perform several performance evaluations. +In this tutorial you will learn how to deploy [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion-v1-4) with state-of-the-art performance optimizations from [DeepSpeed Inference](https://github.com/deepspeedai/DeepSpeed) and [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-mii). In addition to deploying we will perform several performance evaluations. The performance results above utilized NVIDIA GPUs from Azure: [ND96amsr\_A100\_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/nda100-v4-series) (NVIDIA A100-80GB) and [ND96asr\_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/nda100-v4-series) (A100-40GB). We have also used MII-Public with NVIDIA RTX-A6000 GPUs and will include those results at a future date. @@ -36,9 +36,9 @@ DeepSpeed-MII will automatically inject a wide range of optimizations from DeepS 6. Partial UNet INT8 quantization via [ZeroQuant](https://arxiv.org/abs/2206.01861) 7. Exploitation of coarse grained computation sparsity -The first four optimizations are available via MII-Public, while the rest are available via MII-Azure ([see here to read more about MII-Public and MII-Azure](https://github.com/microsoft/deepspeed-mii#mii-public-and-mii-azure)). In the rest of this tutorial, we will show how you can deploy Stable Diffusion with both MII-Public and MII-Azure. +The first four optimizations are available via MII-Public, while the rest are available via MII-Azure ([see here to read more about MII-Public and MII-Azure](https://github.com/deepspeedai/DeepSpeed-mii#mii-public-and-mii-azure)). In the rest of this tutorial, we will show how you can deploy Stable Diffusion with both MII-Public and MII-Azure. -Keep an eye on the [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) repo and this tutorial for further updates and a deeper dive into these and future performance optimizations. +Keep an eye on the [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-mii) repo and this tutorial for further updates and a deeper dive into these and future performance optimizations. ## Environment and dependency setup @@ -49,7 +49,7 @@ pip install deepspeed[sd] deepspeed-mii ``` > **Note** -> The DeepSpeed version used in the rest of this tutorial uses [this branch](https://github.com/microsoft/DeepSpeed/pull/2491) which will be merged into master and released as part of DeepSpeed v0.7.5 later this week. +> The DeepSpeed version used in the rest of this tutorial uses [this branch](https://github.com/deepspeedai/DeepSpeed/pull/2491) which will be merged into master and released as part of DeepSpeed v0.7.5 later this week. In order to check your DeepSpeed install is setup correctly run `ds_report` from your command line. This will show what versions of DeepSpeed, PyTorch, and nvcc will be used at runtime. The bottom half of `ds_report` is show below for our setup: diff --git a/mii/legacy/examples/local/chat/README.md b/mii/legacy/examples/local/chat/README.md index 4bc48639..55684def 100644 --- a/mii/legacy/examples/local/chat/README.md +++ b/mii/legacy/examples/local/chat/README.md @@ -8,7 +8,7 @@ The scripts in this folder provide a complete example of a multi-turn conversati Starting the server for your chat application requires nothing special. Just make sure that the model supports `text-generation` and is trained for conversations. -The example script uses [AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed), which was trained using [DeepSpeed-Chat](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md). +The example script uses [AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed), which was trained using [DeepSpeed-Chat](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md). ```python name = "AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed" diff --git a/mii/legacy/examples/local/chat/chat-server-example.py b/mii/legacy/examples/local/chat/chat-server-example.py index ae8d2ac0..ec484f91 100644 --- a/mii/legacy/examples/local/chat/chat-server-example.py +++ b/mii/legacy/examples/local/chat/chat-server-example.py @@ -7,7 +7,7 @@ mii_configs = {'tensor_parallel': 1} # This checkpoint is create using DeepSpeed-Chat -# https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md +# https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md name = "AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed" print(f"Deploying {name}...") diff --git a/setup.py b/setup.py index 415e6df7..aecebcf6 100644 --- a/setup.py +++ b/setup.py @@ -85,8 +85,8 @@ def command_exists(cmd): author_email='deepspeed-mii@microsoft.com', url='http://deepspeed.ai', project_urls={ - 'Documentation': 'https://github.com/microsoft/DeepSpeed-MII', - 'Source': 'https://github.com/microsoft/DeepSpeed-MII', + 'Documentation': 'https://github.com/deepspeedai/DeepSpeed-MII', + 'Source': 'https://github.com/deepspeedai/DeepSpeed-MII', }, install_requires=install_requires, extras_require=extras_require,