From dd188b0c7b808796287cd7ca456afcd872ec630f Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 25 Jul 2024 10:40:54 -0700
Subject: [PATCH 01/35] Unpin transformers version

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-v100-legacy.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml
index f0598279..426996b3 100644
--- a/.github/workflows/nv-v100-legacy.yml
+++ b/.github/workflows/nv-v100-legacy.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install git+https://github.com/microsoft/DeepSpeed.git
-          pip install git+https://github.com/huggingface/transformers.git@v4.42.4
+          pip install git+https://github.com/huggingface/transformers.git
           pip install -U accelerate
           ds_report
 

From 11839bc29c53fe83e759a4bb0a32a7421864ee55 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 25 Jul 2024 13:29:13 -0700
Subject: [PATCH 02/35] Update model support (#429)

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Co-authored-by: Logan Adams <loadams@microsoft.com>
Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 README.md                         | 18 ++++-----
 requirements/requirements-dev.txt |  3 ++
 tests/test_model_support.py       | 63 ++++++++++++-------------------
 3 files changed, 36 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index f666762a..476c7cc1 100644
--- a/README.md
+++ b/README.md
@@ -85,18 +85,18 @@ Under-the-hood MII is powered by [DeepSpeed-Inference](https://github.com/micros
 
 # Supported Models
 
-MII currently supports over 20,000 models across eight popular model architectures. We plan to add additional models in the near term, if there are specific model architectures you would like supported please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know. All current models leverage Hugging Face in our backend to provide both the model weights and the model's corresponding tokenizer. For our current release we support the following model architectures:
+MII currently supports over 37,000 models across eight popular model architectures. We plan to add additional models in the near term, if there are specific model architectures you would like supported please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know. All current models leverage Hugging Face in our backend to provide both the model weights and the model's corresponding tokenizer. For our current release we support the following model architectures:
 
 model family | size range | ~model count
 ------ | ------ | ------
-[falcon](https://huggingface.co/models?other=falcon) | 7B - 180B | 300
-[llama](https://huggingface.co/models?other=llama) | 7B - 65B | 19,000
-[llama-2](https://huggingface.co/models?other=llama-2) | 7B - 70B | 900
-[mistral](https://huggingface.co/models?other=mistral) | 7B | 6,000
-[mixtral (MoE)](https://huggingface.co/models?other=mixtral) | 8x7B | 1,100
-[opt](https://huggingface.co/models?other=opt) | 0.1B - 66B | 1,300
-[phi-2](https://huggingface.co/models?other=phi) | 2.7B | 200
-[qwen](https://huggingface.co/models?other=qwen) | 7B - 72B | 200
+[falcon](https://huggingface.co/models?other=falcon) | 7B - 180B | 500
+[llama](https://huggingface.co/models?other=llama) | 7B - 65B | 52,000
+[llama-2](https://huggingface.co/models?other=llama-2) | 7B - 70B | 1,200
+[mistral](https://huggingface.co/models?other=mistral) | 7B | 23,000
+[mixtral (MoE)](https://huggingface.co/models?other=mixtral) | 8x7B | 2,900
+[opt](https://huggingface.co/models?other=opt) | 0.1B - 66B | 2,100
+[phi-2](https://huggingface.co/models?other=phi) | 2.7B | 1,500
+[qwen](https://huggingface.co/models?other=qwen) | 7B - 72B | 500
 
 ## MII Legacy Model Support
 
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index 1d69f875..4b7bb770 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -1,5 +1,8 @@
 clang-format==16.0.2
+einops
 pre-commit>=2.20.0
 pytest
 pytest-forked
 sentencepiece
+tiktoken
+transformers-stream-generator
diff --git a/tests/test_model_support.py b/tests/test_model_support.py
index be49044a..fb554206 100644
--- a/tests/test_model_support.py
+++ b/tests/test_model_support.py
@@ -11,25 +11,16 @@
     CheckpointEngineBase,
     HuggingFaceCheckpointEngine,
 )
-from transformers import AutoConfig, AutoModel, GenerationConfig
+from transformers import AutoConfig, AutoModelForCausalLM, GenerationConfig
 from typing import Iterable, Tuple
 
 
-class RandomWeightsCheckpointEngine(CheckpointEngineBase):
-
-    # When using AutoModel.from_config() to load the model, the layer names are
-    # often missing a prefix. We default to adding "model." as the prefix, but
-    # others can be specified here.
-    layer_prefix_map = {"falcon": "transformer."}
-
-    # When using AutoModel.from_config() to load the model, the lm_head layer is
-    # not generated. We default to populating this with the
-    # "embed_tokens.weight" layer, but others can be specified here.
-    lm_head_layer_map = {"falcon": "word_embeddings.weight"}
-
+class ZeroWeightsCheckpointEngine(CheckpointEngineBase):
+    """ Generates weight with all zeros for a given model for testing purposes. """
     def __init__(self, model_name_or_path: str, auth_token: str = None) -> None:
         self.model_name_or_path = model_name_or_path
-        self.model_config = AutoConfig.from_pretrained(self.model_name_or_path)
+        self.model_config = AutoConfig.from_pretrained(self.model_name_or_path,
+                                                       trust_remote_code=True)
         if hasattr(self.model_config, "max_position_embeddings"):
             self.model_config.max_seq_length = self.model_config.max_position_embeddings
         else:
@@ -40,37 +31,21 @@ def __init__(self, model_name_or_path: str, auth_token: str = None) -> None:
             except OSError:
                 self.model_config.max_seq_length = 2048
 
-    def _get_layer_prefix(self) -> str:
-        for model_type, prefix in self.layer_prefix_map.items():
-            if model_type in self.model_name_or_path.lower():
-                return prefix
-        return "model."
-
-    def _get_lm_head_layer(self) -> str:
-        for model_type, layer in self.lm_head_layer_map.items():
-            if model_type in self.model_name_or_path.lower():
-                return layer
-        return "embed_tokens.weight"
-
     def parameters(self) -> Iterable[Tuple[str, torch.Tensor]]:
-        layer_prefix = self._get_layer_prefix()
-        lm_head_layer = self._get_lm_head_layer()
-
         # Load with meta device is faster
         with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
-            model = AutoModel.from_config(self.model_config)
+            model = AutoModelForCausalLM.from_config(self.model_config,
+                                                     trust_remote_code=True)
 
         for param_name, param in model.state_dict().items():
-            yield layer_prefix + param_name, torch.zeros(param.shape)
-            if param_name == lm_head_layer:
-                yield "lm_head.weight", torch.zeros(param.shape)
+            yield param_name, torch.zeros(param.shape)
 
 
 @pytest.fixture(scope="module", autouse=True)
 def inject_checkpoint_engine():
     # Inject the random weihts checkpoint engine
     deepspeed.inference.v2.engine_factory.HuggingFaceCheckpointEngine = (
-        RandomWeightsCheckpointEngine)
+        ZeroWeightsCheckpointEngine)
     yield None
     # Restore the original checkpoint engine
     deepspeed.inference.v2.engine_factory.HuggingFaceCheckpointEngine = (
@@ -81,16 +56,26 @@ def inject_checkpoint_engine():
     "model_name",
     [
         "tiiuae/falcon-7b",
+        "huggyllama/llama-7b",
         "NousResearch/Llama-2-7b-hf",
         "NousResearch/Hermes-2-Pro-Mistral-7B",
         "cloudyu/Mixtral_11Bx2_MoE_19B",
         "facebook/opt-125m",
+        "microsoft/phi-2",
+        "Qwen/Qwen-7B-Chat",
+        "Qwen/Qwen1.5-0.5B",
+    ],
+    ids=[
+        "falcon",
+        "llama",
+        "llama-2",
+        "mistral",
+        "mixtral",
+        "opt",
+        "phi-2",
+        "qwen",
+        "qwen-2"
     ],
-    ids=["falcon",
-         "llama",
-         "mistral",
-         "mixtral",
-         "opt"],
 )
 def test_model(pipeline, query):
     outputs = pipeline(query, max_new_tokens=16)

From 416d4cc85db727efeb47ec243e53a75e5e966cd9 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Thu, 25 Jul 2024 13:34:47 -0700
Subject: [PATCH 03/35] Update version.txt (#511)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index abd41058..3a4036fb 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.4
+0.2.5

From 036b9d99b04019b3f86b30c5451c62b180f17485 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 25 Jul 2024 16:14:16 -0700
Subject: [PATCH 04/35] Test with transformers v4.43.0

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-v100-legacy.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml
index 426996b3..0eef4799 100644
--- a/.github/workflows/nv-v100-legacy.yml
+++ b/.github/workflows/nv-v100-legacy.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install git+https://github.com/microsoft/DeepSpeed.git
-          pip install git+https://github.com/huggingface/transformers.git
+          pip install git+https://github.com/huggingface/transformers.git@v4.43.0
           pip install -U accelerate
           ds_report
 

From ba9c26f2b2b306b7af8b296d45f0a92bbd98fd23 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Mon, 5 Aug 2024 12:58:51 -0700
Subject: [PATCH 05/35] Fix scheduling for non-persistent pipeline (#515)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/batching/ragged_batching.py | 28 ++++++++++++++++------------
 mii/legacy/logging.py           |  2 +-
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/mii/batching/ragged_batching.py b/mii/batching/ragged_batching.py
index 4e9583b9..5b37e5b8 100644
--- a/mii/batching/ragged_batching.py
+++ b/mii/batching/ragged_batching.py
@@ -101,13 +101,13 @@ def is_rank_0(self) -> bool:
         return self.local_rank == 0
 
     @profiler
-    def generate(self) -> None:
+    def generate(self) -> Union[None, bool]:
         """
         This is the main loop of FastGen: puts requests and gets generated results.
         """
 
         # 1. Get a batch of requests, broadcast to all ranks
-        scheduled_requests = self._bcast_requests()
+        scheduled_requests, force = self._bcast_requests()
 
         # 2. Flush for uids that are finished generating
         self.flush(scheduled_requests.requests_to_flush.uids)
@@ -121,7 +121,7 @@ def generate(self) -> None:
 
         # short circuit if not rank 0, only rank 0 does scheduling and postprocessing of logits
         if not self.is_rank_0:
-            return
+            return force
 
         # 4. Launch logit processing and token generation
         running_requests = scheduled_requests.requests_to_run
@@ -173,20 +173,22 @@ def _bcast_requests(self, force=False) -> RequestBatch:
         # the prompt tokens must be broadcast to all TP processes.
         if self.is_rank_0:
             if not self.scheduled_requests and not force:
-                return self.scheduled_requests
+                return self.scheduled_requests, force
             # Rank 0 gets batch of requests and broadcasts to other ranks
             data_dicts = self.scheduled_requests.to_msg_dicts()
-            json_data = ujson.dumps(data_dicts)
+            json_data = ujson.dumps({"data": data_dicts, "force": force})
             self.socket.send_string(json_data)
         else:
             try:
                 json_data = self.socket.recv_string()
-                data_dicts = ujson.loads(json_data)
+                recv_dict = ujson.loads(json_data)
+                data_dicts = recv_dict["data"]
+                force = recv_dict["force"]
                 self.scheduled_requests = RequestBatch.from_msg_dicts(data_dicts)
             except zmq.Again:
                 self.scheduled_requests = RequestBatch()
 
-        return self.scheduled_requests
+        return self.scheduled_requests, force
 
     def _reset_scheduler_bookkeeping(self) -> None:
         self.scheduled_requests = RequestBatch()
@@ -560,6 +562,7 @@ def __init__(self, all_rank_output: bool = False, *args, **kwargs) -> None:
         self.tid = threading.get_ident()
         self._all_rank_output = all_rank_output
         self._destroyed = False
+        get_accelerator().set_device(int(os.getenv("LOCAL_RANK", "0")))
 
     def __call__(self,
                  prompts: Union[str,
@@ -589,25 +592,26 @@ def __call__(self,
             request_kwargs = generate_kwargs.copy()
             self._put_request(uid, input, request_kwargs)
 
-        self.schedule_requests()
-
         if self.is_rank_0:
             # Rank 0 runs generate() until all responses are returned
             while uids_running:
-                self.generate()
                 while not self.result_queues[self.tid].empty():
                     uid, response = self._get_response()
                     outputs.append(response)
                     self._queue_flush_request(uid)
                     uids_complete_order.append(uid)
                     uids_running.remove(uid)
+                self.generate()
             # Ensure final flush requests broadcast and
             # kick ranks 1 -> n out of the while loop
             self._bcast_requests(force=True)
+            self.flush(self.scheduled_requests.requests_to_flush.uids)
+            self.scheduled_requests = RequestBatch()
         else:
             # Ranks 1 -> n just run generate() until there are no more requests
-            while self.scheduled_requests:
-                self.generate()
+            exit = False
+            while not exit:
+                exit = self.generate()
 
         outputs = [
             r for idx,
diff --git a/mii/legacy/logging.py b/mii/legacy/logging.py
index 1fcf2ac9..9fe62a20 100644
--- a/mii/legacy/logging.py
+++ b/mii/legacy/logging.py
@@ -42,4 +42,4 @@ def create_logger(name=None, level=logging.INFO):
         return logger_
 
 
-logger = LoggerFactory.create_logger(name="MII", level=logging.INFO)
+logger = LoggerFactory.create_logger(name="MII_legacy", level=logging.INFO)

From 24c42b5278a9f6cf705b8bdff0a48f504bfb67ac Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Wed, 21 Aug 2024 17:35:10 +0100
Subject: [PATCH 06/35] Add Kubernetes health check route to REST server (#445)

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/grpc_related/restful_gateway.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mii/grpc_related/restful_gateway.py b/mii/grpc_related/restful_gateway.py
index a5f1692b..5b93fea6 100644
--- a/mii/grpc_related/restful_gateway.py
+++ b/mii/grpc_related/restful_gateway.py
@@ -40,6 +40,10 @@ def terminate():
         threading.Thread(target=shutdown, args=(server_thread, )).start()
         return "Shutting down RESTful API gateway server"
 
+    @app.route("/healthz", methods=["GET"])
+    def healthz():
+        return "ok"
+
     api = Api(app)
     path = "/{}/{}".format(RESTFUL_API_PATH, deployment_name)
     api.add_resource(RestfulGatewayService, path)

From ab5b2ba234926352c72122b95d59bac13ee503b0 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 22 Aug 2024 15:49:24 -0700
Subject: [PATCH 07/35] Update in advance of pydantic PR

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 3a4036fb..0d91a54c 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.2.5
+0.3.0

From b285e81de8dd866d183582e3a00a55ca684602fc Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 22 Aug 2024 15:52:00 -0700
Subject: [PATCH 08/35] Pydantic v2 migration (#423)

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Abhishek Kulkarni <abkulkarni@microsoft.com>
Co-authored-by: Logan Adams <loadams@microsoft.com>
Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 docs/requirements.txt                   |   4 +-
 mii/api.py                              |  10 +-
 mii/backend/client.py                   |   2 +-
 mii/backend/server.py                   |  19 ++-
 mii/config.py                           | 125 ++++++++---------
 mii/legacy/client.py                    |   2 +-
 mii/legacy/config.py                    | 179 ++++++++++++------------
 mii/legacy/deployment.py                |  18 +--
 mii/legacy/pydantic_v1.py               |  16 ---
 mii/legacy/server.py                    |  15 +-
 mii/legacy/utils.py                     |   2 +-
 mii/pydantic_v1.py                      |  16 ---
 mii/score/generate.py                   |   2 +-
 requirements/requirements.txt           |   4 +-
 tests/legacy/test_config.py             |  10 +-
 tests/legacy/test_deployment_options.py |   4 +-
 tests/test_arg_parsing.py               |  24 ++--
 17 files changed, 207 insertions(+), 245 deletions(-)
 delete mode 100644 mii/legacy/pydantic_v1.py
 delete mode 100644 mii/pydantic_v1.py

diff --git a/docs/requirements.txt b/docs/requirements.txt
index e2a2fd67..1afb6a65 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,6 +1,6 @@
 asyncio
-autodoc_pydantic<2.0.0
-deepspeed>=0.13.0
+autodoc_pydantic>=2.0.0
+deepspeed>=0.15.0
 grpcio
 grpcio-tools
 sphinx==7.1.2
diff --git a/mii/api.py b/mii/api.py
index 77ed6e19..841f7624 100644
--- a/mii/api.py
+++ b/mii/api.py
@@ -39,7 +39,7 @@ def _parse_kwargs_to_model_config(
     # Fill model_config dict with relevant kwargs, store remaining kwargs in a new dict
     remaining_kwargs = {}
     for key, val in kwargs.items():
-        if key in ModelConfig.__dict__["__fields__"]:
+        if key in ModelConfig.model_fields.keys():
             if key in model_config:
                 assert (
                     model_config.get(key) == val
@@ -77,7 +77,7 @@ def _parse_kwargs_to_mii_config(
 
     # Fill mii_config dict with relevant kwargs, raise error on unknown kwargs
     for key, val in remaining_kwargs.items():
-        if key in MIIConfig.__dict__["__fields__"]:
+        if key in MIIConfig.model_fields.keys():
             if key in mii_config:
                 assert (
                     mii_config.get(key) == val
@@ -183,9 +183,9 @@ def serve(
         mii.aml_related.utils.generate_aml_scripts(
             acr_name=acr_name,
             deployment_name=mii_config.deployment_name,
-            model_name=mii_config.model_config.model,
-            task_name=mii_config.model_config.task,
-            replica_num=mii_config.model_config.replica_num,
+            model_name=mii_config.model_conf.model,
+            task_name=mii_config.model_conf.task,
+            replica_num=mii_config.model_conf.replica_num,
             instance_type=mii_config.instance_type,
             version=mii_config.version,
         )
diff --git a/mii/backend/client.py b/mii/backend/client.py
index cb4acc17..d946fce6 100644
--- a/mii/backend/client.py
+++ b/mii/backend/client.py
@@ -37,7 +37,7 @@ class MIIClient:
     """
     def __init__(self, mii_config: MIIConfig, host: str = "localhost") -> None:
         self.mii_config = mii_config
-        self.task = mii_config.model_config.task
+        self.task = mii_config.model_conf.task
         self.port = mii_config.port_number
         self.asyncio_loop = asyncio.get_event_loop()
         channel = create_channel(host, self.port)
diff --git a/mii/backend/server.py b/mii/backend/server.py
index 02e055d5..ac51a018 100644
--- a/mii/backend/server.py
+++ b/mii/backend/server.py
@@ -20,7 +20,7 @@
 
 def config_to_b64_str(config: DeepSpeedConfigModel) -> str:
     # convert json str -> bytes
-    json_bytes = config.json().encode()
+    json_bytes = config.model_dump_json().encode()
     # base64 encoded bytes
     b64_config_bytes = base64.urlsafe_b64encode(json_bytes)
     # bytes -> str
@@ -31,7 +31,7 @@ class MIIServer:
     """Initialize the model, setup the server for the model"""
     def __init__(self, mii_config: MIIConfig) -> None:
 
-        self.task = mii_config.model_config.task
+        self.task = mii_config.model_conf.task
         self.port_number = mii_config.port_number
 
         if not os.path.isfile(mii_config.hostfile):
@@ -47,8 +47,7 @@ def __init__(self, mii_config: MIIConfig) -> None:
         # balancer process, each DeepSpeed model replica, and optionally the
         # REST API process)
         processes = self._initialize_service(mii_config)
-        self._wait_until_server_is_live(processes,
-                                        mii_config.model_config.replica_configs)
+        self._wait_until_server_is_live(processes, mii_config.model_conf.replica_configs)
 
     def _wait_until_server_is_live(self,
                                    processes: List[subprocess.Popen],
@@ -143,15 +142,15 @@ def _initialize_service(self, mii_config: MIIConfig) -> List[subprocess.Popen]:
         ]
 
         host_gpus = defaultdict(list)
-        for repl_config in mii_config.model_config.replica_configs:
+        for repl_config in mii_config.model_conf.replica_configs:
             host_gpus[repl_config.hostname].extend(repl_config.gpu_indices)
 
         use_multiple_hosts = len(
             set(repl_config.hostname
-                for repl_config in mii_config.model_config.replica_configs)) > 1
+                for repl_config in mii_config.model_conf.replica_configs)) > 1
 
         # Start replica instances
-        for repl_config in mii_config.model_config.replica_configs:
+        for repl_config in mii_config.model_conf.replica_configs:
             hostfile = tempfile.NamedTemporaryFile(delete=False)
             hostfile.write(
                 f"{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n"
@@ -161,7 +160,7 @@ def _initialize_service(self, mii_config: MIIConfig) -> List[subprocess.Popen]:
                                                          use_multiple_hosts)
             processes.append(
                 self._launch_server_process(
-                    mii_config.model_config,
+                    mii_config.model_conf,
                     "MII server",
                     ds_launch_str=ds_launch_str,
                     server_args=server_args + [
@@ -175,7 +174,7 @@ def _initialize_service(self, mii_config: MIIConfig) -> List[subprocess.Popen]:
         # expected to assign one GPU to one process.
         processes.append(
             self._launch_server_process(
-                mii_config.model_config,
+                mii_config.model_conf,
                 "load balancer",
                 server_args=server_args + ["--load-balancer"],
             ))
@@ -183,7 +182,7 @@ def _initialize_service(self, mii_config: MIIConfig) -> List[subprocess.Popen]:
         if mii_config.enable_restful_api:
             processes.append(
                 self._launch_server_process(
-                    mii_config.model_config,
+                    mii_config.model_conf,
                     "restful api gateway",
                     server_args=server_args + ["--restful-gateway"],
                 ))
diff --git a/mii/config.py b/mii/config.py
index 565cdbbc..a1cafb66 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -8,27 +8,18 @@
 
 from deepspeed.launcher.runner import DLTS_HOSTFILE, fetch_hostfile
 from deepspeed.inference import RaggedInferenceEngineConfig
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+from pydantic import Field, model_validator, field_validator
 
 from mii.constants import DeploymentType, TaskType, ModelProvider
 from mii.errors import DeploymentNotFoundError
 from mii.modeling.tokenizers import MIITokenizerWrapper
-from mii.pydantic_v1 import BaseModel, Field, root_validator, validator, Extra
-from mii.utils import generate_deployment_name, get_default_task, import_score_file
+from mii.utils import generate_deployment_name, import_score_file
 
 DEVICE_MAP_DEFAULT = "auto"
 
 
-class MIIConfigModel(BaseModel):
-    class Config:
-        validate_all = True
-        validate_assignment = True
-        use_enum_values = True
-        allow_population_by_field_name = True
-        extra = "forbid"
-        arbitrary_types_allowed = True
-
-
-class GenerateParamsConfig(MIIConfigModel):
+class GenerateParamsConfig(DeepSpeedConfigModel):
     """
     Options for changing text-generation behavior.
     """
@@ -39,7 +30,7 @@ class GenerateParamsConfig(MIIConfigModel):
     max_length: int = 1024
     """ Maximum length of ``input_tokens`` + ``generated_tokens``. """
 
-    max_new_tokens: int = None
+    max_new_tokens: Optional[int] = None
     """ Maximum number of new tokens generated. ``max_length`` takes precedent. """
 
     min_new_tokens: int = 0
@@ -68,24 +59,25 @@ class GenerateParamsConfig(MIIConfigModel):
 
     stop: List[str] = []
     """ List of strings to stop generation at."""
-    @validator("stop", pre=True)
+    @field_validator("stop", mode="before")
+    @classmethod
     def make_stop_string_list(cls, field_value: Union[str, List[str]]) -> List[str]:
         if isinstance(field_value, str):
             return [field_value]
         return field_value
 
-    @validator("stop")
+    @field_validator("stop")
+    @classmethod
     def sort_stop_strings(cls, field_value: List[str]) -> List[str]:
         return sorted(field_value)
 
-    @root_validator
-    def check_prompt_length(cls, values: Dict[str, Any]) -> Dict[str, Any]:
-        prompt_length = values.get("prompt_length")
-        max_length = values.get("max_length")
-        assert max_length > prompt_length, f"max_length ({max_length}) must be greater than prompt_length ({prompt_length})"
-        return values
+    @model_validator(mode="after")
+    def check_prompt_length(self) -> "GenerateParamsConfig":
+        assert self.max_length > self.prompt_length, f"max_length ({self.max_length}) must be greater than prompt_length ({self.prompt_length})"
+        return self
 
-    @root_validator
+    @model_validator(mode="before")
+    @classmethod
     def set_max_new_tokens(cls, values: Dict[str, Any]) -> Dict[str, Any]:
         max_length = values.get("max_length")
         max_new_tokens = values.get("max_new_tokens")
@@ -94,19 +86,16 @@ def set_max_new_tokens(cls, values: Dict[str, Any]) -> Dict[str, Any]:
             values["max_new_tokens"] = max_length - prompt_length
         return values
 
-    class Config:
-        extra = Extra.forbid
-
 
-class ReplicaConfig(MIIConfigModel):
+class ReplicaConfig(DeepSpeedConfigModel):
     hostname: str = ""
     tensor_parallel_ports: List[int] = []
-    torch_dist_port: int = None
+    torch_dist_port: Optional[int] = None
     gpu_indices: List[int] = []
-    zmq_port: int = None
+    zmq_port: Optional[int] = None
 
 
-class ModelConfig(MIIConfigModel):
+class ModelConfig(DeepSpeedConfigModel):
     model_name_or_path: str
     """
     Model name or path of the model to HuggingFace model to be deployed.
@@ -192,8 +181,9 @@ class ModelConfig(MIIConfigModel):
     def provider(self) -> ModelProvider:
         return ModelProvider.HUGGING_FACE
 
-    @validator("device_map", pre=True)
-    def make_device_map_dict(cls, v):
+    @field_validator("device_map", mode="before")
+    @classmethod
+    def make_device_map_dict(cls, v: Any) -> Dict:
         if isinstance(v, int):
             return {"localhost": [[v]]}
         if isinstance(v, list) and isinstance(v[0], int):
@@ -202,36 +192,36 @@ def make_device_map_dict(cls, v):
             return {"localhost": v}
         return v
 
-    @root_validator
+    @model_validator(mode="before")
+    @classmethod
     def auto_fill_values(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        assert values.get("model_name_or_path"), "model_name_or_path must be provided"
         if not values.get("tokenizer"):
             values["tokenizer"] = values.get("model_name_or_path")
-        if not values.get("task"):
-            values["task"] = get_default_task(values.get("model_name_or_path"))
+        #if not values.get("task"):
+        #    values["task"] = get_default_task(values.get("model_name_or_path"))
+        values["task"] = TaskType.TEXT_GENERATION
         return values
 
-    @root_validator
-    def propagate_tp_size(cls, values: Dict[str, Any]) -> Dict[str, Any]:
-        tensor_parallel = values.get("tensor_parallel")
-        values.get("inference_engine_config").tensor_parallel.tp_size = tensor_parallel
-        return values
-
-    @root_validator
-    def propagate_quantization_mode(cls, values: Dict[str, Any]) -> Dict[str, Any]:
-        quantization_mode = values.get("quantization_mode")
-        values.get(
-            "inference_engine_config").quantization.quantization_mode = quantization_mode
-        return values
+    @model_validator(mode="after")
+    def propagate_tp_size(self) -> "ModelConfig":
+        self.inference_engine_config.tensor_parallel.tp_size = self.tensor_parallel
+        return self
 
-    @root_validator
-    def check_replica_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
-        num_replica_config = len(values.get("replica_configs"))
+    @model_validator(mode="after")
+    def check_replica_config(self) -> "ModelConfig":
+        num_replica_config = len(self.replica_configs)
         if num_replica_config > 0:
-            assert num_replica_config == values.get("replica_num"), "Number of replica configs must match replica_num"
-        return values
+            assert num_replica_config == self.replica_num, "Number of replica configs must match replica_num"
+        return self
+
+    @model_validator(mode="after")
+    def propagate_quantization_mode(self) -> "ModelConfig":
+        self.inference_engine_config.quantization.quantization_mode = self.quantization_mode
+        return self
 
 
-class MIIConfig(MIIConfigModel):
+class MIIConfig(DeepSpeedConfigModel):
     deployment_name: str = ""
     """
     Name of the deployment. Used as an identifier for obtaining a inference
@@ -245,7 +235,7 @@ class MIIConfig(MIIConfigModel):
     * `AML` will generate the assets necessary to deploy on AML resources.
     """
 
-    model_config: ModelConfig
+    model_conf: ModelConfig = Field(alias="model_config")
     """
     Configuration for the deployed model(s).
     """
@@ -290,17 +280,18 @@ class MIIConfig(MIIConfigModel):
     """
     AML instance type to use when create AML deployment assets.
     """
-    @root_validator(skip_on_failure=True)
-    def AML_name_valid(cls, values: Dict[str, Any]) -> Dict[str, Any]:
-        if values.get("deployment_type") == DeploymentType.AML:
+    @model_validator(mode="after")
+    def AML_name_valid(self) -> "MIIConfig":
+        if self.deployment_type == DeploymentType.AML:
             allowed_chars = set(string.ascii_lowercase + string.ascii_uppercase +
                                 string.digits + "-")
             assert (
-                set(values.get("deployment_name")) <= allowed_chars
+                set(self.deployment_name) <= allowed_chars
             ), "AML deployment names can only contain a-z, A-Z, 0-9, and '-'."
-        return values
+        return self
 
-    @root_validator(skip_on_failure=True)
+    @model_validator(mode="before")
+    @classmethod
     def check_deployment_name(cls, values: Dict[str, Any]) -> Dict[str, Any]:
         deployment_name = values.get("deployment_name")
         if not deployment_name:
@@ -311,14 +302,14 @@ def check_deployment_name(cls, values: Dict[str, Any]) -> Dict[str, Any]:
         return values
 
     def generate_replica_configs(self) -> None:
-        if self.model_config.replica_configs:
+        if self.model_conf.replica_configs:
             return
-        torch_dist_port = self.model_config.torch_dist_port
-        tensor_parallel = self.model_config.tensor_parallel
+        torch_dist_port = self.model_conf.torch_dist_port
+        tensor_parallel = self.model_conf.tensor_parallel
         replica_pool = _allocate_devices(self.hostfile,
                                          tensor_parallel,
-                                         self.model_config.replica_num,
-                                         self.model_config.device_map)
+                                         self.model_conf.replica_num,
+                                         self.model_conf.device_map)
         replica_configs = []
         for i, (hostname, gpu_indices) in enumerate(replica_pool):
             # Reserver port for a LB proxy when replication is enabled
@@ -332,10 +323,10 @@ def generate_replica_configs(self) -> None:
                     tensor_parallel_ports=tensor_parallel_ports,
                     torch_dist_port=replica_torch_dist_port,
                     gpu_indices=gpu_indices,
-                    zmq_port=self.model_config.zmq_port_number + i,
+                    zmq_port=self.model_conf.zmq_port_number + i,
                 ))
 
-        self.model_config.replica_configs = replica_configs
+        self.model_conf.replica_configs = replica_configs
 
 
 def _allocate_devices(hostfile_path: str,
diff --git a/mii/legacy/client.py b/mii/legacy/client.py
index 0a03d810..2f299eb1 100644
--- a/mii/legacy/client.py
+++ b/mii/legacy/client.py
@@ -37,7 +37,7 @@ def mii_query_handle(deployment_name):
         return MIINonPersistentClient(task, deployment_name)
 
     mii_config = _get_mii_config(deployment_name)
-    return MIIClient(mii_config.model_config.task,
+    return MIIClient(mii_config.model_conf.task,
                      "localhost", # TODO: This can probably be removed
                      mii_config.port_number)
 
diff --git a/mii/legacy/config.py b/mii/legacy/config.py
index 793c976f..e149cc7a 100644
--- a/mii/legacy/config.py
+++ b/mii/legacy/config.py
@@ -5,20 +5,21 @@
 import torch
 import os
 import string
+from pydantic import field_validator, model_validator, Field
 from typing import List, Optional, Dict, Any
-import mii.legacy as mii
-from .constants import DeploymentType, TaskType, ModelProvider, MII_MODEL_PATH_DEFAULT
-from .pydantic_v1 import validator, root_validator, Field
 
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from deepspeed.inference.config import DtypeEnum
 from deepspeed.launcher.runner import DLTS_HOSTFILE, fetch_hostfile
 
+import mii.legacy as mii
+from .constants import DeploymentType, TaskType, ModelProvider, MII_MODEL_PATH_DEFAULT
+
 
 class ReplicaConfig(DeepSpeedConfigModel):
     hostname: str = ""
     tensor_parallel_ports: List[int] = []
-    torch_dist_port: int = None
+    torch_dist_port: Optional[int] = None
     gpu_indices: List[int] = []
 
 
@@ -39,7 +40,7 @@ class ModelConfig(DeepSpeedConfigModel):
     'text-to-image']``
     """
 
-    dtype: DtypeEnum = DtypeEnum.fp32
+    dtype: torch.dtype = torch.float32
     """
     Desired model data type, will convert model to this type.  Supported target
     types: `torch.half`, `torch.float`, `torch.int8` (for BLOOM models)
@@ -102,9 +103,12 @@ class ModelConfig(DeepSpeedConfigModel):
 
     hf_auth_token: Optional[str] = Field(
         None,
-        deprecated=True,
-        deprecated_msg=
-        "Parameter will be removed. Please use the `pipeline_kwargs` field to pass kwargs to the HuggingFace pipeline creation.",
+        json_schema_extra={
+            "deprecated":
+            True,
+            "deprecated_msg":
+            "Parameter will be removed. Please use the `pipeline_kwargs` field to pass kwargs to the HuggingFace pipeline creation."
+        },
     )
     """
     HuggingFace authentication token for accessing models. Will be propagated
@@ -113,9 +117,12 @@ class ModelConfig(DeepSpeedConfigModel):
 
     trust_remote_code: bool = Field(
         False,
-        deprecated=True,
-        deprecated_msg=
-        "Parameter will be removed. Please use the `pipeline_kwargs` field to pass kwargs to the HuggingFace pipeline creation.",
+        json_schema_extra={
+            "deprecated":
+            True,
+            "deprecated_msg":
+            "Parameter will be removed. Please use the `pipeline_kwargs` field to pass kwargs to the HuggingFace pipeline creation."
+        },
     )
     """
     HuggingFace `tranformer.pipeline` option for `trust_remote_code`.
@@ -168,15 +175,13 @@ class ModelConfig(DeepSpeedConfigModel):
     the input and output tokens. Please consider increasing it to the required
     token-length required for your use-case.
     """
-    class Config:
-        json_encoders = {torch.dtype: lambda x: str(x)}
-
     @property
     def provider(self):
         return mii.utils.get_provider(self.model, self.task)
 
-    @validator("checkpoint_dict")
-    def checkpoint_dict_valid(cls, field_value, values):
+    @field_validator("checkpoint_dict", mode="after")
+    @classmethod
+    def checkpoint_dict_valid(cls, field_value):
         if field_value is None:
             return field_value
         for k in ["checkpoints", "version", "type", "base_dir"]:
@@ -184,51 +189,56 @@ def checkpoint_dict_valid(cls, field_value, values):
                 raise ValueError(f"Missing key={k} in checkpoint_dict")
         return field_value
 
-    @validator("deploy_rank", pre=True)
-    def deploy_rank_to_list(cls, field_value, values):
+    @field_validator("deploy_rank", mode="before")
+    @classmethod
+    def deploy_rank_to_list(cls, field_value):
         if field_value and not isinstance(field_value, list):
             field_value = [field_value]
         return field_value
 
-    @root_validator
-    def zero_or_meta(cls, values):
-        if values.get("enable_zero"):
-            assert not values.get(
-                "meta_tensor"
-            ), "ZeRO-Inference does not support meta tensors."
-        return values
+    @field_validator("dtype", mode="before")
+    def validate_dtype(cls, field_value, values):
+        if isinstance(field_value, str):
+            return DtypeEnum.from_str(field_value).value[0]
+        if isinstance(field_value, torch.dtype):
+            return field_value
+        raise TypeError(f"Invalid type for dtype: {type(field_value)}")
 
-    @root_validator
-    def bloom_model_valid(cls, values):
-        if "bigscience/bloom" in values.get("model"):
+    @model_validator(mode="after")
+    def zero_or_meta(self):
+        if self.enable_zero:
+            assert not self.meta_tensor, "ZeRO-Inference does not support meta tensors."
+        return self
+
+    @model_validator(mode="after")
+    def bloom_model_valid(self):
+        if "bigscience/bloom" in self.model:
             # TODO: SHould be albe to use DtypeEnum here
-            assert values.get("dtype") in [
+            assert self.dtype in [
                 torch.int8,
                 torch.float16,
             ], "Bloom models only support fp16/int8."
-            assert not values.get(
-                "enable_cuda_graph"
-            ), "Bloom models do not support CUDA Graph."
-        return values
+            assert not self.enable_cuda_graph, "Bloom models do not support CUDA Graph."
+        return self
 
-    @root_validator
-    def deploy_rank_valid(cls, values):
-        tensor_parallel = values.get("tensor_parallel")
-        deploy_rank = values.get("deploy_rank")
+    @model_validator(mode="after")
+    def deploy_rank_valid(self):
+        deploy_rank = self.deploy_rank
 
         # if deploy rank is not given, default to align with TP value
         if deploy_rank is None:
-            deploy_rank = list(range(tensor_parallel))
+            deploy_rank = list(range(self.tensor_parallel))
 
         # number of ranks provided must be equal to TP size, DP is handled outside MII currently
-        assert tensor_parallel == len(
+        assert self.tensor_parallel == len(
             deploy_rank
-        ), f"{len(deploy_rank)} rank(s) provided in 'deploy_rank' does not align with tensor_parallel size of {tensor_parallel}"
+        ), f"{len(deploy_rank)} rank(s) provided in 'deploy_rank' does not align with tensor_parallel size of {self.tensor_parallel}"
 
-        values["deploy_rank"] = deploy_rank
-        return values
+        self.__dict__["deploy_rank"] = deploy_rank
+        return self
 
-    @root_validator
+    @model_validator(mode="before")
+    @classmethod
     def set_model_path(cls, values):
         model_path = values.get("model_path")
         if not model_path:
@@ -249,54 +259,47 @@ def set_model_path(cls, values):
         values["model_path"] = model_path
         return values
 
-    @root_validator
-    def validate_model_and_task(cls, values):
-        task = values.get("task")
-        model = values.get("model")
-        if not values.get("skip_model_check"):
-            mii.utils.check_if_task_and_model_is_valid(task, model)
-            if values.get("enable_deepspeed"):
-                mii.utils.check_if_task_and_model_is_supported(task, model)
-        # Skip any future checks
-        values["skip_model_check"] = True
-        return values
+    @model_validator(mode="after")
+    def validate_model_and_task(self):
+        if not self.skip_model_check:
+            mii.utils.check_if_task_and_model_is_valid(self.task, self.model)
+            mii.utils.check_if_task_and_model_is_supported(self.task, self.model)
+        return self
 
-    @root_validator
-    def meta_tensor_or_sys_mem(cls, values):
-        if values.get("meta_tensor") and values.get("load_with_sys_mem"):
+    @model_validator(mode="after")
+    def meta_tensor_or_sys_mem(self):
+        if self.meta_tensor and self.load_with_sys_mem:
             raise ValueError(
                 "`meta_tensor` and `load_with_sys_mem` cannot be active at the same time."
             )
-        return values
-
-    @root_validator
-    def sys_mem_and_diffusers(cls, values):
-        if values.get("load_with_sys_mem"):
-            model = values.get("model")
-            task = values.get("task")
-            assert not (mii.utils.get_provider(model, task) == ModelProvider.DIFFUSERS), "`load_with_sys_mem` is not support with Stable Diffusion"
-        return values
-
-    @root_validator
-    def zero_dtype_valid(cls, values):
-        if values.get("enable_zero"):
-            if values.get("ds_config").get("fp16", {}).get("enabled", False):
+        return self
+
+    @model_validator(mode="after")
+    def sys_mem_and_diffusers(self):
+        if self.load_with_sys_mem:
+            assert not (mii.utils.get_provider(self.model, self.task) == ModelProvider.DIFFUSERS), "`load_with_sys_mem` is not support with Stable Diffusion"
+        return self
+
+    @model_validator(mode="after")
+    def zero_dtype_valid(self):
+        if self.enable_zero:
+            if self.ds_config.get("fp16", {}).get("enabled", False):
                 # TODO: We should be able to use DtypeEnum instead of torch.float
                 assert (
-                    values.get("dtype") == torch.float16
+                    self.dtype == torch.float16
                 ), "ZeRO FP16 enabled, `dtype` must be set to `torch.float16`"
             else:
                 assert (
-                    values.get("dtype") == torch.float32
+                    self.dtype == torch.float32
                 ), "ZeRO FP16 disabled, `dtype` must be set to `torch.float32`"
-        return values
+        return self
 
-    @root_validator
-    def deepspeed_or_zero(cls, values):
+    @model_validator(mode="after")
+    def deepspeed_or_zero(self):
         assert not (
-            values.get("enable_deepspeed") and values.get("enable_zero")
+            self.enable_deepspeed and self.enable_zero
         ), "DeepSpeed and ZeRO cannot both be enabled, select only one"
-        return values
+        return self
 
 
 class MIIConfig(DeepSpeedConfigModel):
@@ -314,7 +317,7 @@ class MIIConfig(DeepSpeedConfigModel):
     * `AML` will generate the assets necessary to deploy on AML resources.
     """
 
-    model_config: ModelConfig
+    model_conf: ModelConfig
     """
     Configuration for the deployed model(s).
     """
@@ -349,23 +352,23 @@ class MIIConfig(DeepSpeedConfigModel):
     """
     AML instance type to use when create AML deployment assets.
     """
-    @root_validator(skip_on_failure=True)
-    def AML_name_valid(cls, values):
-        if values.get("deployment_type") == DeploymentType.AML:
+    @model_validator(mode="after")
+    def AML_name_valid(self):
+        if self.deployment_type == DeploymentType.AML:
             allowed_chars = set(string.ascii_lowercase + string.ascii_uppercase +
                                 string.digits + "-")
             assert (
-                set(values.get("deployment_name")) <= allowed_chars
+                set(self.deployment_name) <= allowed_chars
             ), "AML deployment names can only contain a-z, A-Z, 0-9, and '-'."
-        return values
+        return self
 
     def generate_replica_configs(self):
         # TODO: refactor this function
         hostfile = self.hostfile
         port_number = self.port_number
-        torch_dist_port = self.model_config.torch_dist_port
-        tensor_parallel = self.model_config.tensor_parallel
-        replica_num = self.model_config.replica_num
+        torch_dist_port = self.model_conf.torch_dist_port
+        tensor_parallel = self.model_conf.tensor_parallel
+        replica_num = self.model_conf.replica_num
         replica_pool = _allocate_processes(hostfile, tensor_parallel, replica_num)
         replica_configs = []
         for i, (hostname, gpu_indices) in enumerate(replica_pool):
@@ -382,7 +385,7 @@ def generate_replica_configs(self):
                     gpu_indices=gpu_indices,
                 ))
 
-        self.model_config.replica_configs = replica_configs
+        self.model_conf.replica_configs = replica_configs
 
 
 def _allocate_processes(hostfile_path, tensor_parallel, replica_num):
diff --git a/mii/legacy/deployment.py b/mii/legacy/deployment.py
index 59954901..b8b0753f 100644
--- a/mii/legacy/deployment.py
+++ b/mii/legacy/deployment.py
@@ -37,7 +37,7 @@ def support_legacy_api(
     }
     # TODO do this in a single for loop
     for key, val in mii_config.items():
-        if key not in MIIConfig.__dict__["__fields__"]:
+        if key not in MIIConfig.fields.keys():
             model_config[key] = val
     mii_config = {
         k: v
@@ -68,10 +68,10 @@ def deploy(
         model_config, mii_config = support_legacy_api(*args, **kwargs)
 
     mii_config["deployment_name"] = deployment_name
-    mii_config["model_config"] = model_config
+    mii_config["model_conf"] = model_config
     mii_config = mii.config.MIIConfig(**mii_config)
 
-    if mii_config.model_config.enable_deepspeed:
+    if mii_config.model_conf.enable_deepspeed:
         logger.info(
             "************* MII is using DeepSpeed Optimizations to accelerate your model *************"
         )
@@ -100,9 +100,9 @@ def _deploy_aml(mii_config):
     mii.aml_related.utils.generate_aml_scripts(
         acr_name=acr_name,
         deployment_name=mii_config.deployment_name,
-        model_name=mii_config.model_config.model,
-        task_name=mii_config.model_config.task,
-        replica_num=mii_config.model_config.replica_num,
+        model_name=mii_config.model_conf.model,
+        task_name=mii_config.model_conf.task,
+        replica_num=mii_config.model_conf.replica_num,
         instance_type=mii_config.instance_type,
         version=mii_config.version,
     )
@@ -115,10 +115,10 @@ def _deploy_aml(mii_config):
 def _deploy_nonpersistent(mii_config):
     assert (
         int(os.getenv("WORLD_SIZE", "1"))
-        == mii_config.model_config.tensor_parallel
+        == mii_config.model_conf.tensor_parallel
     ), "World Size does not equal number of tensors. When using non-persistent deployment type, please launch with `deepspeed --num_gpus <tensor_parallel>`"
     deployment_name = mii_config.deployment_name
     mii.non_persistent_models[deployment_name] = (
-        load_models(mii_config.model_config),
-        mii_config.model_config.task,
+        load_models(mii_config.model_conf),
+        mii_config.model_conf.task,
     )
diff --git a/mii/legacy/pydantic_v1.py b/mii/legacy/pydantic_v1.py
deleted file mode 100644
index 6aba072a..00000000
--- a/mii/legacy/pydantic_v1.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Pydantic v1 compatibility module.
-
-Pydantic v2 introduced breaking changes that hinder its adoption:
-https://docs.pydantic.dev/latest/migration/. To provide deepspeed users the option to
-migrate to pydantic v2 on their own timeline, deepspeed uses this compatibility module
-as a pydantic-version-agnostic alias for pydantic's v1 API.
-"""
-
-try:
-    from pydantic.v1 import *  # noqa: F401
-except ImportError:
-    from pydantic import *  # noqa: F401
diff --git a/mii/legacy/server.py b/mii/legacy/server.py
index 8a66f3ec..75ba24fe 100644
--- a/mii/legacy/server.py
+++ b/mii/legacy/server.py
@@ -28,7 +28,7 @@ class MIIServer:
     """Initialize the model, setup the server for the model under model_path"""
     def __init__(self, mii_config):
 
-        self.task = mii_config.model_config.task
+        self.task = mii_config.model_conf.task
         self.num_gpus = get_num_gpus(mii_config)
         assert self.num_gpus > 0, "GPU count must be greater than 0"
 
@@ -44,8 +44,7 @@ def __init__(self, mii_config):
         mii_config.generate_replica_configs()
 
         processes = self._initialize_service(mii_config)
-        self._wait_until_server_is_live(processes,
-                                        mii_config.model_config.replica_configs)
+        self._wait_until_server_is_live(processes, mii_config.model_conf.replica_configs)
 
     def _wait_until_server_is_live(self, processes, deployment):
         for process, repl_config in zip(processes, deployment):
@@ -128,11 +127,11 @@ def _initialize_service(self, mii_config):
         ]
 
         host_gpus = defaultdict(list)
-        for repl_config in mii_config.model_config.replica_configs:
+        for repl_config in mii_config.model_conf.replica_configs:
             host_gpus[repl_config.hostname].extend(repl_config.gpu_indices)
 
         # Start replica instances
-        for repl_config in mii_config.model_config.replica_configs:
+        for repl_config in mii_config.model_conf.replica_configs:
             hostfile = tempfile.NamedTemporaryFile(delete=False)
             hostfile.write(
                 f"{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n"
@@ -140,7 +139,7 @@ def _initialize_service(self, mii_config):
             ds_launch_str = self._generate_ds_launch_str(repl_config, hostfile.name)
             processes.append(
                 self._launch_server_process(
-                    mii_config.model_config,
+                    mii_config.model_conf,
                     "MII server",
                     ds_launch_str=ds_launch_str,
                     server_args=server_args +
@@ -153,7 +152,7 @@ def _initialize_service(self, mii_config):
             # expected to assign one GPU to one process.
         processes.append(
             self._launch_server_process(
-                mii_config.model_config,
+                mii_config.model_conf,
                 "load balancer",
                 server_args=server_args + ["--load-balancer"],
             ))
@@ -161,7 +160,7 @@ def _initialize_service(self, mii_config):
         if mii_config.enable_restful_api:
             processes.append(
                 self._launch_server_process(
-                    mii_config.model_config,
+                    mii_config.model_conf,
                     "restful api gateway",
                     server_args=server_args + ["--restful-gateway"],
                 ))
diff --git a/mii/legacy/utils.py b/mii/legacy/utils.py
index f1a7cb59..8d574ad9 100644
--- a/mii/legacy/utils.py
+++ b/mii/legacy/utils.py
@@ -179,7 +179,7 @@ def extract_query_dict(task, request_dict):
 
 
 def get_num_gpus(mii_config):
-    num_gpus = mii_config.model_config.tensor_parallel
+    num_gpus = mii_config.model_conf.tensor_parallel
 
     assert (
         torch.cuda.device_count() >= num_gpus
diff --git a/mii/pydantic_v1.py b/mii/pydantic_v1.py
deleted file mode 100644
index 6aba072a..00000000
--- a/mii/pydantic_v1.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Pydantic v1 compatibility module.
-
-Pydantic v2 introduced breaking changes that hinder its adoption:
-https://docs.pydantic.dev/latest/migration/. To provide deepspeed users the option to
-migrate to pydantic v2 on their own timeline, deepspeed uses this compatibility module
-as a pydantic-version-agnostic alias for pydantic's v1 API.
-"""
-
-try:
-    from pydantic.v1 import *  # noqa: F401
-except ImportError:
-    from pydantic import *  # noqa: F401
diff --git a/mii/score/generate.py b/mii/score/generate.py
index a34a96c6..978a635b 100644
--- a/mii/score/generate.py
+++ b/mii/score/generate.py
@@ -19,7 +19,7 @@ def create_score_file(mii_config):
         score_src = fd.read()
 
     # update score file w. global config dict
-    config_dict = mii_config.dict()
+    config_dict = mii_config.model_dump()
     source_with_config = f"{score_src}\n"
     source_with_config += f"mii_config = {pprint.pformat(config_dict, indent=4)}"
 
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index b4191e29..11cf6b83 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,12 +1,12 @@
 accelerate
 asyncio
-deepspeed>=0.14.0
+deepspeed>=0.15.0
 deepspeed-kernels
 Flask-RESTful
 grpcio
 grpcio-tools
 Pillow
-pydantic
+pydantic>=2.0.0
 pyzmq
 safetensors
 torch
diff --git a/tests/legacy/test_config.py b/tests/legacy/test_config.py
index bc2ca1fd..f99b2524 100644
--- a/tests/legacy/test_config.py
+++ b/tests/legacy/test_config.py
@@ -6,24 +6,24 @@
 import pytest
 
 import mii.legacy as mii
-from mii.legacy import pydantic_v1
+from pydantic import ValidationError
 
 
 @pytest.mark.parametrize("port_number", [12345])
 @pytest.mark.parametrize("tensor_parallel", [4])
 def test_base_configs(deployment_name, mii_config, model_config):
     mii_config["deployment_name"] = deployment_name
-    mii_config["model_config"] = model_config
+    mii_config["model_conf"] = model_config
     mii_config = mii.config.MIIConfig(**mii_config)
 
     assert mii_config.port_number == 12345
-    assert mii_config.model_config.tensor_parallel == 4
+    assert mii_config.model_conf.tensor_parallel == 4
 
 
 @pytest.mark.parametrize("port_number", ["fail"])
 @pytest.mark.parametrize("tensor_parallel", [3.5])
 def test_base_configs_literalfail(deployment_name, mii_config, model_config):
-    with pytest.raises(pydantic_v1.ValidationError):
+    with pytest.raises(ValidationError):
         mii_config["deployment_name"] = deployment_name
-        mii_config["model_config"] = model_config
+        mii_config["model_conf"] = model_config
         mii_config = mii.config.MIIConfig(**mii_config)
diff --git a/tests/legacy/test_deployment_options.py b/tests/legacy/test_deployment_options.py
index e60ebcd7..2cda7a6f 100644
--- a/tests/legacy/test_deployment_options.py
+++ b/tests/legacy/test_deployment_options.py
@@ -7,7 +7,7 @@
 import json
 import requests
 import mii.legacy as mii
-from mii.legacy import pydantic_v1
+from pydantic import ValidationError
 
 
 @pytest.mark.deepspeed
@@ -81,7 +81,7 @@ def test_zero_config(deployment, query):
 
 
 @pytest.mark.deepspeed
-@pytest.mark.parametrize("expected_failure", [pydantic_v1.ValidationError])
+@pytest.mark.parametrize("expected_failure", [ValidationError])
 @pytest.mark.parametrize(
     "enable_deepspeed, enable_zero, dtype",
     [(True,
diff --git a/tests/test_arg_parsing.py b/tests/test_arg_parsing.py
index 640512ae..957b1eeb 100644
--- a/tests/test_arg_parsing.py
+++ b/tests/test_arg_parsing.py
@@ -5,31 +5,33 @@
 
 import pytest
 
+from pydantic import ValidationError
+
 from mii.api import _parse_kwargs_to_model_config, _parse_kwargs_to_mii_config
 from mii.errors import UnknownArgument
 
 
 def test_model_name_or_path():
     # model_name_or_path is required
-    with pytest.raises(ValueError):
+    with pytest.raises(ValidationError):
         _parse_kwargs_to_mii_config()
-    with pytest.raises(ValueError):
+    with pytest.raises(ValidationError):
         _parse_kwargs_to_model_config()
 
     # passing model_name_or_path as positional arg
     mii_config = _parse_kwargs_to_mii_config("test")
-    assert mii_config.model_config.model_name_or_path == "test"
+    assert mii_config.model_conf.model_name_or_path == "test"
     model_config, _ = _parse_kwargs_to_model_config("test")
     assert model_config.model_name_or_path == "test"
 
     # passing model_name_or_path in model_config
     mii_config = _parse_kwargs_to_mii_config(model_config={"model_name_or_path": "test"})
-    assert mii_config.model_config.model_name_or_path == "test"
+    assert mii_config.model_conf.model_name_or_path == "test"
     mii_config = _parse_kwargs_to_mii_config(
         mii_config={"model_config": {
             "model_name_or_path": "test"
         }})
-    assert mii_config.model_config.model_name_or_path == "test"
+    assert mii_config.model_conf.model_name_or_path == "test"
     model_config, _ = _parse_kwargs_to_model_config(
         model_config={"model_name_or_path": "test"}
     )
@@ -53,8 +55,8 @@ def test_only_kwargs():
     mii_config = _parse_kwargs_to_mii_config("test",
                                              tensor_parallel=2,
                                              enable_restful_api=True)
-    assert mii_config.model_config.model_name_or_path == "test"
-    assert mii_config.model_config.tensor_parallel == 2
+    assert mii_config.model_conf.model_name_or_path == "test"
+    assert mii_config.model_conf.tensor_parallel == 2
     assert mii_config.enable_restful_api is True
 
     model_config, _ = _parse_kwargs_to_model_config("test", tensor_parallel=2)
@@ -70,8 +72,8 @@ def test_only_config_dicts():
             "tensor_parallel": 2
         },
     )
-    assert mii_config.model_config.model_name_or_path == "test"
-    assert mii_config.model_config.tensor_parallel == 2
+    assert mii_config.model_conf.model_name_or_path == "test"
+    assert mii_config.model_conf.tensor_parallel == 2
     assert mii_config.enable_restful_api is True
 
     mii_config = _parse_kwargs_to_mii_config(
@@ -82,8 +84,8 @@ def test_only_config_dicts():
                 "tensor_parallel": 2
             },
         })
-    assert mii_config.model_config.model_name_or_path == "test"
-    assert mii_config.model_config.tensor_parallel == 2
+    assert mii_config.model_conf.model_name_or_path == "test"
+    assert mii_config.model_conf.tensor_parallel == 2
     assert mii_config.enable_restful_api is True
 
     model_config, _ = _parse_kwargs_to_model_config(

From 3ab2d05c58ea55bc987ce26bbfa002afec8d286e Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Thu, 22 Aug 2024 15:53:45 -0700
Subject: [PATCH 09/35] Update version.txt after 0.3.0 release (#520)

Co-authored-by: loadams <loadams@users.noreply.github.com>
Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 0d91a54c..9e11b32f 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.0
+0.3.1

From 9bc5f966977fc669eadfb6d07952fcee93345ea9 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Thu, 22 Aug 2024 16:39:05 -0700
Subject: [PATCH 10/35] Update supported model list (#519)

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 README.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 476c7cc1..d4e2d3c9 100644
--- a/README.md
+++ b/README.md
@@ -89,14 +89,16 @@ MII currently supports over 37,000 models across eight popular model architectur
 
 model family | size range | ~model count
 ------ | ------ | ------
-[falcon](https://huggingface.co/models?other=falcon) | 7B - 180B | 500
-[llama](https://huggingface.co/models?other=llama) | 7B - 65B | 52,000
-[llama-2](https://huggingface.co/models?other=llama-2) | 7B - 70B | 1,200
-[mistral](https://huggingface.co/models?other=mistral) | 7B | 23,000
-[mixtral (MoE)](https://huggingface.co/models?other=mixtral) | 8x7B | 2,900
-[opt](https://huggingface.co/models?other=opt) | 0.1B - 66B | 2,100
-[phi-2](https://huggingface.co/models?other=phi) | 2.7B | 1,500
-[qwen](https://huggingface.co/models?other=qwen) | 7B - 72B | 500
+[Falcon](https://huggingface.co/models?other=falcon) | 7B - 180B | 600
+[Llama](https://huggingface.co/models?other=llama) | 7B - 65B | 57,000
+[Llama-2](https://huggingface.co/models?other=llama-2) | 7B - 70B | 1,200
+[Llama-3](https://huggingface.co/models?other=llama-3) | 8B - 405B | 1,600
+[Mistral](https://huggingface.co/models?other=mistral) | 7B | 23,000
+[Mixtral (MoE)](https://huggingface.co/models?other=mixtral) | 8x7B | 2,900
+[OPT](https://huggingface.co/models?other=opt) | 0.1B - 66B | 2,200
+[Phi-2](https://huggingface.co/models?other=phi) | 2.7B | 1,500
+[Qwen](https://huggingface.co/models?other=qwen) | 7B - 72B | 500
+[Qwen2](https://huggingface.co/models?other=qwen2) | 0.5B - 72B | 3700
 
 ## MII Legacy Model Support
 

From 7185697604897181508dd54c474df71c0e4851b5 Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <11399+adk9@users.noreply.github.com>
Date: Tue, 3 Sep 2024 13:21:19 -0700
Subject: [PATCH 11/35] Enable streaming option in the OpenAI API server (#480)

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Logan Adams <loadams@microsoft.com>
Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/entrypoints/api_server.py        |  27 +++---
 mii/entrypoints/data_models.py       |   2 +-
 mii/entrypoints/openai_api_server.py | 140 +++++++++++++--------------
 requirements/requirements.txt        |   3 +
 4 files changed, 87 insertions(+), 85 deletions(-)

diff --git a/mii/entrypoints/api_server.py b/mii/entrypoints/api_server.py
index 2a2bc84f..aac16b81 100644
--- a/mii/entrypoints/api_server.py
+++ b/mii/entrypoints/api_server.py
@@ -6,13 +6,14 @@
 import json
 import grpc
 import argparse
+from typing import AsyncGenerator
 
 # Third-party imports
 import uvicorn
 import mii
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, Response
+from fastapi.responses import StreamingResponse, JSONResponse, Response
 from mii.grpc_related.proto.modelresponse_pb2_grpc import ModelResponseStub
 from mii.grpc_related.proto import modelresponse_pb2
 from mii.utils import kwarg_dict_to_proto
@@ -81,18 +82,18 @@ async def generate(request: CompletionRequest) -> Response:
 
     # Streaming case
     if request.stream:
-        return JSONResponse({"error": "Streaming is not yet supported."},
-                            status_code=400)
-        # async def StreamResults() -> AsyncGenerator[bytes, None]:
-        #     # Send an empty chunk to start the stream and prevent timeout
-        #     yield ""
-        #     async for response_chunk in stub.GeneratorReplyStream(requestData):
-        #         # Send the response chunk
-        #         responses = [obj.response for obj in response_chunk.response]
-        #         dataOut = {"text": responses}
-        #         yield f"data: {json.dumps(dataOut)}\n\n"
-        #     yield f"data: [DONE]\n\n"
-        # return StreamingResponse(StreamResults(), media_type="text/event-stream")
+
+        async def StreamResults() -> AsyncGenerator[bytes, None]:
+            # Send an empty chunk to start the stream and prevent timeout
+            yield ""
+            async for response_chunk in stub.GeneratorReplyStream(requestData):
+                # Send the response chunk
+                responses = [obj.response for obj in response_chunk.response]
+                dataOut = {"text": responses}
+                yield f"data: {json.dumps(dataOut)}\n\n"
+            yield f"data: [DONE]\n\n"
+
+        return StreamingResponse(StreamResults(), media_type="text/event-stream")
 
     # Non-streaming case
     responseData = await stub.GeneratorReply(requestData)
diff --git a/mii/entrypoints/data_models.py b/mii/entrypoints/data_models.py
index 9bba1342..190e486c 100644
--- a/mii/entrypoints/data_models.py
+++ b/mii/entrypoints/data_models.py
@@ -9,7 +9,7 @@
 import time
 
 import shortuuid
-from pydantic import BaseModel, BaseSettings, Field
+from mii.pydantic_v1 import BaseModel, BaseSettings, Field
 
 
 class ErrorResponse(BaseModel):
diff --git a/mii/entrypoints/openai_api_server.py b/mii/entrypoints/openai_api_server.py
index 26f42be2..c8df3d6c 100644
--- a/mii/entrypoints/openai_api_server.py
+++ b/mii/entrypoints/openai_api_server.py
@@ -10,14 +10,14 @@
 import argparse
 import json
 import os
-from typing import Optional, List, Union
+from typing import AsyncGenerator, Optional, List, Union
 from transformers import AutoTokenizer
 import codecs
 
 from fastapi import FastAPI, Depends, HTTPException, Response
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
+from fastapi.responses import StreamingResponse, JSONResponse
 from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer
 
 import shortuuid
@@ -31,16 +31,16 @@
 from .data_models import (
     ChatCompletionRequest,
     ChatCompletionResponse,
-    # ChatCompletionResponseStreamChoice,
-    # ChatCompletionStreamResponse,
+    ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse,
     ChatMessage,
     ChatCompletionResponseChoice,
     CompletionRequest,
     CompletionResponse,
     CompletionResponseChoice,
-    # DeltaMessage,
-    # CompletionResponseStreamChoice,
-    # CompletionStreamResponse,
+    DeltaMessage,
+    CompletionResponseStreamChoice,
+    CompletionStreamResponse,
     ErrorResponse,
     ModelCard,
     ModelList,
@@ -202,42 +202,41 @@ async def create_chat_completion(request: ChatCompletionRequest):
 
     # Streaming case
     if request.stream:
-        return create_error_response(
-            ErrorCode.VALIDATION_TYPE_ERROR,
-            f"Streaming is not yet supported.",
-        )
-        # async def StreamResults() -> AsyncGenerator[bytes, None]:
-        #     # First chunk with role
-        #     firstChoices = []
-        #     for _ in range(request.n):
-        #         firstChoice = ChatCompletionResponseStreamChoice(
-        #             index=len(firstChoices),
-        #             delta=DeltaMessage(role=response_role),
-        #             finish_reason=None,
-        #         )
-        #         firstChoices.append(firstChoice)
-
-        #     chunk = ChatCompletionStreamResponse(
-        #         id=id, choices=firstChoices, model=app_settings.model_id
-        #     )
-        #     yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
-        #     async for response_chunk in stub.GeneratorReplyStream(requestData):
-        #         streamChoices = []
-
-        #         for c in response_chunk.response:
-        #             choice = ChatCompletionResponseStreamChoice(
-        #                 index=len(streamChoices),
-        #                 delta=DeltaMessage(content=c.response),
-        #                 finish_reason=None if c.finish_reason == "none" else c.finish_reason,
-        #             )
-        #             streamChoices.append(choice)
-
-        #         chunk = ChatCompletionStreamResponse(
-        #             id=id, choices=streamChoices, model=app_settings.model_id
-        #         )
-        #         yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
-        #     yield "data: [DONE]\n\n"
-        # return StreamingResponse(StreamResults(), media_type="text/event-stream")
+
+        async def StreamResults() -> AsyncGenerator[bytes, None]:
+            # First chunk with role
+            firstChoices = []
+            for _ in range(request.n):
+                firstChoice = ChatCompletionResponseStreamChoice(
+                    index=len(firstChoices),
+                    delta=DeltaMessage(role=response_role),
+                    finish_reason=None,
+                )
+                firstChoices.append(firstChoice)
+
+            chunk = ChatCompletionStreamResponse(id=id,
+                                                 choices=firstChoices,
+                                                 model=app_settings.model_id)
+            yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
+            async for response_chunk in stub.GeneratorReplyStream(requestData):
+                streamChoices = []
+
+                for c in response_chunk.response:
+                    choice = ChatCompletionResponseStreamChoice(
+                        index=len(streamChoices),
+                        delta=DeltaMessage(content=c.response),
+                        finish_reason=None
+                        if c.finish_reason == "none" else c.finish_reason,
+                    )
+                    streamChoices.append(choice)
+
+                chunk = ChatCompletionStreamResponse(id=id,
+                                                     choices=streamChoices,
+                                                     model=app_settings.model_id)
+                yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(StreamResults(), media_type="text/event-stream")
 
     # Non-streaming case
     responseData = await stub.GeneratorReply(requestData)
@@ -330,34 +329,33 @@ async def create_completion(request: CompletionRequest):
     id = f"cmpl-{shortuuid.random()}"
     # Streaming case
     if request.stream:
-        return create_error_response(
-            ErrorCode.VALIDATION_TYPE_ERROR,
-            f"Streaming is not yet supported.",
-        )
-        # async def StreamResults() -> AsyncGenerator[bytes, None]:
-        #     # Send an empty chunk to start the stream and prevent timeout
-        #     yield ""
-        #     async for response_chunk in stub.GeneratorReplyStream(requestData):
-        #         streamChoices = []
-
-        #         for c in response_chunk.response:
-        #             choice = CompletionResponseStreamChoice(
-        #                 index=len(streamChoices),
-        #                 text=c.response,
-        #                 logprobs=None,
-        #                 finish_reason=None if c.finish_reason == "none" else c.finish_reason,
-        #             )
-        #             streamChoices.append(choice)
-
-        #         chunk = CompletionStreamResponse(
-        #             id=id,
-        #             object="text_completion",
-        #             choices=streamChoices,
-        #             model=app_settings.model_id,
-        #         )
-        #         yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
-        #     yield "data: [DONE]\n\n"
-        # return StreamingResponse(StreamResults(), media_type="text/event-stream")
+
+        async def StreamResults() -> AsyncGenerator[bytes, None]:
+            # Send an empty chunk to start the stream and prevent timeout
+            yield ""
+            async for response_chunk in stub.GeneratorReplyStream(requestData):
+                streamChoices = []
+
+                for c in response_chunk.response:
+                    choice = CompletionResponseStreamChoice(
+                        index=len(streamChoices),
+                        text=c.response,
+                        logprobs=None,
+                        finish_reason=None
+                        if c.finish_reason == "none" else c.finish_reason,
+                    )
+                    streamChoices.append(choice)
+
+                chunk = CompletionStreamResponse(
+                    id=id,
+                    object="text_completion",
+                    choices=streamChoices,
+                    model=app_settings.model_id,
+                )
+                yield f"data: {chunk.json(exclude_unset=True, ensure_ascii=False)}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(StreamResults(), media_type="text/event-stream")
 
     # Non-streaming case
     responseData = await stub.GeneratorReply(requestData)
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 11cf6b83..8ca8791c 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -2,6 +2,8 @@ accelerate
 asyncio
 deepspeed>=0.15.0
 deepspeed-kernels
+fastapi
+fastchat
 Flask-RESTful
 grpcio
 grpcio-tools
@@ -9,6 +11,7 @@ Pillow
 pydantic>=2.0.0
 pyzmq
 safetensors
+shortuuid
 torch
 transformers
 ujson

From d2c93b3a9446ffc285c9e873cc543efef2b5fdd3 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 11 Sep 2024 07:49:42 -0700
Subject: [PATCH 12/35] Fix missing pydantic updates in legacy mii code (#524)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/legacy/deployment.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mii/legacy/deployment.py b/mii/legacy/deployment.py
index b8b0753f..78610b17 100644
--- a/mii/legacy/deployment.py
+++ b/mii/legacy/deployment.py
@@ -37,12 +37,12 @@ def support_legacy_api(
     }
     # TODO do this in a single for loop
     for key, val in mii_config.items():
-        if key not in MIIConfig.fields.keys():
+        if key not in MIIConfig.model_fields.keys():
             model_config[key] = val
     mii_config = {
         k: v
         for k,
-        v in mii_config.items() if k in MIIConfig.__dict__["__fields__"]
+        v in mii_config.items() if k in MIIConfig.model_fields.keys()
     }
     mii_config["version"] = version
     mii_config["deployment_type"] = deployment_type

From a5b443fec51505f7136e3ae40c302830b748719e Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 28 Oct 2024 16:38:26 -0700
Subject: [PATCH 13/35] Update docker container version (#533)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-a6000-fastgen.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-a6000-fastgen.yml b/.github/workflows/nv-a6000-fastgen.yml
index 80bca5ae..cae5e844 100644
--- a/.github/workflows/nv-a6000-fastgen.yml
+++ b/.github/workflows/nv-a6000-fastgen.yml
@@ -18,7 +18,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:23.03-py3
+      image: nvcr.io/nvidia/pytorch:24.03-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"

From 87e9b0d77df90936e8874fd5bf78c65a27bf25c0 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 29 Oct 2024 10:12:30 -0700
Subject: [PATCH 14/35] Update CODEOWNERS (#535)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 29171a01..82efda6e 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1 @@
-*       @jeffra @mrwyattii @awan-10 @samyam
+*       @tohtana  @tjruwase @awan-10 @loadams

From 2b08ace7730a81a3adcf11e8893c4c6903d6ae51 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 29 Oct 2024 10:14:22 -0700
Subject: [PATCH 15/35] Update labels to acquire new runners (#534)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-v100-legacy.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml
index 0eef4799..2949ac6e 100644
--- a/.github/workflows/nv-v100-legacy.yml
+++ b/.github/workflows/nv-v100-legacy.yml
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -29,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir /blob/torch_cache/ torch --index-url https://download.pytorch.org/whl/cu118
+          pip3 install -U --cache-dir /blob/torch_cache/ torch --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 

From 4a3467ef0e604a0b8a94e269f864df500019dd16 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 29 Oct 2024 11:30:14 -0700
Subject: [PATCH 16/35] Test with latest transformers

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-v100-legacy.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml
index 2949ac6e..01d12de4 100644
--- a/.github/workflows/nv-v100-legacy.yml
+++ b/.github/workflows/nv-v100-legacy.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install git+https://github.com/microsoft/DeepSpeed.git
-          pip install git+https://github.com/huggingface/transformers.git@v4.43.0
+          pip install git+https://github.com/huggingface/transformers.git
           pip install -U accelerate
           ds_report
 

From 6079f9c4d3319f5830c80974b1f2726fc2677cd2 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 30 Oct 2024 11:30:57 -0700
Subject: [PATCH 17/35] Update path triggers that were incorrect before (#537)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-a6000-fastgen.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-a6000-fastgen.yml b/.github/workflows/nv-a6000-fastgen.yml
index cae5e844..0b9da000 100644
--- a/.github/workflows/nv-a6000-fastgen.yml
+++ b/.github/workflows/nv-a6000-fastgen.yml
@@ -8,7 +8,7 @@ on:
     paths-ignore:
       - 'mii/legacy/**'
       - 'tests/legacy/**'
-      - '.github/workflows/nv-torch-latest-v100.yml'
+      - '.github/workflows/nv-v100-legacy.yml'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}

From 61c326a494548f5d8e8ae4fef1bb664361e7dede Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 30 Oct 2024 11:56:16 -0700
Subject: [PATCH 18/35] Update clang-format version to match DeepSpeed (#538)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 requirements/requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index 4b7bb770..88d0d08e 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -1,4 +1,4 @@
-clang-format==16.0.2
+clang-format==18.1.3
 einops
 pre-commit>=2.20.0
 pytest

From 7fe9106aa44d7f06175d5cac3dd28a630ae5c393 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 30 Oct 2024 13:57:33 -0700
Subject: [PATCH 19/35] Update version.txt (#539)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 9e11b32f..d15723fb 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.1
+0.3.2

From 6c29dc6a4047c1ed0a5a11f10745d870c260c888 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 31 Oct 2024 14:28:41 -0700
Subject: [PATCH 20/35] Test pinning to 4.43.4

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-v100-legacy.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml
index 01d12de4..346e7f18 100644
--- a/.github/workflows/nv-v100-legacy.yml
+++ b/.github/workflows/nv-v100-legacy.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install git+https://github.com/microsoft/DeepSpeed.git
-          pip install git+https://github.com/huggingface/transformers.git
+          pip install git+https://github.com/huggingface/transformers.git@v4.43.4
           pip install -U accelerate
           ds_report
 

From 87e1d873a719b4e9d9c08b4d33a6b914cb683dd1 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 31 Oct 2024 16:39:55 -0700
Subject: [PATCH 21/35] Update to latest transformers

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-v100-legacy.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml
index 346e7f18..01d12de4 100644
--- a/.github/workflows/nv-v100-legacy.yml
+++ b/.github/workflows/nv-v100-legacy.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install git+https://github.com/microsoft/DeepSpeed.git
-          pip install git+https://github.com/huggingface/transformers.git@v4.43.4
+          pip install git+https://github.com/huggingface/transformers.git
           pip install -U accelerate
           ds_report
 

From ab92351eee3c3eaa15d446c830d54d76960e215f Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 7 Nov 2024 13:03:18 -0800
Subject: [PATCH 22/35] Update to version where only the
 zero-shot-image-classification fails

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-v100-legacy.yml     | 2 +-
 mii/legacy/models/providers/diffusers.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml
index 01d12de4..346e7f18 100644
--- a/.github/workflows/nv-v100-legacy.yml
+++ b/.github/workflows/nv-v100-legacy.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install git+https://github.com/microsoft/DeepSpeed.git
-          pip install git+https://github.com/huggingface/transformers.git
+          pip install git+https://github.com/huggingface/transformers.git@v4.43.4
           pip install -U accelerate
           ds_report
 
diff --git a/mii/legacy/models/providers/diffusers.py b/mii/legacy/models/providers/diffusers.py
index 15973d0e..b75ad3a4 100644
--- a/mii/legacy/models/providers/diffusers.py
+++ b/mii/legacy/models/providers/diffusers.py
@@ -19,10 +19,11 @@ def diffusers_provider(model_config: ModelConfig):
         kwargs["torch_dtype"] = torch.float16
         kwargs["revision"] = "fp16"
 
+    kwargs["device"] = torch.device(f"cuda:{local_rank}")
     pipeline = attempt_load(DiffusionPipeline.from_pretrained,
                             model_config.model,
                             model_config.model_path,
                             kwargs=kwargs)
-    pipeline = pipeline.to(f"cuda:{local_rank}")
+    #pipeline = pipeline.to(f"cuda:{local_rank}")
     pipeline.set_progress_bar_config(disable=True)
     return pipeline

From 6525bd3744d75ffc532b80387a9bf29df716892c Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 7 Nov 2024 14:55:36 -0800
Subject: [PATCH 23/35] Revert certain changes

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/legacy/models/providers/diffusers.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mii/legacy/models/providers/diffusers.py b/mii/legacy/models/providers/diffusers.py
index b75ad3a4..15973d0e 100644
--- a/mii/legacy/models/providers/diffusers.py
+++ b/mii/legacy/models/providers/diffusers.py
@@ -19,11 +19,10 @@ def diffusers_provider(model_config: ModelConfig):
         kwargs["torch_dtype"] = torch.float16
         kwargs["revision"] = "fp16"
 
-    kwargs["device"] = torch.device(f"cuda:{local_rank}")
     pipeline = attempt_load(DiffusionPipeline.from_pretrained,
                             model_config.model,
                             model_config.model_path,
                             kwargs=kwargs)
-    #pipeline = pipeline.to(f"cuda:{local_rank}")
+    pipeline = pipeline.to(f"cuda:{local_rank}")
     pipeline.set_progress_bar_config(disable=True)
     return pipeline

From efb14e04605e7763bd6272614c5b8bc679cf4c94 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 7 Nov 2024 15:15:28 -0800
Subject: [PATCH 24/35] Add other debugging

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/legacy/method_table.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mii/legacy/method_table.py b/mii/legacy/method_table.py
index 520e9a1c..91ec616b 100644
--- a/mii/legacy/method_table.py
+++ b/mii/legacy/method_table.py
@@ -252,6 +252,10 @@ def unpack_request_from_proto(self, request):
 
     def run_inference(self, inference_pipeline, args, kwargs):
         image, candidate_labels = args
+        print({"image":image, "candidate_labels":candidate_labels, "kwargs":kwargs})
+        import torch
+        kwargs["torch_dtype"] = torch.float16
+        print({"image":image, "candidate_labels":candidate_labels, "kwargs":kwargs})
         return inference_pipeline(image, candidate_labels=candidate_labels, **kwargs)
 
 

From 3a0c215454d363cefc6c8cc44c598425a297229b Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 7 Nov 2024 15:16:46 -0800
Subject: [PATCH 25/35] pre-commit

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/legacy/method_table.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mii/legacy/method_table.py b/mii/legacy/method_table.py
index 91ec616b..d190a6a3 100644
--- a/mii/legacy/method_table.py
+++ b/mii/legacy/method_table.py
@@ -252,10 +252,10 @@ def unpack_request_from_proto(self, request):
 
     def run_inference(self, inference_pipeline, args, kwargs):
         image, candidate_labels = args
-        print({"image":image, "candidate_labels":candidate_labels, "kwargs":kwargs})
+        print({"image": image, "candidate_labels": candidate_labels, "kwargs": kwargs})
         import torch
         kwargs["torch_dtype"] = torch.float16
-        print({"image":image, "candidate_labels":candidate_labels, "kwargs":kwargs})
+        print({"image": image, "candidate_labels": candidate_labels, "kwargs": kwargs})
         return inference_pipeline(image, candidate_labels=candidate_labels, **kwargs)
 
 

From 6fc890be1d01b5d56eef9be7f8d4888c91f41482 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Fri, 8 Nov 2024 09:54:09 -0800
Subject: [PATCH 26/35] Confirm replacement of inference_pipeline.model with
 engine causes problems

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/legacy/method_table.py       | 4 ----
 mii/legacy/models/load_models.py | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/mii/legacy/method_table.py b/mii/legacy/method_table.py
index d190a6a3..520e9a1c 100644
--- a/mii/legacy/method_table.py
+++ b/mii/legacy/method_table.py
@@ -252,10 +252,6 @@ def unpack_request_from_proto(self, request):
 
     def run_inference(self, inference_pipeline, args, kwargs):
         image, candidate_labels = args
-        print({"image": image, "candidate_labels": candidate_labels, "kwargs": kwargs})
-        import torch
-        kwargs["torch_dtype"] = torch.float16
-        print({"image": image, "candidate_labels": candidate_labels, "kwargs": kwargs})
         return inference_pipeline(image, candidate_labels=candidate_labels, **kwargs)
 
 
diff --git a/mii/legacy/models/load_models.py b/mii/legacy/models/load_models.py
index cfbf455f..9a37fcaa 100644
--- a/mii/legacy/models/load_models.py
+++ b/mii/legacy/models/load_models.py
@@ -75,7 +75,7 @@ def load_models(model_config):
         if model_config.profile_model_time:
             engine.profile_model_time()
         if hasattr(inference_pipeline, "model"):
-            inference_pipeline.model = engine
+            #inference_pipeline.model = engine
 
     elif model_config.enable_zero:
         ds_config = DeepSpeedConfig(model_config.ds_config)

From 71331eb9702192076460b7136abdea515add940e Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Fri, 8 Nov 2024 09:57:20 -0800
Subject: [PATCH 27/35] Formatting

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/legacy/models/load_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mii/legacy/models/load_models.py b/mii/legacy/models/load_models.py
index 9a37fcaa..7a6d55da 100644
--- a/mii/legacy/models/load_models.py
+++ b/mii/legacy/models/load_models.py
@@ -74,7 +74,7 @@ def load_models(model_config):
                                           config=inf_config)
         if model_config.profile_model_time:
             engine.profile_model_time()
-        if hasattr(inference_pipeline, "model"):
+        #if hasattr(inference_pipeline, "model"):
             #inference_pipeline.model = engine
 
     elif model_config.enable_zero:

From 123d16baaffb056587ae9b57223215ce7c12aa16 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Fri, 8 Nov 2024 09:57:42 -0800
Subject: [PATCH 28/35] yapf

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/legacy/models/load_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mii/legacy/models/load_models.py b/mii/legacy/models/load_models.py
index 7a6d55da..e65653f6 100644
--- a/mii/legacy/models/load_models.py
+++ b/mii/legacy/models/load_models.py
@@ -75,7 +75,7 @@ def load_models(model_config):
         if model_config.profile_model_time:
             engine.profile_model_time()
         #if hasattr(inference_pipeline, "model"):
-            #inference_pipeline.model = engine
+        #inference_pipeline.model = engine
 
     elif model_config.enable_zero:
         ds_config = DeepSpeedConfig(model_config.ds_config)

From 6e57003d411e4c6f4fa81fe004883a22e6f68985 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Fri, 8 Nov 2024 11:01:00 -0800
Subject: [PATCH 29/35] Update code

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/legacy/models/load_models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mii/legacy/models/load_models.py b/mii/legacy/models/load_models.py
index e65653f6..bbb7e37b 100644
--- a/mii/legacy/models/load_models.py
+++ b/mii/legacy/models/load_models.py
@@ -74,8 +74,10 @@ def load_models(model_config):
                                           config=inf_config)
         if model_config.profile_model_time:
             engine.profile_model_time()
-        #if hasattr(inference_pipeline, "model"):
-        #inference_pipeline.model = engine
+        if hasattr(inference_pipeline, "model"):
+            engine._parameters = inference_pipeline.model._parameters
+            engine.training = inference_pipeline.model.training
+            inference_pipeline.model = engine
 
     elif model_config.enable_zero:
         ds_config = DeepSpeedConfig(model_config.ds_config)

From 936e2b14d8a56a9caa41a08f4e8dd13bba75700e Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Fri, 8 Nov 2024 14:02:47 -0800
Subject: [PATCH 30/35] Skip zero-shot tests for now

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 mii/legacy/models/load_models.py               |  2 --
 tests/legacy/test_local_deployment.py          | 11 -----------
 tests/legacy/test_non_persistent_deployment.py | 11 -----------
 3 files changed, 24 deletions(-)

diff --git a/mii/legacy/models/load_models.py b/mii/legacy/models/load_models.py
index bbb7e37b..cfbf455f 100644
--- a/mii/legacy/models/load_models.py
+++ b/mii/legacy/models/load_models.py
@@ -75,8 +75,6 @@ def load_models(model_config):
         if model_config.profile_model_time:
             engine.profile_model_time()
         if hasattr(inference_pipeline, "model"):
-            engine._parameters = inference_pipeline.model._parameters
-            engine.training = inference_pipeline.model.training
             inference_pipeline.model = engine
 
     elif model_config.enable_zero:
diff --git a/tests/legacy/test_local_deployment.py b/tests/legacy/test_local_deployment.py
index 531036f6..69bac328 100644
--- a/tests/legacy/test_local_deployment.py
+++ b/tests/legacy/test_local_deployment.py
@@ -53,17 +53,6 @@
                 "query": "DeepSpeed is the greatest"
             },
         ),
-        (
-            "zero-shot-image-classification",
-            "openai/clip-vit-base-patch32",
-            {
-                "image":
-                "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
-                "candidate_labels": ["animals",
-                                     "humans",
-                                     "landscape"]
-            },
-        ),
     ],
 )
 def test_single_GPU(deployment, query):
diff --git a/tests/legacy/test_non_persistent_deployment.py b/tests/legacy/test_non_persistent_deployment.py
index ed2b13fb..fe5309d6 100644
--- a/tests/legacy/test_non_persistent_deployment.py
+++ b/tests/legacy/test_non_persistent_deployment.py
@@ -55,17 +55,6 @@
                 "query": "DeepSpeed is the greatest"
             },
         ),
-        (
-            "zero-shot-image-classification",
-            "openai/clip-vit-base-patch32",
-            {
-                "image":
-                "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
-                "candidate_labels": ["animals",
-                                     "humans",
-                                     "landscape"],
-            },
-        ),
     ],
 )
 def test_single_GPU(deployment, query):

From 75b90c82222a753dfcd117653126e6970466ac99 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Fri, 8 Nov 2024 15:07:22 -0800
Subject: [PATCH 31/35] Unpin transformers

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-v100-legacy.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml
index 346e7f18..01d12de4 100644
--- a/.github/workflows/nv-v100-legacy.yml
+++ b/.github/workflows/nv-v100-legacy.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Install dependencies
         run: |
           pip install git+https://github.com/microsoft/DeepSpeed.git
-          pip install git+https://github.com/huggingface/transformers.git@v4.43.4
+          pip install git+https://github.com/huggingface/transformers.git
           pip install -U accelerate
           ds_report
 

From 825c072f670f7a2044f9ae38208a07d0bc1e4b40 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 19 Nov 2024 16:54:00 -0800
Subject: [PATCH 32/35] Test branch working around Bloom errors

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-v100-legacy.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml
index 01d12de4..cee04591 100644
--- a/.github/workflows/nv-v100-legacy.yml
+++ b/.github/workflows/nv-v100-legacy.yml
@@ -35,7 +35,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install git+https://github.com/microsoft/DeepSpeed.git
+          pip install git+https://github.com/microsoft/DeepSpeed.git@lekurile/bloom_v_check
           pip install git+https://github.com/huggingface/transformers.git
           pip install -U accelerate
           ds_report

From d71a7b0ca762a3fd892f326c1048897f352e49e1 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Fri, 24 Jan 2025 14:55:51 -0800
Subject: [PATCH 33/35] Update CODEOWNERS file (#552)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 82efda6e..3cc2320e 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1 @@
-*       @tohtana  @tjruwase @awan-10 @loadams
+*       @tohtana @tjruwase @loadams

From 508906af4d0c3fcd8fcde9eb2a77bde7565bd34f Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Fri, 7 Feb 2025 09:33:26 -0800
Subject: [PATCH 34/35] Update contributing language on README for CLA->DCO
 (#554)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index d4e2d3c9..86f1a24c 100644
--- a/README.md
+++ b/README.md
@@ -321,13 +321,14 @@ Users can also control the generation characteristics for individual prompts (i.
 
 # Contributing
 
-This project welcomes contributions and suggestions.  Most contributions require you to agree to a
-Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
-the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+This project welcomes contributions and suggestions.
 
-When you submit a pull request, a CLA bot will automatically determine whether you need to provide
-a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
-provided by the bot. You will only need to do this once across all repos using our CLA.
+DeepSpeed-MII has adopted the [DCO](https://en.wikipedia.org/wiki/Developer_Certificate_of_Origin).  All deepspeedai repos require a DCO.
+(DeepSpeed previously used a CLA which is being replaced with DCO).
+
+DCO is provided by including a sign-off-by line in commit messages. Using the `-s` flag for `git commit` will automatically append this line.
+For example, running `git commit -s -m 'commit info.'` will produce a commit that has the message `commit info. Signed-off-by: My Name <my_email@my_company.com>.`
+The DCO bot will ensure commits are signed with an email address that matches the commit author before they are eligible to be merged.
 
 This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or

From 3c9c7069cdd6ab503304bcf1e5975d39764c2e1a Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Fri, 7 Feb 2025 10:10:20 -0800
Subject: [PATCH 35/35] Update references due to DeepSpeed* GH repo move (#553)

Signed-off-by: Logan Adams <loadams@microsoft.com>
---
 .github/workflows/nv-a6000-fastgen.yml        |  2 +-
 .github/workflows/nv-v100-legacy.yml          |  2 +-
 README.md                                     | 24 +++++++++----------
 docs/source/index.rst                         | 10 ++++----
 docs/source/install.rst                       |  4 ++--
 examples/README.md                            |  2 +-
 mii/aml_related/templates.py                  |  4 ++--
 mii/legacy/README.md                          |  8 +++----
 mii/legacy/aml_related/templates.py           |  4 ++--
 mii/legacy/docs/GPT-NeoX.md                   |  4 ++--
 .../examples/benchmark/txt2img/README.md      |  8 +++----
 mii/legacy/examples/local/chat/README.md      |  2 +-
 .../local/chat/chat-server-example.py         |  2 +-
 setup.py                                      |  4 ++--
 14 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/.github/workflows/nv-a6000-fastgen.yml b/.github/workflows/nv-a6000-fastgen.yml
index 0b9da000..7363979d 100644
--- a/.github/workflows/nv-a6000-fastgen.yml
+++ b/.github/workflows/nv-a6000-fastgen.yml
@@ -41,7 +41,7 @@ jobs:
           python -m pip install .
       - name: Install deepspeed
         run: |
-          git clone --depth=1 https://github.com/microsoft/DeepSpeed
+          git clone --depth=1 https://github.com/deepspeedai/DeepSpeed
           cd DeepSpeed
           python -m pip install .
           ds_report
diff --git a/.github/workflows/nv-v100-legacy.yml b/.github/workflows/nv-v100-legacy.yml
index cee04591..4e77036b 100644
--- a/.github/workflows/nv-v100-legacy.yml
+++ b/.github/workflows/nv-v100-legacy.yml
@@ -35,7 +35,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install git+https://github.com/microsoft/DeepSpeed.git@lekurile/bloom_v_check
+          pip install git+https://github.com/deepspeedai/DeepSpeed.git@lekurile/bloom_v_check
           pip install git+https://github.com/huggingface/transformers.git
           pip install -U accelerate
           ds_report
diff --git a/README.md b/README.md
index 86f1a24c..70e3cef0 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
-[![Formatting](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/formatting.yml/badge.svg?branch=main)](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/formatting.yml)
-[![nv-v100-legacy](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/nv-v100-legacy.yml/badge.svg?branch=main)](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/nv-v100-legacy.yml)
-[![nv-a6000-fastgen](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/nv-a6000-fastgen.yml/badge.svg?branch=main)](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/nv-a6000-fastgen.yml)
-[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
+[![Formatting](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/formatting.yml/badge.svg?branch=main)](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/formatting.yml)
+[![nv-v100-legacy](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/nv-v100-legacy.yml/badge.svg?branch=main)](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/nv-v100-legacy.yml)
+[![nv-a6000-fastgen](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/nv-a6000-fastgen.yml/badge.svg?branch=main)](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/nv-a6000-fastgen.yml)
+[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/deepspeedai/DeepSpeed/blob/master/LICENSE)
 [![PyPI version](https://badge.fury.io/py/deepspeed-mii.svg)](https://pypi.org/project/deepspeed-mii/)
 <!-- [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest) -->
 
@@ -12,8 +12,8 @@
 
 ## Latest News
 
-* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
-* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)
+* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
+* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen)
 * [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](mii/legacy/examples/benchmark/txt2img)
 * [2022/10] [Announcing DeepSpeed Model Implementations for Inference (MII)](https://www.deepspeed.ai/2022/10/10/mii.html)
 
@@ -33,7 +33,7 @@
 
 Introducing MII, an open-source Python library designed by DeepSpeed to democratize powerful model inference with a focus on high-throughput, low latency, and cost-effectiveness.
 
-* MII features include blocked KV-caching, continuous batching, Dynamic SplitFuse, tensor parallelism, and high-performance CUDA kernels to support fast high throughput text-generation for LLMs such as Llama-2-70B, Mixtral (MoE) 8x7B, and Phi-2. The latest updates in v0.2 add new model families, performance optimizations, and feature enhancements. MII now delivers up to 2.5 times higher effective throughput compared to leading systems such as vLLM. For detailed performance results please see our [latest DeepSpeed-FastGen blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) and [DeepSpeed-FastGen release blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen).
+* MII features include blocked KV-caching, continuous batching, Dynamic SplitFuse, tensor parallelism, and high-performance CUDA kernels to support fast high throughput text-generation for LLMs such as Llama-2-70B, Mixtral (MoE) 8x7B, and Phi-2. The latest updates in v0.2 add new model families, performance optimizations, and feature enhancements. MII now delivers up to 2.5 times higher effective throughput compared to leading systems such as vLLM. For detailed performance results please see our [latest DeepSpeed-FastGen blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) and [DeepSpeed-FastGen release blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen).
 
 <div align="center">
  <img src="docs/images/fastgen-24-01-hero-light.png#gh-light-mode-only" width="850px">
@@ -58,7 +58,7 @@ MII provides accelerated text-generation inference through the use of four key t
 * Dynamic SplitFuse
 * High Performance CUDA Kernels
 
-For a deeper dive into understanding these features please [refer to our blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) which also includes a detailed performance analysis.
+For a deeper dive into understanding these features please [refer to our blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen) which also includes a detailed performance analysis.
 
 ## MII Legacy
 
@@ -78,14 +78,14 @@ In the past, MII introduced several [key performance optimizations](https://www.
 </div>
 
 
-Figure 1: MII architecture, showing how MII automatically optimizes OSS models using DS-Inference before deploying them. DeepSpeed-FastGen optimizations in the figure have been published in [our blog post](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen).
+Figure 1: MII architecture, showing how MII automatically optimizes OSS models using DS-Inference before deploying them. DeepSpeed-FastGen optimizations in the figure have been published in [our blog post](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen).
 
-Under-the-hood MII is powered by [DeepSpeed-Inference](https://github.com/microsoft/deepspeed). Based on the model architecture, model size, batch size, and available hardware resources, MII automatically applies the appropriate set of system optimizations to minimize latency and maximize throughput.
+Under-the-hood MII is powered by [DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed). Based on the model architecture, model size, batch size, and available hardware resources, MII automatically applies the appropriate set of system optimizations to minimize latency and maximize throughput.
 
 
 # Supported Models
 
-MII currently supports over 37,000 models across eight popular model architectures. We plan to add additional models in the near term, if there are specific model architectures you would like supported please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know. All current models leverage Hugging Face in our backend to provide both the model weights and the model's corresponding tokenizer. For our current release we support the following model architectures:
+MII currently supports over 37,000 models across eight popular model architectures. We plan to add additional models in the near term, if there are specific model architectures you would like supported please [file an issue](https://github.com/deepspeedai/DeepSpeed-MII/issues) and let us know. All current models leverage Hugging Face in our backend to provide both the model weights and the model's corresponding tokenizer. For our current release we support the following model architectures:
 
 model family | size range | ~model count
 ------ | ------ | ------
@@ -120,7 +120,7 @@ The fasest way to get started is with our [PyPI release of DeepSpeed-MII](https:
 pip install deepspeed-mii
 ```
 
-For ease of use and significant reduction in lengthy compile times that many projects require in this space we distribute a pre-compiled python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/microsoft/DeepSpeed-Kernels#source).
+For ease of use and significant reduction in lengthy compile times that many projects require in this space we distribute a pre-compiled python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/deepspeedai/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/deepspeedai/DeepSpeed-Kernels#source).
 
 ## Non-Persistent Pipeline
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 813f232c..8099898e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -14,15 +14,15 @@ democratize powerful model inference with a focus on high-throughput, low
 latency, and cost-effectiveness.
 
 MII v0.1 introduced several features as part of our `DeepSpeed-FastGen release
-<https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen>`_
+<https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen>`_
 such as blocked KV-caching, continuous batching, Dynamic SplitFuse, tensor
 parallelism, and high-performance CUDA kernels to support fast high throughput
 text-generation with LLMs. The latest version of MII delivers up to 2.5 times
 higher effective throughput compared to leading systems such as vLLM. For
 detailed performance results please see our `DeepSpeed-FastGen release blog
-<https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen>`_
+<https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen>`_
 and the `latest DeepSpeed-FastGen blog
-<https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19>`_.
+<https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19>`_.
 
 MII-Legacy
 ----------
@@ -32,9 +32,9 @@ We first `announced MII <https://www.deepspeed.ai/2022/10/10/mii.html>`_ in
 of DeepSpeed-FastGen. MII-Legacy, which covers all prior releases up to v0.0.9,
 provides support for running inference for a wide variety of language model
 tasks. We also support accelerating `text2image models like Stable Diffusion
-<https://github.com/Microsoft/DeepSpeed-MII/tree/main/mii/legacy/examples/benchmark/txt2img>`_.
+<https://github.com/deepspeedai/DeepSpeed-MII/tree/main/mii/legacy/examples/benchmark/txt2img>`_.
 For more details on our previous releases please see our `legacy APIs
-<https://github.com/Microsoft/DeepSpeed-MII/tree/main/mii/legacy/>`_.
+<https://github.com/deepspeedai/DeepSpeed-MII/tree/main/mii/legacy/>`_.
 
 
 Contents
diff --git a/docs/source/install.rst b/docs/source/install.rst
index 523c9c1a..ee16f0ff 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -19,11 +19,11 @@ pip to install from source:
 
 .. code-block:: console
 
-   (.venv) $ pip install git+https://github.com/Microsoft/DeepSpeed-MII.git
+   (.venv) $ pip install git+https://github.com/deepspeedai/DeepSpeed-MII.git
 
 Or you can clone the repository and install:
 
 .. code-block:: console
 
-   (.venv) $ git clone https://github.com/Microsoft/DeepSpeed-MII.git
+   (.venv) $ git clone https://github.com/deepspeedai/DeepSpeed-MII.git
    (.venv) $ pip install ./DeepSpeed-MII
diff --git a/examples/README.md b/examples/README.md
index 4efb2155..334840a2 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,2 +1,2 @@
 # MII Examples
-Please see [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) for a few examples on using MII.
+Please see [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/mii) for a few examples on using MII.
diff --git a/mii/aml_related/templates.py b/mii/aml_related/templates.py
index 71f1cb44..33805628 100644
--- a/mii/aml_related/templates.py
+++ b/mii/aml_related/templates.py
@@ -165,8 +165,8 @@
 RUN /opt/miniconda/envs/amlenv/bin/pip install torch torchvision --index-url https://download.pytorch.org/whl/cu113 && \
     /opt/miniconda/envs/amlenv/bin/pip install -r "$BUILD_DIR/requirements.txt" && \
     /opt/miniconda/envs/amlenv/bin/pip install azureml-inference-server-http && \
-    /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/microsoft/DeepSpeed.git && \
-    /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/microsoft/DeepSpeed-MII.git && \
+    /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/deepspeedai/DeepSpeed.git && \
+    /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/deepspeedai/DeepSpeed-MII.git && \
     /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/huggingface/transformers.git
 
 
diff --git a/mii/legacy/README.md b/mii/legacy/README.md
index ed949a1c..041c9516 100644
--- a/mii/legacy/README.md
+++ b/mii/legacy/README.md
@@ -1,6 +1,6 @@
-<!-- [![Build Status](https://github.com/microsoft/deepspeed-mii/workflows/Build/badge.svg)](https://github.com/microsoft/DeepSpeed-MII/actions) -->
-[![Formatting](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/formatting.yml/badge.svg)](https://github.com/microsoft/DeepSpeed-MII/actions/workflows/formatting.yml)
-[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
+<!-- [![Build Status](https://github.com/deepspeedai/DeepSpeed-mii/workflows/Build/badge.svg)](https://github.com/deepspeedai/DeepSpeed-MII/actions) -->
+[![Formatting](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/formatting.yml/badge.svg)](https://github.com/deepspeedai/DeepSpeed-MII/actions/workflows/formatting.yml)
+[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/deepspeedai/DeepSpeed/blob/master/LICENSE)
 [![PyPI version](https://badge.fury.io/py/deepspeed-mii.svg)](https://pypi.org/project/deepspeed-mii/)
 <!-- [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest) -->
 
@@ -195,7 +195,7 @@ result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=Tr
 
 ```
 
-You can find a complete example [here]("https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/non_persistent")
+You can find a complete example [here]("https://github.com/deepspeedai/DeepSpeed-MII/tree/main/examples/non_persistent")
 
 Any HTTP client can be used to call the APIs. An example of using curl is:
 ```bash
diff --git a/mii/legacy/aml_related/templates.py b/mii/legacy/aml_related/templates.py
index 71f1cb44..33805628 100644
--- a/mii/legacy/aml_related/templates.py
+++ b/mii/legacy/aml_related/templates.py
@@ -165,8 +165,8 @@
 RUN /opt/miniconda/envs/amlenv/bin/pip install torch torchvision --index-url https://download.pytorch.org/whl/cu113 && \
     /opt/miniconda/envs/amlenv/bin/pip install -r "$BUILD_DIR/requirements.txt" && \
     /opt/miniconda/envs/amlenv/bin/pip install azureml-inference-server-http && \
-    /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/microsoft/DeepSpeed.git && \
-    /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/microsoft/DeepSpeed-MII.git && \
+    /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/deepspeedai/DeepSpeed.git && \
+    /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/deepspeedai/DeepSpeed-MII.git && \
     /opt/miniconda/envs/amlenv/bin/pip install git+https://github.com/huggingface/transformers.git
 
 
diff --git a/mii/legacy/docs/GPT-NeoX.md b/mii/legacy/docs/GPT-NeoX.md
index dafcc736..6e495ca7 100644
--- a/mii/legacy/docs/GPT-NeoX.md
+++ b/mii/legacy/docs/GPT-NeoX.md
@@ -18,7 +18,7 @@ source ./MII-GPT-NeoX/bin/activate
 
 ## Install MII
 ```bash
-git clone https://github.com/microsoft/DeepSpeed-MII.git
+git clone https://github.com/deepspeedai/DeepSpeed-MII.git
 cd DeepSpeed-MII
 pip install .[local]
 pip install .
@@ -26,7 +26,7 @@ pip install .
 
 ## Install DeepSpeed-GPT-NeoX
 ```bash
-git clone -b ds-updates https://github.com/microsoft/deepspeed-gpt-neox.git
+git clone -b ds-updates https://github.com/deepspeedai/DeepSpeed-gpt-neox.git
 cd deepspeed-gpt-neox
 pip install -r requirements/requirements-inference.txt
 pip install .
diff --git a/mii/legacy/examples/benchmark/txt2img/README.md b/mii/legacy/examples/benchmark/txt2img/README.md
index ad4f769d..469afb91 100644
--- a/mii/legacy/examples/benchmark/txt2img/README.md
+++ b/mii/legacy/examples/benchmark/txt2img/README.md
@@ -5,7 +5,7 @@
  <img src="../../../docs/images/sd-hero-dark.png#gh-dark-mode-only">
 </div>
 
-In this tutorial you will learn how to deploy [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion-v1-4) with state-of-the-art performance optimizations from [DeepSpeed Inference](https://github.com/microsoft/deepspeed) and [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii). In addition to deploying we will perform several performance evaluations.
+In this tutorial you will learn how to deploy [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion-v1-4) with state-of-the-art performance optimizations from [DeepSpeed Inference](https://github.com/deepspeedai/DeepSpeed) and [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-mii). In addition to deploying we will perform several performance evaluations.
 
 The performance results above utilized NVIDIA GPUs from Azure: [ND96amsr\_A100\_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/nda100-v4-series) (NVIDIA A100-80GB) and [ND96asr\_v4](https://learn.microsoft.com/en-us/azure/virtual-machines/nda100-v4-series) (A100-40GB). We have also used MII-Public with NVIDIA RTX-A6000 GPUs and will include those results at a future date.
 
@@ -36,9 +36,9 @@ DeepSpeed-MII will automatically inject a wide range of optimizations from DeepS
 6. Partial UNet INT8 quantization via [ZeroQuant](https://arxiv.org/abs/2206.01861)
 7. Exploitation of coarse grained computation sparsity
 
-The first four optimizations are available via MII-Public, while the rest are available via MII-Azure ([see here to read more about MII-Public and MII-Azure](https://github.com/microsoft/deepspeed-mii#mii-public-and-mii-azure)). In the rest of this tutorial, we will show how you can deploy Stable Diffusion with both MII-Public and MII-Azure.
+The first four optimizations are available via MII-Public, while the rest are available via MII-Azure ([see here to read more about MII-Public and MII-Azure](https://github.com/deepspeedai/DeepSpeed-mii#mii-public-and-mii-azure)). In the rest of this tutorial, we will show how you can deploy Stable Diffusion with both MII-Public and MII-Azure.
 
-Keep an eye on the [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) repo and this tutorial for further updates and a deeper dive into these and future performance optimizations.
+Keep an eye on the [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-mii) repo and this tutorial for further updates and a deeper dive into these and future performance optimizations.
 
 ## Environment and dependency setup
 
@@ -49,7 +49,7 @@ pip install deepspeed[sd] deepspeed-mii
 ```
 
 > **Note**
-> The DeepSpeed version used in the rest of this tutorial uses [this branch](https://github.com/microsoft/DeepSpeed/pull/2491) which will be merged into master and released as part of DeepSpeed v0.7.5 later this week.
+> The DeepSpeed version used in the rest of this tutorial uses [this branch](https://github.com/deepspeedai/DeepSpeed/pull/2491) which will be merged into master and released as part of DeepSpeed v0.7.5 later this week.
 
 In order to check your DeepSpeed install is setup correctly run `ds_report` from your command line. This will show what versions of DeepSpeed, PyTorch, and nvcc will be used at runtime. The bottom half of `ds_report` is show below for our setup:
 
diff --git a/mii/legacy/examples/local/chat/README.md b/mii/legacy/examples/local/chat/README.md
index 4bc48639..55684def 100644
--- a/mii/legacy/examples/local/chat/README.md
+++ b/mii/legacy/examples/local/chat/README.md
@@ -8,7 +8,7 @@ The scripts in this folder provide a complete example of a multi-turn conversati
 Starting the server for your chat application requires nothing special.
 Just make sure that the model supports `text-generation` and is trained for conversations.
 
-The example script uses [AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed), which was trained using [DeepSpeed-Chat](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md).
+The example script uses [AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed), which was trained using [DeepSpeed-Chat](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md).
 
 ```python
 name = "AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed"
diff --git a/mii/legacy/examples/local/chat/chat-server-example.py b/mii/legacy/examples/local/chat/chat-server-example.py
index ae8d2ac0..ec484f91 100644
--- a/mii/legacy/examples/local/chat/chat-server-example.py
+++ b/mii/legacy/examples/local/chat/chat-server-example.py
@@ -7,7 +7,7 @@
 mii_configs = {'tensor_parallel': 1}
 
 # This checkpoint is create using DeepSpeed-Chat
-# https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md
+# https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md
 name = "AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed"
 
 print(f"Deploying {name}...")
diff --git a/setup.py b/setup.py
index 415e6df7..aecebcf6 100644
--- a/setup.py
+++ b/setup.py
@@ -85,8 +85,8 @@ def command_exists(cmd):
       author_email='deepspeed-mii@microsoft.com',
       url='http://deepspeed.ai',
       project_urls={
-          'Documentation': 'https://github.com/microsoft/DeepSpeed-MII',
-          'Source': 'https://github.com/microsoft/DeepSpeed-MII',
+          'Documentation': 'https://github.com/deepspeedai/DeepSpeed-MII',
+          'Source': 'https://github.com/deepspeedai/DeepSpeed-MII',
       },
       install_requires=install_requires,
       extras_require=extras_require,