-
Notifications
You must be signed in to change notification settings - Fork 5
[Mirror] mtmd: Add DeepSeekOCR Support #66
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
43a130b
b6b9f02
85c7cda
578c8d7
2aab52e
eab28ed
7630587
2de3436
e8b2610
97e0907
13dc6fb
b32bb5e
790bbb9
cec9a5c
8b3d319
1e08157
331cea8
6c0715b
a65ddf5
63a042f
89afda8
88032f4
1268dc3
68b206b
8bce66d
5e6cf3c
7e9fbec
0f5587d
7b8d735
86f111f
effe669
f8f66a1
3fcfc3a
ee8a148
4cfa15f
3f71188
a594990
6dfda99
7941f5d
206f8ab
40e7e6e
81533e4
8810940
a488b49
ccb2f23
841a4a8
ed3b7f1
5543094
c5f4c64
95239f9
6b0e7cd
6634166
c914e05
e20857b
43dfc0c
b696c54
b26b507
7451b84
386ba47
c73748a
a661c52
0399ddf
c89171c
2dd9924
fc3f625
4d7d994
5381b9c
076138a
d0c08e3
f5bd310
6687b4e
5f2ee1a
1c88647
d981f19
705394c
15f2ada
2d918b3
5dfcc5a
53273f8
48c6cf2
5174a1e
0161406
ed944cd
aaf2fd1
33fabf0
d70f171
4cbbe8a
47f0fee
e0e69fd
f95a6fe
f7736f2
fb3bb6a
1b38ccf
6c36c03
dc2066e
3fc61d4
7f8621c
b3bf8cb
8ad98ee
4a4f829
51c3de6
512b2c8
00d2357
87e4a00
f629d02
5a741fd
616f009
e5d426b
c739cf2
9a05e1d
4d91711
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -712,6 +712,9 @@ def load_hparams(dir_model: Path, is_mistral_format: bool): | |
| if "thinker_config" in config: | ||
| # rename for Qwen2.5-Omni | ||
| config["text_config"] = config["thinker_config"]["text_config"] | ||
| if "language_config" in config: | ||
| # rename for DeepSeekOCR | ||
| config["text_config"] = config["language_config"] | ||
| if "lfm" in config: | ||
| # rename for LFM2-Audio | ||
| config["text_config"] = config["lfm"] | ||
|
|
@@ -1692,7 +1695,7 @@ class MmprojModel(ModelBase): | |
| preprocessor_config: dict[str, Any] | ||
| global_config: dict[str, Any] | ||
|
|
||
| n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"] | ||
| n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers"] | ||
|
|
||
| has_vision_encoder: bool = True # by default | ||
| has_audio_encoder: bool = False | ||
|
|
@@ -5960,6 +5963,68 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter | |
| return [] # skip other tensors | ||
|
|
||
|
|
||
| @ModelBase.register("DeepseekOCRForCausalLM") | ||
| class DeepseekOCRVisionModel(MmprojModel): | ||
| def set_gguf_parameters(self): | ||
| super().set_gguf_parameters() | ||
| hparams = self.hparams | ||
| self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR) | ||
| # default values below are taken from HF tranformers code | ||
| self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) | ||
| self.gguf_writer.add_vision_use_gelu(True) | ||
| # calculate proj_scale_factor (used by tinygemma3 test model) | ||
| image_seq_length = self.preprocessor_config.get("image_seq_length", 256) | ||
| n_per_side = int(image_seq_length ** 0.5) | ||
| image_size = self.hparams["image_size"] | ||
| patch_size = self.hparams["patch_size"] | ||
| proj_scale_factor = (image_size // patch_size) // n_per_side | ||
| if proj_scale_factor > 0 and proj_scale_factor != 4: | ||
| # we only need to write this if it's not the default value | ||
| # in this case, we are converting a test model | ||
| self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) | ||
| # @bluebread: there's no window_size in config but just add it here anyway | ||
| self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14)) | ||
|
|
||
| # SAM configuration | ||
| sam_hparams = hparams['sam'] | ||
| self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers']) | ||
| self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width']) | ||
| self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads']) | ||
|
|
||
| def get_vision_config(self) -> dict[str, Any]: | ||
| vision_config: dict[str, Any] | None = self.global_config.get("vision_config") | ||
|
|
||
| if not vision_config: | ||
| raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found") | ||
|
|
||
| vision_config['sam'] = vision_config['width']['sam_vit_b'] | ||
| vision_config.update(vision_config['width']['clip-l-14-224']) | ||
| vision_config['hidden_size'] = vision_config['width'] | ||
| vision_config['num_heads'] = vision_config['heads'] | ||
| vision_config['intermediate_size'] = vision_config['heads'] * 4 | ||
|
|
||
| return vision_config | ||
|
|
||
| def tensor_force_quant(self, name, new_name, bid, n_dims): | ||
| if ".embeddings." in name or 'pos_embed' in name: | ||
| return gguf.GGMLQuantizationType.F32 | ||
| if ".rel_pos_h" in name or '.rel_pos_w' in name: | ||
| return gguf.GGMLQuantizationType.F32 | ||
| return gguf.GGMLQuantizationType.F16 | ||
|
|
||
| def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: | ||
| # Only process vision-related tensors, skip language model tensors | ||
| # Vision components: sam_model, vision_model, projector, image_newline, view_seperator | ||
| # Language model components to skip: lm_head, embed_tokens, layers, norm | ||
| if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")): | ||
| return [] | ||
|
|
||
| if ".attn.rel_pos_h" in name or ".attn.rel_pos_w" in name: | ||
| return [(self.map_tensor_name(name, try_suffixes=("",)), data_torch)] | ||
|
|
||
| return [(self.map_tensor_name(name), data_torch)] | ||
|
|
||
|
|
||
| @ModelBase.register("Gemma3nForConditionalGeneration") | ||
| class Gemma3NModel(Gemma3Model): | ||
| model_arch = gguf.MODEL_ARCH.GEMMA3N | ||
|
|
@@ -7126,6 +7191,16 @@ def prepare_tensors(self): | |
| class DeepseekV2Model(TextModel): | ||
| model_arch = gguf.MODEL_ARCH.DEEPSEEK2 | ||
|
|
||
| def __init__(self, *args, **kwargs): | ||
| super().__init__(*args, **kwargs) | ||
| hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) | ||
| self.origin_hf_arch = hparams.get('architectures', [None])[0] | ||
|
|
||
| if self.origin_hf_arch == "DeepseekOCRForCausalLM": | ||
| self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR | ||
| self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] | ||
| self.gguf_writer.add_architecture() | ||
|
|
||
|
Comment on lines
+7194
to
+7203
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. DeepseekV2 OCR path: minor rope metadata duplication; rest looks good The OCR-specific wiring in
One small issue: for non-OCR models, Proposed deduplication of `add_rope_dimension_count` def set_gguf_parameters(self):
is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR)
@@
# note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
- if not is_ocr:
- self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"])
- self.gguf_writer.add_value_length(kv_lora_rank)
- self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
- self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
- self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+ if not is_ocr:
+ self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"])
+ self.gguf_writer.add_value_length(kv_lora_rank)
+ self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+ self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
@@
- self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
- self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
- self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
-
- self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+ self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
+ self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
+
+ # Common RoPE dim for both OCR and non-OCR variants
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])The additions in Also applies to: 7259-7274, 7279-7288, 7292-7295, 7307-7313 |
||
| def set_vocab(self): | ||
| try: | ||
| self._set_vocab_gpt2() | ||
|
|
@@ -7181,30 +7256,41 @@ def set_vocab(self): | |
| raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") | ||
|
|
||
| def set_gguf_parameters(self): | ||
| is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR) | ||
|
|
||
| # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) | ||
| self.hparams["num_key_value_heads"] = 1 | ||
| if is_ocr: | ||
| self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0) | ||
| else: | ||
| # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) | ||
| self.hparams["num_key_value_heads"] = 1 | ||
|
|
||
| self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6) | ||
|
|
||
| super().set_gguf_parameters() | ||
| hparams = self.hparams | ||
|
|
||
| kv_lora_rank = hparams["kv_lora_rank"] if hparams.get("kv_lora_rank") is not None else 512 | ||
| routed_scaling_factor = hparams.get("routed_scaling_factor", 1.0) | ||
| norm_topk_prob = hparams.get("norm_topk_prob", False) | ||
| self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) | ||
| self.gguf_writer.add_vocab_size(hparams["vocab_size"]) | ||
| if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: | ||
| self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) | ||
| self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) | ||
| if "kv_lora_rank" in hparams and hparams["kv_lora_rank"] is not None: | ||
| self.gguf_writer.add_kv_lora_rank(kv_lora_rank) | ||
|
|
||
| # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA | ||
| self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"]) | ||
| self.gguf_writer.add_value_length(hparams["kv_lora_rank"]) | ||
| self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) | ||
| self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) | ||
| if not is_ocr: | ||
| self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"]) | ||
| self.gguf_writer.add_value_length(kv_lora_rank) | ||
| self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) | ||
| self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) | ||
| self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) | ||
|
|
||
| self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) | ||
| self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) | ||
| self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) | ||
| self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) | ||
| self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) | ||
| self.gguf_writer.add_expert_weights_scale(routed_scaling_factor) | ||
| self.gguf_writer.add_expert_weights_norm(norm_topk_prob) | ||
|
|
||
| self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) | ||
|
|
||
|
|
@@ -7218,7 +7304,12 @@ def set_gguf_parameters(self): | |
|
|
||
| def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: | ||
| # skip vision tensors and remove "language_model." for Kimi-VL | ||
| if "vision_tower" in name or "multi_modal_projector" in name: | ||
| if ("vision_" in name | ||
| or "multi_modal_projector" in name | ||
| or "image_newline" in name | ||
| or "model.projector" in name | ||
| or "sam_model" in name | ||
| or "view_seperator" in name): | ||
| return [] | ||
|
|
||
| if name.startswith("language_model."): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -290,6 +290,7 @@ class ClipVision: | |
| IMAGE_MEAN = "clip.vision.image_mean" | ||
| IMAGE_STD = "clip.vision.image_std" | ||
| SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size" | ||
| WINDOW_SIZE = "clip.vision.window_size" | ||
| USE_GELU = "clip.use_gelu" | ||
| USE_SILU = "clip.use_silu" | ||
| N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl | ||
|
|
@@ -302,6 +303,11 @@ class Attention: | |
| class Projector: | ||
| SCALE_FACTOR = "clip.vision.projector.scale_factor" | ||
|
|
||
| class SAM: | ||
| BLOCK_COUNT = "clip.vision.sam.block_count" | ||
| EMBEDDING_LENGTH = "clip.vision.sam.embedding_length" | ||
| HEAD_COUNT = "clip.vision.sam.head_count" | ||
|
|
||
| class ClipAudio: | ||
| NUM_MEL_BINS = "clip.audio.num_mel_bins" | ||
| EMBEDDING_LENGTH = "clip.audio.embedding_length" | ||
|
|
@@ -404,6 +410,7 @@ class MODEL_ARCH(IntEnum): | |
| ARCTIC = auto() | ||
| DEEPSEEK = auto() | ||
| DEEPSEEK2 = auto() | ||
| DEEPSEEK2OCR = auto() | ||
| CHATGLM = auto() | ||
| GLM4 = auto() | ||
| GLM4_MOE = auto() | ||
|
|
@@ -688,6 +695,22 @@ class MODEL_TENSOR(IntEnum): | |
| V_MM_GATE = auto() # cogvlm | ||
| V_TOK_BOI = auto() # cogvlm | ||
| V_TOK_EOI = auto() # cogvlm | ||
| V_SAM_POS_EMBD = auto() # Deepseek-OCR | ||
| V_SAM_PATCH_EMBD = auto() # Deepseek-OCR | ||
| V_SAM_PRE_NORM = auto() # Deepseek-OCR | ||
| V_SAM_POST_NORM = auto() # Deepseek-OCR | ||
| V_SAM_ATTN_POS_H = auto() # Deepseek-OCR | ||
| V_SAM_ATTN_POS_W = auto() # Deepseek-OCR | ||
| V_SAM_ATTN_QKV = auto() # Deepseek-OCR | ||
| V_SAM_ATTN_OUT = auto() # Deepseek-OCR | ||
| V_SAM_MLP_LIN_1 = auto() # Deepseek-OCR | ||
| V_SAM_MLP_LIN_2 = auto() # Deepseek-OCR | ||
| V_SAM_NECK = auto() # Deepseek-OCR | ||
| V_SAM_NET_2 = auto() # Deepseek-OCR | ||
| V_SAM_NET_3 = auto() # Deepseek-OCR | ||
| V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR | ||
| V_ENC_EMBD_VSEP = auto() # Deepseek-OCR | ||
|
|
||
| # audio (mtmd) | ||
| A_ENC_EMBD_POS = auto() | ||
| A_ENC_EMBD_NORM = auto() | ||
|
|
@@ -797,6 +820,7 @@ class MODEL_TENSOR(IntEnum): | |
| MODEL_ARCH.ARCTIC: "arctic", | ||
| MODEL_ARCH.DEEPSEEK: "deepseek", | ||
| MODEL_ARCH.DEEPSEEK2: "deepseek2", | ||
| MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr", | ||
| MODEL_ARCH.CHATGLM: "chatglm", | ||
| MODEL_ARCH.GLM4: "glm4", | ||
| MODEL_ARCH.GLM4_MOE: "glm4moe", | ||
|
|
@@ -1080,6 +1104,22 @@ class MODEL_TENSOR(IntEnum): | |
| MODEL_TENSOR.V_MM_GATE: "mm.gate", | ||
| MODEL_TENSOR.V_TOK_BOI: "v.boi", | ||
| MODEL_TENSOR.V_TOK_EOI: "v.eoi", | ||
| # DeepSeek-OCR SAM | ||
| MODEL_TENSOR.V_SAM_POS_EMBD: "v.sam.pos_embd", | ||
| MODEL_TENSOR.V_SAM_PATCH_EMBD: "v.sam.patch_embd", | ||
| MODEL_TENSOR.V_SAM_PRE_NORM: "v.sam.blk.{bid}.pre_ln", | ||
| MODEL_TENSOR.V_SAM_POST_NORM: "v.sam.blk.{bid}.post_ln", | ||
| MODEL_TENSOR.V_SAM_ATTN_POS_H: "v.sam.blk.{bid}.attn.pos_h", | ||
| MODEL_TENSOR.V_SAM_ATTN_POS_W: "v.sam.blk.{bid}.attn.pos_w", | ||
| MODEL_TENSOR.V_SAM_ATTN_QKV: "v.sam.blk.{bid}.attn.qkv", | ||
| MODEL_TENSOR.V_SAM_ATTN_OUT: "v.sam.blk.{bid}.attn.out", | ||
| MODEL_TENSOR.V_SAM_MLP_LIN_1: "v.sam.blk.{bid}.mlp.lin1", | ||
| MODEL_TENSOR.V_SAM_MLP_LIN_2: "v.sam.blk.{bid}.mlp.lin2", | ||
| MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}", | ||
| MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2", | ||
| MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3", | ||
| MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR | ||
| MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typo: "seperator" should be "separator". The string Note: This typo also appears in 🔎 Proposed fix- MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR
+ MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_separator", # Deepseek-OCR🤖 Prompt for AI Agents |
||
| # audio (mtmd) | ||
| # note: all audio tensor names must use prefix "a." or "mm.a." | ||
| MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", | ||
|
|
@@ -1135,6 +1175,8 @@ class MODEL_TENSOR(IntEnum): | |
| MODEL_TENSOR.V_ENC_EMBD_PATCH, | ||
| MODEL_TENSOR.V_ENC_EMBD_NORM, | ||
| MODEL_TENSOR.V_ENC_EMBD_POS, | ||
| MODEL_TENSOR.V_ENC_EMBD_IMGNL, | ||
| MODEL_TENSOR.V_ENC_EMBD_VSEP, | ||
| MODEL_TENSOR.V_ENC_INPUT_NORM, | ||
| MODEL_TENSOR.V_ENC_ATTN_QKV, | ||
| MODEL_TENSOR.V_ENC_ATTN_Q, | ||
|
|
@@ -1178,6 +1220,19 @@ class MODEL_TENSOR(IntEnum): | |
| MODEL_TENSOR.V_MM_GATE, | ||
| MODEL_TENSOR.V_TOK_BOI, | ||
| MODEL_TENSOR.V_TOK_EOI, | ||
| MODEL_TENSOR.V_SAM_POS_EMBD, | ||
| MODEL_TENSOR.V_SAM_PATCH_EMBD, | ||
| MODEL_TENSOR.V_SAM_PRE_NORM, | ||
| MODEL_TENSOR.V_SAM_POST_NORM, | ||
| MODEL_TENSOR.V_SAM_ATTN_POS_H, | ||
| MODEL_TENSOR.V_SAM_ATTN_POS_W, | ||
| MODEL_TENSOR.V_SAM_ATTN_QKV, | ||
| MODEL_TENSOR.V_SAM_ATTN_OUT, | ||
| MODEL_TENSOR.V_SAM_MLP_LIN_1, | ||
| MODEL_TENSOR.V_SAM_MLP_LIN_2, | ||
| MODEL_TENSOR.V_SAM_NECK, | ||
| MODEL_TENSOR.V_SAM_NET_2, | ||
| MODEL_TENSOR.V_SAM_NET_3, | ||
| # audio | ||
| MODEL_TENSOR.A_ENC_EMBD_POS, | ||
| MODEL_TENSOR.A_ENC_EMBD_NORM, | ||
|
|
@@ -2362,7 +2417,41 @@ class MODEL_TENSOR(IntEnum): | |
| MODEL_TENSOR.ATTN_Q_B, | ||
| MODEL_TENSOR.ATTN_KV_A_MQA, | ||
| MODEL_TENSOR.ATTN_KV_B, | ||
| MODEL_TENSOR.ATTN_K, | ||
| MODEL_TENSOR.ATTN_K_B, | ||
| MODEL_TENSOR.ATTN_V, | ||
| MODEL_TENSOR.ATTN_V_B, | ||
| MODEL_TENSOR.ATTN_Q_A_NORM, | ||
| MODEL_TENSOR.ATTN_KV_A_NORM, | ||
| MODEL_TENSOR.ATTN_OUT, | ||
| MODEL_TENSOR.ATTN_ROT_EMBD, | ||
| MODEL_TENSOR.FFN_GATE_INP, | ||
| MODEL_TENSOR.FFN_NORM, | ||
| MODEL_TENSOR.FFN_GATE, | ||
| MODEL_TENSOR.FFN_DOWN, | ||
| MODEL_TENSOR.FFN_UP, | ||
| MODEL_TENSOR.FFN_GATE_EXP, | ||
| MODEL_TENSOR.FFN_DOWN_EXP, | ||
| MODEL_TENSOR.FFN_UP_EXP, | ||
| MODEL_TENSOR.FFN_GATE_SHEXP, | ||
| MODEL_TENSOR.FFN_DOWN_SHEXP, | ||
| MODEL_TENSOR.FFN_UP_SHEXP, | ||
| MODEL_TENSOR.FFN_EXP_PROBS_B, | ||
| ], | ||
| MODEL_ARCH.DEEPSEEK2OCR: [ | ||
| MODEL_TENSOR.TOKEN_EMBD, | ||
| MODEL_TENSOR.OUTPUT_NORM, | ||
| MODEL_TENSOR.OUTPUT, | ||
| MODEL_TENSOR.ROPE_FREQS, | ||
| MODEL_TENSOR.ATTN_NORM, | ||
| MODEL_TENSOR.ATTN_Q, | ||
| MODEL_TENSOR.ATTN_Q_A, | ||
| MODEL_TENSOR.ATTN_Q_B, | ||
| MODEL_TENSOR.ATTN_KV_A_MQA, | ||
| MODEL_TENSOR.ATTN_KV_B, | ||
| MODEL_TENSOR.ATTN_K, | ||
| MODEL_TENSOR.ATTN_K_B, | ||
| MODEL_TENSOR.ATTN_V, | ||
| MODEL_TENSOR.ATTN_V_B, | ||
| MODEL_TENSOR.ATTN_Q_A_NORM, | ||
| MODEL_TENSOR.ATTN_KV_A_NORM, | ||
|
|
@@ -3225,6 +3314,10 @@ class MODEL_TENSOR(IntEnum): | |
| MODEL_TENSOR.ROPE_FREQS, | ||
| MODEL_TENSOR.ATTN_ROT_EMBD, | ||
| ], | ||
| MODEL_ARCH.DEEPSEEK2OCR: [ | ||
| MODEL_TENSOR.ROPE_FREQS, | ||
| MODEL_TENSOR.ATTN_ROT_EMBD, | ||
| ], | ||
| MODEL_ARCH.CHATGLM: [ | ||
| MODEL_TENSOR.ROPE_FREQS, | ||
| ], | ||
|
|
@@ -3414,6 +3507,7 @@ class VisionProjectorType: | |
| LIGHTONOCR = "lightonocr" | ||
| COGVLM = "cogvlm" | ||
| JANUS_PRO = "janus_pro" | ||
| DEEPSEEKOCR = "deepseekocr" | ||
| LFM2A = "lfm2a" # audio | ||
| GLM4V = "glm4v" | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion | 🟠 Major
DeepSeek-OCR:
intermediate_sizederivation inget_vision_configlooks incorrectget_vision_config()currently sets:For the released DeepSeek-OCR config, CLIP-L has
width = 1024,heads = 16, andmlp_ratio ≈ 3.7362, so the MLP/FFN dim should be on the order ofhidden_size * mlp_ratio(~3.8k), notheads * 4(64). That gives nonsense FFN metadata for the vision encoder and may break consumers that rely on GGUF metadata instead of inferring from weights.I’d strongly recommend computing
intermediate_sizefrom the CLIP config instead ofheads * 4, e.g.:Proposed fix for DeepSeek-OCR vision config
def get_vision_config(self) -> dict[str, Any]: vision_config: dict[str, Any] | None = self.global_config.get("vision_config") if not vision_config: raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found") - vision_config['sam'] = vision_config['width']['sam_vit_b'] - vision_config.update(vision_config['width']['clip-l-14-224']) - vision_config['hidden_size'] = vision_config['width'] - vision_config['num_heads'] = vision_config['heads'] - vision_config['intermediate_size'] = vision_config['heads'] * 4 + # Extract SAM and CLIP-L configs from nested vision_config["width"] + width_cfg = vision_config["width"] + sam_cfg = width_cfg["sam_vit_b"] + clip_cfg = width_cfg["clip-l-14-224"] + + # Promote SAM config and CLIP-L config to top level + vision_config["sam"] = sam_cfg + vision_config.update(clip_cfg) + + # Normalize fields expected by MmprojModel / gguf_writer + vision_config["hidden_size"] = clip_cfg["width"] + vision_config["num_heads"] = clip_cfg["heads"] + + mlp_ratio = vision_config.get("mlp_ratio") + if mlp_ratio is not None: + vision_config["intermediate_size"] = int(round(clip_cfg["width"] * mlp_ratio)) + else: + # Fallback: standard ViT MLP ratio + vision_config["intermediate_size"] = clip_cfg["width"] * 4 return vision_configStatic-analysis nits in this class
tensor_force_quant(self, name, new_name, bid, n_dims)doesn’t usenew_name,bid, orn_dims.modify_tensors(self, data_torch, name, bid)doesn’t usebid.To keep the signature compatible with the base class while satisfying linters, consider explicitly deleting the unused arguments:
Suggested cleanups for unused parameters
🧰 Tools
🪛 Ruff (0.14.10)
5998-5998: Avoid specifying long messages outside the exception class
(TRY003)
🤖 Prompt for AI Agents