kakaocorp
/

kanana-1.5-v-3b-instruct

Image-Text-to-Text

Model card Files Files and versions

peterroh commited on 14 days ago

Commit

38145a7

·

verified ·

1 Parent(s): 045081f

Update modeling.py

Files changed (1) hide show

modeling.py +8 -7

modeling.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from functools import partial
 import logging
 import re
 from typing import Optional, Tuple, Union
@@ -229,6 +230,8 @@ class CustomQwen2VLVE(Qwen2VisionTransformerPretrainedModel):
         hidden_states = self.patch_embed(pixel_values)
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
@@ -238,20 +241,18 @@ class CustomQwen2VLVE(Qwen2VisionTransformerPretrainedModel):
         for blk in self.blocks:
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    blk.__call__,
                     hidden_states,
-                    cu_seqlens,
-                    rotary_pos_emb,
                 )
             else:
-                layer_outputs = blk(
                     hidden_states,
                     cu_seqlens=cu_seqlens,
                     rotary_pos_emb=rotary_pos_emb,
                 )
-            hidden_states = layer_outputs
         if output_hidden_states:
             encoder_states = encoder_states + (hidden_states,)

 from functools import partial
+import inspect
 import logging
 import re
 from typing import Optional, Tuple, Union
         hidden_states = self.patch_embed(pixel_values)
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
         for blk in self.blocks:
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
+            if "position_embeddings" in inspect.signature(blk.forward).parameters:
+                hidden_states = blk(
                     hidden_states,
+                    cu_seqlens=cu_seqlens,
+                    position_embeddings=position_embeddings,
                 )
             else:
+                hidden_states = blk(
                     hidden_states,
                     cu_seqlens=cu_seqlens,
                     rotary_pos_emb=rotary_pos_emb,
                 )
         if output_hidden_states:
             encoder_states = encoder_states + (hidden_states,)