microsoft
/

Phi-4-multimodal-instruct-onnx

@@ -31,7 +31,7 @@ def build_vision(args):
     url = "https://wallpaper.dog/large/10809054.jpg"
     image_4 = Image.open(requests.get(url, stream=True).raw)
     images = [image_1, image_2, image_3, image_4]
-    inputs = processor(prompt, images=images, return_tensors="pt").to(args.execution_provider.replace("dml", "cuda"))
     inputs["input_image_embeds"] = inputs["input_image_embeds"].to(args.precision)
     inputs["image_attention_mask"] = inputs["image_attention_mask"].to(args.precision)
@@ -110,7 +110,7 @@ def build_vision(args):
         "--output_model", fpath_4,
         "--block_size", str(32),
     ]
-    if args.precision == torch.float32: cmd.extend(["--accuracy_level", str(4)])
     subprocess.run(cmd)
     shutil.rmtree(temp_folder_3)
@@ -120,7 +120,7 @@ def build_speech(args):
     prompt = f"{user_prompt}<|audio_1|>\n<|audio_2|>\nWhat are the stories that these audios come from?{prompt_suffix}{assistant_prompt}"
     audio1 = soundfile.read(os.path.join(args.input, "examples", "what_is_the_traffic_sign_in_the_image.wav"))
     audio2 = soundfile.read(os.path.join(args.input, "examples", "what_is_shown_in_this_image.wav"))
-    inputs = processor(prompt, audios=[audio1, audio2], return_tensors="pt").to(args.execution_provider.replace("dml", "cuda"))
     inputs["input_audio_embeds"] = inputs["input_audio_embeds"].to(args.precision)
     # TorchScript export
@@ -232,7 +232,7 @@ def build_speech(args):
         "--output_model", fpath_5,
         "--block_size", str(32),
     ]
-    if args.precision == torch.float32: cmd.extend(["--accuracy_level", str(4)])
     subprocess.run(cmd)
     shutil.rmtree(temp_folder_4)
@@ -241,9 +241,9 @@ def build_embedding(args):
     # TorchScript export
     batch_size, sequence_length, num_image_tokens, num_audio_tokens = 2, 8, 2, 2
     inputs = {
-        "input_ids": torch.randint(low=0, high=config.vocab_size, size=(batch_size, sequence_length), device=args.execution_provider.replace("dml", "cuda"), dtype=torch.int64),
-        "image_features": torch.randn(num_image_tokens, config.hidden_size, device=args.execution_provider.replace("dml", "cuda"), dtype=args.precision),
-        "audio_features": torch.randn(num_audio_tokens, config.hidden_size, device=args.execution_provider.replace("dml", "cuda"), dtype=args.precision),
     }
     inputs["input_ids"][0][0] = -1
     inputs["input_ids"][0][1] = -1
@@ -302,8 +302,9 @@ def build_text(args):
     extra_options = {
         "exclude_embeds": "true",
         "filename": "phi-4-mm-text.onnx",
     }
-    if args.precision == torch.float32: extra_options["int4_accuracy_level"] = 4
     create_model(model_name, args.input, args.output, precision, args.execution_provider, args.cache_dir, **extra_options)
@@ -533,7 +534,7 @@ def build_quantized_adapters(args):
         "--output_model", fpath_3,
         "--block_size", str(32),
     ]
-    if args.precision == torch.float32: cmd.extend(["--accuracy_level", str(4)])
     subprocess.run(cmd)
     filename = "phi-4-mm-qlora-speech.onnx"
@@ -544,7 +545,7 @@ def build_quantized_adapters(args):
         "--output_model", fpath_4,
         "--block_size", str(32),
     ]
-    if args.precision == torch.float32: cmd.extend(["--accuracy_level", str(4)])
     subprocess.run(cmd)
     os.remove(fpath_1)
@@ -594,7 +595,7 @@ def get_args():
         "-e",
         "--execution_provider",
         required=True,
-        choices=["cpu", "cuda", "dml"],
         help="Execution provider for Phi-4 multimodal components",
     )
@@ -608,6 +609,7 @@ def get_args():
     args = parser.parse_args()
     args.precision = torch.float16 if args.precision == "fp16" else torch.float32
     return args
 if __name__ == "__main__":
@@ -618,7 +620,7 @@ if __name__ == "__main__":
     args = get_args()
     config = AutoConfig.from_pretrained(args.input, trust_remote_code=True)
     processor = AutoProcessor.from_pretrained(args.input, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(args.input, trust_remote_code=True, torch_dtype=args.precision).to(args.execution_provider.replace("dml", "cuda"))
     # Build model components
     build_vision(args)

     url = "https://wallpaper.dog/large/10809054.jpg"
     image_4 = Image.open(requests.get(url, stream=True).raw)
     images = [image_1, image_2, image_3, image_4]
+    inputs = processor(prompt, images=images, return_tensors="pt").to(args.device)
     inputs["input_image_embeds"] = inputs["input_image_embeds"].to(args.precision)
     inputs["image_attention_mask"] = inputs["image_attention_mask"].to(args.precision)
         "--output_model", fpath_4,
         "--block_size", str(32),
     ]
+    if args.precision == torch.float32 or args.execution_provider == "webgpu": cmd.extend(["--accuracy_level", str(4)])
     subprocess.run(cmd)
     shutil.rmtree(temp_folder_3)
     prompt = f"{user_prompt}<|audio_1|>\n<|audio_2|>\nWhat are the stories that these audios come from?{prompt_suffix}{assistant_prompt}"
     audio1 = soundfile.read(os.path.join(args.input, "examples", "what_is_the_traffic_sign_in_the_image.wav"))
     audio2 = soundfile.read(os.path.join(args.input, "examples", "what_is_shown_in_this_image.wav"))
+    inputs = processor(prompt, audios=[audio1, audio2], return_tensors="pt").to(args.device)
     inputs["input_audio_embeds"] = inputs["input_audio_embeds"].to(args.precision)
     # TorchScript export
         "--output_model", fpath_5,
         "--block_size", str(32),
     ]
+    if args.precision == torch.float32 or args.execution_provider == "webgpu": cmd.extend(["--accuracy_level", str(4)])
     subprocess.run(cmd)
     shutil.rmtree(temp_folder_4)
     # TorchScript export
     batch_size, sequence_length, num_image_tokens, num_audio_tokens = 2, 8, 2, 2
     inputs = {
+        "input_ids": torch.randint(low=0, high=config.vocab_size, size=(batch_size, sequence_length), device=args.device, dtype=torch.int64),
+        "image_features": torch.randn(num_image_tokens, config.hidden_size, device=args.device, dtype=args.precision),
+        "audio_features": torch.randn(num_audio_tokens, config.hidden_size, device=args.device, dtype=args.precision),
     }
     inputs["input_ids"][0][0] = -1
     inputs["input_ids"][0][1] = -1
     extra_options = {
         "exclude_embeds": "true",
         "filename": "phi-4-mm-text.onnx",
+        "int4_algo_config": "k_quant_last",
     }
+    if args.precision == torch.float32 or args.execution_provider == "webgpu": extra_options["int4_accuracy_level"] = 4
     create_model(model_name, args.input, args.output, precision, args.execution_provider, args.cache_dir, **extra_options)
         "--output_model", fpath_3,
         "--block_size", str(32),
     ]
+    if args.precision == torch.float32 or args.execution_provider == "webgpu": cmd.extend(["--accuracy_level", str(4)])
     subprocess.run(cmd)
     filename = "phi-4-mm-qlora-speech.onnx"
         "--output_model", fpath_4,
         "--block_size", str(32),
     ]
+    if args.precision == torch.float32 or args.execution_provider == "webgpu": cmd.extend(["--accuracy_level", str(4)])
     subprocess.run(cmd)
     os.remove(fpath_1)
         "-e",
         "--execution_provider",
         required=True,
+        choices=["cpu", "cuda", "dml", "webgpu"],
         help="Execution provider for Phi-4 multimodal components",
     )
     args = parser.parse_args()
     args.precision = torch.float16 if args.precision == "fp16" else torch.float32
+    setattr(args, "device", args.execution_provider.replace("dml", "cuda").replace("webgpu", "cuda"))
     return args
 if __name__ == "__main__":
     args = get_args()
     config = AutoConfig.from_pretrained(args.input, trust_remote_code=True)
     processor = AutoProcessor.from_pretrained(args.input, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(args.input, trust_remote_code=True, torch_dtype=args.precision).to(args.device)
     # Build model components
     build_vision(args)