Spaces:

C4G-HKUST
/

AnyTalker

Running on Zero

App Files Files Community

C4G-HKUST commited on 5 days ago

Commit

6c41e4a

1 Parent(s): 0c6b95b

feat: trim 4s

Browse files

Files changed (2) hide show

wan/audio2video_multiID.py +2 -2
wan/utils/infer_utils.py +8 -8

wan/audio2video_multiID.py CHANGED Viewed

@@ -199,7 +199,7 @@ class WanAF2V:
         audio_paths=None, # New: audio path list, supports multiple audio files
         task_key=None,
         mode="pad",  # Audio processing mode: "pad" or "concat"
-        trim_to_6s=False,  # Fast mode: trim audio to 4 seconds
     ):
         r"""
         Generates video frames from input image and text prompt using diffusion process.
@@ -515,7 +515,7 @@ class WanAF2V:
             half_dtype=self.half_dtype,
             preprocess_audio=preprocess_audio,
             resample_audio=resample_audio,
-            trim_to_6s=trim_to_6s,
         )
         # Prepare audio_ref_features - new list mode

         audio_paths=None, # New: audio path list, supports multiple audio files
         task_key=None,
         mode="pad",  # Audio processing mode: "pad" or "concat"
+        trim_to_4s=False,  # Fast mode: trim audio to 4 seconds
     ):
         r"""
         Generates video frames from input image and text prompt using diffusion process.
             half_dtype=self.half_dtype,
             preprocess_audio=preprocess_audio,
             resample_audio=resample_audio,
+            trim_to_4s=trim_to_4s,
         )
         # Prepare audio_ref_features - new list mode

wan/utils/infer_utils.py CHANGED Viewed

@@ -118,7 +118,7 @@ def process_audio_features(
     half_dtype=None,
     preprocess_audio=None,
     resample_audio=None,
-    trim_to_6s=False,  # Fast mode: trim audio to 4 seconds
 ):
     """
     Process audio files and extract audio features.
@@ -203,8 +203,8 @@ def process_audio_features(
             total_length = sum(audio_lengths)
             print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
-            # Fast mode: trim to 4 seconds if trim_to_6s is True
-            if trim_to_6s:
                 # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
                 max_frames_4s = 97
                 if total_length > max_frames_4s:
@@ -281,7 +281,7 @@ def process_audio_features(
                     audio_feat_list.append(zero_audio_feat)
                     print(f"Audio {i} is missing, created zero features with shape: {zero_audio_feat.shape}")
         else:
-            # Pad mode: keep existing logic, but apply trim_to_6s if needed
             for i, audio_path in enumerate(audio_paths):
                 if audio_path and os.path.exists(audio_path):
                     print(f"Processing audio {i}: {audio_path}")
@@ -294,9 +294,9 @@ def process_audio_features(
                     with torch.no_grad():
                         print(f"wav2vec_model: {wav2vec_model}")
                         print(f"cache_dir:{cache_dir}")
-                        # Fast mode: if trim_to_6s, limit to 4 seconds
                         target_frames = F
-                        if trim_to_6s:
                             # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
                             max_frames_4s = 97
                             target_frames = min(F, max_frames_4s)
@@ -343,9 +343,9 @@ def process_audio_features(
                     target_resampled_audio_path,
                 )
             with torch.no_grad():
-                # Fast mode: if trim_to_6s, limit to 4 seconds
                 target_frames = F
-                if trim_to_6s:
                     # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
                     max_frames_4s = 97
                     target_frames = min(F, max_frames_4s)

     half_dtype=None,
     preprocess_audio=None,
     resample_audio=None,
+    trim_to_4s=False,  # Fast mode: trim audio to 4 seconds
 ):
     """
     Process audio files and extract audio features.
             total_length = sum(audio_lengths)
             print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
+            # Fast mode: trim to 4 seconds if trim_to_4s is True
+            if trim_to_4s:
                 # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
                 max_frames_4s = 97
                 if total_length > max_frames_4s:
                     audio_feat_list.append(zero_audio_feat)
                     print(f"Audio {i} is missing, created zero features with shape: {zero_audio_feat.shape}")
         else:
+            # Pad mode: keep existing logic, but apply trim_to_4s if needed
             for i, audio_path in enumerate(audio_paths):
                 if audio_path and os.path.exists(audio_path):
                     print(f"Processing audio {i}: {audio_path}")
                     with torch.no_grad():
                         print(f"wav2vec_model: {wav2vec_model}")
                         print(f"cache_dir:{cache_dir}")
+                        # Fast mode: if trim_to_4s, limit to 4 seconds
                         target_frames = F
+                        if trim_to_4s:
                             # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
                             max_frames_4s = 97
                             target_frames = min(F, max_frames_4s)
                     target_resampled_audio_path,
                 )
             with torch.no_grad():
+                # Fast mode: if trim_to_4s, limit to 4 seconds
                 target_frames = F
+                if trim_to_4s:
                     # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
                     max_frames_4s = 97
                     target_frames = min(F, max_frames_4s)