In [None]:
import os
import sys
import torch
from transformers import pipeline, AutoModel
from transformers.pipelines import PIPELINE_REGISTRY

# Uncomment or set your own
#os.environ['OPENAI_API_KEY'] = 'dummy-key'
from vine_hf import VineConfig, VineModel, VinePipeline

* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
PIPELINE_REGISTRY.register_pipeline(
            "vine-video-understanding",
            pipeline_class=VinePipeline,
            pt_model=VineModel,
            type="multimodal",
)

In [None]:
vine_config = VineConfig(
    model_name="openai/clip-vit-base-patch32",
    # Local file example: set use_hf_repo=False and provide local_dir/local_filename
    use_hf_repo=False,
    local_dir=os.path.dirname('/path/to/your/pretrained/model.pt'),
    local_filename=os.path.basename('/path/to/your/pretrained/model.pt'),  # Local file path
    segmentation_method="grounding_dino_sam2",
    visualize=True,
    visualization_dir="path/to/visualization/dir",
    debug_visualizations=True,
    device=0,  # Change to your desired device
)

In [None]:
vine_pipeline = VinePipeline(
    model=VineModel(vine_config),        
    tokenizer=None,
    sam_config_path="path/to/sam2/configs/sam2_hiera_base_plus.yaml",
    sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
    gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
    gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
)

Loaded state type: <class 'collections.OrderedDict'>


In [6]:
categorical_keywords = ['human', 'dog', 'frisbee']
unary_keywords = ['running', 'jumping', 'catching', 'throwing']
binary_keywords = ['behind', 'in front of', 'next to', 'chasing']
object_pairs = [(0, 1), (0, 2), (1, 2)]  # human-dog, dog-frisbee relationships 

In [7]:
demo_video_path = "/home/kevinx/LASER/LASER/demo/videos/v1.mp4"  # Replace with your video file path

In [8]:
try:
    results = vine_pipeline(
        demo_video_path,
        categorical_keywords=categorical_keywords,
        unary_keywords=unary_keywords,
        binary_keywords=binary_keywords,
        object_pairs=object_pairs,
        segmentation_method='grounding_dino_sam2',
        return_top_k=3,
        include_visualizations=False,
        debug_visualizations=False,
    )
    
    print("\nResults:")
    print(f"Summary: {results['summary']}")
    
except Exception as e:
    print(f"Note: Full execution requires segmentation models to be properly set up.")
    print(f"Error: {e}")

Segmentation method: grounding_dino_sam2
Generating Grounding DINO + SAM2 masks...
<class 'int'>
✓ SAM2 models initialized successfully
<class 'int'>




final text_encoder_type: bert-base-uncased
✓ GroundingDINO model initialized successfully
Start detecting objects at time  05:08:58.178592


Detecting objects: 100%|██████████| 3/3 [00:01<00:00,  2.82it/s]


Finished detecting objects at time  05:08:59.250419
Loading inference state at time  05:08:59.544425
Number of frames:  3
None


Processing frames: 100%|██████████| 3/3 [00:00<00:00, 11.77it/s]


Annotated frames:  []
Find the most dense prompt at time  05:09:01.413703
Most dense frame: 0


Start propagating objects at time  05:09:01.416367
Pass count:  0


propagate in video: 100%|██████████| 3/3 [00:00<00:00, 20.20it/s]
propagate in video: 0it [00:00, ?it/s]


Most dense frame: 1


Pass count:  1


propagate in video: 100%|██████████| 3/3 [00:00<00:00, 19.25it/s]
propagate in video: 0it [00:00, ?it/s]


Most dense frame: 2


Pass count:  2


propagate in video: 100%|██████████| 3/3 [00:00<00:00, 25.92it/s]
propagate in video: 0it [00:00, ?it/s]


Most dense frame: -1



Results:
Summary: {'num_objects_detected': 4, 'num_unary_predictions': 10, 'num_binary_predictions': 3, 'top_categories': [('frisbee', 0.9989640712738037), ('dog', 0.957672655582428), ('dog', 0.957672655582428)], 'top_actions': [('running', 0.8483631610870361), ('running', 0.832377016544342), ('running', 0.8178836107254028)], 'top_relations': [('chasing', 0.9616015553474426), ('chasing', 0.9478002786636353), ('chasing', 0.6380977630615234)]}


In [9]:
print(f"Summary: {results['summary']}")

Summary: {'num_objects_detected': 4, 'num_unary_predictions': 10, 'num_binary_predictions': 3, 'top_categories': [('frisbee', 0.9989640712738037), ('dog', 0.957672655582428), ('dog', 0.957672655582428)], 'top_actions': [('running', 0.8483631610870361), ('running', 0.832377016544342), ('running', 0.8178836107254028)], 'top_relations': [('chasing', 0.9616015553474426), ('chasing', 0.9478002786636353), ('chasing', 0.6380977630615234)]}
