YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

ViCLIP - Vietnamese CLIP Text Encoder

This model is a Vietnamese adaptation of CLIP text encoder, trained on Vietnamese data.

Model Description

  • Text encoder based on PhoBERT
  • Projection head to align with CLIP embedding space
  • Optimized for Vietnamese text understanding

Usage

from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
import torch
import torch.nn as nn
import torch.nn.functional as F

class PhoCLIPTextModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Load text encoder
        self.text_encoder = AutoModel.from_pretrained("kienhoang123/ViCLIP")
        
        # Load text projection head
        state_dict = torch.load(hf_hub_download(repo_id="kienhoang123/ViCLIP", filename="model.pt"))
        self.load_state_dict(state_dict)
    
    def forward(self, input_ids, attention_mask=None):
        # Get text embeddings
        text_outputs = self.text_encoder(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            return_dict=True
        )
        text_cls = text_outputs.last_hidden_state[:, 0, :]
        text_proj = self.text_proj(text_cls)
        return text_proj

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("kienhoang123/ViCLIP")

# Encode text
text = "This is an example Vietnamese text"
inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=77, truncation=True)

model = PhoCLIPTextModel()
model.eval()

with torch.no_grad():
    embedding = model(inputs.input_ids, inputs.attention_mask)
    normalized_embedding = F.normalize(embedding, p=2, dim=-1)
Downloads last month
6
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support