YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

ViCLIP - Vietnamese CLIP Text Encoder

This model is a Vietnamese adaptation of CLIP text encoder, trained on Vietnamese data.

Model Description

Text encoder based on PhoBERT
Projection head to align with CLIP embedding space
Optimized for Vietnamese text understanding

Usage

from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
import torch
import torch.nn as nn
import torch.nn.functional as F

class PhoCLIPTextModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Load text encoder
        self.text_encoder = AutoModel.from_pretrained("kienhoang123/ViCLIP")
        
        # Load text projection head
        state_dict = torch.load(hf_hub_download(repo_id="kienhoang123/ViCLIP", filename="model.pt"))
        self.load_state_dict(state_dict)
    
    def forward(self, input_ids, attention_mask=None):
        # Get text embeddings
        text_outputs = self.text_encoder(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            return_dict=True
        )
        text_cls = text_outputs.last_hidden_state[:, 0, :]
        text_proj = self.text_proj(text_cls)
        return text_proj

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("kienhoang123/ViCLIP")

# Encode text
text = "This is an example Vietnamese text"
inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=77, truncation=True)

model = PhoCLIPTextModel()
model.eval()

with torch.no_grad():
    embedding = model(inputs.input_ids, inputs.attention_mask)
    normalized_embedding = F.normalize(embedding, p=2, dim=-1)

Downloads last month: 6

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support