damerajee/clean_hin_vqa
Viewer • Updated • 50k • 737 • 1
How to use BhashaAI/ViLaH with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("visual-question-answering", model="BhashaAI/ViLaH") # Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText
processor = AutoProcessor.from_pretrained("BhashaAI/ViLaH")
model = AutoModelForImageTextToText.from_pretrained("BhashaAI/ViLaH")ViLaH (Vision Language Hindi) is a model with 3 billion parameters, fine-tuned from the base-model google/paligemma-3b-pt-224 to handle input images and bilingual (Hindi and English) text sequences for both input and output.
The model was finetuned on only one dataset
!pip install peft trl datasets accelerate bitsandbytes
!pip install transformers --upgrade
from peft import get_peft_model, LoraConfig,prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer , PaliGemmaForConditionalGeneration , AutoProcessor,BitsAndBytesConfig,AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset
import torch
from datasets import load_dataset
dataset = load_dataset("damerajee/clean_hin_vqa",split='train')
test_example = dataset[10000]
test_image = test_example["image"]
text = test_example['question']
device_index = torch.cuda.current_device()
print("device_index:",device_index)
base_model = PaliGemmaForConditionalGeneration.from_pretrained("BhashaAI/ViLaH",device_map={"": device_index},torch_dtype=torch.float16,low_cpu_mem_usage=True)
processor = AutoProcessor.from_pretrained("BhashaAI/ViLaH")
inputs = processor(text=text, images=test_image, return_tensors="pt").to("cuda")
for k,v in inputs.items():
print(k,v.shape)
MAX_LENGTH = 200
# Autoregressively generate
# We use greedy decoding here, for more fancy methods see https://huggingface.co/blog/how-to-generate
generated_ids = base_model.generate(**inputs, max_new_tokens=MAX_LENGTH,temperature=0.7,repetition_penalty=2.0,do_sample=True)
# Next we turn each predicted token ID back into a string using the decode method
# We chop of the prompt, which consists of image tokens and our text prompt
image_token_index = base_model.config.image_token_index
num_image_tokens = len(generated_ids[generated_ids==image_token_index])
num_text_tokens = len(processor.tokenizer.encode(text))
num_prompt_tokens = num_image_tokens + num_text_tokens + 2
generated_text = processor.batch_decode(generated_ids[:, num_prompt_tokens:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
generated_text
from peft import get_peft_model, LoraConfig,prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer , PaliGemmaForConditionalGeneration , AutoProcessor,BitsAndBytesConfig,AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset
import torch
from datasets import load_dataset
dataset = load_dataset("damerajee/clean_hin_vqa",split='train')
test_example = dataset[10000]
test_image = test_example["image"]
text = test_example['question']
device_index = torch.cuda.current_device()
print("device_index:",device_index)
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
base_model = PaliGemmaForConditionalGeneration.from_pretrained("BhashaAI/ViLaH",device_map={"": device_index},quantization_config=quantization_config,torch_dtype=torch.float16,low_cpu_mem_usage=True)
processor = AutoProcessor.from_pretrained("BhashaAI/ViLaH")
inputs = processor(text=text, images=test_image, return_tensors="pt").to("cuda")
for k,v in inputs.items():
print(k,v.shape)
MAX_LENGTH = 200
# Autoregressively generate
# We use greedy decoding here, for more fancy methods see https://huggingface.co/blog/how-to-generate
generated_ids = base_model.generate(**inputs, max_new_tokens=MAX_LENGTH,temperature=0.7,repetition_penalty=2.0,do_sample=True)
# Next we turn each predicted token ID back into a string using the decode method
# We chop of the prompt, which consists of image tokens and our text prompt
image_token_index = base_model.config.image_token_index
num_image_tokens = len(generated_ids[generated_ids==image_token_index])
num_text_tokens = len(processor.tokenizer.encode(text))
num_prompt_tokens = num_image_tokens + num_text_tokens + 2
generated_text = processor.batch_decode(generated_ids[:, num_prompt_tokens:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
generated_text