# Install required libraries
!pip install transformers
!pip install torch torchvision
!pip install pillow

from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
from google.colab import files

# Load the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_caption(image_path):
    # Open the image
    image = Image.open(image_path).convert("RGB")
    
    # Preprocess the image and move it to the appropriate device
    inputs = processor(images=image, return_tensors="pt").to(device)
    
    # Generate caption
    with torch.no_grad():
        outputs = model.generate(**inputs)
    
    # Decode the generated caption
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

# Upload an image
uploaded = files.upload()

# Assuming you uploaded an image
image_path = list(uploaded.keys())[0]
caption = generate_caption(image_path)

print(f"Generated caption: {caption}")
