In [28]:
# Import necessary modules
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
from google.colab import files
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sentence_transformers import SentenceTransformer, util
import torch
from PIL import Image
from torchvision import transforms
import requests

# Load models
# 1. GPT-2 for text generation
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Step 1: Upload images to Colab
uploaded_files = files.upload() # Allows you to upload files directly from your computer

# Dataset: Example image-text pairs with local file paths
data = [
 {"text": "A dog playing in the park", "image_path": list(uploaded_files.keys())[0]}, # Replace with your image name
 {"text": "A beautiful sunset over the mountains", "image_path": list(uploaded_files.keys())[1]} # Replace with your image name
]

# Step 2: Preprocess the data
text_embeddings = []
image_embeddings = []
for item in data:
 # Encode text
 inputs_text = clip_processor(text=[item["text"]], return_tensors="pt", padding=True)
 text_emb = clip_model.get_text_features(**inputs_text)
 text_embeddings.append(text_emb)

 # Encode image
 image = Image.open(item["image_path"])
 inputs_image = clip_processor(images=image, return_tensors="pt")
 image_emb = clip_model.get_image_features(**inputs_image)
 image_embeddings.append(image_emb)

# Step 3: Query the system
def multimodal_query(query, modality="text"):
 if modality == "text":
 inputs_query = clip_processor(text=[query], return_tensors="pt", padding=True)
 query_embedding = clip_model.get_text_features(**inputs_query)
 similarities = [torch.cosine_similarity(query_embedding, img_emb, dim=1).item() for img_emb in image_embeddings]
 elif modality == "image":
 image = Image.open(query)
 inputs_query = clip_processor(images=image, return_tensors="pt")
 query_embedding = clip_model.get_image_features(**inputs_query)
 similarities = [torch.cosine_similarity(query_embedding, text_emb, dim=1).item() for text_emb in text_embeddings]

 # Find the most similar item
 most_similar_idx = torch.argmax(torch.tensor(similarities)).item()
 return data[most_similar_idx]

# Step 4: Test the system
query_text = "A dog in a park"
response = multimodal_query(query_text, modality="text")
print("Retrieved Image Path:", response["image_path"])

# Generate a response using GPT-2 based on the retrieved text
input_ids = tokenizer.encode(response["text"], return_tensors="pt")
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
print("Generated Response:", tokenizer.decode(output[0], skip_special_tokens=True))


Saving dog.PNG to dog (1).PNG
Saving mount.PNG to mount (1).PNG


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Retrieved Image Path: dog (1).PNG
Generated Response: A dog playing in the park.

"I'm not sure if it's a dog or a cat," said the woman, who asked not to be identified. "I'm not sure if it's a dog or a cat."




CLIPProcessor: Prepares input data (text and images) for the CLIPModel.
CLIPModel: A model from Hugging Face that generates embeddings for text and images, enabling comparison between the two modalities.
PIL.Image: A library for image handling.
torch: A deep learning library for processing tensors and performing operations like cosine similarity.
files.upload(): Allows users to upload files directly into Google Colab.
clip_model: Loads the pretrained CLIP model (openai/clip-vit-base-patch32).
clip_processor: Loads the processor to preprocess input text and images for the model.
Creates a dataset with image-text pairs:
text: A description of the image.
image_path: File path to the corresponding uploaded image.
clip_processor: Converts the text into a format suitable for the CLIP model.
return_tensors="pt": Returns a PyTorch tensor.
padding=True: Ensures consistent input size.
clip_model.get_text_features: Generates a text embedding.
text_embeddings.append: Stores the embedding in the text_embeddings list.
Image.open: Loads the image from the given file path.
clip_processor: Prepares the image for the CLIP model.
clip_model.get_image_features: Generates an image embedding.
image_embeddings.append: Stores the embedding in the image_embeddings list.

Accepts:
query: Input (text or image) to be searched.
modality: Specifies whether the query is "text" or "image".

clip_processor: Prepares the text query.
clip_model.get_text_features: Generates the text embedding for the query.
torch.cosine_similarity: Calculates cosine similarity between the query embedding and each image embedding.

Image.open: Loads the image query.
clip_processor: Prepares the image query.
clip_model.get_image_features: Generates the image embedding for the query.
torch.cosine_similarity: Calculates cosine similarity between the query embedding and each text embedding.
torch.argmax: Finds the index of the highest similarity score.
data[most_similar_idx]: Returns the most similar image-text pair from the dataset.
query_text: Input query in natural language.
multimodal_query: Retrieves the most similar image based on the query.
response["image_path"]: Prints the path of the retrieved image.

tokenizer.encode: Converts the retrieved text description into GPT-2 input tokens.
model.generate: Generates a continuation of the retrieved text.
tokenizer.decode: Converts the output tokens back into human-readable text.