{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "source": [ "# Import necessary modules\n", "from transformers import CLIPProcessor, CLIPModel\n", "from PIL import Image\n", "import torch\n", "from google.colab import files\n", "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n", "from sentence_transformers import SentenceTransformer, util\n", "import torch\n", "from PIL import Image\n", "from torchvision import transforms\n", "import requests\n", "\n", "# Load models\n", "# 1. GPT-2 for text generation\n", "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n", "model = GPT2LMHeadModel.from_pretrained(\"gpt2\")\n", "model.eval()\n", "\n", "# Load CLIP model and processor\n", "clip_model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n", "clip_processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n", "\n", "# Step 1: Upload images to Colab\n", "uploaded_files = files.upload() # Allows you to upload files directly from your computer\n", "\n", "# Dataset: Example image-text pairs with local file paths\n", "data = [\n", " {\"text\": \"A dog playing in the park\", \"image_path\": list(uploaded_files.keys())[0]}, # Replace with your image name\n", " {\"text\": \"A beautiful sunset over the mountains\", \"image_path\": list(uploaded_files.keys())[1]} # Replace with your image name\n", "]\n", "\n", "# Step 2: Preprocess the data\n", "text_embeddings = []\n", "image_embeddings = []\n", "for item in data:\n", " # Encode text\n", " inputs_text = clip_processor(text=[item[\"text\"]], return_tensors=\"pt\", padding=True)\n", " text_emb = clip_model.get_text_features(**inputs_text)\n", " text_embeddings.append(text_emb)\n", "\n", " # Encode image\n", " image = Image.open(item[\"image_path\"])\n", " inputs_image = clip_processor(images=image, return_tensors=\"pt\")\n", " image_emb = clip_model.get_image_features(**inputs_image)\n", " image_embeddings.append(image_emb)\n", "\n", "# Step 3: Query the system\n", "def multimodal_query(query, modality=\"text\"):\n", " if modality == \"text\":\n", " inputs_query = clip_processor(text=[query], return_tensors=\"pt\", padding=True)\n", " query_embedding = clip_model.get_text_features(**inputs_query)\n", " similarities = [torch.cosine_similarity(query_embedding, img_emb, dim=1).item() for img_emb in image_embeddings]\n", " elif modality == \"image\":\n", " image = Image.open(query)\n", " inputs_query = clip_processor(images=image, return_tensors=\"pt\")\n", " query_embedding = clip_model.get_image_features(**inputs_query)\n", " similarities = [torch.cosine_similarity(query_embedding, text_emb, dim=1).item() for text_emb in text_embeddings]\n", "\n", " # Find the most similar item\n", " most_similar_idx = torch.argmax(torch.tensor(similarities)).item()\n", " return data[most_similar_idx]\n", "\n", "# Step 4: Test the system\n", "query_text = \"A dog in a park\"\n", "response = multimodal_query(query_text, modality=\"text\")\n", "print(\"Retrieved Image Path:\", response[\"image_path\"])\n", "\n", "# Generate a response using GPT-2 based on the retrieved text\n", "input_ids = tokenizer.encode(response[\"text\"], return_tensors=\"pt\")\n", "output = model.generate(input_ids, max_length=50, num_return_sequences=1)\n", "print(\"Generated Response:\", tokenizer.decode(output[0], skip_special_tokens=True))\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 284 }, "id": "oSppUmtB6zLH", "outputId": "368ac067-5a78-479b-c19d-af80cb206643" }, "execution_count": 28, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " Upload widget is only available when the cell has been executed in the\n", " current browser session. Please rerun this cell to enable.\n", " \n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saving dog.PNG to dog (1).PNG\n", "Saving mount.PNG to mount (1).PNG\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n", "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Retrieved Image Path: dog (1).PNG\n", "Generated Response: A dog playing in the park.\n", "\n", "\"I'm not sure if it's a dog or a cat,\" said the woman, who asked not to be identified. \"I'm not sure if it's a dog or a cat.\"\n", "\n", "\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "BnIzHfVR7Ffg" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "CLIPProcessor: Prepares input data (text and images) for the CLIPModel.\n", "CLIPModel: A model from Hugging Face that generates embeddings for text and images, enabling comparison between the two modalities.\n", "PIL.Image: A library for image handling.\n", "torch: A deep learning library for processing tensors and performing operations like cosine similarity.\n", "files.upload(): Allows users to upload files directly into Google Colab.\n", "clip_model: Loads the pretrained CLIP model (openai/clip-vit-base-patch32).\n", "clip_processor: Loads the processor to preprocess input text and images for the model.\n", "Creates a dataset with image-text pairs:\n", "text: A description of the image.\n", "image_path: File path to the corresponding uploaded image.\n", "clip_processor: Converts the text into a format suitable for the CLIP model.\n", "return_tensors=\"pt\": Returns a PyTorch tensor.\n", "padding=True: Ensures consistent input size.\n", "clip_model.get_text_features: Generates a text embedding.\n", "text_embeddings.append: Stores the embedding in the text_embeddings list.\n", "Image.open: Loads the image from the given file path.\n", "clip_processor: Prepares the image for the CLIP model.\n", "clip_model.get_image_features: Generates an image embedding.\n", "image_embeddings.append: Stores the embedding in the image_embeddings list.\n", "\n", "Accepts:\n", "query: Input (text or image) to be searched.\n", "modality: Specifies whether the query is \"text\" or \"image\".\n", "\n", "clip_processor: Prepares the text query.\n", "clip_model.get_text_features: Generates the text embedding for the query.\n", "torch.cosine_similarity: Calculates cosine similarity between the query embedding and each image embedding.\n", "\n", "Image.open: Loads the image query.\n", "clip_processor: Prepares the image query.\n", "clip_model.get_image_features: Generates the image embedding for the query.\n", "torch.cosine_similarity: Calculates cosine similarity between the query embedding and each text embedding.\n", "torch.argmax: Finds the index of the highest similarity score.\n", "data[most_similar_idx]: Returns the most similar image-text pair from the dataset.\n", "query_text: Input query in natural language.\n", "multimodal_query: Retrieves the most similar image based on the query.\n", "response[\"image_path\"]: Prints the path of the retrieved image.\n", "\n", "tokenizer.encode: Converts the retrieved text description into GPT-2 input tokens.\n", "model.generate: Generates a continuation of the retrieved text.\n", "tokenizer.decode: Converts the output tokens back into human-readable text." ], "metadata": { "id": "lXONP6KM_z1G" } } ] }