{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "# Import necessary modules\n",
        "from transformers import CLIPProcessor, CLIPModel\n",
        "from PIL import Image\n",
        "import torch\n",
        "from google.colab import files\n",
        "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n",
        "from sentence_transformers import SentenceTransformer, util\n",
        "import torch\n",
        "from PIL import Image\n",
        "from torchvision import transforms\n",
        "import requests\n",
        "\n",
        "# Load models\n",
        "# 1. GPT-2 for text generation\n",
        "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
        "model = GPT2LMHeadModel.from_pretrained(\"gpt2\")\n",
        "model.eval()\n",
        "\n",
        "# Load CLIP model and processor\n",
        "clip_model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
        "clip_processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
        "\n",
        "# Step 1: Upload images to Colab\n",
        "uploaded_files = files.upload()  # Allows you to upload files directly from your computer\n",
        "\n",
        "# Dataset: Example image-text pairs with local file paths\n",
        "data = [\n",
        "    {\"text\": \"A dog playing in the park\", \"image_path\": list(uploaded_files.keys())[0]},  # Replace with your image name\n",
        "    {\"text\": \"A beautiful sunset over the mountains\", \"image_path\": list(uploaded_files.keys())[1]}  # Replace with your image name\n",
        "]\n",
        "\n",
        "# Step 2: Preprocess the data\n",
        "text_embeddings = []\n",
        "image_embeddings = []\n",
        "for item in data:\n",
        "    # Encode text\n",
        "    inputs_text = clip_processor(text=[item[\"text\"]], return_tensors=\"pt\", padding=True)\n",
        "    text_emb = clip_model.get_text_features(**inputs_text)\n",
        "    text_embeddings.append(text_emb)\n",
        "\n",
        "    # Encode image\n",
        "    image = Image.open(item[\"image_path\"])\n",
        "    inputs_image = clip_processor(images=image, return_tensors=\"pt\")\n",
        "    image_emb = clip_model.get_image_features(**inputs_image)\n",
        "    image_embeddings.append(image_emb)\n",
        "\n",
        "# Step 3: Query the system\n",
        "def multimodal_query(query, modality=\"text\"):\n",
        "    if modality == \"text\":\n",
        "        inputs_query = clip_processor(text=[query], return_tensors=\"pt\", padding=True)\n",
        "        query_embedding = clip_model.get_text_features(**inputs_query)\n",
        "        similarities = [torch.cosine_similarity(query_embedding, img_emb, dim=1).item() for img_emb in image_embeddings]\n",
        "    elif modality == \"image\":\n",
        "        image = Image.open(query)\n",
        "        inputs_query = clip_processor(images=image, return_tensors=\"pt\")\n",
        "        query_embedding = clip_model.get_image_features(**inputs_query)\n",
        "        similarities = [torch.cosine_similarity(query_embedding, text_emb, dim=1).item() for text_emb in text_embeddings]\n",
        "\n",
        "    # Find the most similar item\n",
        "    most_similar_idx = torch.argmax(torch.tensor(similarities)).item()\n",
        "    return data[most_similar_idx]\n",
        "\n",
        "# Step 4: Test the system\n",
        "query_text = \"A dog in a park\"\n",
        "response = multimodal_query(query_text, modality=\"text\")\n",
        "print(\"Retrieved Image Path:\", response[\"image_path\"])\n",
        "\n",
        "# Generate a response using GPT-2 based on the retrieved text\n",
        "input_ids = tokenizer.encode(response[\"text\"], return_tensors=\"pt\")\n",
        "output = model.generate(input_ids, max_length=50, num_return_sequences=1)\n",
        "print(\"Generated Response:\", tokenizer.decode(output[0], skip_special_tokens=True))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 284
        },
        "id": "oSppUmtB6zLH",
        "outputId": "368ac067-5a78-479b-c19d-af80cb206643"
      },
      "execution_count": 28,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "\n",
              "     <input type=\"file\" id=\"files-bfbcd63b-5911-46fd-8d9e-70247f191a22\" name=\"files[]\" multiple disabled\n",
              "        style=\"border:none\" />\n",
              "     <output id=\"result-bfbcd63b-5911-46fd-8d9e-70247f191a22\">\n",
              "      Upload widget is only available when the cell has been executed in the\n",
              "      current browser session. Please rerun this cell to enable.\n",
              "      </output>\n",
              "      <script>// Copyright 2017 Google LLC\n",
              "//\n",
              "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
              "// you may not use this file except in compliance with the License.\n",
              "// You may obtain a copy of the License at\n",
              "//\n",
              "//      http://www.apache.org/licenses/LICENSE-2.0\n",
              "//\n",
              "// Unless required by applicable law or agreed to in writing, software\n",
              "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
              "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
              "// See the License for the specific language governing permissions and\n",
              "// limitations under the License.\n",
              "\n",
              "/**\n",
              " * @fileoverview Helpers for google.colab Python module.\n",
              " */\n",
              "(function(scope) {\n",
              "function span(text, styleAttributes = {}) {\n",
              "  const element = document.createElement('span');\n",
              "  element.textContent = text;\n",
              "  for (const key of Object.keys(styleAttributes)) {\n",
              "    element.style[key] = styleAttributes[key];\n",
              "  }\n",
              "  return element;\n",
              "}\n",
              "\n",
              "// Max number of bytes which will be uploaded at a time.\n",
              "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
              "\n",
              "function _uploadFiles(inputId, outputId) {\n",
              "  const steps = uploadFilesStep(inputId, outputId);\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  // Cache steps on the outputElement to make it available for the next call\n",
              "  // to uploadFilesContinue from Python.\n",
              "  outputElement.steps = steps;\n",
              "\n",
              "  return _uploadFilesContinue(outputId);\n",
              "}\n",
              "\n",
              "// This is roughly an async generator (not supported in the browser yet),\n",
              "// where there are multiple asynchronous steps and the Python side is going\n",
              "// to poll for completion of each step.\n",
              "// This uses a Promise to block the python side on completion of each step,\n",
              "// then passes the result of the previous step as the input to the next step.\n",
              "function _uploadFilesContinue(outputId) {\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  const steps = outputElement.steps;\n",
              "\n",
              "  const next = steps.next(outputElement.lastPromiseValue);\n",
              "  return Promise.resolve(next.value.promise).then((value) => {\n",
              "    // Cache the last promise value to make it available to the next\n",
              "    // step of the generator.\n",
              "    outputElement.lastPromiseValue = value;\n",
              "    return next.value.response;\n",
              "  });\n",
              "}\n",
              "\n",
              "/**\n",
              " * Generator function which is called between each async step of the upload\n",
              " * process.\n",
              " * @param {string} inputId Element ID of the input file picker element.\n",
              " * @param {string} outputId Element ID of the output display.\n",
              " * @return {!Iterable<!Object>} Iterable of next steps.\n",
              " */\n",
              "function* uploadFilesStep(inputId, outputId) {\n",
              "  const inputElement = document.getElementById(inputId);\n",
              "  inputElement.disabled = false;\n",
              "\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  outputElement.innerHTML = '';\n",
              "\n",
              "  const pickedPromise = new Promise((resolve) => {\n",
              "    inputElement.addEventListener('change', (e) => {\n",
              "      resolve(e.target.files);\n",
              "    });\n",
              "  });\n",
              "\n",
              "  const cancel = document.createElement('button');\n",
              "  inputElement.parentElement.appendChild(cancel);\n",
              "  cancel.textContent = 'Cancel upload';\n",
              "  const cancelPromise = new Promise((resolve) => {\n",
              "    cancel.onclick = () => {\n",
              "      resolve(null);\n",
              "    };\n",
              "  });\n",
              "\n",
              "  // Wait for the user to pick the files.\n",
              "  const files = yield {\n",
              "    promise: Promise.race([pickedPromise, cancelPromise]),\n",
              "    response: {\n",
              "      action: 'starting',\n",
              "    }\n",
              "  };\n",
              "\n",
              "  cancel.remove();\n",
              "\n",
              "  // Disable the input element since further picks are not allowed.\n",
              "  inputElement.disabled = true;\n",
              "\n",
              "  if (!files) {\n",
              "    return {\n",
              "      response: {\n",
              "        action: 'complete',\n",
              "      }\n",
              "    };\n",
              "  }\n",
              "\n",
              "  for (const file of files) {\n",
              "    const li = document.createElement('li');\n",
              "    li.append(span(file.name, {fontWeight: 'bold'}));\n",
              "    li.append(span(\n",
              "        `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
              "        `last modified: ${\n",
              "            file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
              "                                    'n/a'} - `));\n",
              "    const percent = span('0% done');\n",
              "    li.appendChild(percent);\n",
              "\n",
              "    outputElement.appendChild(li);\n",
              "\n",
              "    const fileDataPromise = new Promise((resolve) => {\n",
              "      const reader = new FileReader();\n",
              "      reader.onload = (e) => {\n",
              "        resolve(e.target.result);\n",
              "      };\n",
              "      reader.readAsArrayBuffer(file);\n",
              "    });\n",
              "    // Wait for the data to be ready.\n",
              "    let fileData = yield {\n",
              "      promise: fileDataPromise,\n",
              "      response: {\n",
              "        action: 'continue',\n",
              "      }\n",
              "    };\n",
              "\n",
              "    // Use a chunked sending to avoid message size limits. See b/62115660.\n",
              "    let position = 0;\n",
              "    do {\n",
              "      const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
              "      const chunk = new Uint8Array(fileData, position, length);\n",
              "      position += length;\n",
              "\n",
              "      const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
              "      yield {\n",
              "        response: {\n",
              "          action: 'append',\n",
              "          file: file.name,\n",
              "          data: base64,\n",
              "        },\n",
              "      };\n",
              "\n",
              "      let percentDone = fileData.byteLength === 0 ?\n",
              "          100 :\n",
              "          Math.round((position / fileData.byteLength) * 100);\n",
              "      percent.textContent = `${percentDone}% done`;\n",
              "\n",
              "    } while (position < fileData.byteLength);\n",
              "  }\n",
              "\n",
              "  // All done.\n",
              "  yield {\n",
              "    response: {\n",
              "      action: 'complete',\n",
              "    }\n",
              "  };\n",
              "}\n",
              "\n",
              "scope.google = scope.google || {};\n",
              "scope.google.colab = scope.google.colab || {};\n",
              "scope.google.colab._files = {\n",
              "  _uploadFiles,\n",
              "  _uploadFilesContinue,\n",
              "};\n",
              "})(self);\n",
              "</script> "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saving dog.PNG to dog (1).PNG\n",
            "Saving mount.PNG to mount (1).PNG\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
            "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
            "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Retrieved Image Path: dog (1).PNG\n",
            "Generated Response: A dog playing in the park.\n",
            "\n",
            "\"I'm not sure if it's a dog or a cat,\" said the woman, who asked not to be identified. \"I'm not sure if it's a dog or a cat.\"\n",
            "\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "BnIzHfVR7Ffg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "CLIPProcessor: Prepares input data (text and images) for the CLIPModel.\n",
        "CLIPModel: A model from Hugging Face that generates embeddings for text and images, enabling comparison between the two modalities.\n",
        "PIL.Image: A library for image handling.\n",
        "torch: A deep learning library for processing tensors and performing operations like cosine similarity.\n",
        "files.upload(): Allows users to upload files directly into Google Colab.\n",
        "clip_model: Loads the pretrained CLIP model (openai/clip-vit-base-patch32).\n",
        "clip_processor: Loads the processor to preprocess input text and images for the model.\n",
        "Creates a dataset with image-text pairs:\n",
        "text: A description of the image.\n",
        "image_path: File path to the corresponding uploaded image.\n",
        "clip_processor: Converts the text into a format suitable for the CLIP model.\n",
        "return_tensors=\"pt\": Returns a PyTorch tensor.\n",
        "padding=True: Ensures consistent input size.\n",
        "clip_model.get_text_features: Generates a text embedding.\n",
        "text_embeddings.append: Stores the embedding in the text_embeddings list.\n",
        "Image.open: Loads the image from the given file path.\n",
        "clip_processor: Prepares the image for the CLIP model.\n",
        "clip_model.get_image_features: Generates an image embedding.\n",
        "image_embeddings.append: Stores the embedding in the image_embeddings list.\n",
        "\n",
        "Accepts:\n",
        "query: Input (text or image) to be searched.\n",
        "modality: Specifies whether the query is \"text\" or \"image\".\n",
        "\n",
        "clip_processor: Prepares the text query.\n",
        "clip_model.get_text_features: Generates the text embedding for the query.\n",
        "torch.cosine_similarity: Calculates cosine similarity between the query embedding and each image embedding.\n",
        "\n",
        "Image.open: Loads the image query.\n",
        "clip_processor: Prepares the image query.\n",
        "clip_model.get_image_features: Generates the image embedding for the query.\n",
        "torch.cosine_similarity: Calculates cosine similarity between the query embedding and each text embedding.\n",
        "torch.argmax: Finds the index of the highest similarity score.\n",
        "data[most_similar_idx]: Returns the most similar image-text pair from the dataset.\n",
        "query_text: Input query in natural language.\n",
        "multimodal_query: Retrieves the most similar image based on the query.\n",
        "response[\"image_path\"]: Prints the path of the retrieved image.\n",
        "\n",
        "tokenizer.encode: Converts the retrieved text description into GPT-2 input tokens.\n",
        "model.generate: Generates a continuation of the retrieved text.\n",
        "tokenizer.decode: Converts the output tokens back into human-readable text."
      ],
      "metadata": {
        "id": "lXONP6KM_z1G"
      }
    }
  ]
}