{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9a35c83a-1f5d-4e90-9756-020cfa9e7bb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import os\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "cc08016d-f3f3-4404-9df4-751bc3a0ffd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dir = \"C:/Users/IT/Desktop/new1/new/NLP Course/SCS/7th/aclImdb/train\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a384f085-6b3a-4b14-9587-a94e823333f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "#load data\n",
    "def load_imdb_data(data_dir):\n",
    "    texts = []\n",
    "    lables = []\n",
    "    for lable_type in ['neg','pos']:\n",
    "        dir_name = os.path.join(data_dir, lable_type)\n",
    "        for fname in os.listdir(dir_name):\n",
    "            if fname.endswith('.txt'):\n",
    "                with open(os.path.join(dir_name, fname) , encoding = 'utf-8') as f:\n",
    "                    texts.append(f.read())\n",
    "                lables.append(0 if lable_type == 'neg' else 1)\n",
    "    return texts, lables\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0e22e588-69b7-4664-8968-60ec1e5682b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_text, train_lables = load_imdb_data(train_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f07cd818-7d5f-413d-961b-dd3a0b0a48d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_dir = \"C:/Users/IT/Desktop/new1/new/NLP Course/SCS/7th/aclImdb/test\"\n",
    "test_text, test_lables = load_imdb_data(test_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "62bbcb36-f27e-4cd3-a96d-3301fefcc425",
   "metadata": {},
   "outputs": [],
   "source": [
    "#convert lables to numpy arrays\n",
    "train_lables = np.array(train_lables)\n",
    "test_lables = np.array(test_lables)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "12400486-a107-4744-a25c-b361071bcca7",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Tokenize and padding\n",
    "vocab_size = 10000\n",
    "max_length = 120\n",
    "trunc_type = 'post'\n",
    "oov_tok =\"<OOV>\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8f8f0ec3-ed6a-4a7b-b28c-6cab021e7405",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer(num_words= vocab_size , oov_token= oov_tok)\n",
    "tokenizer.fit_on_texts(train_text)\n",
    "word_index = tokenizer.word_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a1cd8313-f20a-4e48-8cac-686198b69c0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_sequences = tokenizer.texts_to_sequences(train_text)\n",
    "train_padded = pad_sequences(train_sequences, maxlen= max_length, truncating= trunc_type)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6589dc83-f541-42fc-8c9d-996c0e0c1566",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_sequences = tokenizer.texts_to_sequences(test_text)\n",
    "test_padded = pad_sequences(test_sequences, maxlen= max_length)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "442fcc01-84d2-400e-a7e8-37eb37120058",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\IT\\anaconda3\\Lib\\site-packages\\keras\\src\\layers\\core\\embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "model = Sequential([\n",
    "    Embedding(10000 , 64 , input_length = max_length),\n",
    "    Bidirectional(LSTM(64 , return_sequences= True)),\n",
    "    Bidirectional(LSTM(32)),\n",
    "    Dense(24 ,activation='relu'),\n",
    "    Dense(1 , activation= 'sigmoid')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "28309aa2-1e05-447a-97ac-1a9a42e0d607",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile( optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "ab1fa4c2-b6c6-4f3e-ab3f-7d8085c0b3cd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/5\n",
      "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m84s\u001b[0m 107ms/step - accuracy: 0.9400 - loss: 0.1678 - val_accuracy: 0.8268 - val_loss: 0.4572\n",
      "Epoch 2/5\n",
      "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m119s\u001b[0m 152ms/step - accuracy: 0.9687 - loss: 0.0975 - val_accuracy: 0.8302 - val_loss: 0.5228\n",
      "Epoch 3/5\n",
      "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m149s\u001b[0m 191ms/step - accuracy: 0.9822 - loss: 0.0547 - val_accuracy: 0.7991 - val_loss: 0.8286\n",
      "Epoch 4/5\n",
      "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m153s\u001b[0m 196ms/step - accuracy: 0.9829 - loss: 0.0523 - val_accuracy: 0.8151 - val_loss: 0.7855\n",
      "Epoch 5/5\n",
      "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m141s\u001b[0m 180ms/step - accuracy: 0.9911 - loss: 0.0280 - val_accuracy: 0.8152 - val_loss: 0.9029\n"
     ]
    }
   ],
   "source": [
    "history =model.fit(train_padded, train_lables, epochs = 5 , validation_data = (test_padded, test_lables), verbose = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "0207da33-57f2-4ee4-9602-f797767d687e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m53s\u001b[0m 67ms/step - accuracy: 0.8553 - loss: 0.6779\n",
      "Loss:0.9028642773628235, Accuracy:0.8151999711990356\n"
     ]
    }
   ],
   "source": [
    "loss , accuracy = model.evaluate(test_padded, test_lables)\n",
    "print(f'Loss:{loss}, Accuracy:{accuracy}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "8f9a99ad-6466-4257-82e7-5c4c5af9eb3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#try the model on a random data\n",
    "new_reviews = [\n",
    "    \"The film was fantastic! I really enjoyed the plot and the acting was superb\",\n",
    "    \"the film was so bad i do not recommanded to anyone\",\n",
    "    \"That movie was bad because it is so long and the actor does not good\",\n",
    "    \"That film was amazing but I don't love the cast\",\n",
    "    \"best film i think you should watched it\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "fd8a6d93-0907-4513-acdc-15b41e82b08d",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_sequences = tokenizer.texts_to_sequences(new_reviews)\n",
    "new_padded_sequences = pad_sequences(new_sequences, maxlen= max_length, truncating= trunc_type)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "397acdb7-2531-4587-b859-ee4e7021bdd2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 2s/step\n"
     ]
    }
   ],
   "source": [
    "predictions = model.predict(new_padded_sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "dbd55568-f76f-4f2b-842f-b0c2d494363b",
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted_lables= (predictions > 0.5).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "707e9fe2-86fa-49c1-b757-695ef9255620",
   "metadata": {},
   "outputs": [],
   "source": [
    "lable_map = {0 : 'Negative' , 1 : \"Positive\"}\n",
    "predicted_sentiments = [lable_map[lable[0]] for lable in predicted_lables]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "a8b058e4-883e-4cfa-b0f9-7c032a67a7fa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Review: \"The film was fantastic! I really enjoyed the plot and the acting was superb\" - Sentiment : Positive\n",
      " Review: \"the film was so bad i do not recommanded to anyone\" - Sentiment : Negative\n",
      " Review: \"That movie was bad because it is so long and the actor does not good\" - Sentiment : Negative\n",
      " Review: \"That film was amazing but I don't love the cast\" - Sentiment : Positive\n",
      " Review: \"best film i think you should watched it\" - Sentiment : Positive\n"
     ]
    }
   ],
   "source": [
    "for review, sentiment in zip(new_reviews,predicted_sentiments):\n",
    "    print(f' Review: \"{review}\" - Sentiment : {sentiment}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4273d5e-18c9-4a39-92e5-dea2bcd65495",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}