{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7716dc7d-1410-4318-a9f4-eb5625a0d1c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "import numpy as np\n",
    "import re\n",
    "import string\n",
    "import tensorflow as tf\n",
    "from nltk.corpus import twitter_samples, stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "97033f51-2d34-4264-9874-fa819c0663ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download required NLTK resources\n",
    "#nltk.download('twitter_samples')\n",
    "#nltk.download('stopwords') and , or with because\n",
    "#nltk.download('punkt') \n",
    "#nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "81dd457d-429c-4489-9866-f1d7485618fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load dataset\n",
    "positive_tweets = twitter_samples.strings('positive_tweets.json')\n",
    "negative_tweets = twitter_samples.strings('negative_tweets.json')\n",
    "tweets = positive_tweets + negative_tweets # \"\" \"\" , \"            \"\n",
    "labels = [1]*len(positive_tweets) + [0]*len(negative_tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "66783c82-ec3d-440c-982d-b0ae66af2731",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Text cleaning function\n",
    "def clean_text(text):\n",
    "    text = text.lower()\n",
    "    text = re.sub(r\"http\\S+|www\\S+|https\\S+\", '', text)  # remove URLs\n",
    "    text = re.sub(r\"@\\w+|#\", '', text)  # remove mentions and hashtags\n",
    "    text = re.sub(r\"[^\\w\\s]\", '', text)  # remove punctuation\n",
    "    text = re.sub(r\"\\d+\", '', text)  # remove digits\n",
    "    text = text.strip() # []\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "2dd60a52-5a75-4fe3-80e1-1cf275157a4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocessing\n",
    "stop_words = set(stopwords.words('english'))\n",
    "lemmatizer = WordNetLemmatizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5d75c4e4-8c6e-4258-8cba-35084af55643",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(text):\n",
    "    text = clean_text(text)\n",
    "    tokens = nltk.word_tokenize(text)\n",
    "    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]\n",
    "    return \" \".join(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1bd871e9-28bd-4689-90d4-548fa6d4cd24",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "processed_tweets = [preprocess(tweet) for tweet in tweets]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "38c934f0-3e8d-4502-9e4d-d189829c0b6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenization\n",
    "tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')\n",
    "tokenizer.fit_on_texts(processed_tweets)\n",
    "sequences = tokenizer.texts_to_sequences(processed_tweets)\n",
    "padded_sequences = pad_sequences(sequences, maxlen=50, padding='post', truncating='post')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "15dbf0b9-8d6e-46cd-8573-9cf08d43c783",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train/test split\n",
    "X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9bf18c51-2867-424b-8420-93f43a0dec1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.layers import Bidirectional\n",
    "\n",
    "# LSTM Model\n",
    "model = Sequential([\n",
    "    Embedding(input_dim=10000, output_dim=128, input_length=50),\n",
    "    Bidirectional(LSTM(64, return_sequences=False)),\n",
    "    Dropout(0.5),\n",
    "    Dense(64, activation='relu'),\n",
    "    Dropout(0.5),\n",
    "    Dense(1, activation='sigmoid')\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d9812029-4384-48e9-8f10-84322bedc5a3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"sequential_1\"</span>\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1mModel: \"sequential_1\"\u001b[0m\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n",
       "┃<span style=\"font-weight: bold\"> Layer (type)                         </span>┃<span style=\"font-weight: bold\"> Output Shape                </span>┃<span style=\"font-weight: bold\">         Param # </span>┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n",
       "│ embedding_1 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Embedding</span>)              │ ?                           │     <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (unbuilt) │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ bidirectional (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Bidirectional</span>)        │ ?                           │     <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (unbuilt) │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dropout_2 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dropout</span>)                  │ ?                           │               <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dense_2 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)                      │ ?                           │     <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (unbuilt) │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dropout_3 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dropout</span>)                  │ ?                           │               <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dense_3 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)                      │ ?                           │     <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (unbuilt) │\n",
       "└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘\n",
       "</pre>\n"
      ],
      "text/plain": [
       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n",
       "┃\u001b[1m \u001b[0m\u001b[1mLayer (type)                        \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape               \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m        Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n",
       "│ embedding_1 (\u001b[38;5;33mEmbedding\u001b[0m)              │ ?                           │     \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ bidirectional (\u001b[38;5;33mBidirectional\u001b[0m)        │ ?                           │     \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dropout_2 (\u001b[38;5;33mDropout\u001b[0m)                  │ ?                           │               \u001b[38;5;34m0\u001b[0m │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dense_2 (\u001b[38;5;33mDense\u001b[0m)                      │ ?                           │     \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dropout_3 (\u001b[38;5;33mDropout\u001b[0m)                  │ ?                           │               \u001b[38;5;34m0\u001b[0m │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dense_3 (\u001b[38;5;33mDense\u001b[0m)                      │ ?                           │     \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
       "└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
       "</pre>\n"
      ],
      "text/plain": [
       "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from tensorflow.keras.optimizers import Adam\n",
    "model.compile(\n",
    "    loss='binary_crossentropy',\n",
    "    optimizer=Adam(learning_rate=0.0005),  # 🔧 slower learning\n",
    "    metrics=['accuracy']\n",
    ")\n",
    "\n",
    "model.summary()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "0ba59eb9-37d3-4dc2-a2da-4436f770db88",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/20\n",
      "\u001b[1m225/225\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 21ms/step - accuracy: 0.5347 - loss: 0.6821 - val_accuracy: 0.7425 - val_loss: 0.4950\n",
      "Epoch 2/20\n",
      "\u001b[1m225/225\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - accuracy: 0.8012 - loss: 0.4275 - val_accuracy: 0.7812 - val_loss: 0.4486\n",
      "Epoch 3/20\n",
      "\u001b[1m225/225\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - accuracy: 0.8964 - loss: 0.2804 - val_accuracy: 0.7738 - val_loss: 0.4933\n",
      "Epoch 4/20\n",
      "\u001b[1m225/225\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - accuracy: 0.9268 - loss: 0.1946 - val_accuracy: 0.7688 - val_loss: 0.5735\n",
      "Epoch 5/20\n",
      "\u001b[1m225/225\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 18ms/step - accuracy: 0.9415 - loss: 0.1656 - val_accuracy: 0.7462 - val_loss: 0.6487\n"
     ]
    }
   ],
   "source": [
    "from tensorflow.keras.callbacks import EarlyStopping\n",
    "\n",
    "early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)\n",
    "\n",
    "history = model.fit(\n",
    "    np.array(X_train), np.array(y_train),\n",
    "    epochs=20,\n",
    "    batch_size=32,\n",
    "    validation_split=0.1,\n",
    "    callbacks=[early_stop]\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "591e4b22-b13b-49e7-87c7-4da01b58ec5f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m63/63\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.7384 - loss: 0.4997\n",
      "Test Accuracy: 75.90%\n"
     ]
    }
   ],
   "source": [
    "# Evaluation\n",
    "loss, accuracy = model.evaluate(np.array(X_test), np.array(y_test))\n",
    "print(f\"Test Accuracy: {accuracy * 100:.2f}%\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "e33b5325-b727-4a88-8d84-9e32e68c2cc9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model saved successfully!\n"
     ]
    }
   ],
   "source": [
    "# Save the model\n",
    "model.save(\"twitter_lstm_sentiment_model.h5\")\n",
    "print(\"Model saved successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "34a281de-e921-401a-8367-094474fcaf3c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tokenizer saved!\n"
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "\n",
    "# Save tokenizer\n",
    "with open(\"tokenizer.pkl\", \"wb\") as f:\n",
    "    pickle.dump(tokenizer, f)\n",
    "\n",
    "print(\"Tokenizer saved!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d977ac13-9ab8-4e78-bdd7-284a294932b8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}