{ "cells": [ { "cell_type": "code", "execution_count": 7, "id": "7716dc7d-1410-4318-a9f4-eb5625a0d1c6", "metadata": {}, "outputs": [], "source": [ "import nltk\n", "import numpy as np\n", "import re\n", "import string\n", "import tensorflow as tf\n", "from nltk.corpus import twitter_samples, stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.model_selection import train_test_split\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout" ] }, { "cell_type": "code", "execution_count": 3, "id": "97033f51-2d34-4264-9874-fa819c0663ae", "metadata": {}, "outputs": [], "source": [ "# Download required NLTK resources\n", "#nltk.download('twitter_samples')\n", "#nltk.download('stopwords') and , or with because\n", "#nltk.download('punkt') \n", "#nltk.download('wordnet')" ] }, { "cell_type": "code", "execution_count": 8, "id": "81dd457d-429c-4489-9866-f1d7485618fe", "metadata": {}, "outputs": [], "source": [ "# Load dataset\n", "positive_tweets = twitter_samples.strings('positive_tweets.json')\n", "negative_tweets = twitter_samples.strings('negative_tweets.json')\n", "tweets = positive_tweets + negative_tweets # \"\" \"\" , \" \"\n", "labels = [1]*len(positive_tweets) + [0]*len(negative_tweets)" ] }, { "cell_type": "code", "execution_count": 9, "id": "66783c82-ec3d-440c-982d-b0ae66af2731", "metadata": {}, "outputs": [], "source": [ "# Text cleaning function\n", "def clean_text(text):\n", " text = text.lower()\n", " text = re.sub(r\"http\\S+|www\\S+|https\\S+\", '', text) # remove URLs\n", " text = re.sub(r\"@\\w+|#\", '', text) # remove mentions and hashtags\n", " text = re.sub(r\"[^\\w\\s]\", '', text) # remove punctuation\n", " text = re.sub(r\"\\d+\", '', text) # remove digits\n", " text = text.strip() # []\n", " return text" ] }, { "cell_type": "code", "execution_count": 10, "id": "2dd60a52-5a75-4fe3-80e1-1cf275157a4c", "metadata": {}, "outputs": [], "source": [ "# Preprocessing\n", "stop_words = set(stopwords.words('english'))\n", "lemmatizer = WordNetLemmatizer()" ] }, { "cell_type": "code", "execution_count": 11, "id": "5d75c4e4-8c6e-4258-8cba-35084af55643", "metadata": {}, "outputs": [], "source": [ "def preprocess(text):\n", " text = clean_text(text)\n", " tokens = nltk.word_tokenize(text)\n", " tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]\n", " return \" \".join(tokens)" ] }, { "cell_type": "code", "execution_count": 12, "id": "1bd871e9-28bd-4689-90d4-548fa6d4cd24", "metadata": { "scrolled": true }, "outputs": [], "source": [ "processed_tweets = [preprocess(tweet) for tweet in tweets]\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "38c934f0-3e8d-4502-9e4d-d189829c0b6b", "metadata": {}, "outputs": [], "source": [ "# Tokenization\n", "tokenizer = Tokenizer(num_words=10000, oov_token='')\n", "tokenizer.fit_on_texts(processed_tweets)\n", "sequences = tokenizer.texts_to_sequences(processed_tweets)\n", "padded_sequences = pad_sequences(sequences, maxlen=50, padding='post', truncating='post')" ] }, { "cell_type": "code", "execution_count": 14, "id": "15dbf0b9-8d6e-46cd-8573-9cf08d43c783", "metadata": {}, "outputs": [], "source": [ "# Train/test split\n", "X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 20, "id": "9bf18c51-2867-424b-8420-93f43a0dec1d", "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras.layers import Bidirectional\n", "\n", "# LSTM Model\n", "model = Sequential([\n", " Embedding(input_dim=10000, output_dim=128, input_length=50),\n", " Bidirectional(LSTM(64, return_sequences=False)),\n", " Dropout(0.5),\n", " Dense(64, activation='relu'),\n", " Dropout(0.5),\n", " Dense(1, activation='sigmoid')\n", "])" ] }, { "cell_type": "code", "execution_count": 21, "id": "d9812029-4384-48e9-8f10-84322bedc5a3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Model: \"sequential_1\"\n",
       "
\n" ], "text/plain": [ "\u001b[1mModel: \"sequential_1\"\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n",
       "┃ Layer (type)                          Output Shape                         Param # ┃\n",
       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n",
       "│ embedding_1 (Embedding)              │ ?                           │     0 (unbuilt) │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ bidirectional (Bidirectional)        │ ?                           │     0 (unbuilt) │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dropout_2 (Dropout)                  │ ?                           │               0 │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dense_2 (Dense)                      │ ?                           │     0 (unbuilt) │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dropout_3 (Dropout)                  │ ?                           │               0 │\n",
       "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n",
       "│ dense_3 (Dense)                      │ ?                           │     0 (unbuilt) │\n",
       "└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘\n",
       "
\n" ], "text/plain": [ "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩\n", "│ embedding_1 (\u001b[38;5;33mEmbedding\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n", "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n", "│ bidirectional (\u001b[38;5;33mBidirectional\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n", "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n", "│ dropout_2 (\u001b[38;5;33mDropout\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m │\n", "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n", "│ dense_2 (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n", "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n", "│ dropout_3 (\u001b[38;5;33mDropout\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m │\n", "├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤\n", "│ dense_3 (\u001b[38;5;33mDense\u001b[0m) │ ? │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n", "└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Total params: 0 (0.00 B)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Trainable params: 0 (0.00 B)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
 Non-trainable params: 0 (0.00 B)\n",
       "
\n" ], "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from tensorflow.keras.optimizers import Adam\n", "model.compile(\n", " loss='binary_crossentropy',\n", " optimizer=Adam(learning_rate=0.0005), # 🔧 slower learning\n", " metrics=['accuracy']\n", ")\n", "\n", "model.summary()\n" ] }, { "cell_type": "code", "execution_count": 22, "id": "0ba59eb9-37d3-4dc2-a2da-4436f770db88", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/20\n", "\u001b[1m225/225\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 21ms/step - accuracy: 0.5347 - loss: 0.6821 - val_accuracy: 0.7425 - val_loss: 0.4950\n", "Epoch 2/20\n", "\u001b[1m225/225\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - accuracy: 0.8012 - loss: 0.4275 - val_accuracy: 0.7812 - val_loss: 0.4486\n", "Epoch 3/20\n", "\u001b[1m225/225\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - accuracy: 0.8964 - loss: 0.2804 - val_accuracy: 0.7738 - val_loss: 0.4933\n", "Epoch 4/20\n", "\u001b[1m225/225\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 19ms/step - accuracy: 0.9268 - loss: 0.1946 - val_accuracy: 0.7688 - val_loss: 0.5735\n", "Epoch 5/20\n", "\u001b[1m225/225\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 18ms/step - accuracy: 0.9415 - loss: 0.1656 - val_accuracy: 0.7462 - val_loss: 0.6487\n" ] } ], "source": [ "from tensorflow.keras.callbacks import EarlyStopping\n", "\n", "early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)\n", "\n", "history = model.fit(\n", " np.array(X_train), np.array(y_train),\n", " epochs=20,\n", " batch_size=32,\n", " validation_split=0.1,\n", " callbacks=[early_stop]\n", ")\n" ] }, { "cell_type": "code", "execution_count": 23, "id": "591e4b22-b13b-49e7-87c7-4da01b58ec5f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m63/63\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 7ms/step - accuracy: 0.7384 - loss: 0.4997\n", "Test Accuracy: 75.90%\n" ] } ], "source": [ "# Evaluation\n", "loss, accuracy = model.evaluate(np.array(X_test), np.array(y_test))\n", "print(f\"Test Accuracy: {accuracy * 100:.2f}%\")\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "e33b5325-b727-4a88-8d84-9e32e68c2cc9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Model saved successfully!\n" ] } ], "source": [ "# Save the model\n", "model.save(\"twitter_lstm_sentiment_model.h5\")\n", "print(\"Model saved successfully!\")" ] }, { "cell_type": "code", "execution_count": 25, "id": "34a281de-e921-401a-8367-094474fcaf3c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tokenizer saved!\n" ] } ], "source": [ "import pickle\n", "\n", "# Save tokenizer\n", "with open(\"tokenizer.pkl\", \"wb\") as f:\n", " pickle.dump(tokenizer, f)\n", "\n", "print(\"Tokenizer saved!\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d977ac13-9ab8-4e78-bdd7-284a294932b8", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }