{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "9a35c83a-1f5d-4e90-9756-020cfa9e7bb4", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import os\n", "import tensorflow as tf\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 2, "id": "cc08016d-f3f3-4404-9df4-751bc3a0ffd0", "metadata": {}, "outputs": [], "source": [ "train_dir = \"C:/Users/IT/Desktop/new1/new/NLP Course/SCS/7th/aclImdb/train\"\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "a384f085-6b3a-4b14-9587-a94e823333f6", "metadata": {}, "outputs": [], "source": [ "#load data\n", "def load_imdb_data(data_dir):\n", " texts = []\n", " lables = []\n", " for lable_type in ['neg','pos']:\n", " dir_name = os.path.join(data_dir, lable_type)\n", " for fname in os.listdir(dir_name):\n", " if fname.endswith('.txt'):\n", " with open(os.path.join(dir_name, fname) , encoding = 'utf-8') as f:\n", " texts.append(f.read())\n", " lables.append(0 if lable_type == 'neg' else 1)\n", " return texts, lables\n", " " ] }, { "cell_type": "code", "execution_count": 6, "id": "0e22e588-69b7-4664-8968-60ec1e5682b8", "metadata": {}, "outputs": [], "source": [ "train_text, train_lables = load_imdb_data(train_dir)" ] }, { "cell_type": "code", "execution_count": 7, "id": "f07cd818-7d5f-413d-961b-dd3a0b0a48d9", "metadata": {}, "outputs": [], "source": [ "test_dir = \"C:/Users/IT/Desktop/new1/new/NLP Course/SCS/7th/aclImdb/test\"\n", "test_text, test_lables = load_imdb_data(test_dir)" ] }, { "cell_type": "code", "execution_count": 8, "id": "62bbcb36-f27e-4cd3-a96d-3301fefcc425", "metadata": {}, "outputs": [], "source": [ "#convert lables to numpy arrays\n", "train_lables = np.array(train_lables)\n", "test_lables = np.array(test_lables)" ] }, { "cell_type": "code", "execution_count": 9, "id": "12400486-a107-4744-a25c-b361071bcca7", "metadata": {}, "outputs": [], "source": [ "#Tokenize and padding\n", "vocab_size = 10000\n", "max_length = 120\n", "trunc_type = 'post'\n", "oov_tok =\"\"" ] }, { "cell_type": "code", "execution_count": 10, "id": "8f8f0ec3-ed6a-4a7b-b28c-6cab021e7405", "metadata": {}, "outputs": [], "source": [ "tokenizer = Tokenizer(num_words= vocab_size , oov_token= oov_tok)\n", "tokenizer.fit_on_texts(train_text)\n", "word_index = tokenizer.word_index" ] }, { "cell_type": "code", "execution_count": 11, "id": "a1cd8313-f20a-4e48-8cac-686198b69c0c", "metadata": {}, "outputs": [], "source": [ "train_sequences = tokenizer.texts_to_sequences(train_text)\n", "train_padded = pad_sequences(train_sequences, maxlen= max_length, truncating= trunc_type)" ] }, { "cell_type": "code", "execution_count": 12, "id": "6589dc83-f541-42fc-8c9d-996c0e0c1566", "metadata": {}, "outputs": [], "source": [ "test_sequences = tokenizer.texts_to_sequences(test_text)\n", "test_padded = pad_sequences(test_sequences, maxlen= max_length)" ] }, { "cell_type": "code", "execution_count": 13, "id": "442fcc01-84d2-400e-a7e8-37eb37120058", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\IT\\anaconda3\\Lib\\site-packages\\keras\\src\\layers\\core\\embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it.\n", " warnings.warn(\n" ] } ], "source": [ "model = Sequential([\n", " Embedding(10000 , 64 , input_length = max_length),\n", " Bidirectional(LSTM(64 , return_sequences= True)),\n", " Bidirectional(LSTM(32)),\n", " Dense(24 ,activation='relu'),\n", " Dense(1 , activation= 'sigmoid')])" ] }, { "cell_type": "code", "execution_count": 14, "id": "28309aa2-1e05-447a-97ac-1a9a42e0d607", "metadata": {}, "outputs": [], "source": [ "model.compile( optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])" ] }, { "cell_type": "code", "execution_count": 16, "id": "ab1fa4c2-b6c6-4f3e-ab3f-7d8085c0b3cd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m84s\u001b[0m 107ms/step - accuracy: 0.9400 - loss: 0.1678 - val_accuracy: 0.8268 - val_loss: 0.4572\n", "Epoch 2/5\n", "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m119s\u001b[0m 152ms/step - accuracy: 0.9687 - loss: 0.0975 - val_accuracy: 0.8302 - val_loss: 0.5228\n", "Epoch 3/5\n", "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m149s\u001b[0m 191ms/step - accuracy: 0.9822 - loss: 0.0547 - val_accuracy: 0.7991 - val_loss: 0.8286\n", "Epoch 4/5\n", "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m153s\u001b[0m 196ms/step - accuracy: 0.9829 - loss: 0.0523 - val_accuracy: 0.8151 - val_loss: 0.7855\n", "Epoch 5/5\n", "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m141s\u001b[0m 180ms/step - accuracy: 0.9911 - loss: 0.0280 - val_accuracy: 0.8152 - val_loss: 0.9029\n" ] } ], "source": [ "history =model.fit(train_padded, train_lables, epochs = 5 , validation_data = (test_padded, test_lables), verbose = 1)" ] }, { "cell_type": "code", "execution_count": 17, "id": "0207da33-57f2-4ee4-9602-f797767d687e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m53s\u001b[0m 67ms/step - accuracy: 0.8553 - loss: 0.6779\n", "Loss:0.9028642773628235, Accuracy:0.8151999711990356\n" ] } ], "source": [ "loss , accuracy = model.evaluate(test_padded, test_lables)\n", "print(f'Loss:{loss}, Accuracy:{accuracy}')" ] }, { "cell_type": "code", "execution_count": 18, "id": "8f9a99ad-6466-4257-82e7-5c4c5af9eb3e", "metadata": {}, "outputs": [], "source": [ "#try the model on a random data\n", "new_reviews = [\n", " \"The film was fantastic! I really enjoyed the plot and the acting was superb\",\n", " \"the film was so bad i do not recommanded to anyone\",\n", " \"That movie was bad because it is so long and the actor does not good\",\n", " \"That film was amazing but I don't love the cast\",\n", " \"best film i think you should watched it\"\n", "]" ] }, { "cell_type": "code", "execution_count": 23, "id": "fd8a6d93-0907-4513-acdc-15b41e82b08d", "metadata": {}, "outputs": [], "source": [ "new_sequences = tokenizer.texts_to_sequences(new_reviews)\n", "new_padded_sequences = pad_sequences(new_sequences, maxlen= max_length, truncating= trunc_type)" ] }, { "cell_type": "code", "execution_count": 24, "id": "397acdb7-2531-4587-b859-ee4e7021bdd2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 2s/step\n" ] } ], "source": [ "predictions = model.predict(new_padded_sequences)" ] }, { "cell_type": "code", "execution_count": 25, "id": "dbd55568-f76f-4f2b-842f-b0c2d494363b", "metadata": {}, "outputs": [], "source": [ "predicted_lables= (predictions > 0.5).astype(int)" ] }, { "cell_type": "code", "execution_count": 26, "id": "707e9fe2-86fa-49c1-b757-695ef9255620", "metadata": {}, "outputs": [], "source": [ "lable_map = {0 : 'Negative' , 1 : \"Positive\"}\n", "predicted_sentiments = [lable_map[lable[0]] for lable in predicted_lables]" ] }, { "cell_type": "code", "execution_count": 27, "id": "a8b058e4-883e-4cfa-b0f9-7c032a67a7fa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Review: \"The film was fantastic! I really enjoyed the plot and the acting was superb\" - Sentiment : Positive\n", " Review: \"the film was so bad i do not recommanded to anyone\" - Sentiment : Negative\n", " Review: \"That movie was bad because it is so long and the actor does not good\" - Sentiment : Negative\n", " Review: \"That film was amazing but I don't love the cast\" - Sentiment : Positive\n", " Review: \"best film i think you should watched it\" - Sentiment : Positive\n" ] } ], "source": [ "for review, sentiment in zip(new_reviews,predicted_sentiments):\n", " print(f' Review: \"{review}\" - Sentiment : {sentiment}')" ] }, { "cell_type": "code", "execution_count": null, "id": "b4273d5e-18c9-4a39-92e5-dea2bcd65495", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }