In [1]:
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from sklearn.model_selection import train_test_split

In [2]:
train_dir = "C:/Users/IT/Desktop/new1/new/NLP Course/SCS/7th/aclImdb/train"


In [5]:
#load data
def load_imdb_data(data_dir):
 texts = []
 lables = []
 for lable_type in ['neg','pos']:
 dir_name = os.path.join(data_dir, lable_type)
 for fname in os.listdir(dir_name):
 if fname.endswith('.txt'):
 with open(os.path.join(dir_name, fname) , encoding = 'utf-8') as f:
 texts.append(f.read())
 lables.append(0 if lable_type == 'neg' else 1)
 return texts, lables
 

In [6]:
train_text, train_lables = load_imdb_data(train_dir)

In [7]:
test_dir = "C:/Users/IT/Desktop/new1/new/NLP Course/SCS/7th/aclImdb/test"
test_text, test_lables = load_imdb_data(test_dir)

In [8]:
#convert lables to numpy arrays
train_lables = np.array(train_lables)
test_lables = np.array(test_lables)

In [9]:
#Tokenize and padding
vocab_size = 10000
max_length = 120
trunc_type = 'post'
oov_tok =""

In [10]:
tokenizer = Tokenizer(num_words= vocab_size , oov_token= oov_tok)
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index

In [11]:
train_sequences = tokenizer.texts_to_sequences(train_text)
train_padded = pad_sequences(train_sequences, maxlen= max_length, truncating= trunc_type)

In [12]:
test_sequences = tokenizer.texts_to_sequences(test_text)
test_padded = pad_sequences(test_sequences, maxlen= max_length)

In [13]:
model = Sequential([
 Embedding(10000 , 64 , input_length = max_length),
 Bidirectional(LSTM(64 , return_sequences= True)),
 Bidirectional(LSTM(32)),
 Dense(24 ,activation='relu'),
 Dense(1 , activation= 'sigmoid')])



In [14]:
model.compile( optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [16]:
history =model.fit(train_padded, train_lables, epochs = 5 , validation_data = (test_padded, test_lables), verbose = 1)

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 107ms/step - accuracy: 0.9400 - loss: 0.1678 - val_accuracy: 0.8268 - val_loss: 0.4572
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 152ms/step - accuracy: 0.9687 - loss: 0.0975 - val_accuracy: 0.8302 - val_loss: 0.5228
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 191ms/step - accuracy: 0.9822 - loss: 0.0547 - val_accuracy: 0.7991 - val_loss: 0.8286
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 196ms/step - accuracy: 0.9829 - loss: 0.0523 - val_accuracy: 0.8151 - val_loss: 0.7855
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 180ms/step - accuracy: 0.9911 - loss: 0.0280 - val_accuracy: 0.8152 - val_loss: 0.9029


In [17]:
loss , accuracy = model.evaluate(test_padded, test_lables)
print(f'Loss:{loss}, Accuracy:{accuracy}')

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 67ms/step - accuracy: 0.8553 - loss: 0.6779
Loss:0.9028642773628235, Accuracy:0.8151999711990356


In [18]:
#try the model on a random data
new_reviews = [
 "The film was fantastic! I really enjoyed the plot and the acting was superb",
 "the film was so bad i do not recommanded to anyone",
 "That movie was bad because it is so long and the actor does not good",
 "That film was amazing but I don't love the cast",
 "best film i think you should watched it"
]

In [23]:
new_sequences = tokenizer.texts_to_sequences(new_reviews)
new_padded_sequences = pad_sequences(new_sequences, maxlen= max_length, truncating= trunc_type)

In [24]:
predictions = model.predict(new_padded_sequences)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


In [25]:
predicted_lables= (predictions > 0.5).astype(int)

In [26]:
lable_map = {0 : 'Negative' , 1 : "Positive"}
predicted_sentiments = [lable_map[lable[0]] for lable in predicted_lables]

In [27]:
for review, sentiment in zip(new_reviews,predicted_sentiments):
 print(f' Review: "{review}" - Sentiment : {sentiment}')

 Review: "The film was fantastic! I really enjoyed the plot and the acting was superb" - Sentiment : Positive
 Review: "the film was so bad i do not recommanded to anyone" - Sentiment : Negative
 Review: "That movie was bad because it is so long and the actor does not good" - Sentiment : Negative
 Review: "That film was amazing but I don't love the cast" - Sentiment : Positive
 Review: "best film i think you should watched it" - Sentiment : Positive
