import nltk
import numpy as np
import re
import string
import tensorflow as tf
from nltk.corpus import twitter_samples, stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Download required NLTK resources
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load dataset
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
tweets = positive_tweets + negative_tweets
labels = [1]*len(positive_tweets) + [0]*len(negative_tweets)

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r"@\w+|#", '', text)  # remove mentions and hashtags
    text = re.sub(r"[^\w\s]", '', text)  # remove punctuation
    text = re.sub(r"\d+", '', text)  # remove digits
    text = text.strip()
    return text

# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = clean_text(text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

processed_tweets = [preprocess(tweet) for tweet in tweets]

# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(processed_tweets)
sequences = tokenizer.texts_to_sequences(processed_tweets)
padded_sequences = pad_sequences(sequences, maxlen=50, padding='post', truncating='post')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# LSTM Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=50),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Training
history = model.fit(np.array(X_train), np.array(y_train), epochs=5, batch_size=32, validation_split=0.1)

# Evaluation
loss, accuracy = model.evaluate(np.array(X_test), np.array(y_test))
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Save the model
model.save("twitter_lstm_sentiment_model.h5")
print("Model saved successfully!")
import pickle

# Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Tokenizer saved!")


import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load model
model = load_model("twitter_lstm_sentiment_model.h5")

# Load tokenizer
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Load NLTK resources (if needed)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Define preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+|#", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = re.sub(r"\d+", '', text)
    text = text.strip()
    return text

def preprocess(text):
    text = clean_text(text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

def predict_sentiment(text):
    processed = preprocess(text)
    seq = tokenizer.texts_to_sequences([processed])
    padded = pad_sequences(seq, maxlen=50, padding='post', truncating='post')
    pred = model.predict(padded)[0][0]
    sentiment = "Positive 😊" if pred > 0.5 else "Negative 😞"
    print(f"\nText: {text}\n→ Prediction: {sentiment} (Confidence: {pred:.2f})")

# Test with your own sentence
predict_sentiment("I live in a rainy city and I don't love that.")
predict_sentiment("I'm so grateful for this opportunity and happy for my friends.")
predict_sentiment("I'm tired and everything seems pointless.")
