{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "5319a8a7-253d-4111-a581-f42c11353796", "metadata": {}, "outputs": [], "source": [ "from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments\n", "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "7a3f1e1f-1503-42fc-90b9-aae9834667a5", "metadata": {}, "outputs": [], "source": [ "# Load dataset\n", "data_path = \"C:/Users/IT/Desktop/Advanced NLP/responses.txt\"\n", "with open(data_path, \"r\", encoding=\"utf-8\") as f:\n", " lines = f.readlines()\n", "dataset = {\"train\": [{\"text\": line.strip()} for line in lines]}" ] }, { "cell_type": "code", "execution_count": null, "id": "36fbe2d2-e30c-4b1b-8ff7-5c31899e7e6f", "metadata": {}, "outputs": [], "source": [ "# Tokenizer and model\n", "model_path = \"C:/Users/IT/Desktop/Advanced NLP/gpt2\"\n", "tokenizer = GPT2Tokenizer.from_pretrained(model_path)\n", "tokenizer.pad_token = tokenizer.eos_token # Set PAD token\n", "model = GPT2LMHeadModel.from_pretrained(model_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "32d39516-b533-4598-bf77-c354fb4d5a25", "metadata": {}, "outputs": [], "source": [ "# Tokenize data\n", "def tokenize_function(examples):\n", " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True, max_length=512)" ] }, { "cell_type": "code", "execution_count": null, "id": "a809a8f6-cde7-461c-93bf-314d0b90875d", "metadata": {}, "outputs": [], "source": [ "tokenized_data = list(map(tokenize_function, dataset[\"train\"]))" ] }, { "cell_type": "code", "execution_count": null, "id": "e1c4f9f1-017d-4818-af39-2e0180d965b5", "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorForLanguageModeling" ] }, { "cell_type": "code", "execution_count": null, "id": "b1806dac-41f6-4150-8771-ccfa19ea8133", "metadata": {}, "outputs": [], "source": [ "# Define a data collator to dynamically create labels for the input data\n", "data_collator = DataCollatorForLanguageModeling(\n", " tokenizer=tokenizer, # Use the same tokenizer\n", " mlm=False, # Set to False since we are doing causal language modeling, not masked LM\n", ")\n", "\n", "# Update Trainer to include the data collator\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_data,\n", " data_collator=data_collator, # Add the data collator\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "c4e7cfd1-39b4-43d5-adf6-8b851a125ac7", "metadata": {}, "outputs": [], "source": [ "def tokenize_data(data):\n", " \"\"\"Tokenize and prepare the dataset with labels.\"\"\"\n", " tokenized_data = []\n", " for item in data:\n", " tokenized = tokenizer(item[\"text\"], truncation=True, padding=\"max_length\", max_length=512)\n", " tokenized[\"labels\"] = tokenized[\"input_ids\"].copy() # Set labels as a copy of input_ids\n", " tokenized_data.append(tokenized)\n", " return tokenized_data\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2b6c398a-fcfa-4500-a178-2b1bd25c5518", "metadata": {}, "outputs": [], "source": [ "# Training arguments\n", "training_args = TrainingArguments(\n", " output_dir=\"./fine_tuned_gpt2\",\n", " evaluation_strategy=\"no\",\n", " logging_dir=\"./logs\",\n", " num_train_epochs=2,\n", " per_device_train_batch_size=2,\n", " save_steps=10,\n", " save_total_limit=2,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "7b6952ce-f381-4f53-b4db-2e06dd688640", "metadata": {}, "outputs": [], "source": [ "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_data,\n", " eval_dataset='no', # If you have an evaluation dataset\n", " data_collator=data_collator, # Ensure data_collator is included\n", ")\n", "\n", "# Train the model\n", "trainer.train()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "587fa2e7-13fc-41b6-ab44-bf6dcf3d828f", "metadata": {}, "outputs": [], "source": [ "model.save_pretrained(\"./fine_tuned_gpt2\")\n", "tokenizer.save_pretrained(\"./fine_tuned_gpt2\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c2f8079c-a23a-4f4e-9576-9bb2d810ccde", "metadata": {}, "outputs": [], "source": [ "for item in tokenized_data:\n", " if len(item['input_ids']) == 0:\n", " print(\"Found empty input sequence:\", item)" ] }, { "cell_type": "code", "execution_count": null, "id": "47d57104-f127-4458-a4f9-4e6331cdf7dd", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }