選自
# pip install torch lightning matplotlib pandas torchmetrics watermark transformers datasets -U
import os import os.path as op import time
from datasets import load_dataset from lightning import Fabric import torch from torch.utils.data import DataLoader import torchmetrics from transformers import AutoTokenizer from transformers import AutoModelForSequenceClassification from watermark import watermark
from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset from local_dataset_utilities import IMDBDataset
def tokenize_text (batch): return tokenizer (batch ['text'], truncation=True, padding=True, max_length=1024)
def train (num_epochs, model, optimizer, train_loader, val_loader, fabric):
for epoch in range (num_epochs): train_acc = torchmetrics.Accuracy ( task='multiclass', num_classes=2).to (fabric.device)
for batch_idx, batch in enumerate (train_loader): model.train ()
### FORWARD AND BACK PROP outputs = model ( batch ['input_ids'], attention_mask=batch ['attention_mask'], labels=batch ['label'] )
fabric.backward (outputs ['loss'])
### UPDATE MODEL PARAMETERS optimizer.step () optimizer.zero_grad ()
### LOGGING if not batch_idx % 300: print (f'Epoch: {epoch+1:04d}/{num_epochs:04d}' f'| Batch {batch_idx:04d}/{len (train_loader):04d}' f'| Loss: {outputs ['loss']:.4f}')
model.eval () with torch.no_grad (): predicted_labels = torch.argmax (outputs ['logits'], 1) train_acc.update (predicted_labels, batch ['label'])
### MORE LOGGING model.eval () with torch.no_grad (): val_acc = torchmetrics.Accuracy (task='multiclass', num_classes=2).to (fabric.device) for batch in val_loader: outputs = model ( batch ['input_ids'], attention_mask=batch ['attention_mask'], labels=batch ['label'] ) predicted_labels = torch.argmax (outputs ['logits'], 1) val_acc.update (predicted_labels, batch ['label'])
print (f'Epoch: {epoch+1:04d}/{num_epochs:04d}' f'| Train acc.: {train_acc.compute ()*100:.2f}%' f'| Val acc.: {val_acc.compute ()*100:.2f}%' ) train_acc.reset (), val_acc.reset ()
if __name__ == '__main__':
print (watermark (packages='torch,lightning,transformers', python=True)) print ('Torch CUDA available?', torch.cuda.is_available ()) device = 'cuda' if torch.cuda.is_available () else 'cpu'
torch.manual_seed (123) # torch.use_deterministic_algorithms (True)
########################## ### 1 Loading the Dataset ########################## download_dataset () df = load_dataset_into_to_dataframe () if not (op.exists ('train.csv') and op.exists ('val.csv') and op.exists ('test.csv')): partition_dataset (df)
imdb_dataset = load_dataset ( 'csv', data_files={ 'train': 'train.csv', 'validation': 'val.csv', 'test': 'test.csv', }, )
######################################### ### 2 Tokenization and Numericalization #########################################
tokenizer = AutoTokenizer.from_pretrained ('bigscience/bloom-560m', max_length=1024) print ('Tokenizer input max length:', tokenizer.model_max_length, flush=True) print ('Tokenizer vocabulary size:', tokenizer.vocab_size, flush=True)
print ('Tokenizing ...', flush=True) imdb_tokenized = imdb_dataset.map (tokenize_text, batched=True, batch_size=None) del imdb_dataset imdb_tokenized.set_format ('torch', columns=['input_ids', 'attention_mask', 'label']) os.environ ['TOKENIZERS_PARALLELISM'] = 'false'
######################################### ### 3 Set Up DataLoaders #########################################
train_dataset = IMDBDataset (imdb_tokenized, partition_key='train') val_dataset = IMDBDataset (imdb_tokenized, partition_key='validation') test_dataset = IMDBDataset (imdb_tokenized, partition_key='test')
train_loader = DataLoader ( dataset=train_dataset, batch_size=1, shuffle=True, num_workers=4, drop_last=True, )
val_loader = DataLoader ( dataset=val_dataset, batch_size=1, num_workers=4, drop_last=True, )
test_loader = DataLoader ( dataset=test_dataset, batch_size=1, num_workers=2, drop_last=True, )
######################################### ### 4 Initializing the Model #########################################
fabric = Fabric (accelerator='cuda', devices=1, precision='16-mixed') fabric.launch ()
model = AutoModelForSequenceClassification.from_pretrained ( 'bigscience/bloom-560m', num_labels=2)
optimizer = torch.optim.Adam (model.parameters (), lr=5e-5)
model, optimizer = fabric.setup (model, optimizer) train_loader, val_loader, test_loader = fabric.setup_dataloaders ( train_loader, val_loader, test_loader)
######################################### ### 5 Finetuning #########################################
start = time.time () train ( num_epochs=1, model=model, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, fabric=fabric, )
end = time.time () elapsed = end-start print (f'Time elapsed {elapsed/60:.2f} min')
with torch.no_grad (): model.eval () test_acc = torchmetrics.Accuracy (task='multiclass', num_classes=2).to (fabric.device) for batch in test_loader: outputs = model ( batch ['input_ids'], attention_mask=batch ['attention_mask'], labels=batch ['label'] ) predicted_labels = torch.argmax (outputs ['logits'], 1) test_acc.update (predicted_labels, batch ['label'])
print (f'Test accuracy {test_acc.compute ()*100:.2f}%') # 1 加載數(shù)據(jù)集 # 2 token 化和數(shù)值化 # 3 設(shè)置數(shù)據(jù)加載器
... torch : 2.0.0 lightning : 2.0.0 transformers: 4.27.2
Torch CUDA available? True ... Epoch: 0001/0001 | Batch 23700/35000 | Loss: 0.0168 Epoch: 0001/0001 | Batch 24000/35000 | Loss: 0.0006 Epoch: 0001/0001 | Batch 24300/35000 | Loss: 0.0152 Epoch: 0001/0001 | Batch 24600/35000 | Loss: 0.0003 Epoch: 0001/0001 | Batch 24900/35000 | Loss: 0.0623 Epoch: 0001/0001 | Batch 25200/35000 | Loss: 0.0010 Epoch: 0001/0001 | Batch 25500/35000 | Loss: 0.0001 Epoch: 0001/0001 | Batch 25800/35000 | Loss: 0.0047 Epoch: 0001/0001 | Batch 26100/35000 | Loss: 0.0004 Epoch: 0001/0001 | Batch 26400/35000 | Loss: 0.1016 Epoch: 0001/0001 | Batch 26700/35000 | Loss: 0.0021 Epoch: 0001/0001 | Batch 27000/35000 | Loss: 0.0015 Epoch: 0001/0001 | Batch 27300/35000 | Loss: 0.0008 Epoch: 0001/0001 | Batch 27600/35000 | Loss: 0.0060 Epoch: 0001/0001 | Batch 27900/35000 | Loss: 0.0001 Epoch: 0001/0001 | Batch 28200/35000 | Loss: 0.0426 Epoch: 0001/0001 | Batch 28500/35000 | Loss: 0.0012 Epoch: 0001/0001 | Batch 28800/35000 | Loss: 0.0025 Epoch: 0001/0001 | Batch 29100/35000 | Loss: 0.0025 Epoch: 0001/0001 | Batch 29400/35000 | Loss: 0.0000 Epoch: 0001/0001 | Batch 29700/35000 | Loss: 0.0495 Epoch: 0001/0001 | Batch 30000/35000 | Loss: 0.0164 Epoch: 0001/0001 | Batch 30300/35000 | Loss: 0.0067 Epoch: 0001/0001 | Batch 30600/35000 | Loss: 0.0037 Epoch: 0001/0001 | Batch 30900/35000 | Loss: 0.0005 Epoch: 0001/0001 | Batch 31200/35000 | Loss: 0.0013 Epoch: 0001/0001 | Batch 31500/35000 | Loss: 0.0112 Epoch: 0001/0001 | Batch 31800/35000 | Loss: 0.0053 Epoch: 0001/0001 | Batch 32100/35000 | Loss: 0.0012 Epoch: 0001/0001 | Batch 32400/35000 | Loss: 0.1365 Epoch: 0001/0001 | Batch 32700/35000 | Loss: 0.0210 Epoch: 0001/0001 | Batch 33000/35000 | Loss: 0.0374 Epoch: 0001/0001 | Batch 33300/35000 | Loss: 0.0007 Epoch: 0001/0001 | Batch 33600/35000 | Loss: 0.0341 Epoch: 0001/0001 | Batch 33900/35000 | Loss: 0.0259 Epoch: 0001/0001 | Batch 34200/35000 | Loss: 0.0005 Epoch: 0001/0001 | Batch 34500/35000 | Loss: 0.4792 Epoch: 0001/0001 | Batch 34800/35000 | Loss: 0.0003 Epoch: 0001/0001 | Train acc.: 78.67% | Val acc.: 87.28% Time elapsed 51.37 min Test accuracy 87.37%
|
|