From 44097da7288042e988bcb89f1c6cc817a8e1eec9 Mon Sep 17 00:00:00 2001 From: zhang Date: Sat, 4 Jun 2022 08:48:54 +0800 Subject: 0604 --- learn_torch/seq/sentiment_analysis.py | 193 ++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 learn_torch/seq/sentiment_analysis.py (limited to 'learn_torch/seq/sentiment_analysis.py') diff --git a/learn_torch/seq/sentiment_analysis.py b/learn_torch/seq/sentiment_analysis.py new file mode 100644 index 0000000..8971e31 --- /dev/null +++ b/learn_torch/seq/sentiment_analysis.py @@ -0,0 +1,193 @@ +import torch +from torchtext.legacy import data +from torchtext.legacy import datasets +import random +import torch.nn as nn +import time + +SEED = 1234 + +torch.manual_seed(SEED) +torch.backends.cudnn.deterministic = True + +TEXT = data.Field(tokenize='spacy', + tokenizer_language='en_core_web_sm') +LABEL = data.LabelField(dtype=torch.float) + + +train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) + +print(f'Number of training examples: {len(train_data)}') +print(f'Number of testing examples: {len(test_data)}') + +print(vars(train_data.examples[0])) + +train_data, valid_data = train_data.split(random_state = random.seed(SEED)) + +print(f'Number of training examples: {len(train_data)}') +print(f'Number of validation examples: {len(valid_data)}') +print(f'Number of testing examples: {len(test_data)}') + +MAX_VOCAB_SIZE = 25_000 + +TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE) +LABEL.build_vocab(train_data) + +print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}") +print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}") + +print(TEXT.vocab.freqs.most_common(20)) + +print(TEXT.vocab.itos[:10]) +print(LABEL.vocab.stoi) + +BATCH_SIZE = 64 + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( + (train_data, valid_data, test_data), + batch_size = BATCH_SIZE, + device = device) + + +class RNN(nn.Module): + def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim): + super().__init__() + + self.embedding = nn.Embedding(input_dim, embedding_dim) + + self.rnn = nn.RNN(embedding_dim, hidden_dim) + + self.fc = nn.Linear(hidden_dim, output_dim) + + def forward(self, text): + # text = [sent len, batch size] + + embedded = self.embedding(text) + + # embedded = [sent len, batch size, emb dim] + + output, hidden = self.rnn(embedded) + + # output = [sent len, batch size, hid dim] + # hidden = [1, batch size, hid dim] + + assert torch.equal(output[-1, :, :], hidden.squeeze(0)) + + return self.fc(hidden.squeeze(0)) + +INPUT_DIM = len(TEXT.vocab) +EMBEDDING_DIM = 100 +HIDDEN_DIM = 256 +OUTPUT_DIM = 1 + +model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM) + +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + +print(f'The model has {count_parameters(model):,} trainable parameters') + + +import torch.optim as optim + +optimizer = optim.SGD(model.parameters(), lr=1e-3) +criterion = nn.BCEWithLogitsLoss() + +model = model.to(device) +criterion = criterion.to(device) + +def binary_accuracy(preds, y): + """ + Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8 + """ + + #round predictions to the closest integer + rounded_preds = torch.round(torch.sigmoid(preds)) + correct = (rounded_preds == y).float() #convert into float for division + acc = correct.sum() / len(correct) + return acc + + +def train(model, iterator, optimizer, criterion): + epoch_loss = 0 + epoch_acc = 0 + + model.train() + + for batch in iterator: + optimizer.zero_grad() + + predictions = model(batch.text).squeeze(1) + + loss = criterion(predictions, batch.label) + + acc = binary_accuracy(predictions, batch.label) + + loss.backward() + + optimizer.step() + + epoch_loss += loss.item() + epoch_acc += acc.item() + + return epoch_loss / len(iterator), epoch_acc / len(iterator) + + +def evaluate(model, iterator, criterion): + epoch_loss = 0 + epoch_acc = 0 + + model.eval() + + with torch.no_grad(): + for batch in iterator: + predictions = model(batch.text).squeeze(1) + + loss = criterion(predictions, batch.label) + + acc = binary_accuracy(predictions, batch.label) + + epoch_loss += loss.item() + epoch_acc += acc.item() + + return epoch_loss / len(iterator), epoch_acc / len(iterator) + + +def epoch_time(start_time, end_time): + elapsed_time = end_time - start_time + elapsed_mins = int(elapsed_time / 60) + elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) + return elapsed_mins, elapsed_secs + + +N_EPOCHS = 5 + +best_valid_loss = float('inf') + +for epoch in range(N_EPOCHS): + + start_time = time.time() + + train_loss, train_acc = train(model, train_iterator, optimizer, criterion) + valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) + + end_time = time.time() + + epoch_mins, epoch_secs = epoch_time(start_time, end_time) + + if valid_loss < best_valid_loss: + best_valid_loss = valid_loss + torch.save(model.state_dict(), 'tut1-model.pt') + + print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') + print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%') + print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%') + + +model.load_state_dict(torch.load('tut1-model.pt')) + +test_loss, test_acc = evaluate(model, test_iterator, criterion) + +print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%') \ No newline at end of file -- cgit v1.2.3