From 44097da7288042e988bcb89f1c6cc817a8e1eec9 Mon Sep 17 00:00:00 2001 From: zhang Date: Sat, 4 Jun 2022 08:48:54 +0800 Subject: 0604 --- learn_torch/seq/basic_rnn.py | 43 ++++++++ learn_torch/seq/sentiment_analysis.py | 193 ++++++++++++++++++++++++++++++++++ learn_torch/seq/test_rnn.py | 8 ++ 3 files changed, 244 insertions(+) create mode 100644 learn_torch/seq/basic_rnn.py create mode 100644 learn_torch/seq/sentiment_analysis.py create mode 100644 learn_torch/seq/test_rnn.py (limited to 'learn_torch') diff --git a/learn_torch/seq/basic_rnn.py b/learn_torch/seq/basic_rnn.py new file mode 100644 index 0000000..a2db2af --- /dev/null +++ b/learn_torch/seq/basic_rnn.py @@ -0,0 +1,43 @@ +import torch +from torch import nn + + +def t1(): + rnn = nn.RNN(10, 20, 2) + input = torch.randn(5, 3, 10) + h0 = torch.randn(2, 3, 20) + output, hn = rnn(input, h0) + print() + +def t2(): + data = torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]) + print("Data: ", data.shape, "\n\n", data) + ###################### OUTPUT ###################### + + + # Number of features used as input. (Number of columns) + INPUT_SIZE = 1 + # Number of previous time stamps taken into account. + SEQ_LENGTH = 5 + # Number of features in last hidden state ie. number of output time- + # steps to predict.See image below for more clarity. + HIDDEN_SIZE = 2 + # Number of stacked rnn layers. + NUM_LAYERS = 1 + # We have total of 20 rows in our input. + # We divide the input into 4 batches where each batch has only 1 + # row. Each row corresponds to a sequence of length 5. + BATCH_SIZE = 4 + + # Initialize the RNN. + rnn = nn.RNN(input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE, num_layers = 1, batch_first=True) + # input size : (batch, seq_len, input_size) + inputs = data.view(BATCH_SIZE, SEQ_LENGTH, INPUT_SIZE) + # out shape = (batch, seq_len, num_directions * hidden_size) + # h_n shape = (num_layers * num_directions, batch, hidden_size) + out, h_n = rnn(inputs) + + +if __name__ == '__main__': + nn.Linear + t1() diff --git a/learn_torch/seq/sentiment_analysis.py b/learn_torch/seq/sentiment_analysis.py new file mode 100644 index 0000000..8971e31 --- /dev/null +++ b/learn_torch/seq/sentiment_analysis.py @@ -0,0 +1,193 @@ +import torch +from torchtext.legacy import data +from torchtext.legacy import datasets +import random +import torch.nn as nn +import time + +SEED = 1234 + +torch.manual_seed(SEED) +torch.backends.cudnn.deterministic = True + +TEXT = data.Field(tokenize='spacy', + tokenizer_language='en_core_web_sm') +LABEL = data.LabelField(dtype=torch.float) + + +train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) + +print(f'Number of training examples: {len(train_data)}') +print(f'Number of testing examples: {len(test_data)}') + +print(vars(train_data.examples[0])) + +train_data, valid_data = train_data.split(random_state = random.seed(SEED)) + +print(f'Number of training examples: {len(train_data)}') +print(f'Number of validation examples: {len(valid_data)}') +print(f'Number of testing examples: {len(test_data)}') + +MAX_VOCAB_SIZE = 25_000 + +TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE) +LABEL.build_vocab(train_data) + +print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}") +print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}") + +print(TEXT.vocab.freqs.most_common(20)) + +print(TEXT.vocab.itos[:10]) +print(LABEL.vocab.stoi) + +BATCH_SIZE = 64 + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( + (train_data, valid_data, test_data), + batch_size = BATCH_SIZE, + device = device) + + +class RNN(nn.Module): + def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim): + super().__init__() + + self.embedding = nn.Embedding(input_dim, embedding_dim) + + self.rnn = nn.RNN(embedding_dim, hidden_dim) + + self.fc = nn.Linear(hidden_dim, output_dim) + + def forward(self, text): + # text = [sent len, batch size] + + embedded = self.embedding(text) + + # embedded = [sent len, batch size, emb dim] + + output, hidden = self.rnn(embedded) + + # output = [sent len, batch size, hid dim] + # hidden = [1, batch size, hid dim] + + assert torch.equal(output[-1, :, :], hidden.squeeze(0)) + + return self.fc(hidden.squeeze(0)) + +INPUT_DIM = len(TEXT.vocab) +EMBEDDING_DIM = 100 +HIDDEN_DIM = 256 +OUTPUT_DIM = 1 + +model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM) + +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + +print(f'The model has {count_parameters(model):,} trainable parameters') + + +import torch.optim as optim + +optimizer = optim.SGD(model.parameters(), lr=1e-3) +criterion = nn.BCEWithLogitsLoss() + +model = model.to(device) +criterion = criterion.to(device) + +def binary_accuracy(preds, y): + """ + Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8 + """ + + #round predictions to the closest integer + rounded_preds = torch.round(torch.sigmoid(preds)) + correct = (rounded_preds == y).float() #convert into float for division + acc = correct.sum() / len(correct) + return acc + + +def train(model, iterator, optimizer, criterion): + epoch_loss = 0 + epoch_acc = 0 + + model.train() + + for batch in iterator: + optimizer.zero_grad() + + predictions = model(batch.text).squeeze(1) + + loss = criterion(predictions, batch.label) + + acc = binary_accuracy(predictions, batch.label) + + loss.backward() + + optimizer.step() + + epoch_loss += loss.item() + epoch_acc += acc.item() + + return epoch_loss / len(iterator), epoch_acc / len(iterator) + + +def evaluate(model, iterator, criterion): + epoch_loss = 0 + epoch_acc = 0 + + model.eval() + + with torch.no_grad(): + for batch in iterator: + predictions = model(batch.text).squeeze(1) + + loss = criterion(predictions, batch.label) + + acc = binary_accuracy(predictions, batch.label) + + epoch_loss += loss.item() + epoch_acc += acc.item() + + return epoch_loss / len(iterator), epoch_acc / len(iterator) + + +def epoch_time(start_time, end_time): + elapsed_time = end_time - start_time + elapsed_mins = int(elapsed_time / 60) + elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) + return elapsed_mins, elapsed_secs + + +N_EPOCHS = 5 + +best_valid_loss = float('inf') + +for epoch in range(N_EPOCHS): + + start_time = time.time() + + train_loss, train_acc = train(model, train_iterator, optimizer, criterion) + valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) + + end_time = time.time() + + epoch_mins, epoch_secs = epoch_time(start_time, end_time) + + if valid_loss < best_valid_loss: + best_valid_loss = valid_loss + torch.save(model.state_dict(), 'tut1-model.pt') + + print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') + print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%') + print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%') + + +model.load_state_dict(torch.load('tut1-model.pt')) + +test_loss, test_acc = evaluate(model, test_iterator, criterion) + +print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%') \ No newline at end of file diff --git a/learn_torch/seq/test_rnn.py b/learn_torch/seq/test_rnn.py new file mode 100644 index 0000000..5a7baf2 --- /dev/null +++ b/learn_torch/seq/test_rnn.py @@ -0,0 +1,8 @@ +import torch +from torch import nn + +if __name__ == '__main__': + rnn = nn.RNN(10, 20, 2) + input = torch.randn(5, 3, 10) + h0 = torch.randn(2, 3, 10) + rnn(input, h0) -- cgit v1.2.3