summaryrefslogtreecommitdiff
path: root/learn_torch/seq/sentiment_analysis.py
diff options
context:
space:
mode:
Diffstat (limited to 'learn_torch/seq/sentiment_analysis.py')
-rw-r--r--learn_torch/seq/sentiment_analysis.py193
1 files changed, 193 insertions, 0 deletions
diff --git a/learn_torch/seq/sentiment_analysis.py b/learn_torch/seq/sentiment_analysis.py
new file mode 100644
index 0000000..8971e31
--- /dev/null
+++ b/learn_torch/seq/sentiment_analysis.py
@@ -0,0 +1,193 @@
+import torch
+from torchtext.legacy import data
+from torchtext.legacy import datasets
+import random
+import torch.nn as nn
+import time
+
+SEED = 1234
+
+torch.manual_seed(SEED)
+torch.backends.cudnn.deterministic = True
+
+TEXT = data.Field(tokenize='spacy',
+ tokenizer_language='en_core_web_sm')
+LABEL = data.LabelField(dtype=torch.float)
+
+
+train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
+
+print(f'Number of training examples: {len(train_data)}')
+print(f'Number of testing examples: {len(test_data)}')
+
+print(vars(train_data.examples[0]))
+
+train_data, valid_data = train_data.split(random_state = random.seed(SEED))
+
+print(f'Number of training examples: {len(train_data)}')
+print(f'Number of validation examples: {len(valid_data)}')
+print(f'Number of testing examples: {len(test_data)}')
+
+MAX_VOCAB_SIZE = 25_000
+
+TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
+LABEL.build_vocab(train_data)
+
+print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
+print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
+
+print(TEXT.vocab.freqs.most_common(20))
+
+print(TEXT.vocab.itos[:10])
+print(LABEL.vocab.stoi)
+
+BATCH_SIZE = 64
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
+ (train_data, valid_data, test_data),
+ batch_size = BATCH_SIZE,
+ device = device)
+
+
+class RNN(nn.Module):
+ def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
+ super().__init__()
+
+ self.embedding = nn.Embedding(input_dim, embedding_dim)
+
+ self.rnn = nn.RNN(embedding_dim, hidden_dim)
+
+ self.fc = nn.Linear(hidden_dim, output_dim)
+
+ def forward(self, text):
+ # text = [sent len, batch size]
+
+ embedded = self.embedding(text)
+
+ # embedded = [sent len, batch size, emb dim]
+
+ output, hidden = self.rnn(embedded)
+
+ # output = [sent len, batch size, hid dim]
+ # hidden = [1, batch size, hid dim]
+
+ assert torch.equal(output[-1, :, :], hidden.squeeze(0))
+
+ return self.fc(hidden.squeeze(0))
+
+INPUT_DIM = len(TEXT.vocab)
+EMBEDDING_DIM = 100
+HIDDEN_DIM = 256
+OUTPUT_DIM = 1
+
+model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
+
+def count_parameters(model):
+ return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+print(f'The model has {count_parameters(model):,} trainable parameters')
+
+
+import torch.optim as optim
+
+optimizer = optim.SGD(model.parameters(), lr=1e-3)
+criterion = nn.BCEWithLogitsLoss()
+
+model = model.to(device)
+criterion = criterion.to(device)
+
+def binary_accuracy(preds, y):
+ """
+ Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
+ """
+
+ #round predictions to the closest integer
+ rounded_preds = torch.round(torch.sigmoid(preds))
+ correct = (rounded_preds == y).float() #convert into float for division
+ acc = correct.sum() / len(correct)
+ return acc
+
+
+def train(model, iterator, optimizer, criterion):
+ epoch_loss = 0
+ epoch_acc = 0
+
+ model.train()
+
+ for batch in iterator:
+ optimizer.zero_grad()
+
+ predictions = model(batch.text).squeeze(1)
+
+ loss = criterion(predictions, batch.label)
+
+ acc = binary_accuracy(predictions, batch.label)
+
+ loss.backward()
+
+ optimizer.step()
+
+ epoch_loss += loss.item()
+ epoch_acc += acc.item()
+
+ return epoch_loss / len(iterator), epoch_acc / len(iterator)
+
+
+def evaluate(model, iterator, criterion):
+ epoch_loss = 0
+ epoch_acc = 0
+
+ model.eval()
+
+ with torch.no_grad():
+ for batch in iterator:
+ predictions = model(batch.text).squeeze(1)
+
+ loss = criterion(predictions, batch.label)
+
+ acc = binary_accuracy(predictions, batch.label)
+
+ epoch_loss += loss.item()
+ epoch_acc += acc.item()
+
+ return epoch_loss / len(iterator), epoch_acc / len(iterator)
+
+
+def epoch_time(start_time, end_time):
+ elapsed_time = end_time - start_time
+ elapsed_mins = int(elapsed_time / 60)
+ elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
+ return elapsed_mins, elapsed_secs
+
+
+N_EPOCHS = 5
+
+best_valid_loss = float('inf')
+
+for epoch in range(N_EPOCHS):
+
+ start_time = time.time()
+
+ train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
+ valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
+
+ end_time = time.time()
+
+ epoch_mins, epoch_secs = epoch_time(start_time, end_time)
+
+ if valid_loss < best_valid_loss:
+ best_valid_loss = valid_loss
+ torch.save(model.state_dict(), 'tut1-model.pt')
+
+ print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
+ print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
+ print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
+
+
+model.load_state_dict(torch.load('tut1-model.pt'))
+
+test_loss, test_acc = evaluate(model, test_iterator, criterion)
+
+print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%') \ No newline at end of file