1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
import torch
import re
from transformers import BertTokenizer
import pandas as pd
if torch.cuda.is_available():
device = torch.device("cuda")
print(f'There are {torch.cuda.device_count()} GPU(s) available.')
print('Device name:', torch.cuda.get_device_name(0))
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
def text_preprocessing(text):
"""
- Remove entity mentions (eg. '@united')
- Correct errors (eg. '&' to '&')
@param text (str): a string to be processed.
@return text (Str): the processed string.
"""
# Remove '@name'
text = re.sub(r'(@.*?)[\s]', ' ', text)
# Replace '&' with '&'
text = re.sub(r'&', '&', text)
# Remove trailing whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# Specify `MAX_LEN`
MAX_LEN = 64
def preprocessing_for_bert(data):
"""Perform required preprocessing steps for pretrained BERT.
@param data (np.array): Array of texts to be processed.
@return input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
@return attention_masks (torch.Tensor): Tensor of indices specifying which
tokens should be attended to by the model.
"""
# Create empty lists to store outputs
input_ids = []
attention_masks = []
# For every sentence...
for sent in data:
# `encode_plus` will:
# (1) Tokenize the sentence
# (2) Add the `[CLS]` and `[SEP]` token to the start and end
# (3) Truncate/Pad sentence to max length
# (4) Map tokens to their IDs
# (5) Create attention mask
# (6) Return a dictionary of outputs
encoded_sent = tokenizer.encode_plus(
text=text_preprocessing(sent), # Preprocess sentence
add_special_tokens=True, # Add `[CLS]` and `[SEP]`
max_length=MAX_LEN, # Max length to truncate/pad
pad_to_max_length=True, # Pad sentence to max length
# return_tensors='pt', # Return PyTorch tensor
return_attention_mask=True # Return attention mask
)
# Add the outputs to the lists
input_ids.append(encoded_sent.get('input_ids'))
attention_masks.append(encoded_sent.get('attention_mask'))
# Convert lists to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
return input_ids, attention_masks
|