import torch
# read all movie reviews sentiments
labels = []
reviews = []
with open('imdb_labelled.txt', encoding='utf-8') as file:
    for line in file:
        # get sentence and label
        sentence, label = line.strip('\n').split('\t')
        reviews.append(sentence)
        labels.append(int(label))

print(f'total number of samples: {len(labels)}')

# use cuda to run this program on GPU.
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

total number of samples: 1000
No GPU available, using the CPU instead.


from transformers import BertTokenizer
# load bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# setting the maximum length of encodings to 128, so that encodings will not be too long
encodings = tokenizer(reviews, truncation=True, padding='max_length', return_tensors='pt',
                      max_length=128)
input_ids = encodings['input_ids']
attention_masks = encodings['attention_mask']


print(f'The original sentence: {reviews[0]}')
print(f'Input ID: {input_ids[0]}')
print(f'Attention mask: {attention_masks[0]}')

The original sentence: A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  
Input ID: tensor([  101,  1037,  2200,  1010,  2200,  1010,  2200,  4030,  1011,  3048,
         1010,  6614,  3238,  3185,  2055,  1037, 24305,  1010, 15013,  2402,
         2158,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])
Attention mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])


from torch.utils.data import TensorDataset, random_split, DataLoader
# splitting traing and validation dataset
dataset = TensorDataset(input_ids, attention_masks, torch.tensor(labels))
train_size = int(df.shape[0] * 0.8)
val_size = df.shape[0] - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
# load datasets into dataloaders
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
eval_dataloader = DataLoader(val_dataset, batch_size=16)


from transformers import BertForSequenceClassification, AdamW, get_scheduler
from tqdm.auto import tqdm
# setting the model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
    num_labels=2,  # The number of output labels--2 for binary classification.
    output_attentions=False,  # Whether the model returns attentions weights.
    output_hidden_states=False,  # Whether the model returns all hidden-states.
)


# setting the epochs
num_epochs = 3
# setting for gradient descent
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
model.train()
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # send batches to device (cpu or gpu)
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        outputs = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels,
                       return_dict=True)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


from sklearn.metrics import accuracy_score
y_pred = []
y_true = []
model.eval()
for batch in eval_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    # set gradient to zero at the start of every batch
    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels,
                        return_dict=True)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    y_pred.extend(predictions.tolist())
    y_true.extend(b_labels.tolist())
print(f'Accuracy: {accuracy_score(y_pred, y_true)}')

Accuracy: 0.905

Sentiment Analysis of Movie Reviews pt.4 -- BERT¶

--by Charlie Chengrui Zheng 01/25/2021¶

Transfer Learning¶

Pytorch and CUDA¶

Tokenization¶

Dataset and Dataloader¶

Training¶

Evaluation¶