Language Representations

Glossary

Word embeddings: a method of representing text with vectors with respect to a language model obtained from a large corpus of texts.
Practice


1# classification by using embeddings
2# df - dataset
3# model - classification model
4
5import math
6import numpy as np
7import pandas as pd
8
9import torch
10import transformers
11
12from sklearn.linear_model import LogisticRegression
13from sklearn.model_selection import cross_val_score
14from sklearn.model_selection import train_test_split
15
16from tqdm.auto import tqdm
17
18# check for other models at https://huggingface.co/transformers/pretrained_models.html
19tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
20model = transformers.BertModel.from_pretrained('bert-base-uncased')
21
22def BERT_text_to_embeddings(texts, max_length=512, batch_size=100, force_device=None, disable_progress_bar=False):
23    
24    ids_list = []
25    attention_mask_list = []
26
27    # text to padded IDs of tokens along with their attention masks
28    
29    for input_text in tqdm(texts, disable=disable_progress_bar):
30        ids = tokenizer.encode(input_text.lower(), add_special_tokens=True, truncation=True, max_length=max_length)
31        padded = np.array(ids + [0]*(max_length - len(ids)))
32        attention_mask = np.where(padded != 0, 1, 0)
33        ids_list.append(padded)
34        attention_mask_list.append(attention_mask)
35    
36    if force_device is not None:
37        device = torch.device(force_device)
38    else:
39        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
40        
41    model.to(device)
42    if not disable_progress_bar:
43        print(f'Using the {device} device.')
44    
45    # gettings embeddings in batches
46
47    embeddings = []
48
49    for i in tqdm(range(math.ceil(len(ids_list)/batch_size)), disable=disable_progress_bar):
50            
51        ids_batch = torch.LongTensor(ids_list[batch_size*i:batch_size*(i+1)]).to(device)
52        attention_mask_batch = torch.LongTensor(attention_mask_list[batch_size*i:batch_size*(i+1)]).to(device)
53            
54        with torch.no_grad():            
55            model.eval()
56            batch_embeddings = model(input_ids=ids_batch, attention_mask=attention_mask_batch)   
57        embeddings.append(batch_embeddings[0][:,0,:].detach().cpu().numpy())
58        
59    return np.concatenate(embeddings)
60
61# Attention! 
62# Running BERT for thousands of texts may take long run on the CPU, several hours
63# Try to find a machine with GPU, it will run BERT in several minutes instead of hours
64features = BERT_text_to_embeddings(df['text'])
65target = df['target']
66
67train_features, test_features, train_target, test_target = train_test_split(
68    features, target, train_size=.8)
69
70model.fit(train_features, train_target)
71
72print(model.score(test_features, test_target))