Knowledge Base

Language Representations

Glossary

Word embeddings: a method of representing text with vectors with respect to a language model obtained from a large corpus of texts.

Practice

1# classification by using embeddings
2# df - dataset
3# model - classification model
4
5import math
6import numpy as np
7import pandas as pd
8
9import torch
10import transformers
11
12from sklearn.linear_model import LogisticRegression
13from sklearn.model_selection import cross_val_score
14from sklearn.model_selection import train_test_split
15
16from tqdm.auto import tqdm
17
18# check for other models at https://huggingface.co/transformers/pretrained_models.html
19tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
20model = transformers.BertModel.from_pretrained('bert-base-uncased')
21
22def BERT_text_to_embeddings(texts, max_length=512, batch_size=100, force_device=None, disable_progress_bar=False):
23
24 ids_list = []
25 attention_mask_list = []
26
27 # text to padded IDs of tokens along with their attention masks
28
29 for input_text in tqdm(texts, disable=disable_progress_bar):
30 ids = tokenizer.encode(input_text.lower(), add_special_tokens=True, truncation=True, max_length=max_length)
31 padded = np.array(ids + [0]*(max_length - len(ids)))
32 attention_mask = np.where(padded != 0, 1, 0)
33 ids_list.append(padded)
34 attention_mask_list.append(attention_mask)
35
36 if force_device is not None:
37 device = torch.device(force_device)
38 else:
39 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
40
41 model.to(device)
42 if not disable_progress_bar:
43 print(f'Using the {device} device.')
44
45 # gettings embeddings in batches
46
47 embeddings = []
48
49 for i in tqdm(range(math.ceil(len(ids_list)/batch_size)), disable=disable_progress_bar):
50
51 ids_batch = torch.LongTensor(ids_list[batch_size*i:batch_size*(i+1)]).to(device)
52 attention_mask_batch = torch.LongTensor(attention_mask_list[batch_size*i:batch_size*(i+1)]).to(device)
53
54 with torch.no_grad():
55 model.eval()
56 batch_embeddings = model(input_ids=ids_batch, attention_mask=attention_mask_batch)
57 embeddings.append(batch_embeddings[0][:,0,:].detach().cpu().numpy())
58
59 return np.concatenate(embeddings)
60
61# Attention!
62# Running BERT for thousands of texts may take long run on the CPU, several hours
63# Try to find a machine with GPU, it will run BERT in several minutes instead of hours
64features = BERT_text_to_embeddings(df['text'])
65target = df['target']
66
67train_features, test_features, train_target, test_target = train_test_split(
68 features, target, train_size=.8)
69
70model.fit(train_features, train_target)
71
72print(model.score(test_features, test_target))
Send Feedback
close
  • Bug
  • Improvement
  • Feature
Send Feedback
,