Language Representations
Glossary
Word embeddings: a method of representing text with vectors with respect to a language model obtained from a large corpus of texts.
Practice
1# classification by using embeddings2# df - dataset3# model - classification model45import math6import numpy as np7import pandas as pd89import torch10import transformers1112from sklearn.linear_model import LogisticRegression13from sklearn.model_selection import cross_val_score14from sklearn.model_selection import train_test_split1516from tqdm.auto import tqdm1718# check for other models at https://huggingface.co/transformers/pretrained_models.html19tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')20model = transformers.BertModel.from_pretrained('bert-base-uncased')2122def BERT_text_to_embeddings(texts, max_length=512, batch_size=100, force_device=None, disable_progress_bar=False):2324 ids_list = []25 attention_mask_list = []2627 # text to padded IDs of tokens along with their attention masks2829 for input_text in tqdm(texts, disable=disable_progress_bar):30 ids = tokenizer.encode(input_text.lower(), add_special_tokens=True, truncation=True, max_length=max_length)31 padded = np.array(ids + [0]*(max_length - len(ids)))32 attention_mask = np.where(padded != 0, 1, 0)33 ids_list.append(padded)34 attention_mask_list.append(attention_mask)3536 if force_device is not None:37 device = torch.device(force_device)38 else:39 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')4041 model.to(device)42 if not disable_progress_bar:43 print(f'Using the {device} device.')4445 # gettings embeddings in batches4647 embeddings = []4849 for i in tqdm(range(math.ceil(len(ids_list)/batch_size)), disable=disable_progress_bar):5051 ids_batch = torch.LongTensor(ids_list[batch_size*i:batch_size*(i+1)]).to(device)52 attention_mask_batch = torch.LongTensor(attention_mask_list[batch_size*i:batch_size*(i+1)]).to(device)5354 with torch.no_grad():55 model.eval()56 batch_embeddings = model(input_ids=ids_batch, attention_mask=attention_mask_batch)57 embeddings.append(batch_embeddings[0][:,0,:].detach().cpu().numpy())5859 return np.concatenate(embeddings)6061# Attention!62# Running BERT for thousands of texts may take long run on the CPU, several hours63# Try to find a machine with GPU, it will run BERT in several minutes instead of hours64features = BERT_text_to_embeddings(df['text'])65target = df['target']6667train_features, test_features, train_target, test_target = train_test_split(68 features, target, train_size=.8)6970model.fit(train_features, train_target)7172print(model.score(test_features, test_target))