In this blog, I have demonstrated the use of XLNet for a simple emotion classifier. The dataset used in this task contains four emotion classes (0-Anger, 1-Fear, 2-Joy, and 3-Sadness). I have used the XLNet pretrained model to classify it. Before going into the details – please check the XLNet basics (covered in the following video tutorials):
Details of the dataset:
- The dataset used in this code is available HERE. Download the README (for dataset)
- Download the
XLNet base model from HERE (Or visit -
https://github.com/zihangdai/xlnet ) - For Keras-XLNet – visit – https://pypi.org/project/keras-xlnet/ or use the link
EmoInt-2017 Dataset main page: https://competitions.codalab.org/competitions/16380#learn_the_details-datasets
EmoBank Dataset main page: https://github.com/JULIELab/EmoBank
- Please Follow the copyright information available with the dataset and give full credit to the corresponding owners.
This code is provided to support the hands-on practice after learning the XLNet Basics.
import pandas as pd
import numpy as np
#Please install NLTK
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import string
import csv
# please install keras-xlnet 'pip install keras-xlnet'
from keras_xlnet.backend import keras
from keras_bert.layers import Extract
from keras_xlnet import Tokenizer, load_trained_model_from_checkpoint, ATTENTION_TYPE_BI
from collections import namedtuple
import os
Source_File = "Emotion_Dataset2.csv" #Path of the source training file for 'Emotion Classification'
# -------------XLNet----------
# Path of the unzipped "XLNet-Large, Cased: 24-layer, 1024-hidden, 16-heads" model file downloaded from "https://github.com/zihangdai/xlnet" However this experiment can be repeated by using XLNet base model also.
xlnet_folder_path = '../model/XLNET/xlnet_cased_L-24_H-1024_A-16/'
xlnet_pretrained_path = namedtuple('xlnet_pretrained_path',['config', 'model', 'vocab'])
config_path = os.path.join(xlnet_folder_path,'xlnet_config.json')
model_path = os.path.join(xlnet_folder_path,'xlnet_model.ckpt')
vocab_path = os.path.join(xlnet_folder_path,'spiece.model')
paths = xlnet_pretrained_path(config_path,model_path,vocab_path)
tokenizer = Tokenizer(paths.vocab)
BATCH_SIZE = 16
SEQ_LEN = 30
EPOCH = 21 #you can check the code for 1-2 epoch also
trained_model_path = "xlnet_sentiment.h5" #default path to store the final trained model
# Read data; code source - keras-xlnet github page
class DataSequence(keras.utils.Sequence):
def __init__(self, x, y):
self.x = x
self.y = y
def __len__(self):
return (len(self.y) + BATCH_SIZE - 1) // BATCH_SIZE
def __getitem__(self, index):
s = slice(index * BATCH_SIZE, (index + 1) * BATCH_SIZE)
return [item[s] for item in self.x], self.y[s]
# Main Emotion Classifier class
class Simple_Emotion_Classifier:
# geterate input sequence for xlnet *code source - keras-xlnet github page and modified for the current application requirements
def generate_sequence(self, Lines, Train_Labels):
tokens, classes = [], []
# df = pd.read_csv(path, sep='\t', error_bad_lines=False)
for i in range(0,int(len(Lines))):
text_a, text_b, clas_labels = str(Lines[i]), "", Train_Labels[i]
# if not isinstance(text_a, str) or not isinstance(text_b, str) or cls not in CLASSES:
# continue
# encoded_a, encoded_b = tokenizer.encode(text_a)[:48], tokenizer.encode(text_b)[:49]
encoded_a = tokenizer.encode(text_a)
# encoded = encoded_a + [tokenizer.SYM_SEP] + encoded_b + [tokenizer.SYM_SEP]
encoded = encoded_a + [tokenizer.SYM_SEP] + [tokenizer.SYM_SEP]
encoded = [tokenizer.SYM_PAD] * (SEQ_LEN - 1 - len(encoded)) + encoded + [tokenizer.SYM_CLS]
tokens.append(encoded)
classes.append(clas_labels)
tokens, classes = np.array(tokens), np.array(classes)
segments = np.zeros_like(tokens)
segments[:, -1] = 1
lengths = np.zeros_like(tokens[:, :1])
return DataSequence([tokens, segments, lengths], classes)
def return_sequence_for_csv_file(self, CSV_train_file):
Lines, Train_Labels, Train_Label_Weight = self.Read_Lines_Classes_Weights(CSV_train_file)
data_seq = self.generate_sequence(Lines,Train_Labels)
return data_seq
# Function to train the complete model
def Train_Model(self, CSV_train_file):
# Load pretrained model * code-source - keras-xlnet github page *
model = load_trained_model_from_checkpoint(
config_path=paths.config,
checkpoint_path=paths.model,
batch_size=BATCH_SIZE,
memory_len=0,
target_len=SEQ_LEN,
in_train_phase=False,
attention_type=ATTENTION_TYPE_BI,
)
# Build classification model * code-source - keras-xlnet github page *
last = Extract(index=-1, name='Extract')(model.output)
dense = keras.layers.Dense(units=768, activation='tanh', name='Dense')(last)
dropout = keras.layers.Dropout(rate=0.1, name='Dropout')(dense)
output = keras.layers.Dense(units=4, activation='softmax', name='Softmax')(dropout)
model = keras.models.Model(inputs=model.inputs, outputs=output)
model.summary()
# Fit model
train_seq = self.return_sequence_for_csv_file(CSV_train_file)
model.compile(
optimizer=keras.optimizers.Adam(lr=3e-5),
loss='sparse_categorical_crossentropy',
metrics=['sparse_categorical_accuracy'],
)
model.fit_generator(
generator=train_seq,
# validation_data=dev_matched_seq,
epochs=EPOCH,
callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)],
)
model.save_weights(trained_model_path)
# Function to read and preprocess the source file
def Read_Lines_Classes_Weights(self, SourceFile):
Lines = [] # contains sentences with stopwords
Lines_Cln = [] # contains highly cleaned lines
Train_Labels = []
Train_Label_Weight = []
word_freq = {}
data = pd.read_csv(SourceFile)
for line in data.values:
# print(line[1], line[2], line[3])
line1 = str(line[1]).strip()
if int(len(line1)) >= 1:
Train_Labels.append(line[2]) # add train labels
lab_weight1 = float(line[3])
Train_Label_Weight.append(lab_weight1) # add train label weight
txt0 = str(line1[1])
wds_list = txt0.split()
# Remove words having length greater than 15
if int(len(wds_list)) > 0:
txt1 = ' '.join(wds_list)
# split into words
tokens = word_tokenize(txt1)
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 0]
# stemming of words
# porter = PorterStemmer()
# stemmed = [porter.stem(word) for word in tokens]
txt2 = ' '.join(tokens)
if txt2.strip().__len__()==0:
txt2 = "hello world"
#Stem lines remove noisy words
Lines.append(txt2)
else:
Lines.append("hellow world")
return Lines, Train_Labels, Train_Label_Weight
# Function to load the trained model
def load_xlnet_model(self, saved_model_path):
# Load pretrained model * code-source - keras-xlnet github page *
model = load_trained_model_from_checkpoint(
config_path=paths.config,
checkpoint_path=paths.model,
batch_size=BATCH_SIZE,
memory_len=0,
target_len=SEQ_LEN,
in_train_phase=False,
attention_type=ATTENTION_TYPE_BI,
)
# Build classification model * code-source - keras-xlnet github page *
last = Extract(index=-1, name='Extract')(model.output)
dense = keras.layers.Dense(units=768, activation='tanh', name='Dense')(last)
dropout = keras.layers.Dropout(rate=0.1, name='Dropout')(dense)
output = keras.layers.Dense(units=4, activation='softmax', name='Softmax')(dropout)
model = keras.models.Model(inputs=model.inputs, outputs=output)
model.load_weights(saved_model_path)
return model
# Function to test the emotion classifier for the single sentence
def Emotion_Classifier_Test(self, Input_Sentence, model1):
cln_test_lines = []
line1 = str(Input_Sentence).strip().lower()
tokens = word_tokenize(line1)
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 0]
# stemming of words
# Stem lines remove noisy words
# porter = PorterStemmer()
# stemmed = [porter.stem(word) for word in tokens]
txt2 = ' '.join(tokens)
if txt2.__len__() > 0:
cln_test_lines.append(txt2)
else:
cln_test_lines.append("Very Good")
initial_false_level = 0
Labels = []
Labels.append(0)
test_seq = self.generate_sequence(cln_test_lines,Labels)
pred = model1.predict_generator(test_seq)[0]
print("prediction => ", pred)
index1 = np.argmax(pred)
print("class index => ", index1)
if __name__=="__main__":
print("Calling Simple Emotion Classifier Training and Test")
S_EMO_CLS = Simple_Emotion_Classifier()
# Train the model
S_EMO_CLS.Train_Model(Source_File)
# Load the trained model
model1 = S_EMO_CLS.load_xlnet_model(trained_model_path)
print("model loaded")
input_text1 = " Jimmy Carr makes me want to cry and cry *shiver*"
# test the trained model
S_EMO_CLS.Emotion_Classifier_Test(input_text1,model1)