# DIT NLP lesson 2024

## Using character-level representations for classification




In [None]:
# Importing the dependencies
import glob
import numpy as np
import os
import tarfile

from keras.models import Sequential
from keras.layers import Dense, Dropout,  Flatten, LSTM

from random import shuffle
from urllib import request

In [None]:
# Add the path to the corpus. It should end in aclImdb/train
# CORPUS_PATH = "/Users/albarron/corpora/misc/stanford_movie_review/aclImdb/train"
PATH_TO_CORPUS = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
CORPUS_FILE_NAME = "aclImdb_v1.tar.gz"

CORPUS_PATH = "aclImdb/train"

def download_file(url_to_file, path_to_file):
  if os.path.isfile(path_to_file):
    print("A local copy of the file exists already:", path_to_file, "\nDoing nothing")
  else:
    request.urlretrieve(url_to_file, path_to_file)

# Downloading and untaring the corpus

download_file(PATH_TO_CORPUS, CORPUS_FILE_NAME)
with tarfile.open(CORPUS_FILE_NAME) as f:
  f.extractall(path=".")

In [None]:
# Loading the data

def pre_process_data(filepath):
    """
    Load pos and neg examples from separate dirs then shuffle them
    together.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    shuffle(dataset)
    return dataset

def collect_expected(dataset):
    """Extracting the expected output for all the instances"""
    return [sample[0] for sample in dataset]

In [None]:
# Loading instances and expected classes (as usual)
dataset = pre_process_data(CORPUS_PATH)
expected = collect_expected(dataset)

In [None]:
def avg_len(data):
    """Computes the average length of the data"""
    total_len = 0
    for sample in data:
        total_len += len(sample[1])
    return total_len/len(data)
avg_len(dataset)

The average word length is 202.44 (**Homework:** don't believe me and go find yourself).

That is, we would unroll the network **6.5x**!

In [None]:
def clean_data(data):
    """Shift to lower case, replace unknowns with UNK, and listify """
    new_data = []
    VALID = 'abcdefghijklmnopqrstuvwxyz0123456789"\'?!.,:; '
    for sample in data:
        new_sample = []
        for char in sample[1].lower():  # Just grab the string, not the label
            # Not extremely efficient procedure
            if char in VALID:
                new_sample.append(char)
            else:
                new_sample.append('UNK')

        new_data.append(new_sample)
    return new_data

# listified_data = clean_data(dataset)

**Homework**: turn the process to determine if a character is VALID more efficient

In [None]:
def char_pad_trunc(data, maxlen):
    """ We truncate to maxlen or add PAD tokens """
    new_dataset = []
    for sample in data:
        if len(sample) > maxlen:
            new_data = sample[:maxlen]
        elif len(sample) < maxlen:
            pads = maxlen - len(sample)
            new_data = sample + ['PAD'] * pads
        else:
            new_data = sample
        new_dataset.append(new_data)
    return new_dataset

In [None]:
# Producing the one-hot encodings (no embeddings here)
def create_dicts(data):
    """ Modified from Keras LSTM example"""
    chars = set()
    for sample in data:
        chars.update(set(sample))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return char_indices, indices_char

In [None]:
def onehot_encode(dataset, char_indices, maxlen):
    """
    One hot encode the tokens

    Args:
        dataset  list of lists of tokens
        char_indices  dictionary of {key=character, value=index to use encoding vector}
        maxlen  int  length of each sample
    Return:
        np array of shape (samples, tokens, encoding length)
    """
    X = np.zeros((len(dataset), maxlen, len(char_indices.keys())))
    for i, sentence in enumerate(dataset):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
    return X

In [None]:
# Load and preprocess the data
# The first 2 steps were run earlier
# dataset = pre_process_data(CORPUS_PATH)
# expected = collect_expected(dataset)
listified_data = clean_data(dataset)

maxlen = 1500
common_length_data = char_pad_trunc(listified_data, maxlen)

char_indices, indices_char = create_dicts(common_length_data)
encoded_data = onehot_encode(common_length_data, char_indices, maxlen)

In [None]:
# Split the data
split_point = int(len(encoded_data)*.8)

x_train = encoded_data[:split_point]
y_train = np.array(expected[:split_point])
x_test = encoded_data[split_point:]
y_test = np.array(expected[split_point:])
# Pay attention: in the book they forgot to turn y_[train|test] into numpy arrays

In [None]:
# A quick view to the first instance
x_train[0]

In [None]:
# Shape of the resulting array
x_train.shape

In [None]:
# How many instances do we have?
len(x_train)

**Q: What is  the size of the vocabulary?**

In [None]:
# Building the network

# Embedding,
num_neurons = 40

print('Build model...')
model = Sequential()

model.add(LSTM(
    num_neurons,
    return_sequences=True,
    input_shape=(maxlen, len(char_indices.keys())))
    )

model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

[rmsprop](https://keras.io/api/optimizers/rmsprop/)

In [None]:
# Training the network
batch_size = 32
epochs = 10
model.fit(x_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_test, y_test)
    )
# This would take between 5 and 10 minutes per epoch, depending on the hardware!

In [None]:
# Saving the model
model_structure = model.to_json()
with open("char_lstm_model3.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("char_lstm.weights.h5")

Back to the slides