##  DIT NLP lesson 2024

# Long Short-Term Memory Networks

In [None]:
# Importing the dependencies
import glob
import numpy as np
import os
import tarfile

from gensim.models.keyedvectors import KeyedVectors
from nltk.tokenize import TreebankWordTokenizer
from random import shuffle
from urllib import request


In [None]:
PATH_TO_CORPUS = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
CORPUS_FILE_NAME = "aclImdb_v1.tar.gz"

PATH_TO_GOOGLENEWS_VECTORS ="https://www.dropbox.com/s/965dir4dje0hfi4/GoogleNews-vectors-negative300.bin.gz?dl=1"
GOOGLE_VECTORS = "GoogleNews-vectors-negative300.bin.gz"

CORPUS_PATH = "aclImdb/train"

def download_file(url_to_file, path_to_file):
  if os.path.isfile(path_to_file):
    print("A local copy of the file exists already:", path_to_file, "\nDoing nothing")
  else:
    request.urlretrieve(url_to_file, path_to_file)

In [None]:
# Downloading the embeddings

download_file(PATH_TO_GOOGLENEWS_VECTORS, GOOGLE_VECTORS)

# Downloading and untaring the corpus

download_file(PATH_TO_CORPUS, CORPUS_FILE_NAME)
with tarfile.open(CORPUS_FILE_NAME) as f:
  f.extractall(path=".")

In [None]:
# I am using the same preprocessing functions again

def pre_process_data(filepath):
    """
    Load pos and neg examples from separate dirs then shuffle them
    together.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    shuffle(dataset)
    return dataset

# Tokenizing and vectorizing all the instances
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)
    return vectorized_data

# Not necessary in general; we apply it for comparison against
# previous sessions
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen
    """
    new_data = []
    # Create a vector of 0s the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            # Append the appropriate number 0 vectors to the list
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)

    return new_data

def collect_expected(dataset):
    """Extracting the expected output for all the instances"""
    return [sample[0] for sample in dataset]

In [None]:
# Loading the embeddings
word_vectors = KeyedVectors.load_word2vec_format(GOOGLE_VECTORS,
    binary=True, limit=400000)

In [None]:
# Data preparation
dataset = pre_process_data(CORPUS_PATH)
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)
split_point = int(len(vectorized_data) * .8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]

x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [None]:
dataset

In [None]:
# Network parameters

maxlen = 400
batch_size = 32
embedding_dims = 300
epochs = 2
num_neurons = 50

In [None]:
# Padding and truncating

x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)

x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [None]:
# Importing the dependencies
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Input, LSTM

In [None]:
model = Sequential()

model.add(Input([maxlen, embedding_dims]))
model.add(
    LSTM(num_neurons,
         return_sequences=True,
        )
    )

# This is what we had before
# model.add(Bidirectional(SimpleRNN(
#     num_neurons,
#     return_sequences=True),
#     )
#  )

In [None]:
# Adding a dropout, flattening, and classification layers
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Compiling the network
model.compile('rmsprop',
              'binary_crossentropy',
              metrics=['accuracy'])
print(model.summary())

That's **70,200** parameters in the LSTM (against **17,550** for the RNN)

Back to the slides

In [None]:
# Training the network

model.fit(x_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_test, y_test))

In [None]:
# Saving the network for future use
model_structure = model.to_json()
with open("lstm_model1.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("lstm.weights.h5")

In [None]:
# Predicting
sample_1 = """I hate that the dismal weather had me down for so long, when
will it break! Ugh, when does happiness return? The sun is blinding and
the puffy clouds are too thin. I can't wait for the weekend."""

vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(
    test_vec_list,
    (len(test_vec_list), maxlen, embedding_dims))
model.predict(test_vec)
