Implementing LSTM in numpy from scratch

The purpose of this notebook is to illustrate how to build an LSTM from scratch in numpy.

import numpy as np
np.__version__
'1.21.6'
import tensorflow as tf
tf.__version__
'2.7.0'
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM as KLSTM
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import Orthogonal, GlorotUniform, Zeros


assert tf.__version__ > "2.1", "results are not reproducible with Tensorflow below 2"

experiment setup

num_inputs = 3  # number of input features
lstm_units = 32
lookback_steps = 5  # also known as time_steps or sequence length
num_samples = 10  # length of x,y

in order to make the results comparable between tensorflow ansd numpy we use same weights for tensorflow inmplementation and numpy implementation

k_init = GlorotUniform(seed=313)
k_vals = k_init(shape=(num_inputs, lstm_units*4))

rec_init = Orthogonal(seed=313)
rec_vals = rec_init(shape=(lstm_units, lstm_units*4))

b_init = Zeros()
b_vals = b_init(lstm_units*4)

weights = [k_vals, rec_vals, b_vals]

Keras version of LSTM

# check the results of forward pass of original LSTM of Keras

inp = Input(shape=(lookback_steps, num_inputs))
lstm_lyr = KLSTM(lstm_units)
out = lstm_lyr(inp)

lstm_lyr.set_weights(weights)

model = Model(inputs=inp, outputs=out)

xx = np.random.random((num_samples, lookback_steps, num_inputs))

lstm_out_tf = model.predict(x=xx)

numpy version of LSTM

class LSTMNP(object):
    """vanilla LSTM in pure numpy
    Only forward loop"""
    def __init__(
            self,
            units:int,
            return_sequences:bool = False,
            return_states:bool = False,
            time_major:bool = False
    ):
        self.units = units
        self.return_sequences = return_sequences
        self.return_states = return_states
        self.time_major = time_major

        self.kernel = k_vals.numpy()
        self.rec_kernel = rec_vals.numpy()
        self.bias = b_vals.numpy()


    def __call__(self, inputs, initial_state=None):
        # if not time_major original inputs have shape (batch_size, lookback_steps, num_inputs)
        # otherwise inputs will have shape (lookback_steps, batch_size, num_inputs)

        if not self.time_major:
            inputs = np.moveaxis(inputs, [0, 1], [1, 0])

        # inputs have shape (lookback_steps, batch_size, num_inputs)
        lookback_steps, bs, ins = inputs.shape

        if initial_state is None:
            h_state = np.zeros((bs, self.units))
            c_state = np.zeros((bs, self.units))
        else:
            assert len(initial_state) == 2
            h_state, c_state = initial_state

        h_states = []
        c_states = []

        for step in range(lookback_steps):

            h_state, c_state = self.cell(inputs[step], h_state, c_state)

            h_states.append(h_state)
            c_states.append(c_state)

        h_states = np.stack(h_states)
        c_states = np.stack(c_states)

        if not self.time_major:
            h_states = np.moveaxis(h_states, [0, 1], [1, 0])
            c_states = np.moveaxis(c_states, [0, 1], [1, 0])

        o = h_states[:, -1]
        if self.return_sequences:
            o = h_states

        if self.return_states:
            return o, c_states
        return o

    def cell(self, xt, ht, ct):
        """implements logic of LSTM"""

        # input gate
        k_i = self.kernel[:, :self.units]
        rk_i = self.rec_kernel[:, :self.units]
        b_i = self.bias[:self.units]
        i_t = self.sigmoid(np.dot(xt, k_i) + np.dot(ht, rk_i) + b_i)

        # forget gate
        k_f = self.kernel[:, self.units:self.units * 2]
        rk_f = self.rec_kernel[:, self.units:self.units * 2]
        b_f = self.bias[self.units:self.units * 2]
        ft = self.sigmoid(np.dot(xt, k_f) + np.dot(ht, rk_f) + b_f)

        # candidate cell state
        k_c = self.kernel[:, self.units * 2:self.units * 3]
        rk_c = self.rec_kernel[:, self.units * 2:self.units * 3]
        b_c = self.bias[self.units * 2:self.units * 3]
        c_t = self.tanh(np.dot(xt, k_c) + np.dot(ht, rk_c) + b_c)

        # cell state
        ct = ft * ct + i_t * c_t

        # output gate
        k_o = self.kernel[:, self.units * 3:]
        rk_o = self.rec_kernel[:, self.units * 3:]
        b_o = self.bias[self.units * 3:]
        ot = self.sigmoid(np.dot(xt, k_o) + np.dot(ht, rk_o) + b_o)

        # hidden state
        ht = ot * self.tanh(ct)

        return ht, ct

    @staticmethod
    def tanh(x):
        return np.tanh(x)

    @staticmethod
    def sigmoid(x):
        return 1. / (1 + np.exp(-x))
nplstm = LSTMNP(lstm_units)
lstm_out_np = nplstm(xx)

we can make sure that the results of numpy implementation and implementation of Tensorflow are exactly same

print(np.allclose(lstm_out_tf, lstm_out_np))
True

Total running time of the script: (0 minutes 0.318 seconds)

Gallery generated by Sphinx-Gallery