understanding Input/output of LSTM

The purpose of this notebook to determine the input and output shapes of LSTM in keras/tensorflow. It also shows how the output changes when we use different options such as return_sequences and return_state arguments in LSTM/RNN layers of tensorflow/keras.

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import MaxPooling1D, Flatten, Conv1D
from tensorflow.keras.layers import Input, LSTM, Reshape, TimeDistributed

# to suppress scientific notation while printing arrays
np.set_printoptions(suppress=True)

def reset_graph(seed=313):
    tf.compat.v1.reset_default_graph()
    tf.compat.v1.set_random_seed(seed)
    np.random.seed(seed)

tf.__version__
'2.7.0'
seq_len = 9
in_features = 3
batch_size = 2
units = 5

# define input data
data = np.random.normal(0,1, size=(batch_size, seq_len, in_features))
print('input shape is', data.shape)
input shape is (2, 9, 3)
reset_graph()

Input to LSTM

# The input to LSTM is 3D where each dimension is expected to have following meaning
# (batch_size, sequence_length, num_inputs)
# the batch_size determines the number of samples, sequence_legth determines the length
# of historical/temporal data used by LSTM and num_inputs is the number of input features

# define model
inputs1 = Input(shape=(seq_len, in_features))
lstm1 = LSTM(units)(inputs1)
model = Model(inputs=inputs1, outputs=lstm1)
model.inputs
[<KerasTensor: shape=(None, 9, 3) dtype=float32 (created by layer 'input_2')>]

Output from LSTM

# In Keras, the output from LSTM is 2D and each dimension has following meaning
# (batch_size, units)
# the units here represents the number of units/neuron of LSTM layer.

# check output
output = model.predict(data)
print('output shape is ', output.shape)
print(output)
output shape is  (2, 5)
[[-0.04311746 -0.04708175  0.11244525  0.09445497  0.08160033]
 [ 0.22174549  0.23136306 -0.1471001   0.04506844 -0.0963508 ]]

Return Sequence

If we use return_sequences=True, we can get hidden state which is also output, at each time step instead of just one final output.

reset_graph()

print('input shape is', data.shape)

# define model
inputs1 = Input(shape=(seq_len, in_features))
lstm1 = LSTM(units, return_sequences=True)(inputs1)
model = Model(inputs=inputs1, outputs=lstm1)

# check output
output = model.predict(data)
print('output shape is ', output.shape)
print(output)
input shape is (2, 9, 3)
output shape is  (2, 9, 5)
[[[ 0.23949696  0.23758332  0.0201166  -0.07562752  0.14458913]
  [ 0.20123877  0.19533847  0.04180209 -0.12905313  0.20505369]
  [ 0.06623977  0.09107485  0.02961113 -0.06149743  0.07921001]
  [ 0.103291    0.14202026 -0.10353918 -0.13593747 -0.01541394]
  [ 0.11871371  0.11363701  0.01490535 -0.01338429  0.09110813]
  [ 0.18314067  0.17522626  0.04663869 -0.05388878  0.18176244]
  [ 0.31485227  0.24940978  0.0693886  -0.03106552  0.25046384]
  [ 0.17771643  0.09009738  0.16493434  0.06166327  0.21880664]
  [-0.04311746 -0.04708175  0.11244525  0.09445497  0.08160033]]

 [[ 0.0236822   0.057854    0.05342087 -0.10365748  0.14504817]
  [-0.03983979  0.04184275  0.13498983  0.14183497  0.11871135]
  [-0.08096419  0.02722256  0.16430669  0.19353093  0.18122804]
  [-0.10457274 -0.09090691  0.05876469  0.26642254 -0.02051181]
  [ 0.07231079  0.07811436  0.06489968  0.07280337  0.08751098]
  [-0.02732764  0.00174761  0.04222624 -0.02587408  0.02410888]
  [ 0.02454332  0.01909897 -0.09221498 -0.07524213 -0.09897806]
  [ 0.22740148  0.31498346 -0.19642149 -0.16686526 -0.2563934 ]
  [ 0.22174549  0.23136306 -0.1471001   0.04506844 -0.0963508 ]]]

Return States

If we use return_state=True, it will give final hidden state/output plus the cell state as well

reset_graph()

# define model
inputs1 = Input(shape=(seq_len, in_features))
lstm1, state_h, state_c = LSTM(units, return_state=True)(inputs1)
model = Model(inputs=inputs1, outputs=[lstm1, state_h, state_c])

# check output
_h, h, c = model.predict(data)
print('_h: shape {} values \n {}\n'.format(_h.shape, _h))
print('h: shape {} values \n {}\n'.format(h.shape, h))
print('c: shape {} values \n {}'.format(c.shape, c))
_h: shape (2, 5) values
 [[-0.04311746 -0.04708175  0.11244525  0.09445497  0.08160033]
 [ 0.22174549  0.23136306 -0.1471001   0.04506844 -0.0963508 ]]

h: shape (2, 5) values
 [[-0.04311746 -0.04708175  0.11244525  0.09445497  0.08160033]
 [ 0.22174549  0.23136306 -0.1471001   0.04506844 -0.0963508 ]]

c: shape (2, 5) values
 [[-0.0884207  -0.10446949  0.1710459   0.17895043  0.24443825]
 [ 0.3913621   0.40256596 -0.38461903  0.08493438 -0.22778362]]

using both at same time We can use both return_sequences and return_states at same time as well.

reset_graph()

# define model
inputs1 = Input(shape=(seq_len, in_features))
lstm1, state_h, state_c = LSTM(units, return_state=True, return_sequences=True)(inputs1)
model = Model(inputs=inputs1, outputs=[lstm1, state_h, state_c])

# check output
_h, h, c = model.predict(data)
print('_h: shape {} values \n {}\n'.format(_h.shape, _h))
print('h: shape {} values \n {}\n'.format(h.shape, h))
print('c: shape {} values \n {}'.format(c.shape, c))
_h: shape (2, 9, 5) values
 [[[ 0.23949696  0.23758332  0.0201166  -0.07562752  0.14458913]
  [ 0.20123877  0.19533847  0.04180209 -0.12905313  0.20505369]
  [ 0.06623977  0.09107485  0.02961113 -0.06149743  0.07921001]
  [ 0.103291    0.14202026 -0.10353918 -0.13593747 -0.01541394]
  [ 0.11871371  0.11363701  0.01490535 -0.01338429  0.09110813]
  [ 0.18314067  0.17522626  0.04663869 -0.05388878  0.18176244]
  [ 0.31485227  0.24940978  0.0693886  -0.03106552  0.25046384]
  [ 0.17771643  0.09009738  0.16493434  0.06166327  0.21880664]
  [-0.04311746 -0.04708175  0.11244525  0.09445497  0.08160033]]

 [[ 0.0236822   0.057854    0.05342087 -0.10365748  0.14504817]
  [-0.03983979  0.04184275  0.13498983  0.14183497  0.11871135]
  [-0.08096419  0.02722256  0.16430669  0.19353093  0.18122804]
  [-0.10457274 -0.09090691  0.05876469  0.26642254 -0.02051181]
  [ 0.07231079  0.07811436  0.06489968  0.07280337  0.08751098]
  [-0.02732764  0.00174761  0.04222624 -0.02587408  0.02410888]
  [ 0.02454332  0.01909897 -0.09221498 -0.07524213 -0.09897806]
  [ 0.22740148  0.31498346 -0.19642149 -0.16686526 -0.2563934 ]
  [ 0.22174549  0.23136306 -0.1471001   0.04506844 -0.0963508 ]]]

h: shape (2, 5) values
 [[-0.04311746 -0.04708175  0.11244525  0.09445497  0.08160033]
 [ 0.22174549  0.23136306 -0.1471001   0.04506844 -0.0963508 ]]

c: shape (2, 5) values
 [[-0.0884207  -0.10446949  0.1710459   0.17895043  0.24443825]
 [ 0.3913621   0.40256596 -0.38461903  0.08493438 -0.22778362]]

time major

By time_major we mean that the last dimention i.e. 3rd dimension represents time and the second last represents input features. Thus the 3D input to lstm will become (batch_size, num_inputs, sequence_length)

reset_graph()

# define model
inputs1 = Input(shape=(in_features, seq_len))
lstm1 = LSTM(units, time_major=True)(inputs1)
model = Model(inputs=inputs1, outputs=[lstm1])
model.inputs
[<KerasTensor: shape=(None, 3, 9) dtype=float32 (created by layer 'input_6')>]
# we will have to shift the dimensions of numpy array to make it time_major
# check output
time_major_data = np.moveaxis(data, [1,2], [2,1])
time_major_data.shape
(2, 3, 9)
h = model.predict(time_major_data)
print('h: shape {} values \n {}\n'.format(h.shape, h))
h: shape (3, 5) values
 [[ 0.0856159   0.06631077 -0.43855685  0.1004677  -0.40924817]
 [ 0.02948599  0.02146549  0.01565967 -0.10389965  0.27761555]
 [ 0.09459803  0.14054263  0.1562092  -0.11277693 -0.12558709]]

CNN -> LSTM

We can append LSTM with any other layer. The only requirement is that the output from that layer should match the input requirement of LSTM i.e. the output from the layer that we want to add before LSTM should be 3D of shape (batch_size, num_inputs, seq_length)

reset_graph()

# define model
inputs = Input(shape=(seq_len, in_features))
cnn = Conv1D(filters=2, kernel_size=2, padding="same")(inputs)
max_pool = MaxPooling1D(padding="same")(cnn)
max_pool
<KerasTensor: shape=(None, 5, 2) dtype=float32 (created by layer 'max_pooling1d')>

as the shape of max_pool tensor matches the input requirement of LSTM we can combine it with LSTM

h = LSTM(units)(max_pool)
model = Model(inputs=inputs, outputs=h)
model.summary()
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #
=================================================================
 input_7 (InputLayer)        [(None, 9, 3)]            0

 conv1d (Conv1D)             (None, 9, 2)              14

 max_pooling1d (MaxPooling1D  (None, 5, 2)             0
 )

 lstm (LSTM)                 (None, 5)                 160

=================================================================
Total params: 174
Trainable params: 174
Non-trainable params: 0
_________________________________________________________________

However, this is not how CNN is comined with LSTM at its start. The purpose is usually to break the sequence length into small sub-sequences and then apply the same CNN on those sub-sequences. We can achieve this as following

sub_sequences = 3

reset_graph()
# define model
inputs = Input(shape=(seq_len, in_features))
time_steps = seq_len // sub_sequences
reshape = Reshape(target_shape=(sub_sequences, time_steps, in_features))(inputs)
cnn = TimeDistributed(Conv1D(filters=2, kernel_size=2, padding="same"))(reshape)
max_pool = TimeDistributed(MaxPooling1D(padding="same"))(cnn)
flatten = TimeDistributed(Flatten())(max_pool)
flatten
<KerasTensor: shape=(None, 3, 4) dtype=float32 (created by layer 'time_distributed_2')>

the shape of flatten tensor again matches the input requirements of LSTM so we can again attach LSTM after it.

h = LSTM(units)(flatten)
model = Model(inputs=inputs, outputs=h)
model.summary()
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #
=================================================================
 input_8 (InputLayer)        [(None, 9, 3)]            0

 reshape (Reshape)           (None, 3, 3, 3)           0

 time_distributed (TimeDistr  (None, 3, 3, 2)          14
 ibuted)

 time_distributed_1 (TimeDis  (None, 3, 2, 2)          0
 tributed)

 time_distributed_2 (TimeDis  (None, 3, 4)             0
 tributed)

 lstm (LSTM)                 (None, 5)                 200

=================================================================
Total params: 214
Trainable params: 214
Non-trainable params: 0
_________________________________________________________________

LSTM -> 1D CNN

We can put 1d cnn at the end of LSTM to further extract some features from LSTM output.

reset_graph()

print('input shape is', data.shape)

# define model
inputs = Input(shape=(seq_len, in_features))
lstm_layer = LSTM(units, return_sequences=True)
lstm_outputs = lstm_layer(inputs)
print('lstm output: ', lstm_outputs.shape)

conv1 = Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(seq_len, units))(lstm_outputs)
print('conv output: ', conv1.shape)

max1d1 = MaxPooling1D(pool_size=2)(conv1)
print('max pool output: ', max1d1.shape)

flat1 = Flatten()(max1d1)
print('flatten output: ', flat1.shape)

model = Model(inputs=inputs, outputs=flat1)

# check output
output = model.predict(data)
print('output shape: ', output.shape)
input shape is (2, 9, 3)
lstm output:  (None, 9, 5)
conv output:  (None, 8, 64)
max pool output:  (None, 4, 64)
flatten output:  (None, 256)
output shape:  (2, 256)

The output from LSTM/RNN looks roughly as below. $$ h_t = tanh(b + Wh_{t-1} + UX_t) $$

weights of our input against every neuron in LSTM

print('kernel U: ', lstm_layer.get_weights()[0].shape)
kernel U:  (3, 20)

weights of our hidden state a.k.a the output of LSTM in the previous timestep (t-1) against every neuron in LSTM

print('recurrent kernel, W: ', lstm_layer.get_weights()[1].shape)
recurrent kernel, W:  (5, 20)
print('bias: ', lstm_layer.get_weights()[2].shape)
bias:  (20,)

This post is inspired from Jason Brownlee’s [page](https://machinelearningmastery.com/return-sequences-and-return-states-for-lstms-in-keras/)

Total running time of the script: (0 minutes 2.115 seconds)

Gallery generated by Sphinx-Gallery