part 1- Building your Recurrent Neural Network - Step by Step

这是这一周编程作业的第一部分，让我们从无到有构建了 RNN 和 LSTM 的前向和反向传播函数，但是作业中给的公式出现很多问题，下面都是更正后的公式，还有一些符号的提法比较模糊，下面也做了说明。

首先引入需要的包。

1 2	import numpy as np from rnn_utils import *

RNN

单个 RNN 前向传播

def rnn_cell_forward(xt, a_prev, parameters):
    """
    Implements a single forward step of the RNN-cell as described in Figure (2)

    Arguments:
    xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    Returns:
    a_next -- next hidden state, of shape (n_a, m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
    """
    
    # Retrieve parameters from "parameters"
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]

    # compute next activation state using the formula given above
    a_next = np.tanh(np.dot(Wax,xt) + np.dot(Waa,a_prev) + ba)
    # compute output of the current cell using the formula given above
    yt_pred = softmax(np.dot(Wya,a_prev) + by)   
 
    # store values you need for backward propagation in cache
    cache = (a_next, a_prev, xt, parameters)
    
    return a_next, yt_pred, cache

总的 RNN 前向传播

def rnn_forward(x, a0, parameters):
    """
    Implement the forward propagation of the recurrent neural network described in Figure (3).

    Arguments:
    x -- Input data for every time-step, of shape (n_x, m, T_x).
    a0 -- Initial hidden state, of shape (n_a, m)
    parameters -- python dictionary containing:
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

    Returns:
    a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
    y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
    caches -- tuple of values needed for the backward pass, contains (list of caches, x)
    """
    
    # Initialize "caches" which will contain the list of all caches
    caches = []
    
    # Retrieve dimensions from shapes of x and parameters["Wya"]
    n_x, m, T_x = x.shape
    n_y, n_a = parameters["Wya"].shape
    
    # initialize "a" and "y" with zeros
    a = np.zeros((n_a,m,T_x)) # 初始化一个存放所有激活值的容器
    y_pred = np.zeros((n_y,m,T_x)) # 初始化一个存放所有y预测值的容器
    
    # Initialize a_next 
    a_next = a0 # 将 a_next 初始化为 a<0> 
    
    # loop over all time-steps
    for t in range(T_x):
        # Update next hidden state, compute the prediction, get the cache (≈1 line)
        a_next, yt_pred, cache = rnn_cell_forward(x[:,:,t], a_next, parameters)
        # Save the value of the new "next" hidden state in a (≈1 line)
        a[:,:,t] = a_next
        # Save the value of the prediction in y (≈1 line)
        y_pred[:,:,t] = yt_pred
        # Append "cache" to "caches" (≈1 line)
        caches.append(cache)

    # store values needed for backward propagation in cache
    caches = (caches, x)
    
    return a, y_pred, caches

单个 RNN 单元反向传播

def rnn_cell_backward(da_next, cache):
    """
    Implements the backward pass for the RNN-cell (single time-step).

    Arguments:
    da_next -- Gradient of loss with respect to next hidden state
    cache -- python dictionary containing useful values (output of rnn_cell_forward())

    Returns:
    gradients -- python dictionary containing:
                        dx -- Gradients of input data, of shape (n_x, m)
                        da_prev -- Gradients of previous hidden state, of shape (n_a, m)
                        dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
                        dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
                        dba -- Gradients of bias vector, of shape (n_a, 1)
    """
    
    # Retrieve values from cache
    (a_next, a_prev, xt, parameters) = cache
    
    # Retrieve values from parameters
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]

    # compute the gradient of tanh with respect to a_next 
    dtanh = da_next * (1 - a_next**2) # 指的是代价函数对 (Wax·x^<t>+Waa·a^<t-1>+ba) 的导数

    # compute the gradient of the loss with respect to Wax
    dxt = np.dot(Wax.T, dtanh)
    dWax = np.dot(dtanh, xt.T)

    # compute the gradient with respect to Waa 
    da_prev = np.dot(Waa.T, dtanh)
    dWaa = np.dot(dtanh, a_prev.T)

    # compute the gradient with respect to b 
    dba = np.sum(dtanh, axis = 1, keepdims = True)

    # Store the gradients in a python dictionary
    gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}
    
    return gradients

总的 RNN 反向传播

def rnn_backward(da, caches):
    """
    Implement the backward pass for a RNN over an entire sequence of input data.

    Arguments:
    da -- Upstream gradients of all hidden states, of shape (n_a, m, T_x)
    caches -- tuple containing information from the forward pass (rnn_forward)

    Returns:
    gradients -- python dictionary containing:
                        dx -- Gradient w.r.t. the input data, numpy-array of shape (n_x, m, T_x)
                        da0 -- Gradient w.r.t the initial hidden state, numpy-array of shape (n_a, m)
                        dWax -- Gradient w.r.t the input's weight matrix, numpy-array of shape (n_a, n_x)
                        dWaa -- Gradient w.r.t the hidden state's weight matrix, numpy-arrayof shape (n_a, n_a)
                        dba -- Gradient w.r.t the bias, of shape (n_a, 1)
    """

    # Retrieve values from the first cache (t=1) of caches 
    (caches, x) = caches
    (a1, a0, x1, parameters) = caches[0]

    # Retrieve dimensions from da's and x1's shapes 
    n_a, m, T_x = da.shape
    n_x, m = x1.shape

    # initialize the gradients with the right sizes 
    dx = np.zeros((n_x, m, T_x))
    dWax = np.zeros((n_a, n_x))
    dWaa = np.zeros((n_a, n_a))
    dba = np.zeros((n_a, 1))
    da0 = np.zeros((n_a, m))
    da_prevt = np.zeros((n_a, m))

    # Loop through all the time steps
    for t in reversed(range(T_x)):
        # Compute gradients at time step t. Choose wisely the "da_next" and the "cache" to use in the backward propagation step. (≈1 line)
        # 这里注意：a^<t> 在这里分两条路影响代价函数 J，所以 da = da[:, :, t] + da_prevt
        gradients = rnn_cell_backward(da[:, :, t] + da_prevt, caches[t])
        # Retrieve derivatives from gradients (≈ 1 line)
        dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
        # Increment global derivatives w.r.t parameters by adding their derivative at time-step t (≈4 lines)
        dx[:, :, t] = dxt
        # 注意：前向传播中所有 t 的 Wax、Waa、ba 都是相等的，但是每个 t 时间的 dWaxt、dWaa、dba 都是不相等的，
        #      它们都影响了代价函数 J，所以需要把每一个时间 t 求到的 dWaxt、dWaat、dbat 累加起来
        dWax += dWaxt
        dWaa += dWaat
        dba += dbat

    # Set da0 to the gradient of a which has been backpropagated through all time-steps (≈1 line) 
    da0 = da_prevt
  
    # Store the gradients in a python dictionary
    gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}

    return gradients

LSTM

单个 LSTM 单元的前向传播

def lstm_cell_forward(xt, a_prev, c_prev, parameters):
    """
    Implement a single forward step of the LSTM-cell as described in Figure (4)

    Arguments:
    xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    c_prev -- Memory state at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
                        Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        bi -- Bias of the update gate, numpy array of shape (n_a, 1)
                        Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
                        bc --  Bias of the first "tanh", numpy array of shape (n_a, 1)
                        Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
                        bo --  Bias of the output gate, numpy array of shape (n_a, 1)
                        Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
                        
    Returns:
    a_next -- next hidden state, of shape (n_a, m)
    c_next -- next memory state, of shape (n_a, m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    cache -- tuple of values needed for the backward pass, contains (a_next, c_next, a_prev, c_prev, xt, parameters)
    
    Note: ft/it/ot stand for the forget/update/output gates, cct stands for the candidate value (c tilde),
          c stands for the memory value
    """

    # Retrieve parameters from "parameters"
    Wf = parameters["Wf"]
    bf = parameters["bf"]
    Wi = parameters["Wi"]
    bi = parameters["bi"]
    Wc = parameters["Wc"]
    bc = parameters["bc"]
    Wo = parameters["Wo"]
    bo = parameters["bo"]
    Wy = parameters["Wy"]
    by = parameters["by"]
    
    # Retrieve dimensions from shapes of xt and Wy
    n_x, m = xt.shape
    n_y, n_a = Wy.shape

    # Concatenate a_prev and xt 将 a_prev 和 xt 拼在一起
    concat = np.zeros((n_a+n_x, m))
    concat[: n_a, :] = a_prev
    concat[n_a :, :] = xt

    # Compute values for ft, it, cct, c_next, ot, a_next using the formulas given figure (4)
    ft = sigmoid(np.dot(Wf, concat) + bf)
    it = sigmoid(np.dot(Wi, concat) + bi)
    cct = np.tanh(np.dot(Wc, concat) + bc)
    c_next = ft * c_prev + it * cct
    ot = sigmoid(np.dot(Wo, concat) + bo)
    a_next = ot * np.tanh(c_next)
    
    # Compute prediction of the LSTM cell
    yt_pred = softmax(np.dot(Wy,a_next) + by)

    # store values needed for backward propagation in cache
    cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)

    return a_next, c_next, yt_pred, cache

总的 LSTM 前向传播

def lstm_forward(x, a0, parameters):
    """
    Implement the forward propagation of the recurrent neural network using an LSTM-cell described in Figure (3).

    Arguments:
    x -- Input data for every time-step, of shape (n_x, m, T_x).
    a0 -- Initial hidden state, of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
                        Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        bi -- Bias of the update gate, numpy array of shape (n_a, 1)
                        Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
                        bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
                        Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
                        bo -- Bias of the output gate, numpy array of shape (n_a, 1)
                        Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
                        
    Returns:
    a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
    y -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
    caches -- tuple of values needed for the backward pass, contains (list of all the caches, x)
    """

    # Initialize "caches", which will track the list of all the caches
    caches = []

    # Retrieve dimensions from shapes of x and parameters['Wy'] 
    n_x, m, T_x = x.shape
    n_y, n_a = parameters['Wy'].shape
    
    # initialize "a", "c" and "y" with zeros 
    a = np.zeros((n_a,m,T_x)) # 存放 a 的容器
    c = np.zeros((n_a,m,T_x)) # 存放 c 的容器
    y = np.zeros((n_y,m,T_x)) # 存放 y 的容器
    
    # Initialize a_next and c_next
    a_next = a0
    c_next = np.zeros((n_a,m))
    
    # loop over all time-steps
    for t in range(T_x):
        # Update next hidden state, next memory state, compute the prediction, get the cache 
        a_next, c_next, yt, cache = lstm_cell_forward(x[:,:,t], a_next, c_next, parameters)
        # Save the value of the new "next" hidden state in a 
        a[:,:,t] = a_next
        # Save the value of the prediction in y 
        y[:,:,t] = yt
        # Save the value of the next cell state 
        c[:,:,t]  = c_next
        # Append the cache into caches
        caches.append(cache)
 
    # store values needed for backward propagation in cache
    caches = (caches, x)

    return a, y, c, caches

单个 LSTM 单元的反向传播

由于这部分给的公式发生了错误，所以结合论坛里的讨论自己进行了公式的推导和更正，可以在这篇笔记里找到。

$\begin{split}原公式中的“d \Gamma_o^{\langle t \rangle}” &= dZ_o^{<t>}=\frac{\partial J}{\partial Z_o^{<t>}}=\frac{\partial J}{\partial(W_o[a^{<t-1>},x^{<t>}]+b_o)} \\ &= da_{next}*\tanh(c_{next}) * \Gamma_o^{\langle t \rangle}*\left(1-\Gamma_o^{\langle t \rangle}\right) \end{split}\tag{7}$ $\begin{split}原公式中的“d\widetilde{c}^{\langle t \rangle}”&=dZ_c^{<t>}=\frac{\partial J}{\partial Z_c^{<t>}}=\frac{\partial J}{\partial(W_c[a^{<t-1>},x^{<t>}]+b_c)} \\ &= \left(dc_{next}+ \Gamma_o^{\langle t \rangle}* (1-\tanh(c_{next})^2) * da_{next} \right) \Gamma_u^{\langle t \rangle} * \left(1-\left(\widetilde c^{\langle t \rangle}\right)^2\right) \end{split}\tag{8}$ $\begin{split}原公式中的“d\Gamma_u^{\langle t \rangle}”&=dZ_u^{<t>}=\frac{\partial J}{\partial Z_u^{<t>}}=\frac{\partial J}{\partial(W_u[a^{<t-1>},x^{<t>}]+b_u)} \\ &= \left(dc_{next}+ \Gamma_o^{\langle t \rangle}* (1-\tanh(c_{next})^2) * da_{next}\right)*\Gamma_u^{\langle t \rangle}* \widetilde{c}^{\langle t \rangle} *\left(1-\Gamma_u^{\langle t \rangle}\right)\end{split}\tag{9}$ $\begin{split}原公式中的“d\Gamma_f^{\langle t \rangle}”&=dZ_f^{<t>}=\frac{\partial J}{\partial Z_f^{<t>}}=\frac{\partial J}{\partial(W_f[a^{<t-1>},x^{<t>}]+b_f)} \\ &= \left(dc_{next}+ \Gamma_o^{\langle t \rangle} * (1-\tanh(c_{next})^2) * da_{next}\right)*\Gamma_f^{\langle t \rangle}* c_{prev}*\left(1-\Gamma_f^{\langle t \rangle}\right)\end{split}\tag{10}$ $dW_f = d\Gamma_f^{\langle t \rangle} \begin{bmatrix} a_{prev} \\ x_t\end{bmatrix}^T \tag{11}$ $dW_u = d\Gamma_u^{\langle t \rangle} \begin{bmatrix} a_{prev} \\ x_t\end{bmatrix}^T \tag{12}$ $dW_c = d\widetilde c^{\langle t \rangle} \begin{bmatrix} a_{prev} \\ x_t\end{bmatrix}^T \tag{13}$ $dW_o = d\Gamma_o^{\langle t \rangle} \begin{bmatrix} a_{prev} \\ x_t\end{bmatrix}^T \tag{14}$ $db_f=np.sum(d\Gamma_f^{\langle t \rangle},axis=1,keepdims=True)\tag{15}$ $db_u=np.sum(d\Gamma_u^{\langle t \rangle},axis=1,keepdims=True)\tag{16}$ $db_c=np.sum(d\Gamma_c^{\langle t \rangle},axis=1,keepdims=True)\tag{17}$ $db_o=np.sum(d\Gamma_o^{\langle t \rangle},axis=1,keepdims=True)\tag{18}$ $da_{prev} = W_f^T[:,:n_a] d\Gamma_f^{\langle t \rangle} + W_u^T[:,:n_a] d\Gamma_u^{\langle t \rangle}+ W_c^T[:,:n_a] d\widetilde c^{\langle t \rangle} + W_o^T[:,:n_a] d\Gamma_o^{\langle t \rangle} \tag{19}$ $dc_{prev} = dc_{next}*\Gamma_f^{\langle t \rangle} + \Gamma_o^{\langle t \rangle} * (1- \tanh(c_{next})^2)*\Gamma_f^{\langle t \rangle}*da_{next} \tag{20}$ $dx^{\langle t \rangle} = W_f^T[:,n_a:] d\Gamma_f^{\langle t \rangle} + W_u^T[:,n_a:] d\Gamma_u^{\langle t \rangle}+ W_c^T[:,n_a:] d\widetilde c^{\langle t \rangle} + W_o^T[:,n_a:] d\Gamma_o^{\langle t \rangle}\tag{21}$

def lstm_cell_backward(da_next, dc_next, cache):
    """
    Implement the backward pass for the LSTM-cell (single time-step).

    Arguments:
    da_next -- Gradients of next hidden state, of shape (n_a, m)
    dc_next -- Gradients of next cell state, of shape (n_a, m)
    cache -- cache storing information from the forward pass

    Returns:
    gradients -- python dictionary containing:
                        dxt -- Gradient of input data at time-step t, of shape (n_x, m)
                        da_prev -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
                        dc_prev -- Gradient w.r.t. the previous memory state, of shape (n_a, m, T_x)
                        dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
                        dWo -- Gradient w.r.t. the weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
                        dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
                        dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
                        dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
                        dbo -- Gradient w.r.t. biases of the output gate, of shape (n_a, 1)
    """

    # Retrieve information from "cache"
    (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters) = cache

    # Retrieve dimensions from xt's and a_next's shape 
    n_x, m = xt.shape
    n_a, m = a_next.shape
    
    # Compute gates related derivatives, you can find their values can be found by looking carefully at equations (7) to (10) 
    dot = da_next * np.tanh(c_next)*ot*(1-ot) # 也就是 dJ / d(W_o[a^{<t-1>},x^{<t>}]+b_o),下同
    dcct = (dc_next + da_next*ot*(1-np.tanh(c_next)**2))*it*(1-cct**2)
    dit = (dc_next + da_next*ot*(1-np.tanh(c_next)**2))*cct*it*(1-it) # it 就是更新门 updategate
    dft = (dc_next + da_next*ot*(1-np.tanh(c_next)**2))*c_prev*ft*(1-ft)

    # Compute parameters related derivatives. Use equations (11)-(14) 
    dWf = np.dot(dft, np.vstack((a_prev,xt)).T)
    dWi = np.dot(dit, np.vstack((a_prev,xt)).T)
    dWc = np.dot(dcct,np.vstack((a_prev,xt)).T)
    dWo = np.dot(dot,np.vstack((a_prev,xt)).T)
    dbf = np.sum(dft, axis = 1, keepdims = True)
    dbi = np.sum(dit, axis = 1, keepdims = True)
    dbc = np.sum(dcct, axis = 1, keepdims = True)
    dbo = np.sum(dot, axis = 1, keepdims = True)

    # Compute derivatives w.r.t previous hidden state, previous memory state and input. Use equations (15)-(17). 
    da_prev = np.dot(parameters['Wf'][:,:n_a].T,dft) + np.dot(parameters['Wi'][:,:n_a].T,dit) + np.dot(parameters['Wo'][:,:n_a].T,dot) + np.dot(parameters['Wc'][:,:n_a].T,dcct) 
    dc_prev = da_next*ot*(1-(np.tanh(c_next))**2)
    dxt = np.dot(parameters['Wf'][:,n_a:].T,dft) + np.dot(parameters['Wi'][:,n_a:].T,dit) + np.dot(parameters['Wo'][:,n_a:].T,dot) + np.dot(parameters['Wc'][:,n_a:].T,dcct)
 
    # Save gradients in dictionary
    gradients = {"dxt": dxt, "da_prev": da_prev, "dc_prev": dc_prev, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
                "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}

    return gradients

总的 LSTM 反向传播

def lstm_backward(da, caches):
    
    """
    Implement the backward pass for the RNN with LSTM-cell (over a whole sequence).

    Arguments:
    da -- Gradients w.r.t the hidden states, numpy-array of shape (n_a, m, T_x)
    dc -- Gradients w.r.t the memory states, numpy-array of shape (n_a, m, T_x)
    caches -- cache storing information from the forward pass (lstm_forward)

    Returns:
    gradients -- python dictionary containing:
                        dx -- Gradient of inputs, of shape (n_x, m, T_x)
                        da0 -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
                        dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
                        dWo -- Gradient w.r.t. the weight matrix of the save gate, numpy array of shape (n_a, n_a + n_x)
                        dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
                        dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
                        dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
                        dbo -- Gradient w.r.t. biases of the save gate, of shape (n_a, 1)
    """

    # Retrieve values from the first cache (t=1) of caches.
    (caches, x) = caches
    (a1, c1, a0, c0, f1, i1, cc1, o1, x1, parameters) = caches[0]

    # Retrieve dimensions from da's and x1's shapes 
    n_a, m, T_x = da.shape
    n_x, m = x1.shape
    
    # initialize the gradients with the right sizes
    dx = np.zeros((n_x, m, T_x))
    da0 = np.zeros((n_a,m))
    da_prevt = np.zeros((n_a,m))
    dc_prevt = np.zeros((n_a,m))
    dWf = np.zeros((n_a, n_a+n_x)) # 每个时间点 t 的 Wf 都是相等的
    dWi = np.zeros((n_a,n_a+n_x))
    dWc = np.zeros((n_a,n_a+n_x))
    dWo = np.zeros((n_a,n_a+n_x))
    dbf = np.zeros((n_a,1))
    dbi = np.zeros((n_a,1))
    dbc = np.zeros((n_a,1))
    dbo = np.zeros((n_a,1))
    
    # loop back over the whole sequence
    for t in reversed(range(T_x)):
        # Compute all gradients using lstm_cell_backward
        gradients = lstm_cell_backward(da[:,:,t], dc_prevt, caches[t])
        # Store or add the gradient to the parameters' previous step's gradient
        dx[:,:,t] = gradients['dxt']
        # 注意：前向传播中所有 t 的 Wf、Wi、Wo、Wc、ba、bi、bo、bc 都是相等的，
        #      但是每个 t 时间的 dWf、dWi、dWo、dWc、dba、dbi、dbo、dbc 都是不相等的，
        #      它们都影响了代价函数 J，
        #      所以需要把每一个时间 t 求到的 dWf、dWi、dWo、dWc、dba、dbi、dbo、dbc 累加起来
        dWf += gradients['dWf']
        dWi += gradients['dWi']
        dWc += gradients['dWc']
        dWo += gradients['dWo']
        dbf += gradients['dbf']
        dbi += gradients['dbi']
        dbc += gradients['dbc']
        dbo += gradients['dbo']
    # Set the first activation's gradient to the backpropagated gradient da_prev.
    da0 = gradients['da_prev']
 
    # Store the gradients in a python dictionary
    gradients = {"dx": dx, "da0": da0, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
                "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}
    
    return gradients

Part 2-Character level language model - Dinosaurus land

创造一个字符级的语言模型，通过一堆恐龙名字进行训练，用来生成新的恐龙名字。

1
2
3

import numpy as np
from utils import *
import random

问题描述

数据集加载和预处理

data = open('dinos.txt', 'r').read()
data= data.lower()
chars = list(set(data))	# 去除重复元素
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

output：There are 19909 total characters and 27 unique characters in your data.

由于是字符级别的，所以一共有 26 个英文字母加上结束符 \n = 27 个字符，将这些字符一一对应存入字典中。

1
2
3

char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) } # sorted 进行排序
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

output：{0: ‘\n’, 1: ‘a’, 2: ‘b’, 3: ‘c’, 4: ‘d’, 5: ‘e’, 6: ‘f’, 7: ‘g’, 8: ‘h’, 9: ‘i’, 10: ‘j’, 11: ‘k’, 12: ‘l’, 13: ‘m’, 14: ‘n’, 15: ‘o’, 16: ‘p’, 17: ‘q’, 18: ‘r’, 19: ‘s’, 20: ‘t’, 21: ‘u’, 22: ‘v’, 23: ‘w’, 24: ‘x’, 25: ‘y’, 26: ‘z’}

模型概况

初始化参数
优化循环
- 前向传播计算 loss
- 反向传播计算 loss 对每个参数的梯度
- 进行梯度剪枝 gradient clipping，防止梯度爆炸
- 更新参数
返回学习参数

建立模型组块

两个需要建立的：Gradient clipping 和采样 sampling

梯度剪枝 gradient clipping

反向传播得到梯度后，在更新参数之前，进行一步梯度剪枝的操作，用以确保不会发生梯度爆炸，也就是把梯度限制在某个区间内，比如 [-10,10]，大于 10 的梯度取 10，小于 -10 的梯度取 -10.

代码如下：

def clip(gradients, maxValue):
    '''
    Clips the gradients' values between minimum and maximum.
    
    Arguments:
    gradients -- a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
    maxValue -- everything above this number is set to this number, and everything less than -maxValue is set to -maxValue
    
    Returns: 
    gradients -- a dictionary with the clipped gradients.
    '''
    
    dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']

    # clip to mitigate exploding gradients, loop over [dWax, dWaa, dWya, db, dby]. (≈2 lines)
    for gradient in [dWax, dWaa, dWya, db, dby]:
        np.minimum(gradient,maxValue,out=gradient)
        np.maximum(gradient,-maxValue,out=gradient)# 注意！！gradient 在这里只是一个临时变量，我们需要把得到的值 out 输出给它

    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}
    
    return gradients

采样 sampling

模型训练好之后，如何生成文字呢？如下图所示：

先输入一个假的输入 $x^{<1>} = 0，a^{<0>} = 0$
按照公式计算 $\hat y^{<1>}$ 和 $a^{<1>}$
进行采样：实际上 $\hat y^{<1>}$ 是 $x^{<2>}$ 的预测值，包含了取不同值的概率，我们根据不同的概率进行随机的取值，例如取索引值 3 的概率为 0.16，那么 $x^{<2>}$ 就有 16% 的概率被赋值为 3
- 为什么不直接取 $\hat y^{<1>}$ 中概率值最大的赋值给 $x^{<2>}$？论坛看到一个解释：
  
  Sampling produces variety. Say you have a list of items with probability of occurrence “car” (30%) “airplane” (70%). If you simply choose the argmax of the probability from the list then you always choose airplane which gets boring. If you alternatively “sample” from the list then 70% of the time you get airplane and 30% car. This is really important in longer sentences; if you choose the argmax you end up with the same sentence all the time (exception when multiple words have exactly same probability 50:50), but sampling gives you an interesting mix of sentences from the same list.
将采样得到的 $x^{}$ 索引值进行 one-hot 编码赋值给 x 以输入下一个循环。重复第一步直到 y 输出一个结束符 “\n”

代码如下：

def sample(parameters, char_to_ix):
    """
    Sample a sequence of characters according to a sequence of probability distributions output of the RNN

    Arguments:
    parameters -- python dictionary containing the parameters Waa, Wax, Wya, by, and b. 
    char_to_ix -- python dictionary mapping each character to an index.
 
    Returns:
    indices -- a list of length n containing the indices of the sampled characters.
    """
    
    # Retrieve parameters and relevant shapes from "parameters" dictionary
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    vocab_size = by.shape[0]
    n_a = Waa.shape[1]

    # Step 1: Create the one-hot vector x for the first character (initializing the sequence generation).
    x = np.zeros((vocab_size,1))
    # Step 1': Initialize a_prev as zeros 
    a_prev = np.zeros((n_a,1))
    
    # Create an empty list of indices, this is the list which will contain the list of indices of the characters to generate (≈1 line)
    indices = []	# 存放每一步生成的索引值
    
    # Idx is a flag to detect a newline character, we initialize it to -1
    idx = -1 
    
    # Loop over time-steps t. At each time-step, sample a character from a probability distribution and append 
    # its index to "indices". We'll stop if we reach 50 characters (which should be very unlikely with a well 
    # trained model), which helps debugging and prevents entering an infinite loop. 
    counter = 0
    newline_character = char_to_ix['\n']
    
    while (idx != newline_character and counter != 50):
        
        # Step 2: Forward propagate x using the equations (1), (2) and (3)
        a = np.tanh(np.dot(Wax,x)+np.dot(Waa,a_prev)+b)
        z = np.dot(Wya,a)+by
        y = softmax(z)
        
        # Step 3: Sample the index of a character within the vocabulary from the probability distribution y
        idx = np.random.choice(vocab_size, p=y.ravel())	# 按照概率进行随机取值

        # Append the index to "indices"
        indices.append(idx)
        
        # Step 4: Overwrite the input character as the one corresponding to the sampled index.
        x = np.zeros((vocab_size,1))
        x[idx] = 1	# one-hot 编码
        
        # Update "a_prev" to be "a"
        a_prev = a
        
        counter +=1
    if (counter == 50): # 限制生成的单词字长不超过 50，防止无限循环
        indices.append(char_to_ix['\n'])
    
    return indices

建立语言模型

梯度下降

一些辅助函数：

def smooth(loss, cur_loss):
    return loss * 0.999 + cur_loss * 0.001

def print_sample(sample_ix, ix_to_char):
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    txt = txt[0].upper() + txt[1:]  # capitalize first character 
    print ('%s' % (txt, ), end='')

def get_initial_loss(vocab_size, seq_length):
    return -np.log(1.0/vocab_size)*seq_length

def initialize_parameters(n_a, n_x, n_y):
    """
    Initialize parameters with small random values
    
    Returns:
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        b --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    """
    np.random.seed(1)
    Wax = np.random.randn(n_a, n_x)*0.01 # input to hidden
    Waa = np.random.randn(n_a, n_a)*0.01 # hidden to hidden
    Wya = np.random.randn(n_y, n_a)*0.01 # hidden to output
    b = np.zeros((n_a, 1)) # hidden bias
    by = np.zeros((n_y, 1)) # output bias
    
    parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b,"by": by}
    
    return parameters

def rnn_step_forward(parameters, a_prev, x):
    
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    a_next = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b) # hidden state
    p_t = softmax(np.dot(Wya, a_next) + by) # unnormalized log probabilities for next chars # probabilities for next chars 
    
    return a_next, p_t

def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):
    
    gradients['dWya'] += np.dot(dy, a.T)
    gradients['dby'] += dy
    da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] # backprop into h
    daraw = (1 - a * a) * da # backprop through tanh nonlinearity
    gradients['db'] += daraw
    gradients['dWax'] += np.dot(daraw, x.T)
    gradients['dWaa'] += np.dot(daraw, a_prev.T)
    gradients['da_next'] = np.dot(parameters['Waa'].T, daraw)
    return gradients

def update_parameters(parameters, gradients, lr):

    parameters['Wax'] += -lr * gradients['dWax']
    parameters['Waa'] += -lr * gradients['dWaa']
    parameters['Wya'] += -lr * gradients['dWya']
    parameters['b']  += -lr * gradients['db']
    parameters['by']  += -lr * gradients['dby']
    return parameters

def rnn_forward(X, Y, a0, parameters, vocab_size = 27):
    
    # Initialize x, a and y_hat as empty dictionaries
    x, a, y_hat = {}, {}, {}
    
    a[-1] = np.copy(a0)
    
    # initialize your loss to 0
    loss = 0
    
    for t in range(len(X)):
        
        # Set x[t] to be the one-hot vector representation of the t'th character in X.
        # if X[t] == None, we just have x[t]=0. This is used to set the input for the first timestep to the zero vector. 
        x[t] = np.zeros((vocab_size,1)) 
        if (X[t] != None):
            x[t][X[t]] = 1
        
        # Run one step forward of the RNN
        a[t], y_hat[t] = rnn_step_forward(parameters, a[t-1], x[t])
        
        # Update the loss by substracting the cross-entropy term of this time-step from it.
        loss -= np.log(y_hat[t][Y[t],0])
        
    cache = (y_hat, a, x)
        
    return loss, cache

def rnn_backward(X, Y, parameters, cache):
    # Initialize gradients as an empty dictionary
    gradients = {}
    
    # Retrieve from cache and parameters
    (y_hat, a, x) = cache
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    
    # each one should be initialized to zeros of the same dimension as its corresponding parameter
    gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
    gradients['db'], gradients['dby'] = np.zeros_like(b), np.zeros_like(by)
    gradients['da_next'] = np.zeros_like(a[0])

    # Backpropagate through time
    for t in reversed(range(len(X))):
        dy = np.copy(y_hat[t])
        dy[Y[t]] -= 1
        gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t-1])
    
    return gradients, a

建立模型：

def optimize(X, Y, a_prev, parameters, learning_rate = 0.01):
    """
    Execute one step of the optimization to train the model.
    
    Arguments:
    X -- list of integers, where each integer is a number that maps to a character in the vocabulary.
    Y -- list of integers, exactly the same as X but shifted one index to the left.
    a_prev -- previous hidden state.
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        b --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    learning_rate -- learning rate for the model.
    
    Returns:
    loss -- value of the loss function (cross-entropy)
    gradients -- python dictionary containing:
                        dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
                        dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
                        dWya -- Gradients of hidden-to-output weights, of shape (n_y, n_a)
                        db -- Gradients of bias vector, of shape (n_a, 1)
                        dby -- Gradients of output bias vector, of shape (n_y, 1)
    a[len(X)-1] -- the last hidden state, of shape (n_a, 1)
    """
    # Forward propagate through time (≈1 line)
    loss, cache = rnn_forward(X, Y, a_prev, parameters)
    
    # Backpropagate through time (≈1 line)
    gradients, a = rnn_backward(X, Y, parameters, cache)
    
    # Clip your gradients between -5 (min) and 5 (max) (≈1 line)
    gradients = clip(gradients, 5)
    
    # Update parameters (≈1 line)
    parameters = update_parameters(parameters, gradients, learning_rate)

    return loss, gradients, a[len(X)-1]

训练模型

def model(data, ix_to_char, char_to_ix, num_iterations = 35000, n_a = 50, dino_names = 7, vocab_size = 27):
    """
    Trains the model and generates dinosaur names. 
    
    Arguments:
    data -- text corpus
    ix_to_char -- dictionary that maps the index to a character
    char_to_ix -- dictionary that maps a character to an index
    num_iterations -- number of iterations to train the model for
    n_a -- number of units of the RNN cell
    dino_names -- number of dinosaur names you want to sample at each iteration. 
    vocab_size -- number of unique characters found in the text, size of the vocabulary
    
    Returns:
    parameters -- learned parameters
    """
    
    # Retrieve n_x and n_y from vocab_size
    n_x, n_y = vocab_size, vocab_size
    
    # Initialize parameters
    parameters = initialize_parameters(n_a, n_x, n_y)
    
    # Initialize loss (this is required because we want to smooth our loss, don't worry about it)
    loss = get_initial_loss(vocab_size, dino_names)
    
    # Build list of all dinosaur names (training examples).
    with open("dinos.txt") as f:
        examples = f.readlines()
    examples = [x.lower().strip() for x in examples]
    
    # Shuffle list of all dinosaur names
    np.random.shuffle(examples)	# 将样本洗牌
    
    # Initialize the hidden state of your LSTM
    a_prev = np.zeros((n_a, 1))
    
    # Optimization loop
    for j in range(num_iterations):
       
        # 假设 n 个恐龙名字样本
        # 样本1（循环1）-> 样本2（循环2）->……-> 样本n（循环n）->样本1（循环n+1）-> ……->样本n（循环2n）->……
        # Use the hint above to define one training example (X,Y) (≈ 2 lines)
        index = j % len(examples)	# 目前的循环次数取余样本数
        X = [None] + [char_to_ix[ch] for ch in examples[index]] # [None] 指第一个X<0>,它生成hatY<1>，即X<n>生成hatY<n+1>
        Y = X[1:] + [char_to_ix["\n"]] # Y 的真实值，与 X 相同，只是没有第一个Y<0>，并且最后加上结束符，比X向右移动了一位
        
        # Perform one optimization step: Forward-prop -> Backward-prop -> Clip -> Update parameters
        # Choose a learning rate of 0.01
        curr_loss, gradients, a_prev = optimize(X, Y, a_prev, parameters, learning_rate = 0.01)
  
        # Use a latency trick to keep the loss smooth. It happens here to accelerate the training.
        loss = smooth(loss, curr_loss)

        # Every 2000 Iteration, generate "n" characters thanks to sample() to check if the model is learning properly
        if j % 2000 == 0:
            
            print('Iteration: %d, Loss: %f' % (j, loss) + '\n')
            
            # The number of dinosaur names to print
            for name in range(dino_names):
                
                # Sample indices and print them
                sampled_indices = sample(parameters, char_to_ix, seed)
                print_sample(sampled_indices, ix_to_char)
      
            print('\n')
        
    return parameters

1	parameters = model(data, ix_to_char, char_to_ix)

……

Iteration: 34000, Loss: 22.396744

Mavptokekus
Ilabaisaurus
Itosaurus
Macaesaurus
Yrosaurus
Eiaeosaurus
Trodon

Part 3-Improvise a Jazz Solo with an LSTM Network

这次作业是在 Keras 使用 LSTM 进行爵士音乐的生成。

参考 Ji-Sung Kim, 2016, deepjazz.

from __future__ import print_function
import IPython
import sys
from music21 import *
import numpy as np
from grammar import *
from qa import *
from preprocess import * 
from music_utils import *
from data_utils import *
from keras.models import load_model, Model
from keras.layers import Dense, Activation, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector
from keras.initializers import glorot_uniform
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras import backend as K

问题描述

加载数据集

先不管具体的音乐理论和细节，直接加载需要的数据集：

X, Y, n_values, indices_values = load_music_utils()
print('shape of X:', X.shape)
print('number of training examples:', X.shape[0])
print('Tx (length of sequence):', X.shape[1])
print('total # of unique values:', n_values)
print('Shape of Y:', Y.shape)

shape of X: (60, 30, 78)
number of training examples: 60
Tx (length of sequence): 30
total # of unique values: 78
Shape of Y: (30, 60, 78)

X：(m, T_x, 78) 其中 m 是样本数，T_x 是音乐片段的时间，78 是 one-hot 向量的长度
Y：(T_y, m, 78) 为了便于 feed 给 LSTM 层，将其维度变一下
n_value：把某个时间的音乐变成 one-hot 向量的长度
indices_values：78 个索引值的字典

模型概况

每个训练用的序列样本长度可以是不同的，但是这里为了方便向量化，将所有用以训练的样本统一长度为 30 s.

建立模型

首先确定模型的超参数，我们必须先确定隐藏层的单元数。

看上面这幅图，所谓隐藏层单元数，就是 $W[a^{},x^{}]$ 的输出的维度，用 n_a 表示，这里设为 64.

n_a = 64

如果我们建立的序列模型是在测试时也给出所有序列，例如输入一个序列，预测一个标签，那么可以简单地使用 Keras 的内置函数进行模型构建，但是对于生成模型来说，我们每次只能知道序列前一个的值，而不知道整个序列，无法直接全部输入模型进行预测，只能一个时间点生成一次，然后让 $x^{\langle t\rangle} = y^{\langle t-1 \rangle}$ 继续生成下一个，而内置的 LSTM 模型的逻辑是，一次将所有时间的序列值丢进去然后得到一个输出。

所以在下面的函数中使用 for 循环调用 LSTM 层 T_x 次，由于每个时间点的 LSTM 层的参数都是一样的，所以我们定义一个全局的 层对象，每个 for 循环都调用这个对象，实现参数共享。

1
2
3

reshapor = Reshape((1, 78))                        # Used in Step 2.B of djmodel(), below
LSTM_cell = LSTM(n_a, return_state = True)# return_state 返回 c         # Used in Step 2.C
densor = Dense(n_values, activation='softmax')     # Used in Step 2.D

建立模型的步骤是：

建立空列表 outputs 储存每个时间点的输出 $\hat y$
for t in 1,…, T_x：
- 从输入选择第 t 个时间点的序列值
  - 使用 x = Lambda(lambda x: X[:,t,:])(X)，输出维度是 (?, 78)
- 调用 reshapor 把 (?,78)->(?,1,78)，便于输入 LSTM 层
- 调用 LSTM_cell 得到输出的 a 和 c
  - a, _, c = LSTM_cell(input_x, initial_state=[previous hidden state, previous cell state])
- 将隐藏层的激活值 a 通过 softmax 进行输出
- 把输出存进 outputs

def djmodel(Tx, n_a, n_values):
    """
    Implement the model
    
    Arguments:
    Tx -- length of the sequence in a corpus
    n_a -- the number of activations used in our model
    n_values -- number of unique values in the music data 
    
    Returns:
    model -- a keras model with the 
    """
    
    # Define the input of your model with a shape 
    X = Input(shape=(Tx, n_values)) # Keras 中 shape 不需要显式地指出样本数 m，第一个位置默认是样本数，打印 X 的形状为 (?, 30, 78)
    
    # Define s0, initial hidden state for the decoder LSTM
    a0 = Input(shape=(n_a,), name='a0')#shape 其实是(?,n_a,)
    c0 = Input(shape=(n_a,), name='c0')
    a = a0
    c = c0

    # Step 1: Create empty list to append the outputs while you iterate (≈1 line)
    outputs = []
    
    # Step 2: Loop
    for t in range(Tx):
        
        # Step 2.A: select the "t"th time step vector from X. 
        x = Lambda(lambda x: X[:,t,:])(X)# Lambda 层是对 X 整个进行运算，得到一个结果，所以直接返回 X[:,t,:]
        # Step 2.B: Use reshapor to reshape x to be (1, n_values) (≈1 line)
        x = reshapor(x) # (?,78)->(?,1,78)
        # Step 2.C: Perform one step of the LSTM_cell
        a, _, c = LSTM_cell(x, initial_state=[a, c])
        # Step 2.D: Apply densor to the hidden state output of LSTM_Cell
        out = densor(a)
        # Step 2.E: add the output to "outputs"
        outputs.append(out)
        
    # Step 3: Create model instance
    model = Model(inputs=[X,a0,c0], outputs=outputs)

    return model

建立模型：

1	model = djmodel(Tx = 30 , n_a = 64, n_values = 78)

编译模型：

1
2
3

opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)#定义优化器

model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

训练模型：

m = 60
a0 = np.zeros((m, n_a))
c0 = np.zeros((m, n_a))

model.fit([X, a0, c0], list(Y), epochs=100)

输出：
......
Epoch 100/100
60/60 [==============================] - 0s - loss: 5.9232 - dense_1_loss_1: 3.7652 - dense_1_loss_2: 1.1579 - dense_1_loss_3: 0.3157 - dense_1_loss_4: 0.0825 - dense_1_loss_5: 0.0493 - dense_1_loss_6: 0.0368 - dense_1_loss_7: 0.0311 - dense_1_loss_8: 0.0285 - dense_1_loss_9: 0.0278 - dense_1_loss_10: 0.0222 - dense_1_loss_11: 0.0213 - dense_1_loss_12: 0.0205 - dense_1_loss_13: 0.0194 - dense_1_loss_14: 0.0199 - dense_1_loss_15: 0.0210 - dense_1_loss_16: 0.0206 - dense_1_loss_17: 0.0192 - dense_1_loss_18: 0.0213 - dense_1_loss_19: 0.0212 - dense_1_loss_20: 0.0219 - dense_1_loss_21: 0.0220 - dense_1_loss_22: 0.0202 - dense_1_loss_23: 0.0217 - dense_1_loss_24: 0.0211 - dense_1_loss_25: 0.0227 - dense_1_loss_26: 0.0202 - dense_1_loss_27: 0.0225 - dense_1_loss_28: 0.0245 - dense_1_loss_29: 0.0252 - dense_1_loss_30: 0.0000e+00 - dense_1_acc_1: 0.0500 - dense_1_acc_2: 0.6833 - dense_1_acc_3: 0.9167 - dense_1_acc_4: 1.0000 - dense_1_acc_5: 1.0000 - dense_1_acc_6: 1.0000 - dense_1_acc_7: 1.0000 - dense_1_acc_8: 1.0000 - dense_1_acc_9: 1.0000 - dense_1_acc_10: 1.0000 - dense_1_acc_11: 1.0000 - dense_1_acc_12: 1.0000 - dense_1_acc_13: 1.0000 - dense_1_acc_14: 1.0000 - dense_1_acc_15: 1.0000 - dense_1_acc_16: 1.0000 - dense_1_acc_17: 1.0000 - dense_1_acc_18: 1.0000 - dense_1_acc_19: 1.0000 - dense_1_acc_20: 1.0000 - dense_1_acc_21: 1.0000 - dense_1_acc_22: 1.0000 - dense_1_acc_23: 1.0000 - dense_1_acc_24: 1.0000 - dense_1_acc_25: 1.0000 - dense_1_acc_26: 1.0000 - dense_1_acc_27: 1.0000 - dense_1_acc_28: 1.0000 - dense_1_acc_29: 1.0000 - dense_1_acc_30: 0.0000e+00

生成音乐

到目前为止，我们模型的几个层 reshapor、LSTM_cell、densor 中的参数已经学习完毕，存在该对象中，现在我们使用这些参数训练好的层对象来构建一个新的生成模型。

与生成恐龙名字相同，我们仍需要 sampling 操作，因此增加一个将 softmax 的输出变成 one-hot 的层。

def music_inference_model(LSTM_cell, densor, n_values = 78, n_a = 64, Ty = 100):
    """
    Uses the trained "LSTM_cell" and "densor" from model() to generate a sequence of values.
    
    Arguments:
    LSTM_cell -- the trained "LSTM_cell" from model(), Keras layer object
    densor -- the trained "densor" from model(), Keras layer object
    n_values -- integer, umber of unique values
    n_a -- number of units in the LSTM_cell
    Ty -- integer, number of time steps to generate
    
    Returns:
    inference_model -- Keras model instance
    """
    
    # Define the input of your model with a shape 
    x0 = Input(shape=(1, n_values))
    
    # Define s0, initial hidden state for the decoder LSTM
    a0 = Input(shape=(n_a,), name='a0')
    c0 = Input(shape=(n_a,), name='c0')
    a = a0
    c = c0
    x = x0

    # Step 1: Create an empty list of "outputs" to later store your predicted values (≈1 line)
    outputs = []
    
    # Step 2: Loop over Ty and generate a value at every time step
    for t in range(Ty):
        
        # Step 2.A: Perform one step of LSTM_cell (≈1 line)
        a, _, c = LSTM_cell(x,initial_state=[a,c])
        
        # Step 2.B: Apply Dense layer to the hidden state output of the LSTM_cell (≈1 line)
        out = densor(a)

        # Step 2.C: Append the prediction "out" to "outputs". out.shape = (None, 78) (≈1 line)
        outputs.append(out)
        
        # Step 2.D: Select the next value according to "out", and set "x" to be the one-hot representation of the
        #           selected value, which will be passed as the input to LSTM_cell on the next step. We have provided 
        #           the line of code you need to do this. 
        x = Lambda(one_hot)(out)	# 下一次的输入等于上一次的输出变成 one-hot 向量
        
    # Step 3: Create model instance with the correct "inputs" and "outputs" (≈1 line)
    inference_model = Model(inputs=[x0,a0,c0], outputs = outputs)

    return inference_model

调用模型进行预测的函数：

def predict_and_sample(inference_model, x_initializer = x_initializer, a_initializer = a_initializer, 
                       c_initializer = c_initializer):
    """
    Predicts the next value of values using the inference model.
    
    Arguments:
    inference_model -- Keras model instance for inference time
    x_initializer -- numpy array of shape (1, 1, 78), one-hot vector initializing the values generation
    a_initializer -- numpy array of shape (1, n_a), initializing the hidden state of the LSTM_cell
    c_initializer -- numpy array of shape (1, n_a), initializing the cell state of the LSTM_cel
    
    Returns:
    results -- numpy-array of shape (Ty, 78), matrix of one-hot vectors representing the values generated
    indices -- numpy-array of shape (Ty, 1), matrix of indices representing the values generated
    """

    # Step 1: Use your inference model to predict an output sequence given x_initializer, a_initializer and c_initializer.
    pred = inference_model.predict([x_initializer, a_initializer, c_initializer])
    # Step 2: Convert "pred" into an np.array() of indices with the maximum probabilities
    indices = np.argmax(pred,axis=-1)	# 将 softmax 输出中概率值最大的定为输出索引值
    # Step 3: Convert indices to one-hot vectors, the shape of the results should be (1, )
    results = to_categorical(indices, num_classes=n_values)	# 将索引值再变成 one-hot 向量

    return results, indices

看看结果：

results, indices = predict_and_sample(inference_model, x_initializer, a_initializer, c_initializer)
print("np.argmax(results[12]) =", np.argmax(results[12]))
print("np.argmax(results[17]) =", np.argmax(results[17]))
print("list(indices[12:18]) =", list(indices[12:18]))

np.argmax(results[12]) = 70
np.argmax(results[17]) = 55
list(indices[12:18]) = [array([70]), array([71]), array([43]), array([2]), array([10]), array([55])]

生成音乐

得到的输出必须经过很好的后处理才能确保听起来好听，输出音乐的质量很大地取决于后处理的质量。

1	out_stream = generate_music(inference_model)