coursera 吴恩达深度学习 Specialization 编程作业(course 5 week 1)

part 1- Building your Recurrent Neural Network - Step by Step

这是这一周编程作业的第一部分,让我们从无到有构建了 RNN 和 LSTM 的前向和反向传播函数,但是作业中给的公式出现很多问题,下面都是更正后的公式,还有一些符号的提法比较模糊,下面也做了说明。

首先引入需要的包。

1
2
import numpy as np
from rnn_utils import *

RNN

单个 RNN 前向传播

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def rnn_cell_forward(xt, a_prev, parameters):
"""
Implements a single forward step of the RNN-cell as described in Figure (2)

Arguments:
xt -- your input data at timestep "t", numpy array of shape (n_x, m).
a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
parameters -- python dictionary containing:
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
ba -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
Returns:
a_next -- next hidden state, of shape (n_a, m)
yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
"""

# Retrieve parameters from "parameters"
Wax = parameters["Wax"]
Waa = parameters["Waa"]
Wya = parameters["Wya"]
ba = parameters["ba"]
by = parameters["by"]

# compute next activation state using the formula given above
a_next = np.tanh(np.dot(Wax,xt) + np.dot(Waa,a_prev) + ba)
# compute output of the current cell using the formula given above
yt_pred = softmax(np.dot(Wya,a_prev) + by)

# store values you need for backward propagation in cache
cache = (a_next, a_prev, xt, parameters)

return a_next, yt_pred, cache

总的 RNN 前向传播

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def rnn_forward(x, a0, parameters):
"""
Implement the forward propagation of the recurrent neural network described in Figure (3).

Arguments:
x -- Input data for every time-step, of shape (n_x, m, T_x).
a0 -- Initial hidden state, of shape (n_a, m)
parameters -- python dictionary containing:
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
ba -- Bias numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

Returns:
a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
caches -- tuple of values needed for the backward pass, contains (list of caches, x)
"""

# Initialize "caches" which will contain the list of all caches
caches = []

# Retrieve dimensions from shapes of x and parameters["Wya"]
n_x, m, T_x = x.shape
n_y, n_a = parameters["Wya"].shape

# initialize "a" and "y" with zeros
a = np.zeros((n_a,m,T_x)) # 初始化一个存放所有激活值的容器
y_pred = np.zeros((n_y,m,T_x)) # 初始化一个存放所有y预测值的容器

# Initialize a_next
a_next = a0 # 将 a_next 初始化为 a<0>

# loop over all time-steps
for t in range(T_x):
# Update next hidden state, compute the prediction, get the cache (≈1 line)
a_next, yt_pred, cache = rnn_cell_forward(x[:,:,t], a_next, parameters)
# Save the value of the new "next" hidden state in a (≈1 line)
a[:,:,t] = a_next
# Save the value of the prediction in y (≈1 line)
y_pred[:,:,t] = yt_pred
# Append "cache" to "caches" (≈1 line)
caches.append(cache)

# store values needed for backward propagation in cache
caches = (caches, x)

return a, y_pred, caches

单个 RNN 单元反向传播

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def rnn_cell_backward(da_next, cache):
"""
Implements the backward pass for the RNN-cell (single time-step).

Arguments:
da_next -- Gradient of loss with respect to next hidden state
cache -- python dictionary containing useful values (output of rnn_cell_forward())

Returns:
gradients -- python dictionary containing:
dx -- Gradients of input data, of shape (n_x, m)
da_prev -- Gradients of previous hidden state, of shape (n_a, m)
dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
dba -- Gradients of bias vector, of shape (n_a, 1)
"""

# Retrieve values from cache
(a_next, a_prev, xt, parameters) = cache

# Retrieve values from parameters
Wax = parameters["Wax"]
Waa = parameters["Waa"]
Wya = parameters["Wya"]
ba = parameters["ba"]
by = parameters["by"]

# compute the gradient of tanh with respect to a_next
dtanh = da_next * (1 - a_next**2) # 指的是代价函数对 (Wax·x^<t>+Waa·a^<t-1>+ba) 的导数

# compute the gradient of the loss with respect to Wax
dxt = np.dot(Wax.T, dtanh)
dWax = np.dot(dtanh, xt.T)

# compute the gradient with respect to Waa
da_prev = np.dot(Waa.T, dtanh)
dWaa = np.dot(dtanh, a_prev.T)

# compute the gradient with respect to b
dba = np.sum(dtanh, axis = 1, keepdims = True)

# Store the gradients in a python dictionary
gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}

return gradients

总的 RNN 反向传播

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def rnn_backward(da, caches):
"""
Implement the backward pass for a RNN over an entire sequence of input data.

Arguments:
da -- Upstream gradients of all hidden states, of shape (n_a, m, T_x)
caches -- tuple containing information from the forward pass (rnn_forward)

Returns:
gradients -- python dictionary containing:
dx -- Gradient w.r.t. the input data, numpy-array of shape (n_x, m, T_x)
da0 -- Gradient w.r.t the initial hidden state, numpy-array of shape (n_a, m)
dWax -- Gradient w.r.t the input's weight matrix, numpy-array of shape (n_a, n_x)
dWaa -- Gradient w.r.t the hidden state's weight matrix, numpy-arrayof shape (n_a, n_a)
dba -- Gradient w.r.t the bias, of shape (n_a, 1)
"""

# Retrieve values from the first cache (t=1) of caches
(caches, x) = caches
(a1, a0, x1, parameters) = caches[0]

# Retrieve dimensions from da's and x1's shapes
n_a, m, T_x = da.shape
n_x, m = x1.shape

# initialize the gradients with the right sizes
dx = np.zeros((n_x, m, T_x))
dWax = np.zeros((n_a, n_x))
dWaa = np.zeros((n_a, n_a))
dba = np.zeros((n_a, 1))
da0 = np.zeros((n_a, m))
da_prevt = np.zeros((n_a, m))

# Loop through all the time steps
for t in reversed(range(T_x)):
# Compute gradients at time step t. Choose wisely the "da_next" and the "cache" to use in the backward propagation step. (≈1 line)
# 这里注意:a^<t> 在这里分两条路影响代价函数 J,所以 da = da[:, :, t] + da_prevt
gradients = rnn_cell_backward(da[:, :, t] + da_prevt, caches[t])
# Retrieve derivatives from gradients (≈ 1 line)
dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
# Increment global derivatives w.r.t parameters by adding their derivative at time-step t (≈4 lines)
dx[:, :, t] = dxt
# 注意:前向传播中所有 t 的 Wax、Waa、ba 都是相等的,但是每个 t 时间的 dWaxt、dWaa、dba 都是不相等的,
# 它们都影响了代价函数 J,所以需要把每一个时间 t 求到的 dWaxt、dWaat、dbat 累加起来
dWax += dWaxt
dWaa += dWaat
dba += dbat

# Set da0 to the gradient of a which has been backpropagated through all time-steps (≈1 line)
da0 = da_prevt

# Store the gradients in a python dictionary
gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}

return gradients

LSTM

单个 LSTM 单元的前向传播

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def lstm_cell_forward(xt, a_prev, c_prev, parameters):
"""
Implement a single forward step of the LSTM-cell as described in Figure (4)

Arguments:
xt -- your input data at timestep "t", numpy array of shape (n_x, m).
a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
c_prev -- Memory state at timestep "t-1", numpy array of shape (n_a, m)
parameters -- python dictionary containing:
Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
bi -- Bias of the update gate, numpy array of shape (n_a, 1)
Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
bo -- Bias of the output gate, numpy array of shape (n_a, 1)
Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

Returns:
a_next -- next hidden state, of shape (n_a, m)
c_next -- next memory state, of shape (n_a, m)
yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
cache -- tuple of values needed for the backward pass, contains (a_next, c_next, a_prev, c_prev, xt, parameters)

Note: ft/it/ot stand for the forget/update/output gates, cct stands for the candidate value (c tilde),
c stands for the memory value
"""

# Retrieve parameters from "parameters"
Wf = parameters["Wf"]
bf = parameters["bf"]
Wi = parameters["Wi"]
bi = parameters["bi"]
Wc = parameters["Wc"]
bc = parameters["bc"]
Wo = parameters["Wo"]
bo = parameters["bo"]
Wy = parameters["Wy"]
by = parameters["by"]

# Retrieve dimensions from shapes of xt and Wy
n_x, m = xt.shape
n_y, n_a = Wy.shape

# Concatenate a_prev and xt 将 a_prev 和 xt 拼在一起
concat = np.zeros((n_a+n_x, m))
concat[: n_a, :] = a_prev
concat[n_a :, :] = xt

# Compute values for ft, it, cct, c_next, ot, a_next using the formulas given figure (4)
ft = sigmoid(np.dot(Wf, concat) + bf)
it = sigmoid(np.dot(Wi, concat) + bi)
cct = np.tanh(np.dot(Wc, concat) + bc)
c_next = ft * c_prev + it * cct
ot = sigmoid(np.dot(Wo, concat) + bo)
a_next = ot * np.tanh(c_next)

# Compute prediction of the LSTM cell
yt_pred = softmax(np.dot(Wy,a_next) + by)

# store values needed for backward propagation in cache
cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)

return a_next, c_next, yt_pred, cache

总的 LSTM 前向传播

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def lstm_forward(x, a0, parameters):
"""
Implement the forward propagation of the recurrent neural network using an LSTM-cell described in Figure (3).

Arguments:
x -- Input data for every time-step, of shape (n_x, m, T_x).
a0 -- Initial hidden state, of shape (n_a, m)
parameters -- python dictionary containing:
Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
bi -- Bias of the update gate, numpy array of shape (n_a, 1)
Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
bo -- Bias of the output gate, numpy array of shape (n_a, 1)
Wy -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

Returns:
a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
y -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
caches -- tuple of values needed for the backward pass, contains (list of all the caches, x)
"""

# Initialize "caches", which will track the list of all the caches
caches = []

# Retrieve dimensions from shapes of x and parameters['Wy']
n_x, m, T_x = x.shape
n_y, n_a = parameters['Wy'].shape

# initialize "a", "c" and "y" with zeros
a = np.zeros((n_a,m,T_x)) # 存放 a 的容器
c = np.zeros((n_a,m,T_x)) # 存放 c 的容器
y = np.zeros((n_y,m,T_x)) # 存放 y 的容器

# Initialize a_next and c_next
a_next = a0
c_next = np.zeros((n_a,m))

# loop over all time-steps
for t in range(T_x):
# Update next hidden state, next memory state, compute the prediction, get the cache
a_next, c_next, yt, cache = lstm_cell_forward(x[:,:,t], a_next, c_next, parameters)
# Save the value of the new "next" hidden state in a
a[:,:,t] = a_next
# Save the value of the prediction in y
y[:,:,t] = yt
# Save the value of the next cell state
c[:,:,t] = c_next
# Append the cache into caches
caches.append(cache)

# store values needed for backward propagation in cache
caches = (caches, x)

return a, y, c, caches

单个 LSTM 单元的反向传播

由于这部分给的公式发生了错误,所以结合论坛里的讨论自己进行了公式的推导和更正,可以在这篇笔记里找到。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def lstm_cell_backward(da_next, dc_next, cache):
"""
Implement the backward pass for the LSTM-cell (single time-step).

Arguments:
da_next -- Gradients of next hidden state, of shape (n_a, m)
dc_next -- Gradients of next cell state, of shape (n_a, m)
cache -- cache storing information from the forward pass

Returns:
gradients -- python dictionary containing:
dxt -- Gradient of input data at time-step t, of shape (n_x, m)
da_prev -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
dc_prev -- Gradient w.r.t. the previous memory state, of shape (n_a, m, T_x)
dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
dWo -- Gradient w.r.t. the weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
dbo -- Gradient w.r.t. biases of the output gate, of shape (n_a, 1)
"""

# Retrieve information from "cache"
(a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters) = cache

# Retrieve dimensions from xt's and a_next's shape
n_x, m = xt.shape
n_a, m = a_next.shape

# Compute gates related derivatives, you can find their values can be found by looking carefully at equations (7) to (10)
dot = da_next * np.tanh(c_next)*ot*(1-ot) # 也就是 dJ / d(W_o[a^{<t-1>},x^{<t>}]+b_o),下同
dcct = (dc_next + da_next*ot*(1-np.tanh(c_next)**2))*it*(1-cct**2)
dit = (dc_next + da_next*ot*(1-np.tanh(c_next)**2))*cct*it*(1-it) # it 就是更新门 updategate
dft = (dc_next + da_next*ot*(1-np.tanh(c_next)**2))*c_prev*ft*(1-ft)

# Compute parameters related derivatives. Use equations (11)-(14)
dWf = np.dot(dft, np.vstack((a_prev,xt)).T)
dWi = np.dot(dit, np.vstack((a_prev,xt)).T)
dWc = np.dot(dcct,np.vstack((a_prev,xt)).T)
dWo = np.dot(dot,np.vstack((a_prev,xt)).T)
dbf = np.sum(dft, axis = 1, keepdims = True)
dbi = np.sum(dit, axis = 1, keepdims = True)
dbc = np.sum(dcct, axis = 1, keepdims = True)
dbo = np.sum(dot, axis = 1, keepdims = True)

# Compute derivatives w.r.t previous hidden state, previous memory state and input. Use equations (15)-(17).
da_prev = np.dot(parameters['Wf'][:,:n_a].T,dft) + np.dot(parameters['Wi'][:,:n_a].T,dit) + np.dot(parameters['Wo'][:,:n_a].T,dot) + np.dot(parameters['Wc'][:,:n_a].T,dcct)
dc_prev = da_next*ot*(1-(np.tanh(c_next))**2)
dxt = np.dot(parameters['Wf'][:,n_a:].T,dft) + np.dot(parameters['Wi'][:,n_a:].T,dit) + np.dot(parameters['Wo'][:,n_a:].T,dot) + np.dot(parameters['Wc'][:,n_a:].T,dcct)

# Save gradients in dictionary
gradients = {"dxt": dxt, "da_prev": da_prev, "dc_prev": dc_prev, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
"dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}

return gradients

总的 LSTM 反向传播

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def lstm_backward(da, caches):

"""
Implement the backward pass for the RNN with LSTM-cell (over a whole sequence).

Arguments:
da -- Gradients w.r.t the hidden states, numpy-array of shape (n_a, m, T_x)
dc -- Gradients w.r.t the memory states, numpy-array of shape (n_a, m, T_x)
caches -- cache storing information from the forward pass (lstm_forward)

Returns:
gradients -- python dictionary containing:
dx -- Gradient of inputs, of shape (n_x, m, T_x)
da0 -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
dWo -- Gradient w.r.t. the weight matrix of the save gate, numpy array of shape (n_a, n_a + n_x)
dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
dbo -- Gradient w.r.t. biases of the save gate, of shape (n_a, 1)
"""

# Retrieve values from the first cache (t=1) of caches.
(caches, x) = caches
(a1, c1, a0, c0, f1, i1, cc1, o1, x1, parameters) = caches[0]

# Retrieve dimensions from da's and x1's shapes
n_a, m, T_x = da.shape
n_x, m = x1.shape

# initialize the gradients with the right sizes
dx = np.zeros((n_x, m, T_x))
da0 = np.zeros((n_a,m))
da_prevt = np.zeros((n_a,m))
dc_prevt = np.zeros((n_a,m))
dWf = np.zeros((n_a, n_a+n_x)) # 每个时间点 t 的 Wf 都是相等的
dWi = np.zeros((n_a,n_a+n_x))
dWc = np.zeros((n_a,n_a+n_x))
dWo = np.zeros((n_a,n_a+n_x))
dbf = np.zeros((n_a,1))
dbi = np.zeros((n_a,1))
dbc = np.zeros((n_a,1))
dbo = np.zeros((n_a,1))

# loop back over the whole sequence
for t in reversed(range(T_x)):
# Compute all gradients using lstm_cell_backward
gradients = lstm_cell_backward(da[:,:,t], dc_prevt, caches[t])
# Store or add the gradient to the parameters' previous step's gradient
dx[:,:,t] = gradients['dxt']
# 注意:前向传播中所有 t 的 Wf、Wi、Wo、Wc、ba、bi、bo、bc 都是相等的,
# 但是每个 t 时间的 dWf、dWi、dWo、dWc、dba、dbi、dbo、dbc 都是不相等的,
# 它们都影响了代价函数 J,
# 所以需要把每一个时间 t 求到的 dWf、dWi、dWo、dWc、dba、dbi、dbo、dbc 累加起来
dWf += gradients['dWf']
dWi += gradients['dWi']
dWc += gradients['dWc']
dWo += gradients['dWo']
dbf += gradients['dbf']
dbi += gradients['dbi']
dbc += gradients['dbc']
dbo += gradients['dbo']
# Set the first activation's gradient to the backpropagated gradient da_prev.
da0 = gradients['da_prev']

# Store the gradients in a python dictionary
gradients = {"dx": dx, "da0": da0, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
"dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}

return gradients

Part 2-Character level language model - Dinosaurus land

创造一个字符级的语言模型,通过一堆恐龙名字进行训练,用来生成新的恐龙名字。

1
2
3
import numpy as np
from utils import *
import random

问题描述

数据集加载和预处理

1
2
3
4
5
data = open('dinos.txt', 'r').read()
data= data.lower()
chars = list(set(data)) # 去除重复元素
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

output:There are 19909 total characters and 27 unique characters in your data.

由于是字符级别的,所以一共有 26 个英文字母加上结束符 \n = 27 个字符,将这些字符一一对应存入字典中。

1
2
3
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) } # sorted 进行排序
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

output:{0: ‘\n’, 1: ‘a’, 2: ‘b’, 3: ‘c’, 4: ‘d’, 5: ‘e’, 6: ‘f’, 7: ‘g’, 8: ‘h’, 9: ‘i’, 10: ‘j’, 11: ‘k’, 12: ‘l’, 13: ‘m’, 14: ‘n’, 15: ‘o’, 16: ‘p’, 17: ‘q’, 18: ‘r’, 19: ‘s’, 20: ‘t’, 21: ‘u’, 22: ‘v’, 23: ‘w’, 24: ‘x’, 25: ‘y’, 26: ‘z’}

模型概况

  • 初始化参数
  • 优化循环
    • 前向传播计算 loss
    • 反向传播计算 loss 对每个参数的梯度
    • 进行梯度剪枝 gradient clipping,防止梯度爆炸
    • 更新参数
  • 返回学习参数

建立模型组块

两个需要建立的:Gradient clipping 和 采样 sampling

梯度剪枝 gradient clipping

反向传播得到梯度后,在更新参数之前,进行一步梯度剪枝的操作,用以确保不会发生梯度爆炸,也就是把梯度限制在某个区间内,比如 [-10,10],大于 10 的梯度取 10,小于 -10 的梯度取 -10.

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def clip(gradients, maxValue):
'''
Clips the gradients' values between minimum and maximum.

Arguments:
gradients -- a dictionary containing the gradients "dWaa", "dWax", "dWya", "db", "dby"
maxValue -- everything above this number is set to this number, and everything less than -maxValue is set to -maxValue

Returns:
gradients -- a dictionary with the clipped gradients.
'''

dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']

# clip to mitigate exploding gradients, loop over [dWax, dWaa, dWya, db, dby]. (≈2 lines)
for gradient in [dWax, dWaa, dWya, db, dby]:
np.minimum(gradient,maxValue,out=gradient)
np.maximum(gradient,-maxValue,out=gradient)# 注意!!gradient 在这里只是一个临时变量,我们需要把得到的值 out 输出给它

gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}

return gradients

采样 sampling

模型训练好之后,如何生成文字呢?如下图所示:

  • 先输入一个假的输入 $x^{<1>} = 0,a^{<0>} = 0$

  • 按照公式计算 $\hat y^{<1>}$ 和 $a^{<1>}$

  • 进行采样:实际上 $\hat y^{<1>}$ 是 $x^{<2>}$ 的预测值,包含了取不同值的概率,我们根据不同的概率进行随机的取值,例如取索引值 3 的概率为 0.16,那么 $x^{<2>}$ 就有 16% 的概率被赋值为 3

    • 为什么不直接取 $\hat y^{<1>}$ 中概率值最大的赋值给 $x^{<2>}$?论坛看到一个解释:

      Sampling produces variety. Say you have a list of items with probability of occurrence “car” (30%) “airplane” (70%). If you simply choose the argmax of the probability from the list then you always choose airplane which gets boring. If you alternatively “sample” from the list then 70% of the time you get airplane and 30% car. This is really important in longer sentences; if you choose the argmax you end up with the same sentence all the time (exception when multiple words have exactly same probability 50:50), but sampling gives you an interesting mix of sentences from the same list.

  • 将采样得到的 $x^{}$ 索引值进行 one-hot 编码赋值给 x 以输入下一个循环。重复第一步直到 y 输出一个结束符 “\n”

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def sample(parameters, char_to_ix):
"""
Sample a sequence of characters according to a sequence of probability distributions output of the RNN

Arguments:
parameters -- python dictionary containing the parameters Waa, Wax, Wya, by, and b.
char_to_ix -- python dictionary mapping each character to an index.

Returns:
indices -- a list of length n containing the indices of the sampled characters.
"""

# Retrieve parameters and relevant shapes from "parameters" dictionary
Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
vocab_size = by.shape[0]
n_a = Waa.shape[1]

# Step 1: Create the one-hot vector x for the first character (initializing the sequence generation).
x = np.zeros((vocab_size,1))
# Step 1': Initialize a_prev as zeros
a_prev = np.zeros((n_a,1))

# Create an empty list of indices, this is the list which will contain the list of indices of the characters to generate (≈1 line)
indices = [] # 存放每一步生成的索引值

# Idx is a flag to detect a newline character, we initialize it to -1
idx = -1

# Loop over time-steps t. At each time-step, sample a character from a probability distribution and append
# its index to "indices". We'll stop if we reach 50 characters (which should be very unlikely with a well
# trained model), which helps debugging and prevents entering an infinite loop.
counter = 0
newline_character = char_to_ix['\n']

while (idx != newline_character and counter != 50):

# Step 2: Forward propagate x using the equations (1), (2) and (3)
a = np.tanh(np.dot(Wax,x)+np.dot(Waa,a_prev)+b)
z = np.dot(Wya,a)+by
y = softmax(z)

# Step 3: Sample the index of a character within the vocabulary from the probability distribution y
idx = np.random.choice(vocab_size, p=y.ravel()) # 按照概率进行随机取值

# Append the index to "indices"
indices.append(idx)

# Step 4: Overwrite the input character as the one corresponding to the sampled index.
x = np.zeros((vocab_size,1))
x[idx] = 1 # one-hot 编码

# Update "a_prev" to be "a"
a_prev = a

counter +=1
if (counter == 50): # 限制生成的单词字长不超过 50,防止无限循环
indices.append(char_to_ix['\n'])

return indices

建立语言模型

梯度下降

一些辅助函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
def smooth(loss, cur_loss):
return loss * 0.999 + cur_loss * 0.001

def print_sample(sample_ix, ix_to_char):
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
txt = txt[0].upper() + txt[1:] # capitalize first character
print ('%s' % (txt, ), end='')

def get_initial_loss(vocab_size, seq_length):
return -np.log(1.0/vocab_size)*seq_length

def initialize_parameters(n_a, n_x, n_y):
"""
Initialize parameters with small random values

Returns:
parameters -- python dictionary containing:
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
b -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
"""
np.random.seed(1)
Wax = np.random.randn(n_a, n_x)*0.01 # input to hidden
Waa = np.random.randn(n_a, n_a)*0.01 # hidden to hidden
Wya = np.random.randn(n_y, n_a)*0.01 # hidden to output
b = np.zeros((n_a, 1)) # hidden bias
by = np.zeros((n_y, 1)) # output bias

parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b,"by": by}

return parameters

def rnn_step_forward(parameters, a_prev, x):

Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
a_next = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b) # hidden state
p_t = softmax(np.dot(Wya, a_next) + by) # unnormalized log probabilities for next chars # probabilities for next chars

return a_next, p_t

def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):

gradients['dWya'] += np.dot(dy, a.T)
gradients['dby'] += dy
da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] # backprop into h
daraw = (1 - a * a) * da # backprop through tanh nonlinearity
gradients['db'] += daraw
gradients['dWax'] += np.dot(daraw, x.T)
gradients['dWaa'] += np.dot(daraw, a_prev.T)
gradients['da_next'] = np.dot(parameters['Waa'].T, daraw)
return gradients

def update_parameters(parameters, gradients, lr):

parameters['Wax'] += -lr * gradients['dWax']
parameters['Waa'] += -lr * gradients['dWaa']
parameters['Wya'] += -lr * gradients['dWya']
parameters['b'] += -lr * gradients['db']
parameters['by'] += -lr * gradients['dby']
return parameters

def rnn_forward(X, Y, a0, parameters, vocab_size = 27):

# Initialize x, a and y_hat as empty dictionaries
x, a, y_hat = {}, {}, {}

a[-1] = np.copy(a0)

# initialize your loss to 0
loss = 0

for t in range(len(X)):

# Set x[t] to be the one-hot vector representation of the t'th character in X.
# if X[t] == None, we just have x[t]=0. This is used to set the input for the first timestep to the zero vector.
x[t] = np.zeros((vocab_size,1))
if (X[t] != None):
x[t][X[t]] = 1

# Run one step forward of the RNN
a[t], y_hat[t] = rnn_step_forward(parameters, a[t-1], x[t])

# Update the loss by substracting the cross-entropy term of this time-step from it.
loss -= np.log(y_hat[t][Y[t],0])

cache = (y_hat, a, x)

return loss, cache

def rnn_backward(X, Y, parameters, cache):
# Initialize gradients as an empty dictionary
gradients = {}

# Retrieve from cache and parameters
(y_hat, a, x) = cache
Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']

# each one should be initialized to zeros of the same dimension as its corresponding parameter
gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
gradients['db'], gradients['dby'] = np.zeros_like(b), np.zeros_like(by)
gradients['da_next'] = np.zeros_like(a[0])

# Backpropagate through time
for t in reversed(range(len(X))):
dy = np.copy(y_hat[t])
dy[Y[t]] -= 1
gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t-1])

return gradients, a

建立模型:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def optimize(X, Y, a_prev, parameters, learning_rate = 0.01):
"""
Execute one step of the optimization to train the model.

Arguments:
X -- list of integers, where each integer is a number that maps to a character in the vocabulary.
Y -- list of integers, exactly the same as X but shifted one index to the left.
a_prev -- previous hidden state.
parameters -- python dictionary containing:
Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
b -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
learning_rate -- learning rate for the model.

Returns:
loss -- value of the loss function (cross-entropy)
gradients -- python dictionary containing:
dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
dWya -- Gradients of hidden-to-output weights, of shape (n_y, n_a)
db -- Gradients of bias vector, of shape (n_a, 1)
dby -- Gradients of output bias vector, of shape (n_y, 1)
a[len(X)-1] -- the last hidden state, of shape (n_a, 1)
"""
# Forward propagate through time (≈1 line)
loss, cache = rnn_forward(X, Y, a_prev, parameters)

# Backpropagate through time (≈1 line)
gradients, a = rnn_backward(X, Y, parameters, cache)

# Clip your gradients between -5 (min) and 5 (max) (≈1 line)
gradients = clip(gradients, 5)

# Update parameters (≈1 line)
parameters = update_parameters(parameters, gradients, learning_rate)

return loss, gradients, a[len(X)-1]

训练模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def model(data, ix_to_char, char_to_ix, num_iterations = 35000, n_a = 50, dino_names = 7, vocab_size = 27):
"""
Trains the model and generates dinosaur names.

Arguments:
data -- text corpus
ix_to_char -- dictionary that maps the index to a character
char_to_ix -- dictionary that maps a character to an index
num_iterations -- number of iterations to train the model for
n_a -- number of units of the RNN cell
dino_names -- number of dinosaur names you want to sample at each iteration.
vocab_size -- number of unique characters found in the text, size of the vocabulary

Returns:
parameters -- learned parameters
"""

# Retrieve n_x and n_y from vocab_size
n_x, n_y = vocab_size, vocab_size

# Initialize parameters
parameters = initialize_parameters(n_a, n_x, n_y)

# Initialize loss (this is required because we want to smooth our loss, don't worry about it)
loss = get_initial_loss(vocab_size, dino_names)

# Build list of all dinosaur names (training examples).
with open("dinos.txt") as f:
examples = f.readlines()
examples = [x.lower().strip() for x in examples]

# Shuffle list of all dinosaur names
np.random.shuffle(examples) # 将样本洗牌

# Initialize the hidden state of your LSTM
a_prev = np.zeros((n_a, 1))

# Optimization loop
for j in range(num_iterations):

# 假设 n 个恐龙名字样本
# 样本1(循环1)-> 样本2(循环2)->……-> 样本n(循环n)->样本1(循环n+1)-> ……->样本n(循环2n)->……
# Use the hint above to define one training example (X,Y) (≈ 2 lines)
index = j % len(examples) # 目前的循环次数取余样本数
X = [None] + [char_to_ix[ch] for ch in examples[index]] # [None] 指第一个X<0>,它生成hatY<1>,即X<n>生成hatY<n+1>
Y = X[1:] + [char_to_ix["\n"]] # Y 的真实值,与 X 相同,只是没有第一个Y<0>,并且最后加上结束符,比X向右移动了一位

# Perform one optimization step: Forward-prop -> Backward-prop -> Clip -> Update parameters
# Choose a learning rate of 0.01
curr_loss, gradients, a_prev = optimize(X, Y, a_prev, parameters, learning_rate = 0.01)

# Use a latency trick to keep the loss smooth. It happens here to accelerate the training.
loss = smooth(loss, curr_loss)

# Every 2000 Iteration, generate "n" characters thanks to sample() to check if the model is learning properly
if j % 2000 == 0:

print('Iteration: %d, Loss: %f' % (j, loss) + '\n')

# The number of dinosaur names to print
for name in range(dino_names):

# Sample indices and print them
sampled_indices = sample(parameters, char_to_ix, seed)
print_sample(sampled_indices, ix_to_char)

print('\n')

return parameters
1
parameters = model(data, ix_to_char, char_to_ix)

……

Iteration: 34000, Loss: 22.396744

Mavptokekus
Ilabaisaurus
Itosaurus
Macaesaurus
Yrosaurus
Eiaeosaurus
Trodon

Part 3-Improvise a Jazz Solo with an LSTM Network

这次作业是在 Keras 使用 LSTM 进行爵士音乐的生成。

参考 Ji-Sung Kim, 2016, deepjazz.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from __future__ import print_function
import IPython
import sys
from music21 import *
import numpy as np
from grammar import *
from qa import *
from preprocess import *
from music_utils import *
from data_utils import *
from keras.models import load_model, Model
from keras.layers import Dense, Activation, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector
from keras.initializers import glorot_uniform
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras import backend as K

问题描述

加载数据集

先不管具体的音乐理论和细节,直接加载需要的数据集:

1
2
3
4
5
6
X, Y, n_values, indices_values = load_music_utils()
print('shape of X:', X.shape)
print('number of training examples:', X.shape[0])
print('Tx (length of sequence):', X.shape[1])
print('total # of unique values:', n_values)
print('Shape of Y:', Y.shape)

shape of X: (60, 30, 78)
number of training examples: 60
Tx (length of sequence): 30
total # of unique values: 78
Shape of Y: (30, 60, 78)

  • X:(m, T_x, 78) 其中 m 是样本数,T_x 是音乐片段的时间,78 是 one-hot 向量的长度
  • Y:(T_y, m, 78) 为了便于 feed 给 LSTM 层,将其维度变一下
  • n_value:把某个时间的音乐变成 one-hot 向量的长度
  • indices_values:78 个索引值的字典

模型概况

每个训练用的序列样本长度可以是不同的,但是这里为了方便向量化,将所有用以训练的样本统一长度为 30 s.

建立模型

首先确定模型的超参数,我们必须先确定隐藏层的单元数。

看上面这幅图,所谓隐藏层单元数,就是 $W[a^{},x^{}]$ 的输出的维度,用 n_a 表示,这里设为 64.

1
n_a = 64

如果我们建立的序列模型是在测试时也给出所有序列,例如输入一个序列,预测一个标签,那么可以简单地使用 Keras 的内置函数进行模型构建,但是对于生成模型来说,我们每次只能知道序列前一个的值,而不知道整个序列,无法直接全部输入模型进行预测,只能一个时间点生成一次,然后让 $x^{\langle t\rangle} = y^{\langle t-1 \rangle}$ 继续生成下一个,而内置的 LSTM 模型的逻辑是,一次将所有时间的序列值丢进去然后得到一个输出。

所以在下面的函数中使用 for 循环调用 LSTM 层 T_x 次,由于每个时间点的 LSTM 层的参数都是一样的,所以我们定义一个全局的 层 对象,每个 for 循环都调用这个对象,实现参数共享。

1
2
3
reshapor = Reshape((1, 78))                        # Used in Step 2.B of djmodel(), below
LSTM_cell = LSTM(n_a, return_state = True)# return_state 返回 c # Used in Step 2.C
densor = Dense(n_values, activation='softmax') # Used in Step 2.D

建立模型的步骤是:

  • 建立空列表 outputs 储存每个时间点的输出 $\hat y$

  • for t in 1,…, T_x:

    • 从输入选择第 t 个时间点的序列值
      • 使用 x = Lambda(lambda x: X[:,t,:])(X),输出维度是 (?, 78)
    • 调用 reshapor 把 (?,78)->(?,1,78),便于输入 LSTM 层
    • 调用 LSTM_cell 得到输出的 a 和 c
      • a, _, c = LSTM_cell(input_x, initial_state=[previous hidden state, previous cell state])
    • 将隐藏层的激活值 a 通过 softmax 进行输出
    • 把输出存进 outputs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def djmodel(Tx, n_a, n_values):
"""
Implement the model

Arguments:
Tx -- length of the sequence in a corpus
n_a -- the number of activations used in our model
n_values -- number of unique values in the music data

Returns:
model -- a keras model with the
"""

# Define the input of your model with a shape
X = Input(shape=(Tx, n_values)) # Keras 中 shape 不需要显式地指出样本数 m,第一个位置默认是样本数,打印 X 的形状为 (?, 30, 78)

# Define s0, initial hidden state for the decoder LSTM
a0 = Input(shape=(n_a,), name='a0')#shape 其实是(?,n_a,)
c0 = Input(shape=(n_a,), name='c0')
a = a0
c = c0

# Step 1: Create empty list to append the outputs while you iterate (≈1 line)
outputs = []

# Step 2: Loop
for t in range(Tx):

# Step 2.A: select the "t"th time step vector from X.
x = Lambda(lambda x: X[:,t,:])(X)# Lambda 层是对 X 整个进行运算,得到一个结果,所以直接返回 X[:,t,:]
# Step 2.B: Use reshapor to reshape x to be (1, n_values) (≈1 line)
x = reshapor(x) # (?,78)->(?,1,78)
# Step 2.C: Perform one step of the LSTM_cell
a, _, c = LSTM_cell(x, initial_state=[a, c])
# Step 2.D: Apply densor to the hidden state output of LSTM_Cell
out = densor(a)
# Step 2.E: add the output to "outputs"
outputs.append(out)

# Step 3: Create model instance
model = Model(inputs=[X,a0,c0], outputs=outputs)

return model

建立模型:

1
model = djmodel(Tx = 30 , n_a = 64, n_values = 78)

编译模型:

1
2
3
opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)#定义优化器

model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

训练模型:

1
2
3
4
5
m = 60
a0 = np.zeros((m, n_a))
c0 = np.zeros((m, n_a))

model.fit([X, a0, c0], list(Y), epochs=100)
1
2
3
4
输出:
......
Epoch 100/100
60/60 [==============================] - 0s - loss: 5.9232 - dense_1_loss_1: 3.7652 - dense_1_loss_2: 1.1579 - dense_1_loss_3: 0.3157 - dense_1_loss_4: 0.0825 - dense_1_loss_5: 0.0493 - dense_1_loss_6: 0.0368 - dense_1_loss_7: 0.0311 - dense_1_loss_8: 0.0285 - dense_1_loss_9: 0.0278 - dense_1_loss_10: 0.0222 - dense_1_loss_11: 0.0213 - dense_1_loss_12: 0.0205 - dense_1_loss_13: 0.0194 - dense_1_loss_14: 0.0199 - dense_1_loss_15: 0.0210 - dense_1_loss_16: 0.0206 - dense_1_loss_17: 0.0192 - dense_1_loss_18: 0.0213 - dense_1_loss_19: 0.0212 - dense_1_loss_20: 0.0219 - dense_1_loss_21: 0.0220 - dense_1_loss_22: 0.0202 - dense_1_loss_23: 0.0217 - dense_1_loss_24: 0.0211 - dense_1_loss_25: 0.0227 - dense_1_loss_26: 0.0202 - dense_1_loss_27: 0.0225 - dense_1_loss_28: 0.0245 - dense_1_loss_29: 0.0252 - dense_1_loss_30: 0.0000e+00 - dense_1_acc_1: 0.0500 - dense_1_acc_2: 0.6833 - dense_1_acc_3: 0.9167 - dense_1_acc_4: 1.0000 - dense_1_acc_5: 1.0000 - dense_1_acc_6: 1.0000 - dense_1_acc_7: 1.0000 - dense_1_acc_8: 1.0000 - dense_1_acc_9: 1.0000 - dense_1_acc_10: 1.0000 - dense_1_acc_11: 1.0000 - dense_1_acc_12: 1.0000 - dense_1_acc_13: 1.0000 - dense_1_acc_14: 1.0000 - dense_1_acc_15: 1.0000 - dense_1_acc_16: 1.0000 - dense_1_acc_17: 1.0000 - dense_1_acc_18: 1.0000 - dense_1_acc_19: 1.0000 - dense_1_acc_20: 1.0000 - dense_1_acc_21: 1.0000 - dense_1_acc_22: 1.0000 - dense_1_acc_23: 1.0000 - dense_1_acc_24: 1.0000 - dense_1_acc_25: 1.0000 - dense_1_acc_26: 1.0000 - dense_1_acc_27: 1.0000 - dense_1_acc_28: 1.0000 - dense_1_acc_29: 1.0000 - dense_1_acc_30: 0.0000e+00

生成音乐

到目前为止,我们模型的几个层 reshapor、LSTM_cell、densor 中的参数已经学习完毕,存在该对象中,现在我们使用这些参数训练好的层对象来构建一个新的生成模型。

与生成恐龙名字相同,我们仍需要 sampling 操作,因此增加一个将 softmax 的输出变成 one-hot 的层。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def music_inference_model(LSTM_cell, densor, n_values = 78, n_a = 64, Ty = 100):
"""
Uses the trained "LSTM_cell" and "densor" from model() to generate a sequence of values.

Arguments:
LSTM_cell -- the trained "LSTM_cell" from model(), Keras layer object
densor -- the trained "densor" from model(), Keras layer object
n_values -- integer, umber of unique values
n_a -- number of units in the LSTM_cell
Ty -- integer, number of time steps to generate

Returns:
inference_model -- Keras model instance
"""

# Define the input of your model with a shape
x0 = Input(shape=(1, n_values))

# Define s0, initial hidden state for the decoder LSTM
a0 = Input(shape=(n_a,), name='a0')
c0 = Input(shape=(n_a,), name='c0')
a = a0
c = c0
x = x0

# Step 1: Create an empty list of "outputs" to later store your predicted values (≈1 line)
outputs = []

# Step 2: Loop over Ty and generate a value at every time step
for t in range(Ty):

# Step 2.A: Perform one step of LSTM_cell (≈1 line)
a, _, c = LSTM_cell(x,initial_state=[a,c])

# Step 2.B: Apply Dense layer to the hidden state output of the LSTM_cell (≈1 line)
out = densor(a)

# Step 2.C: Append the prediction "out" to "outputs". out.shape = (None, 78) (≈1 line)
outputs.append(out)

# Step 2.D: Select the next value according to "out", and set "x" to be the one-hot representation of the
# selected value, which will be passed as the input to LSTM_cell on the next step. We have provided
# the line of code you need to do this.
x = Lambda(one_hot)(out) # 下一次的输入等于上一次的输出变成 one-hot 向量

# Step 3: Create model instance with the correct "inputs" and "outputs" (≈1 line)
inference_model = Model(inputs=[x0,a0,c0], outputs = outputs)

return inference_model

调用模型进行预测的函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def predict_and_sample(inference_model, x_initializer = x_initializer, a_initializer = a_initializer, 
c_initializer = c_initializer):
"""
Predicts the next value of values using the inference model.

Arguments:
inference_model -- Keras model instance for inference time
x_initializer -- numpy array of shape (1, 1, 78), one-hot vector initializing the values generation
a_initializer -- numpy array of shape (1, n_a), initializing the hidden state of the LSTM_cell
c_initializer -- numpy array of shape (1, n_a), initializing the cell state of the LSTM_cel

Returns:
results -- numpy-array of shape (Ty, 78), matrix of one-hot vectors representing the values generated
indices -- numpy-array of shape (Ty, 1), matrix of indices representing the values generated
"""

# Step 1: Use your inference model to predict an output sequence given x_initializer, a_initializer and c_initializer.
pred = inference_model.predict([x_initializer, a_initializer, c_initializer])
# Step 2: Convert "pred" into an np.array() of indices with the maximum probabilities
indices = np.argmax(pred,axis=-1) # 将 softmax 输出中概率值最大的定为输出索引值
# Step 3: Convert indices to one-hot vectors, the shape of the results should be (1, )
results = to_categorical(indices, num_classes=n_values) # 将索引值再变成 one-hot 向量

return results, indices

看看结果:

1
2
3
4
results, indices = predict_and_sample(inference_model, x_initializer, a_initializer, c_initializer)
print("np.argmax(results[12]) =", np.argmax(results[12]))
print("np.argmax(results[17]) =", np.argmax(results[17]))
print("list(indices[12:18]) =", list(indices[12:18]))

np.argmax(results[12]) = 70
np.argmax(results[17]) = 55
list(indices[12:18]) = [array([70]), array([71]), array([43]), array([2]), array([10]), array([55])]

生成音乐

得到的输出必须经过很好的后处理才能确保听起来好听,输出音乐的质量很大地取决于后处理的质量。

1
out_stream = generate_music(inference_model)
微信捐赠
支付宝捐赠