coursera 吴恩达深度学习 Specialization 编程作业(course 4 week 4)

神经风格转换

先引入需要的包:

1
2
3
4
5
6
7
8
9
10
11
12
import os
import sys
import scipy.io
import scipy.misc
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from PIL import Image
from nst_utils import *
import numpy as np
import tensorflow as tf

%matplotlib inline

使用一个预先训练好的 19 层的 VGG-19 网络进行迁移学习,下面加载预训练模型:

1
2
model = load_vgg_model("pretrained-model/imagenet-vgg-verydeep-19.mat")
print(model)
  • load_vgg_model() 函数为

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    class CONFIG:
    IMAGE_WIDTH = 400
    IMAGE_HEIGHT = 300
    COLOR_CHANNELS = 3
    NOISE_RATIO = 0.6
    MEANS = np.array([123.68, 116.779, 103.939]).reshape((1,1,1,3))
    VGG_MODEL = 'pretrained-model/imagenet-vgg-verydeep-19.mat' # Pick the VGG 19-layer model by from the paper "Very Deep Convolutional Networks for Large-Scale Image Recognition".
    STYLE_IMAGE = 'images/stone_style.jpg' # Style image to use.
    CONTENT_IMAGE = 'images/content300.jpg' # Content image to use.
    OUTPUT_DIR = 'output/'
    def load_vgg_model(path):
    """
    Returns a model for the purpose of 'painting' the picture.
    Takes only the convolution layer weights and wrap using the TensorFlow
    Conv2d, Relu and AveragePooling layer. VGG actually uses maxpool but
    the paper indicates that using AveragePooling yields better results.
    The last few fully connected layers are not used.
    Here is the detailed configuration of the VGG model:
    0 is conv1_1 (3, 3, 3, 64)
    1 is relu
    2 is conv1_2 (3, 3, 64, 64)
    3 is relu
    4 is maxpool
    5 is conv2_1 (3, 3, 64, 128)
    6 is relu
    7 is conv2_2 (3, 3, 128, 128)
    8 is relu
    9 is maxpool
    10 is conv3_1 (3, 3, 128, 256)
    11 is relu
    12 is conv3_2 (3, 3, 256, 256)
    13 is relu
    14 is conv3_3 (3, 3, 256, 256)
    15 is relu
    16 is conv3_4 (3, 3, 256, 256)
    17 is relu
    18 is maxpool
    19 is conv4_1 (3, 3, 256, 512)
    20 is relu
    21 is conv4_2 (3, 3, 512, 512)
    22 is relu
    23 is conv4_3 (3, 3, 512, 512)
    24 is relu
    25 is conv4_4 (3, 3, 512, 512)
    26 is relu
    27 is maxpool
    28 is conv5_1 (3, 3, 512, 512)
    29 is relu
    30 is conv5_2 (3, 3, 512, 512)
    31 is relu
    32 is conv5_3 (3, 3, 512, 512)
    33 is relu
    34 is conv5_4 (3, 3, 512, 512)
    35 is relu
    36 is maxpool
    37 is fullyconnected (7, 7, 512, 4096)
    38 is relu
    39 is fullyconnected (1, 1, 4096, 4096)
    40 is relu
    41 is fullyconnected (1, 1, 4096, 1000)
    42 is softmax
    """

    vgg = scipy.io.loadmat(path)

    vgg_layers = vgg['layers']

    def _weights(layer, expected_layer_name):
    """
    Return the weights and bias from the VGG model for a given layer.
    """
    wb = vgg_layers[0][layer][0][0][2]
    W = wb[0][0]
    b = wb[0][1]
    layer_name = vgg_layers[0][layer][0][0][0][0]
    assert layer_name == expected_layer_name
    return W, b

    return W, b

    def _relu(conv2d_layer):
    """
    Return the RELU function wrapped over a TensorFlow layer. Expects a
    Conv2d layer input.
    """
    return tf.nn.relu(conv2d_layer)

    def _conv2d(prev_layer, layer, layer_name):
    """
    Return the Conv2D layer using the weights, biases from the VGG
    model at 'layer'.
    """
    W, b = _weights(layer, layer_name)
    W = tf.constant(W)
    b = tf.constant(np.reshape(b, (b.size)))
    return tf.nn.conv2d(prev_layer, filter=W, strides=[1, 1, 1, 1], padding='SAME') + b

    def _conv2d_relu(prev_layer, layer, layer_name):
    """
    Return the Conv2D + RELU layer using the weights, biases from the VGG
    model at 'layer'.
    """
    return _relu(_conv2d(prev_layer, layer, layer_name))

    def _avgpool(prev_layer):
    """
    Return the AveragePooling layer.
    """
    return tf.nn.avg_pool(prev_layer, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

    # Constructs the graph model.
    graph = {}
    # 此处将输入作为 variable,使得优化器知道更新其参数
    graph['input'] = tf.Variable(np.zeros((1, CONFIG.IMAGE_HEIGHT, CONFIG.IMAGE_WIDTH, CONFIG.COLOR_CHANNELS)), dtype = 'float32')
    graph['conv1_1'] = _conv2d_relu(graph['input'], 0, 'conv1_1')
    graph['conv1_2'] = _conv2d_relu(graph['conv1_1'], 2, 'conv1_2')
    graph['avgpool1'] = _avgpool(graph['conv1_2'])
    graph['conv2_1'] = _conv2d_relu(graph['avgpool1'], 5, 'conv2_1')
    graph['conv2_2'] = _conv2d_relu(graph['conv2_1'], 7, 'conv2_2')
    graph['avgpool2'] = _avgpool(graph['conv2_2'])
    graph['conv3_1'] = _conv2d_relu(graph['avgpool2'], 10, 'conv3_1')
    graph['conv3_2'] = _conv2d_relu(graph['conv3_1'], 12, 'conv3_2')
    graph['conv3_3'] = _conv2d_relu(graph['conv3_2'], 14, 'conv3_3')
    graph['conv3_4'] = _conv2d_relu(graph['conv3_3'], 16, 'conv3_4')
    graph['avgpool3'] = _avgpool(graph['conv3_4'])
    graph['conv4_1'] = _conv2d_relu(graph['avgpool3'], 19, 'conv4_1')
    graph['conv4_2'] = _conv2d_relu(graph['conv4_1'], 21, 'conv4_2')
    graph['conv4_3'] = _conv2d_relu(graph['conv4_2'], 23, 'conv4_3')
    graph['conv4_4'] = _conv2d_relu(graph['conv4_3'], 25, 'conv4_4')
    graph['avgpool4'] = _avgpool(graph['conv4_4'])
    graph['conv5_1'] = _conv2d_relu(graph['avgpool4'], 28, 'conv5_1')
    graph['conv5_2'] = _conv2d_relu(graph['conv5_1'], 30, 'conv5_2')
    graph['conv5_3'] = _conv2d_relu(graph['conv5_2'], 32, 'conv5_3')
    graph['conv5_4'] = _conv2d_relu(graph['conv5_3'], 34, 'conv5_4')
    graph['avgpool5'] = _avgpool(graph['conv5_4'])

    return graph

模型被我们存进了一个 dict,dict 的 key 是每一层的名字,value 是这一层的值。

我们可以对某一层进行赋值,例如对输入进行赋值:

1
model["input"].assign(image)

如果想得到某一层的值,我们可以:

1
sess.run(model["conv4_2"])

内容损失函数

内容损失函数是内容图片前向传播的某一层的激活值 a_C 和生成图片前向传播的某一层的激活值 a_G 之间的差距,一般来说我们选取中间层的激活值更能代表图片的内容。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def compute_content_cost(a_C, a_G):
"""
Computes the content cost

Arguments:
a_C -- tensor of dimension (1, n_H, n_W, n_C), hidden layer activations representing content of the image C
a_G -- tensor of dimension (1, n_H, n_W, n_C), hidden layer activations representing content of the image G

Returns:
J_content -- scalar that you compute using equation 1 above.
"""

# Retrieve dimensions from a_G (≈1 line)
m, n_H, n_W, n_C = a_G.get_shape().as_list()

# compute the cost with tensorflow (≈1 line)
J_content = (1 / (4 * n_W * n_H * n_C)) * tf.reduce_sum(tf.square(tf.subtract(a_C_unrolled, a_G_unrolled)))

return J_content

风格代价函数

风格矩阵

风格矩阵代表了某张图片的风格,用某一层的激活值的格拉姆矩阵表示,也就是这一层各个通道之间的相关性,具体的步骤为:先将某一层激活值展开成二维矩阵,风格矩阵就是这个二维矩阵叉乘它的转置矩阵。

风格矩阵的斜对角,是某个通道自己和自己的相关性,表示某个通道的活跃程度。

1
2
3
4
5
6
7
8
9
10
11
12
13
# 风格矩阵生成
def gram_matrix(A):
"""
Argument:
A -- matrix of shape (n_C, n_H*n_W)

Returns:
GA -- Gram matrix of A, of shape (n_C, n_C)
"""

GA = tf.matmul(A, A, transpose_b=True)

return GA

风格代价函数

对于某一层而言,这一层的风格代价函数是,生成图片在这一层的风格矩阵和风格图片在这一层的风格矩阵的“距离”。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# 某一层的风格代价函数
def compute_layer_style_cost(a_S, a_G):
"""
Arguments:
a_S -- tensor of dimension (1, n_H, n_W, n_C), hidden layer activations representing style of the image S
a_G -- tensor of dimension (1, n_H, n_W, n_C), hidden layer activations representing style of the image G

Returns:
J_style_layer -- tensor representing a scalar value, style cost defined above by equation (2)
"""

# Retrieve dimensions from a_G
m, n_H, n_W, n_C = a_G.get_shape().as_list()

# Reshape the images to have them of shape (n_C, n_H*n_W)
a_S = tf.transpose(tf.reshape(a_S, [n_H*n_W, n_C]))# tf.reshape 是从最后一个维度开始取数,(n_H, n_W, n_C) 最后一个维度是通道数,也就是说先把第一行第一列的通道数取出
a_G = tf.transpose(tf.reshape(a_G, [n_H*n_W, n_C]))# 然后按行排列,如果这一行满了 n_C 个则换行,接着取第一行第二列的通道数,接着按行排列,依次类推,所以只能reshape陈
# 成(n_H*n_W, n_C)形状的向量,然后进行一次转置操作变成 (n_C, n_W*n_H)
# Computing gram_matrices for both images S and G
GS = gram_matrix(a_S)
GG = gram_matrix(a_G)

# Computing the loss
J_style_layer = (1 / (4 * (n_C ** 2) * (n_H*n_W) ** 2)) * tf.reduce_sum(tf.square(tf.subtract(GS,GG)))

return J_style_layer

最后总的风格代价函数是每一层的风格代价值用不同的权重组合起来,权重存放在一个字典里:

1
2
3
4
5
6
STYLE_LAYERS = [
('conv1_1', 0.2),
('conv2_1', 0.2),
('conv3_1', 0.2),
('conv4_1', 0.2),
('conv5_1', 0.2)]

加上权重之后总的风格代价函数为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def compute_style_cost(model, STYLE_LAYERS):
"""
Computes the overall style cost from several chosen layers

Arguments:
model -- our tensorflow model
STYLE_LAYERS -- A python list containing:
- the names of the layers we would like to extract style from
- a coefficient for each of them

Returns:
J_style -- tensor representing a scalar value, style cost defined above by equation (2)
"""

# initialize the overall style cost
J_style = 0

for layer_name, coeff in STYLE_LAYERS:

# Select the output tensor of the currently selected layer
out = model[layer_name]

# Set a_S to be the hidden layer activation from the layer we have selected, by running the session on out
a_S = sess.run(out) # 先对 a_S 进行赋值

# Set a_G to be the hidden layer activation from same layer. Here, a_G references model[layer_name]
# and isn't evaluated yet. Later in the code, we'll assign the image G as the model input, so that
# when we run the session, this will be the activations drawn from the appropriate layer, with G as input.
a_G = out # 由于 a_G 是最后 assign 进模型,我们先不赋值

# Compute style_cost for the current layer
J_style_layer = compute_layer_style_cost(a_S, a_G)

# Add coeff * J_style_layer of this layer to overall style cost
J_style += coeff * J_style_layer

return J_style

总的代价函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def total_cost(J_content, J_style, alpha = 10, beta = 40):
"""
Computes the total cost function

Arguments:
J_content -- content cost coded above
J_style -- style cost coded above
alpha -- hyperparameter weighting the importance of the content cost
beta -- hyperparameter weighting the importance of the style cost

Returns:
J -- total cost as defined by the formula above.
"""

J = alpha * J_content + beta * J_style

return J

进行训练

步骤为:

  • 创建对话
  • 加载内容图片 C
  • 加载风格图片 S
  • 随机初始化需要生成的图片 G
  • 加载预训练模型 VGG
  • 建立 tensorflow 图:
    • 将内容图片通过 VGG 模型计算内容代价函数
    • 将风格图片通过 VGG 模型计算风格代价函数
    • 计算总代价函数
    • 定义优化器和学习率
  • 初始化计算图用很大的迭代数运行,每一步都更新一次生成图片 G

创建对话:

1
2
3
4
5
# 重设计算图
tf.reset_default_graph()

# 开启交互式对话,在没有指定会话对象时也会运行变量,不用 with tf.Session() as sess 这种语句来指明默认会话,它自己就是默认会话,更方便
sess = tf.InteractiveSession()

加载内容图片 C 并进行 reshape 和 归一化处理:

1
2
content_image = scipy.misc.imread("images/1.jpg")
content_image = reshape_and_normalize_image(content_image)
  • 归一化函数

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    def reshape_and_normalize_image(image):
    """
    Reshape and normalize the input image (content or style)
    """

    # Reshape image to mach expected input of VGG16
    image = np.reshape(image, ((1,) + image.shape)) # (1,)+(400,300,3)=(1,400,300,3)

    # Substract the mean to match the expected input of VGG16
    image = image - CONFIG.MEANS

    return image

加载风格图片 S 并进行归一化处理:

1
2
style_image = scipy.misc.imread("images/2.jpg")
style_image = reshape_and_normalize_image(style_image)

现在用噪音初始化生成图片 G,虽然每个像素是随机的噪声,但还是与内容图片 C 相关,噪声和 C 以一定的权重叠加,使得在更新 G 的像素时能更快地匹配到内容图片。

1
2
generated_image = generate_noise_image(content_image)
imshow(generated_image[0])
  • 生成噪声的函数

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    def generate_noise_image(content_image, noise_ratio = CONFIG.NOISE_RATIO):
    """
    Generates a noisy image by adding random noise to the content_image
    """

    # Generate a random noise_image
    noise_image = np.random.uniform(-20, 20, (1, CONFIG.IMAGE_HEIGHT, CONFIG.IMAGE_WIDTH, CONFIG.COLOR_CHANNELS)).astype('float32')

    # Set the input_image to be a weighted average of the content_image and a noise_image
    input_image = noise_image * noise_ratio + content_image * (1 - noise_ratio)

    return input_image

加载预训练模型:

1
model = load_vgg_model("pretrained-model/imagenet-vgg-verydeep-19.mat")

构建计算图:

  1. 选取某一层计算内容代价函数

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    # Assign the content image to be the input of the VGG model.  
    sess.run(model['input'].assign(content_image))

    # Select the output tensor of layer conv4_2
    out = model['conv4_2']

    # Set a_C to be the hidden layer activation from the layer we have selected
    a_C = sess.run(out)

    # Set a_G to be the hidden layer activation from same layer. Here, a_G references model['conv4_2']
    # and isn't evaluated yet. Later in the code, we'll assign the image G as the model input, so that
    # when we run the session, this will be the activations drawn from the appropriate layer, with G as input.
    a_G = out

    # Compute the content cost
    J_content = compute_content_cost(a_C, a_G)
  2. 计算风格代价函数

    1
    2
    3
    4
    5
    # Assign the input of the model to be the "style" image 
    sess.run(model['input'].assign(style_image))

    # Compute the style cost
    J_style = compute_style_cost(model, STYLE_LAYERS)
  3. 计算总的代价函数

    1
    2
    # 总代价函数
    J = total_cost(J_content, J_style, alpha = 10, beta = 80)
  4. 定义优化器

    1
    2
    3
    4
    5
    # define optimizer
    optimizer = tf.train.AdamOptimizer(2.0)

    # define train_step,选择优化对象,总代价函数 J
    train_step = optimizer.minimize(J)

下面进行训练:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def model_nn(sess, input_image, num_iterations = 200):

# 初始化全局变量
tf.global_variables_initializer().run()

# 输入图片 G 是一个变量,使用 assign 对其进行赋值
sess.run(model['input'].assign(input_image))

for i in range(num_iterations):

# 运行优化器,更新图片 G 的参数
sess.run(train_step)

# 获得生成图片的值
generated_image = sess.run(model['input'])

# Print every 20 iteration.
if i%20 == 0:
Jt, Jc, Js = sess.run([J, J_content, J_style])
print("Iteration " + str(i) + " :")
print("total cost = " + str(Jt))
print("content cost = " + str(Jc))
print("style cost = " + str(Js))

# save current generated image in the "/output" directory
save_image("output/" + str(i) + ".png", generated_image)

# save last generated image
save_image('output/generated_image.jpg', generated_image)

return generated_image
  • 其中保存图片的函数

    1
    2
    3
    4
    5
    6
    7
    8
    def save_image(path, image):

    # Un-normalize the image so that it looks good 去归一化
    image = image + CONFIG.MEANS

    # Clip and Save the image
    image = np.clip(image[0], 0, 255).astype('uint8') # 这一句将元素的值限制在 0~255 之间
    scipy.misc.imsave(path, image)

测试

1
2
3
model_nn(sess, generated_image)
result_image = scipy.misc.imread("output/generated_image.jpg")
imshow(result_image)

结论

What you should remember:

  • Neural Style Transfer is an algorithm that given a content image C and a style image S can generate an artistic image
  • It uses representations (hidden layer activations) based on a pretrained ConvNet.
  • The content cost function is computed using one hidden layer’s activations.
  • The style cost function for one layer is computed using the Gram matrix of that layer’s activations. The overall style cost function is obtained using several hidden layers.
  • Optimizing the total cost function results in synthesizing new images.

人脸识别

  • 使用三重损失函数
  • 使用预训练模型来对人脸图片进行编码
  • 使用这些编码来实现人脸验证和人脸识别

照例先引入需要的包:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from keras.models import Sequential
from keras.layers import Conv2D, ZeroPadding2D, Activation, Input, concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import MaxPooling2D, AveragePooling2D
from keras.layers.merge import Concatenate
from keras.layers.core import Lambda, Flatten, Dense
from keras.initializers import glorot_uniform
from keras.engine.topology import Layer
from keras import backend as K
K.set_image_data_format('channels_first') # 将图片格式设置为通道数在前
import cv2
import os
import numpy as np
from numpy import genfromtxt
import pandas as pd
import tensorflow as tf
from fr_utils import *
from inception_blocks_v2 import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

np.set_printoptions(threshold=np.nan)

三重损失

由于我们采用预训练模型,不需要对三重损失进行优化,但是还是有必要知道如何实现。

  • A 为锚照片,P 为正例,N 为反例,$\alpha$ 是一个裕度
  • f() 表示照片经过模型的输出向量,也就是编码
  • $[ \ \ ]_+$表示取和 0 相比的较大值

为了能在 keras 模型的编译环节使用,我们使用 keras 中损失函数的格式进行自定义 loss 函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def triplet_loss(y_true, y_pred, alpha = 0.2): 
"""
Implementation of the triplet loss as defined by formula (3)

Arguments:
y_true -- y_true 其实没有用,但是在 Keras 中要定义一个损失函数就必须按照这个格式
y_pred -- python list containing three objects:
anchor -- the encodings for the anchor images, of shape (None, 128)
positive -- the encodings for the positive images, of shape (None, 128)
negative -- the encodings for the negative images, of shape (None, 128)

Returns:
loss -- real number, value of the loss
"""

anchor, positive, negative = y_pred[0], y_pred[1], y_pred[2]

# Step 1: Compute the (encoding) distance between the anchor and the positive, you will need to sum over axis=-1
pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, positive)), axis = -1)
# Step 2: Compute the (encoding) distance between the anchor and the negative, you will need to sum over axis=-1
neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor, negative)), axis = -1)
# Step 3: subtract the two previous distances and add alpha.
basic_loss = pos_dist - neg_dist + alpha
# Step 4: Take the maximum of basic_loss and 0.0. Sum over the training examples.
loss = tf.reduce_sum(tf.maximum(basic_loss, 0))

return loss

加载预训练模型

我们使用别人训练好的一个 Inception 网络来对图片进行编码。

  • 输入形状为 $(m, n_C, n_H, n_W) = (m, 3, 96, 96)$ 的图片
  • 输出形状为 $(m, 128)$ 的图片编码
1
FRmodel = faceRecoModel(input_shape=(3, 96, 96))

编译模型并加载权重:

1
2
FRmodel.compile(optimizer = 'adam', loss = triplet_loss, metrics = ['accuracy'])
load_weights_from_FaceNet(FRmodel)

具体的函数细节见 iwantooxxoox 的 github。

人脸识别和验证

首先我们要建立一个数据库,里面存放了所有的需要识别或验证的人的照片通过神经网络的编码。

1
2
3
4
5
6
7
8
9
10
11
12
13
database = {}
database["danielle"] = img_to_encoding("images/danielle.png", FRmodel)
database["younes"] = img_to_encoding("images/younes.jpg", FRmodel)
database["tian"] = img_to_encoding("images/tian.jpg", FRmodel)
database["andrew"] = img_to_encoding("images/andrew.jpg", FRmodel)
database["kian"] = img_to_encoding("images/kian.jpg", FRmodel)
database["dan"] = img_to_encoding("images/dan.jpg", FRmodel)
database["sebastiano"] = img_to_encoding("images/sebastiano.jpg", FRmodel)
database["bertrand"] = img_to_encoding("images/bertrand.jpg", FRmodel)
database["kevin"] = img_to_encoding("images/kevin.jpg", FRmodel)
database["felix"] = img_to_encoding("images/felix.jpg", FRmodel)
database["benoit"] = img_to_encoding("images/benoit.jpg", FRmodel)
database["arnaud"] = img_to_encoding("images/arnaud.jpg", FRmodel)

其中获取编码的函数:

1
2
3
4
5
6
7
def img_to_encoding(image_path, model):
img1 = cv2.imread(image_path, 1)
img = img1[...,::-1]
img = np.around(np.transpose(img, (2,0,1))/255.0, decimals=12)
x_train = np.array([img])
embedding = model.predict_on_batch(x_train)
return embedding

人脸检验

所谓人脸检验就是在识别时提供人脸照片和 ID 号,用来验证是不是本人,1 对 1 的问题。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def verify(image_path, identity, database, model):
"""
Function that verifies if the person on the "image_path" image is "identity".

Arguments:
image_path -- path to an image
identity -- string, name of the person you'd like to verify the identity. Has to be a resident of the Happy house.
database -- python dictionary mapping names of allowed people's names (strings) to their encodings (vectors).
model -- your Inception model instance in Keras

Returns:
dist -- distance between the image_path and the image of "identity" in the database.
door_open -- True, if the door should open. False otherwise.
"""

# Step 1: Compute the encoding for the image. Use img_to_encoding() see example above.
encoding = img_to_encoding(image_path, model)

# Step 2: Compute distance with identity's image
dist = np.linalg.norm(encoding - database[identity])

# Step 3: Open the door if dist < 0.7, else don't open
if dist < 0.7:
print("It's " + str(identity) + ", welcome home!")
door_open = True
else:
print("It's not " + str(identity) + ", please go away")
door_open = False

return dist, door_open

人脸识别

人脸识别不再提供 ID 号,只提供人脸照片,然后与数据库里的编码进行比对,找出与之距离最小的,如果距离小于某个阈值,则我们认定他是数据库中的人。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def who_is_it(image_path, database, model):
"""
Implements face recognition for the happy house by finding who is the person on the image_path image.

Arguments:
image_path -- path to an image
database -- database containing image encodings along with the name of the person on the image
model -- your Inception model instance in Keras

Returns:
min_dist -- the minimum distance between image_path encoding and the encodings from the database
identity -- string, the name prediction for the person on image_path
"""

## Step 1: Compute the target "encoding" for the image. Use img_to_encoding() see example above.
encoding = img_to_encoding(image_path, model)

## Step 2: Find the closest encoding,排序算法

# Initialize "min_dist" to a large value, say 100
min_dist = 100

# Loop over the database dictionary's names and encodings.
for (name, db_enc) in database.items(): # 如果要迭代 key 和 value 必须加 .item()

# Compute L2 distance between the target "encoding" and the current "emb" from the database.
dist = np.linalg.norm(encoding - db_enc)

# If this distance is less than the min_dist, then set min_dist to dist, and identity to name.
if dist < min_dist:
min_dist = dist
identity = name

if min_dist > 0.7:
print("Not in the database.")
else:
print ("it's " + str(identity) + ", the distance is " + str(min_dist))

return min_dist, identity

算法的改进

  • Put more images of each person (under different lighting conditions, taken on different days, etc.) into the database. Then given a new image, compare the new face to multiple pictures of the person. This would increae accuracy.
  • Crop the images to just contain the face, and less of the “border” region around the face. This preprocessing removes some of the irrelevant pixels around the face, and also makes the algorithm more robust.

结论

  • Face verification solves an easier 1:1 matching problem; face recognition addresses a harder 1:K matching problem.
  • The triplet loss is an effective loss function for training a neural network to learn an encoding of a face image.
  • The same encoding can be used for verification and recognition. Measuring distances between two images’ encodings allows you to determine whether they are pictures of the same person.
微信捐赠
支付宝捐赠