defread_glove_vecs(glove_file): with open(glove_file, 'r') as f: words = set() word_to_vec_map = {} for line in f: line = line.strip().split() curr_word = line[0] words.add(curr_word) word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64) return words, word_to_vec_map
defcosine_similarity(u, v): """ Cosine similarity reflects the degree of similariy between u and v Arguments: u -- a word vector of shape (n,) v -- a word vector of shape (n,) Returns: cosine_similarity -- the cosine similarity between u and v defined by the formula above. """ distance = 0.0 # Compute the dot product between u and v (≈1 line) dot =,v) # Compute the L2 norm of u (≈1 line) norm_u = np.sqrt(np.sum(np.square(u))) # Compute the L2 norm of v (≈1 line) norm_v = np.sqrt(np.sum(np.square(v))) # Compute the cosine similarity defined by formula (1) (≈1 line) cosine_similarity = dot/(norm_u*norm_v)
return cosine_similarity
要实现 “a is to b as c is to __“ 这种类比任务,我们可以从词向量词汇表里面找出这么一个词使得 $e_b - e_a \approx e_d - e_c$
defcomplete_analogy(word_a, word_b, word_c, word_to_vec_map): """ Performs the word analogy task as explained above: a is to b as c is to ____. Arguments: word_a -- a word, string word_b -- a word, string word_c -- a word, string word_to_vec_map -- dictionary that maps words to their corresponding vectors. Returns: best_word -- the word such that v_b - v_a is close to v_best_word - v_c, as measured by cosine similarity """ # convert words to lower case word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower() # Get the word embeddings v_a, v_b and v_c (≈1-3 lines) e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c] words = word_to_vec_map.keys() max_cosine_sim = -100# Initialize max_cosine_sim to a large negative number best_word = None# Initialize best_word with None, it will help keep track of the word to output
# loop over the whole word vector set for w in words: # to avoid best_word being one of the input words, pass on them. if w in [word_a, word_b, word_c] : continue # Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c) (≈1 line) cosine_sim = cosine_similarity(word_to_vec_map[w], e_b-e_a+e_c)#找出使得这个值最小的 # If the cosine_sim is more than the max_cosine_sim seen so far, # then: set the new max_cosine_sim to the current cosine_sim and the best_word to the current word (≈3 lines) if cosine_sim > max_cosine_sim: max_cosine_sim = cosine_sim best_word = w return best_word
g = word_to_vec_map['woman'] - word_to_vec_map['man']
print ('List of names and their similarities with constructed vector:')
# girls and boys name name_list = ['john', 'marie', 'sophie', 'ronaldo', 'priya', 'rahul', 'danielle', 'reza', 'katy', 'yasmin','bill']
for w in name_list: print (w, cosine_similarity(word_to_vec_map[w], g))
List of names and their similarities with constructed vector: john -0.23163356146 marie 0.315597935396 sophie 0.318687898594 ronaldo -0.312447968503 priya 0.17632041839 rahul -0.169154710392 danielle 0.243932992163 reza -0.079304296722 katy 0.283106865957 yasmin 0.233138577679 bill -0.0306830313755
print('Other words and their similarities:') word_list = ['lipstick', 'guns', 'science', 'arts', 'literature', 'warrior','doctor', 'tree', 'receptionist', 'technology', 'fashion', 'teacher', 'engineer', 'pilot', 'computer', 'singer'] for w in word_list: print (w, cosine_similarity(word_to_vec_map[w], g))
Other words and their similarities: lipstick 0.276919162564 guns -0.18884855679 science -0.0608290654093 arts 0.00818931238588 literature 0.0647250443346 warrior -0.209201646411 doctor 0.118952894109 tree -0.0708939917548 receptionist 0.330779417506 technology -0.131937324476 fashion 0.0356389462577 teacher 0.179209234318 engineer -0.0803928049452 pilot 0.00107644989919 computer -0.103303588739 singer 0.185005181365
defneutralize(word, g, word_to_vec_map): """ Removes the bias of "word" by projecting it on the space orthogonal to the bias axis. This function ensures that gender neutral words are zero in the gender subspace. Arguments: word -- string indicating the word to debias g -- numpy-array of shape (50,), corresponding to the bias axis (such as gender) word_to_vec_map -- dictionary mapping words to their corresponding vectors. Returns: e_debiased -- neutralized word vector representation of the input "word" """
# Select word vector representation of "word". Use word_to_vec_map. (≈ 1 line) e = word_to_vec_map[word] # Compute e_biascomponent using the formula give above. (≈ 1 line) e_biascomponent = (,g)/np.linalg.norm(g)**2) * g # Neutralize e by substracting e_biascomponent from it # e_debiased should be equal to its orthogonal projection. (≈ 1 line) e_debiased = e - e_biascomponent return e_debiased
e = "receptionist" print("cosine similarity between " + e + " and g, before neutralizing: ", cosine_similarity(word_to_vec_map["receptionist"], g))
e_debiased = neutralize("receptionist", g, word_to_vec_map) print("cosine similarity between " + e + " and g, after neutralizing: ", cosine_similarity(e_debiased, g))
cosine similarity between receptionist and g, before neutralizing: 0.330779417506 cosine similarity between receptionist and g, after neutralizing: -3.26732746085e-17
平均化 Equalization algorithm for gender-specific words
defequalize(pair, bias_axis, word_to_vec_map): """ Debias gender specific words by following the equalize method described in the figure above. Arguments: pair -- pair of strings of gender specific words to debias, e.g. ("actress", "actor") bias_axis -- numpy-array of shape (50,), vector corresponding to the bias axis, e.g. gender word_to_vec_map -- dictionary mapping words to their corresponding vectors Returns e_1 -- word vector corresponding to the first word e_2 -- word vector corresponding to the second word """ # Step 1: Select word vector representation of "word". Use word_to_vec_map. (≈ 2 lines) w1, w2 = pair e_w1, e_w2 = word_to_vec_map[w1],word_to_vec_map[w2] # Step 2: Compute the mean of e_w1 and e_w2 (≈ 1 line) mu = (e_w1 + e_w2)/2
# Step 3: Compute the projections of mu over the bias axis and the orthogonal axis (≈ 2 lines) mu_B = (,bias_axis)/(np.linalg.norm(bias_axis)**2)) * bias_axis mu_orth = mu - mu_B
# Step 4: Use equations (7) and (8) to compute e_w1B and e_w2B (≈2 lines) e_w1B = (,bias_axis)/(np.linalg.norm(bias_axis)**2)) * bias_axis e_w2B = (,bias_axis)/(np.linalg.norm(bias_axis)**2)) * bias_axis # Step 5: Adjust the Bias part of e_w1B and e_w2B using the formulas (9) and (10) given above (≈2 lines) corrected_e_w1B = np.sqrt(abs(1-np.linalg.norm(mu_orth)**2)) * ((e_w1B-mu_B)/np.linalg.norm(e_w1B-mu_B)) corrected_e_w2B = np.sqrt(abs(1-np.linalg.norm(mu_orth)**2)) * ((e_w2B-mu_B)/np.linalg.norm(e_w2B-mu_B))
# Step 6: Debias by equalizing e1 and e2 to the sum of their corrected projections (≈2 lines) e1 = corrected_e_w1B + mu_orth e2 = corrected_e_w2B + mu_orth return e1, e2
defsentence_to_avg(sentence, word_to_vec_map): """ Converts a sentence (string) into a list of words (strings). Extracts the GloVe representation of each word and averages its value into a single vector encoding the meaning of the sentence. Arguments: sentence -- string, one training example from X word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation Returns: avg -- average vector encoding information about the sentence, numpy-array of shape (50,) """ # Step 1: Split sentence into list of lower case words (≈ 1 line) words = sentence.lower().split()
# Initialize the average word vector, should have the same shape as your word vectors. avg = np.zeros((50,)) # Step 2: average the word vectors. You can loop over the words in the list "words". for w in words: avg += word_to_vec_map[w] avg = avg / len(words)
defmodel(X, Y, word_to_vec_map, learning_rate = 0.01, num_iterations = 400): """ Model to train word vector representations in numpy. Arguments: X -- input data, numpy array of sentences as strings, of shape (m, 1) Y -- labels, numpy array of integers between 0 and 7, numpy-array of shape (m, 1) word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation learning_rate -- learning_rate for the stochastic gradient descent algorithm num_iterations -- number of iterations Returns: pred -- vector of predictions, numpy-array of shape (m, 1) W -- weight matrix of the softmax layer, of shape (n_y, n_h) b -- bias of the softmax layer, of shape (n_y,) """ np.random.seed(1)
# Define number of training examples m = Y.shape[0] # number of training examples n_y = 5# number of classes n_h = 50# dimensions of the GloVe vectors # Initialize parameters using Xavier initialization W = np.random.randn(n_y, n_h) / np.sqrt(n_h) b = np.zeros((n_y,)) # Convert Y to Y_onehot with n_y classes Y_oh = convert_to_one_hot(Y, C = n_y) # Optimization loop for t in range(num_iterations): # 遍历完所有的 epoch for i in range(m): # 每一个 epoch 遍历完所有样本,每一个样本更新一次参数,应该是随机梯度下降 # Average the word vectors of the words from the i'th training example avg = sentence_to_avg(X[i], word_to_vec_map)
# Forward propagate the avg through the softmax layer z =,avg)+b a = softmax(z)
# Compute cost using the i'th training label's one hot representation and "A" (the output of the softmax) cost = -np.sum(Y_oh*np.log(a)) # Compute gradients dz = a - Y_oh[i] dW =,1), avg.reshape(1, n_h)) db = dz
# Update parameters with Stochastic Gradient Descent W = W - learning_rate * dW b = b - learning_rate * db if t % 100 == 0: print("Epoch: " + str(t) + " --- cost = " + str(cost)) pred = predict(X, Y, W, b, word_to_vec_map)
return pred, W, b
pred, W, b = model(X_train, Y_train, word_to_vec_map) print(pred)
Training set: Accuracy: 0.977272727273 Test set: Accuracy: 0.857142857143
X_my_sentences = np.array(["i hate you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy"]) Y_my_labels = np.array([[0], [0], [2], [1], [4],[3]])
pred = predict(X_my_sentences, Y_my_labels , W, b, word_to_vec_map) print_predictions(X_my_sentences, pred)
Accuracy: 0.666666666667
i hate you 😞 i love you ❤️ funny lol 😄 lets play with a ball ⚾ food is ready 🍴 not feeling happy 😄
这个模型没有考虑词语的前后联系,所以会出现 not feeling happy 😄 这种错误,接下来用 LSTM 模型来实现这个任务。
版本 2 - 在 Keras 中使用 LSTM 模型
import numpy as np np.random.seed(0) from keras.models import Model from keras.layers import Dense, Input, Dropout, LSTM, Activation from keras.layers.embeddings import Embedding from keras.preprocessing import sequence from keras.initializers import glorot_uniform np.random.seed(1)
Keras 和 minibatch
数据集中所有的句子长度不是统一的,而在 Keras 中要实现小批量梯度下降,某个 minibatch 中所有句子的长度必须全部相同,这样才能输入 LSTM 层进行训练,为了解决这个问题,我们可以将句子进行填充,以最长的句子为基准,不足的部分用零向量进行填充,假设最长的句子有 20 个词,那么 “I love you” 这个句子的词向量为 $(e_{i}, e_{love}, e_{you}, \vec{0}, \vec{0}, \ldots, \vec{0})$.
在 Keras 中,嵌入矩阵是用类似于嵌入层的形式实现的,输入一个由索引值组成的句子,输出句子每个词的词向量。接下来我们会实现一个用预训练词向量初始化过的嵌入层,由于训练集较小,保持嵌入层参数固定不被训练。
defsentences_to_indices(X, word_to_index, max_len): """ Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences. The output shape should be such that it can be given to `Embedding()` (described in Figure 4). Arguments: X -- array of sentences (strings), of shape (m, 1) word_to_index -- a dictionary containing the each word mapped to its index max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. Returns: X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len) """ m = X.shape[0] # number of training examples
# Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line) X_indices = np.zeros((m,max_len)) # 这一步其实已经做好了零填充 for i in range(m): # loop over training examples # Convert the ith training sentence in lower case and split is into words. You should get a list of words. sentence_words = X[i].lower().split() # Initialize j to 0 j = 0 # Loop over the words of sentence_words for w in sentence_words: # Set the (i,j)th entry of X_indices to the index of the correct word. X_indices[i, j] = word_to_index[w] # Increment j to j + 1 j = j + 1 return X_indices
defpretrained_embedding_layer(word_to_vec_map, word_to_index): """ Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors. Arguments: word_to_vec_map -- dictionary mapping words to their GloVe vector representation. word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words) Returns: embedding_layer -- pretrained layer Keras instance """ # 以下部分都是为了将我们自己的词嵌入矩阵变成 Embedding 层的参数要求的形状 vocab_len = len(word_to_index) + 1# Keras embedding 层输入的规定,估计是注意词汇表的索引值第一个是 1 而不是 0,所以要加一 emb_dim = word_to_vec_map["cucumber"].shape[0] # define dimensionality of your GloVe word vectors (= 50)
# 先用 0 初始化一个正确的形状(vocab_len,emb_dim)的矩阵 emb_matrix = np.zeros((vocab_len, emb_dim))# 这个是传入Keras embedding 层的参数,它的形状必须跟 embedding.get_weights() 的形状相同 # 按照索引值将正确的词向量填入这个矩阵 for word, index in word_to_index.items(): emb_matrix[index, :] = word_to_vec_map[word] # 从这里开始构建 Keras 的 embedding 层 # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. embedding_layer = Embedding(vocab_len,emb_dim,trainable=False)# 这个将该层调为“不可训练”,保持其参数不变 # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".,))# 这里不太懂?? # 将该层的参数设为我们自己的词嵌入矩阵 embedding_layer.set_weights([emb_matrix]) return embedding_layer
defEmojify_V2(input_shape, word_to_vec_map, word_to_index): """ Function creating the Emojify-v2 model's graph. Arguments: input_shape -- shape of the input, usually (max_len,) word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words) Returns: model -- a model instance in Keras """
# 定义输入层 sentence_indices = Input(shape=input_shape, dtype='int32') # 创建嵌入层(参数是自己训练好的词嵌入矩阵) embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index) # 将输入向前传播 embeddings = embedding_layer(sentence_indices) # 继续传播经过一个隐藏单元数为 128 的 LSTM 层 X = LSTM(units=128, return_sequences=True)(embeddings)# 注意设置 return_sequences=True,返回所有时间序列 # 继续经过一个概率值为 0.5 的 Dropout 层 X = Dropout(0.5)(X) # 继续传播经过一个隐藏单元数为 128 的 LSTM 层 X = LSTM(units=128)(X)# 注意不用设置 return_sequences=True,默认返回最后一个cell的值 # 继续经过一个概率值为 0.5 的 Dropout 层 X = Dropout(0.5)(X) # 这是 softmax 层的第一部分,先通过一个全连接层 X = Dense(5)(X) # 再通过 softmax 激活函数层 X = Activation('softmax')(X) # 创建从 sentence_indices 到 X 的模型 model = Model(input=sentence_indices, outputs=X) return model
让我们看看模型的 summary,maxLen 为 10:
model = Emojify_V2((maxLen,), word_to_vec_map, word_to_index) model.summary()
32/56 [================>………….] - ETA: 0s Test accuracy = 0.821428562914
C = 5 y_test_oh = np.eye(C)[Y_test.reshape(-1)] # np.eye(C)生成5×5的对角矩阵,np.eye(C)[Y_test.reshape(-1)]取[]中每个元素的索引值 # 注意 np.array[np.array] 将会把 [] 中所有的元素索引出来 X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen) pred = model.predict(X_test_indices) for i in range(len(X_test)): x = X_test_indices num = np.argmax(pred[i])# 将 onehot 变成索引 if(num != Y_test[i]): print('Expected emoji:'+ label_to_emoji(Y_test[i]) + ' prediction: '+ X_test[i] + label_to_emoji(num).strip())
Expected emoji:😄 prediction: she got me a nice present ❤️ Expected emoji:😞 prediction: work is hard 😄 Expected emoji:😞 prediction: This girl is messing with me ❤️ Expected emoji:😞 prediction: work is horrible 😄 Expected emoji:🍴 prediction: any suggestions for dinner 😄 Expected emoji:😄 prediction: you brighten my day ❤️ Expected emoji:😞 prediction: she is a bully 😄 Expected emoji:😞 prediction: My life is so boring ❤️ Expected emoji:😄 prediction: will you be my valentine ❤️ Expected emoji:😞 prediction: go away ⚾ Expected emoji:🍴 prediction: I did not have breakfast ❤️