0%

TensorFlow_seq2seq_attention_wrapper源码阅读

attention_wrapper

tensorflow/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py

def _luong_score(query, keys, scale):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def _luong_score(query, keys, scale):
"""Implements Luong-style (multiplicative) scoring function.
This attention has two forms. The first is standard Luong attention,
as described in:
Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
"Effective Approaches to Attention-based Neural Machine Translation."
EMNLP 2015. https://arxiv.org/abs/1508.04025
The second is the scaled form inspired partly by the normalized form of
Bahdanau attention.
To enable the second form, call this function with `scale=True`.
Args:
query: Tensor, shape `[batch_size, num_units]` to compare to keys.
keys: Processed memory, shape `[batch_size, max_time, num_units]`.
scale: Whether to apply a scale to the score function.
Returns:
A `[batch_size, max_time]` tensor of unnormalized score values.
Raises:
ValueError: If `key` and `query` depths do not match.
"""
depth = query.get_shape()[-1]
key_units = keys.get_shape()[-1]
if depth != key_units:
raise ValueError(
"Incompatible or unknown inner dimensions between query and keys. "
"Query (%s) has units: %s. Keys (%s) have units: %s. "
"Perhaps you need to set num_units to the keys' dimension (%s)?"
% (query, depth, keys, key_units, key_units))
dtype = query.dtype

# Reshape from [batch_size, depth] to [batch_size, 1, depth]
# for matmul.
query = array_ops.expand_dims(query, 1)

# Inner product along the query units dimension.
# matmul shapes: query is [batch_size, 1, depth] and
# keys is [batch_size, max_time, depth].
# the inner product is asked to **transpose keys' inner shape** to get a
# batched matmul on:
# [batch_size, 1, depth] . [batch_size, depth, max_time]
# resulting in an output shape of:
# [batch_size, 1, max_time].
# we then squeeze out the center singleton dimension.
score = math_ops.matmul(query, keys, transpose_b=True)
score = array_ops.squeeze(score, [1])

if scale:
# Scalar used in weight scaling
g = variable_scope.get_variable(
"attention_g", dtype=dtype,
initializer=init_ops.ones_initializer, shape=())
score = g * score
return score

def _bahdanau_score(processed_query, keys, normalize):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def _bahdanau_score(processed_query, keys, normalize):
"""Implements Bahdanau-style (additive) scoring function.
This attention has two forms. The first is Bhandanau attention,
as described in:
Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
"Neural Machine Translation by Jointly Learning to Align and Translate."
ICLR 2015. https://arxiv.org/abs/1409.0473
The second is the normalized form. This form is inspired by the
weight normalization article:
Tim Salimans, Diederik P. Kingma.
"Weight Normalization: A Simple Reparameterization to Accelerate
Training of Deep Neural Networks."
https://arxiv.org/abs/1602.07868
To enable the second form, set `normalize=True`.
Args:
processed_query: Tensor, shape `[batch_size, num_units]` to compare to keys.
keys: Processed memory, shape `[batch_size, max_time, num_units]`.
normalize: Whether to normalize the score function.
Returns:
A `[batch_size, max_time]` tensor of unnormalized score values.
"""
dtype = processed_query.dtype
# Get the number of hidden units from the trailing dimension of keys
num_units = tensor_shape.dimension_value(
keys.shape[2]) or array_ops.shape(keys)[2]
# Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
processed_query = array_ops.expand_dims(processed_query, 1)
v = variable_scope.get_variable(
"attention_v", [num_units], dtype=dtype)
if normalize:
# Scalar used in weight normalization
g = variable_scope.get_variable(
"attention_g", dtype=dtype,
initializer=init_ops.constant_initializer(math.sqrt((1. / num_units))),
shape=())
# Bias added prior to the nonlinearity
b = variable_scope.get_variable(
"attention_b", [num_units], dtype=dtype,
initializer=init_ops.zeros_initializer())
# normed_v = g * v / ||v||
normed_v = g * v * math_ops.rsqrt(
math_ops.reduce_sum(math_ops.square(v)))
return math_ops.reduce_sum(
normed_v * math_ops.tanh(keys + processed_query + b), [2])
else:
return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2])
本站所有文章和源码均免费开放,如您喜欢,可以请我喝杯咖啡