self attention pytorch代码 / 憋错料

实现细节;
1.embedding 层
2.positional encoding层:添加位置信息
3,MultiHeadAttention层:encoder的self attention
4,sublayerConnection层：add&norm，使用layerNorm，
5,FeedForward层:两层全连接
6,Masked MultiHeadAttention:decoder中的self attention层，添加mask,不考虑计算当前位置的后面信息
7,MultiHeadAttention层:encoder的输出做key,value,decoder的self attention输出做query,类似于传统attention
8,generator层:最后的linear和softmax层，转为概率输出
9,预测时greedy_decode,第一个预测初始化为start字符
  1 #!/usr/bin/env python
  2 # coding: utf-8
  3
  4 import numpy as np
  5 import torch
  6 import torch.nn as nn
  7 import torch.nn.functional as F
  8 import math
  9 import copy
 10 import time
 11 from torch.autograd import Variable
 12 import matplotlib.pyplot as plt
 13 import seaborn
 14 seaborn.set_context(context="talk")
 15
 16
 17 class EncoderDecoder(nn.Module):
 18     """
 19     A standard Encoder-Decoder architecture. Base for this and many
 20     other models.
 21     """
 22
 23     def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
 24         super(EncoderDecoder, self).__init__()
 25         self.encoder = encoder
 26         self.decoder = decoder
 27         self.src_embed = src_embed
 28         self.tgt_embed = tgt_embed
 29         self.generator = generator
 30
 31     def forward(self, src, tgt, src_mask, tgt_mask):
 32         "Take in and process masked src and target sequences."
 33         memory = self.encode(src, src_mask)
 34         ret = self.decode(memory, src_mask, tgt, tgt_mask)
 35         return ret
 36
 37     def encode(self, src, src_mask):
 38         src_embedding = self.src_embed(src)
 39         ret = self.encoder(src_embedding, src_mask)
 40         return ret
 41
 42     def decode(self, memory, src_mask, tgt, tgt_mask):
 43         ret = tgt_embdding = self.tgt_embed(tgt)
 44         self.decoder(tgt_embdding, memory, src_mask, tgt_mask)
 45         return ret
 46
 47
 48 class Generator(nn.Module):
 49     "Define standard linear + softmax generation step."
 50
 51     def __init__(self, d_model, vocab):
 52         super(Generator, self).__init__()
 53         self.proj = nn.Linear(d_model, vocab)
 54
 55     def forward(self, x):
 56         return F.log_softmax(self.proj(x), dim=-1)
 57
 58
 59 # The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.
 60
 61 # ## Encoder and Decoder Stacks
 62 # ### Encoder
 63 # The encoder is composed of a stack of $N=6$ identical layers.
 64 def clones(module, N):
 65     "Produce N identical layers."
 66     return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
 67
 68
 69 class Encoder(nn.Module):
 70     "Core encoder is a stack of N layers"
 71
 72     def __init__(self, layer, N):
 73         super(Encoder, self).__init__()
 74         self.layers = clones(layer, N)
 75         self.norm = LayerNorm(layer.size)
 76
 77     def forward(self, x, mask):
 78         "Pass the input (and mask) through each layer in turn."
 79         for layer in self.layers:
 80             x = layer(x, mask)
 81         return self.norm(x)
 82
 83
 84 #layer normalization [(cite)](https://arxiv.org/abs/1607.06450). do on
 85 class LayerNorm(nn.Module):
 86     "Construct a layernorm module (See citation for details)."
 87     def __init__(self, features, eps=1e-6):
 88         super(LayerNorm, self).__init__()
 89         self.a_2 = nn.Parameter(torch.ones(features))
 90         self.b_2 = nn.Parameter(torch.zeros(features))
 91         self.eps = eps
 92
 93     def forward(self, x):
 94         mean = x.mean(-1, keepdim=True)
 95         std = x.std(-1, keepdim=True)
 96         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
 97
 98
 99 # That is, the output of each sub-layer is $\mathrm{LayerNorm}(x + \mathrm{Sublayer}(x))$, where $\mathrm{Sublayer}(x)$ is the function implemented by the sub-layer itself.  We apply dropout [(cite)](http://jmlr.org/papers/v15/srivastava14a.html) to the output of each sub-layer, before it is added to the sub-layer input and normalized.
100 # To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension $d_{\text{model}}=512$.
101 class SublayerConnection(nn.Module):
102     """
103     A residual connection followed by a layer norm.
104     Note for code simplicity the norm is first as opposed to last.
105     """
106
107     def __init__(self, size, dropout):
108         super(SublayerConnection, self).__init__()
109         self.norm = LayerNorm(size)
110         self.dropout = nn.Dropout(dropout)
111
112     def forward(self, x, sublayer):
113         "Apply residual connection to any sublayer with the same size."
114         ret = x + self.dropout(sublayer(self.norm(x)))
115         return ret
116
117
118 # Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network.
119 class EncoderLayer(nn.Module):
120     "Encoder is made up of self-attn and feed forward (defined below)"
121
122     def __init__(self, size, self_attn, feed_forward, dropout):
123         super(EncoderLayer, self).__init__()
124         self.self_attn = self_attn
125         self.feed_forward = feed_forward
126         self.sublayer = clones(SublayerConnection(size, dropout), 2)
127         self.size = size
128
129     def forward(self, x, mask):
130         "Follow Figure 1 (left) for connections."
131         x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
132         # torch.Size([30, 10, 512])
133         ret = self.sublayer[1](x, self.feed_forward)
134         return ret
135
136
137 # ### Decoder
138 # The decoder is also composed of a stack of $N=6$ identical layers.
139 class Decoder(nn.Module):
140     "Generic N layer decoder with masking."
141
142     def __init__(self, layer, N):
143         super(Decoder, self).__init__()
144         self.layers = clones(layer, N)
145         self.norm = LayerNorm(layer.size)
146
147     def forward(self, x, memory, src_mask, tgt_mask):
148         for layer in self.layers:
149             x = layer(x, memory, src_mask, tgt_mask)
150         return self.norm(x)
151
152
153 # In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack.  Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization.
154 class DecoderLayer(nn.Module):
155     "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
156
157     def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
158         super(DecoderLayer, self).__init__()
159         self.size = size
160         self.self_attn = self_attn
161         self.src_attn = src_attn
162         self.feed_forward = feed_forward
163         self.sublayer = clones(SublayerConnection(size, dropout), 3)
164
165     def forward(self, x, memory, src_mask, tgt_mask):
166         "Follow Figure 1 (right) for connections."
167         m = memory
168         x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
169         x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
170         return self.sublayer[2](x, self.feed_forward)
171
172
173 # ### Attention
174 # An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors.  The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.
175 # We call our particular attention "Scaled Dot-Product Attention".   The input consists of queries and keys of dimension $d_k$, and values of dimension $d_v$.  We compute the dot products of the query with all keys, divide each by $\sqrt{d_k}$, and apply a softmax function to obtain the weights on the values.
176 def attention(query, key, value, mask=None, dropout=None):
177     "Compute ‘Scaled Dot Product Attention‘"
178     # query,key,value:torch.Size([30, 8, 10, 64])
179     # decoder mask:torch.Size([30, 1, 9, 9])
180     d_k = query.size(-1)
181     key_ = key.transpose(-2, -1)  # torch.Size([30, 8, 64, 10])
182     # torch.Size([30, 8, 10, 10])
183     scores = torch.matmul(query, key_) / math.sqrt(d_k)
184     if mask is not None:
185         # decoder scores:torch.Size([30, 8, 9, 9]),
186         scores = scores.masked_fill(mask == 0, -1e9)
187     p_attn = F.softmax(scores, dim=-1)
188     if dropout is not None:
189         p_attn = dropout(p_attn)
190     return torch.matmul(p_attn, value), p_attn
191
192
193 class MultiHeadedAttention(nn.Module):
194     def __init__(self, h, d_model, dropout=0.1):
195         "Take in model size and number of heads."
196         super(MultiHeadedAttention, self).__init__()
197         assert d_model % h == 0
198         # We assume d_v always equals d_k
199         self.d_k = d_model // h  # 64=512//8
200         self.h = h
201         self.linears = clones(nn.Linear(d_model, d_model), 4)
202         self.attn = None
203         self.dropout = nn.Dropout(p=dropout)
204
205     def forward(self, query, key, value, mask=None):
206         # query,key,value:torch.Size([30, 10, 512])
207         if mask is not None:
208             # Same mask applied to all h heads.
209             mask = mask.unsqueeze(1)
210         nbatches = query.size(0)
211         # 1) Do all the linear projections in batch from d_model => h x d_k
212         query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
213                 for l, x in zip(self.linears, (query, key, value))]  # query,key,value:torch.Size([30, 8, 10, 64])
214         # 2) Apply attention on all the projected vectors in batch.
215         x, self.attn = attention(query, key, value, mask=mask,
216                                  dropout=self.dropout)
217         # 3) "Concat" using a view and apply a final linear.
218         x = x.transpose(1, 2).contiguous().view(
219             nbatches, -1, self.h * self.d_k)
220         ret = self.linears[-1](x)  # torch.Size([30, 10, 512])
221         return ret
222
223
224 # ### Applications of Attention in our Model
225 # The Transformer uses multi-head attention in three different ways:
226 # 1) In "encoder-decoder attention" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder.   This allows every position in the decoder to attend over all positions in the input sequence.  This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [(cite)](https://arxiv.org/abs/1609.08144).
227 # 2) The encoder contains self-attention layers.  In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder.   Each position in the encoder can attend to all positions in the previous layer of the encoder.
228 # 3) Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position.  We need to prevent leftward information flow in the decoder to preserve the auto-regressive property.  We implement this inside of scaled dot-product attention by masking out (setting to $-\infty$) all values in the input of the softmax which correspond to illegal connections.
229 # ## Position-wise Feed-Forward Networks
230 class PositionwiseFeedForward(nn.Module):
231     "Implements FFN equation."
232
233     def __init__(self, d_model, d_ff, dropout=0.1):
234         super(PositionwiseFeedForward, self).__init__()
235         self.w_1 = nn.Linear(d_model, d_ff)
236         self.w_2 = nn.Linear(d_ff, d_model)
237         self.dropout = nn.Dropout(dropout)
238
239     def forward(self, x):
240         return self.w_2(self.dropout(F.relu(self.w_1(x))))
241
242
243 # ## Embeddings and Softmax
244 # Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension $d_{\text{model}}$.  We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities.  In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [(cite)](https://arxiv.org/abs/1608.05859). In the embedding layers, we multiply those weights by $\sqrt{d_{\text{model}}}$.
245 class Embeddings(nn.Module):
246     def __init__(self, d_model, vocab):
247         super(Embeddings, self).__init__()
248         self.lut = nn.Embedding(vocab, d_model)  # Embedding(11, 512)
249         self.d_model = d_model
250
251     def forward(self, x):
252         return self.lut(x) * math.sqrt(self.d_model)
253
254
255 # ## Positional Encoding
256 class PositionalEncoding(nn.Module):
257     "Implement the PE function."
258
259     def __init__(self, d_model, dropout, max_len=5000):
260         super(PositionalEncoding, self).__init__()
261         self.dropout = nn.Dropout(p=dropout)
262
263         # Compute the positional encodings once in log space.
264         pe = torch.zeros(max_len, d_model)
265         position = torch.arange(0., max_len).unsqueeze(1)
266         div_term = torch.exp(torch.arange(0., d_model, 2)
267                              * -(math.log(10000.0) / d_model))
268
269         pe[:, 0::2] = torch.sin(position * div_term)
270         pe[:, 1::2] = torch.cos(position * div_term)
271         pe = pe.unsqueeze(0)
272         self.register_buffer(‘pe‘, pe)
273
274     def forward(self, x):
275         x = x + Variable(self.pe[:, :x.size(1)],
276                          requires_grad=False)
277         return self.dropout(x)
278
279
280 # We also experimented with using learned positional embeddings [(cite)](https://arxiv.org/pdf/1705.03122.pdf) instead, and found that the two versions produced nearly identical results.  We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training.
281 # ## Full Model
282 def make_model(src_vocab, tgt_vocab, N=6,
283                d_model=512, d_ff=2048, h=8, dropout=0.1):
284     "Helper: Construct a model from hyperparameters."
285     c = copy.deepcopy
286     attn = MultiHeadedAttention(h, d_model)
287     ff = PositionwiseFeedForward(d_model, d_ff, dropout)
288     position = PositionalEncoding(d_model, dropout)
289     model = EncoderDecoder(
290         Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
291         Decoder(DecoderLayer(d_model, c(attn), c(attn),
292                              c(ff), dropout), N),
293         nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
294         nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
295         Generator(d_model, tgt_vocab))
296
297     # This was important from their code.
298     # Initialize parameters with Glorot / fan_avg.
299     for p in model.parameters():
300         if p.dim() > 1:
301             nn.init.xavier_uniform_(p)
302     return model
303
304 # We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions.  This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position $i$ can depend only on the known outputs at positions less than $i$.
305
306
307 def subsequent_mask(size):
308     "Mask out subsequent positions when decoding."
309     attn_shape = (1, size, size)
310     subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype(‘uint8‘)
311     return torch.from_numpy(subsequent_mask) == 0
312
313 # # Training
314 # This section describes the training regime for our models.
315 # > We stop for a quick interlude to introduce some of the tools
316 # needed to train a standard encoder decoder model. First we define a batch object that holds the src and target sentences for training, as well as constructing the masks.
317 # ## Batches and Masking
318
319
320 class Batch:
321     "Object for holding a batch of data with mask during training."
322
323     def __init__(self, src, trg=None, pad=0):
324         self.src = src
325         self.src_mask = (src != pad).unsqueeze(-2)
326         if trg is not None:
327             self.trg = trg[:, :-1]
328             self.trg_y = trg[:, 1:]
329             self.trg_mask = self.make_std_mask(self.trg, pad)
330             self.ntokens = (self.trg_y != pad).data.sum()
331
332     @staticmethod
333     def make_std_mask(tgt, pad):
334         "Create a mask to hide padding and future words."
335         tgt_mask = (tgt != pad).unsqueeze(-2)
336         tgt_mask = tgt_mask & Variable(
337             subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
338         return tgt_mask
339
340 # Next we create a generic training and scoring function to keep track of loss. We pass in a generic loss compute function that also handles parameter updates.
341 def run_epoch(data_iter, model, loss_compute):
342     "Standard Training and Logging Function"
343     start = time.time()
344     total_tokens = 0
345     total_loss = 0
346     tokens = 0
347     for i, batch in enumerate(data_iter):
348         out = model.forward(batch.src, batch.trg,
349                             batch.src_mask, batch.trg_mask)#torch.Size([30, 10]),torch.Size([30, 9]),torch.Size([30, 1, 10]),torch.Size([30, 9, 9])
350
351
352         loss = loss_compute(out, batch.trg_y, batch.ntokens)
353         total_loss += loss
354         total_tokens += batch.ntokens
355         tokens += batch.ntokens
356         if i % 50 == 1:
357             elapsed = time.time() - start
358             print("Step: %d Loss: %f" %
359                     (i, loss / batch.ntokens))
360             start = time.time()
361             tokens = 0
362
363     return total_loss / total_tokens
364
365
366 # ## Optimizer
367 class NoamOpt:
368     "Optim wrapper that implements rate."
369     def __init__(self, model_size, factor, warmup, optimizer):
370         self.optimizer = optimizer
371         self._step = 0
372         self.warmup = warmup
373         self.factor = factor
374         self.model_size = model_size
375         self._rate = 0
376
377     def step(self):
378         "Update parameters and rate"
379         self._step += 1
380         rate = self.rate()
381         for p in self.optimizer.param_groups:
382             p[‘lr‘] = rate
383         self._rate = rate
384         self.optimizer.step()
385
386     def rate(self, step = None):
387         "Implement `lrate` above"
388         if step is None:
389             step = self._step
390         return self.factor *(self.model_size ** (-0.5) *min(step ** (-0.5), step * self.warmup ** (-1.5)))
391
392 def get_std_opt(model):
393     return NoamOpt(model.src_embed[0].d_model, 2, 4000,
394             torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
395 # Three settings of the lrate hyperparameters.
396 opts = [NoamOpt(512, 1, 4000, None),
397         NoamOpt(512, 1, 8000, None),
398         NoamOpt(256, 1, 4000, None)]
399
400 # ## Regularization
401 # ### Label Smoothing
402 # During training, we employed label smoothing . This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.
403 class LabelSmoothing(nn.Module):
404     "Implement label smoothing."
405     def __init__(self, size, padding_idx, smoothing=0.0):
406         super(LabelSmoothing, self).__init__()
407         self.criterion = nn.KLDivLoss(size_average=False)
408         self.padding_idx = padding_idx
409         self.confidence = 1.0 - smoothing
410         self.smoothing = smoothing
411         self.size = size
412         self.true_dist = None
413
414     def forward(self, x, target):
415         assert x.size(1) == self.size
416         true_dist = x.data.clone()
417         true_dist.fill_(self.smoothing / (self.size - 2))
418         true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
419         true_dist[:, self.padding_idx] = 0
420         mask = torch.nonzero(target.data == self.padding_idx)
421         if mask.dim() > 0:
422             true_dist.index_fill_(0, mask.squeeze(), 0.0)
423         self.true_dist = true_dist
424         return self.criterion(x, Variable(true_dist, requires_grad=False))
425
426
427 # > Here we can see an example of how the mass is distributed to the words based on confidence.
428 # crit = LabelSmoothing(5, 0, 0.4)
429 # predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
430 #                              [0, 0.2, 0.7, 0.1, 0],
431 #                              [0, 0.2, 0.7, 0.1, 0]])
432 # v = crit(Variable(predict.log()),
433 #          Variable(torch.LongTensor([2, 1, 0])))
434
435
436 # crit = LabelSmoothing(5, 0, 0.1)
437 # def loss(x):
438 #     d = x + 3 * 1
439 #     predict = torch.FloatTensor([[0, x / d, 1 / d, 1 / d, 1 / d],
440 #                                  ])
441 #     # print(predict)
442 #     return crit(Variable(predict.log()),
443 #                  Variable(torch.LongTensor([1]))).item()
444
445 # # A First  Example
446 # > We can begin by trying out a simple copy-task. Given a random set of input symbols from a small vocabulary, the goal is to generate back those same symbols.
447 # ## Synthetic Data
448 def data_gen(V, batch, nbatches):
449     "Generate random data for a src-tgt copy task."
450     for i in range(nbatches):
451         data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))#torch.Size([30, 10])
452         data[:, 0] = 1 #start
453         src = Variable(data, requires_grad=False)
454         tgt = Variable(data, requires_grad=False)
455         yield Batch(src, tgt, 0)
456 # data_gen(11,30,20)
457
458
459 # ## Loss Computation
460 class SimpleLossCompute:
461     "A simple loss compute and train function."
462     def __init__(self, generator, criterion, opt=None):
463         self.generator = generator
464         self.criterion = criterion
465         self.opt = opt
466
467     def __call__(self, x, y, norm):
468         x = self.generator(x)
469         loss = self.criterion(x.contiguous().view(-1, x.size(-1)),
470                               y.contiguous().view(-1)) / norm
471         loss.backward()
472         if self.opt is not None:
473             self.opt.step()
474             self.opt.optimizer.zero_grad()
475         return loss.item() * norm
476
477
478 # ## Greedy Decoding
479 # Train the simple copy task.
480 V = 11
481 criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
482 model = make_model(V, V, N=2)
483 model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
484         torch.optim.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.98), eps=1e-9))
485
486 for epoch in range(5):
487     model.train()
488     run_epoch(data_gen(V, 30, 20), model,
489               SimpleLossCompute(model.generator, criterion, model_opt))
490     model.eval()
491     print(run_epoch(data_gen(V, 30, 5), model,
492                     SimpleLossCompute(model.generator, criterion, None)))
493
494
495 #This code predicts a translation using greedy decoding for simplicity.
496 def greedy_decode(model, src, src_mask, max_len, start_symbol):
497     memory = model.encode(src, src_mask)
498     ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)#fill start symbol
499     for i in range(max_len-1):
500         out = model.decode(memory, src_mask,
501                            Variable(ys),
502                            Variable(subsequent_mask(ys.size(1))
503                                     .type_as(src.data)))
504         prob = model.generator(out[:, -1])
505         _, next_word = torch.max(prob, dim = 1)
506         next_word = next_word.data[0]
507         ys = torch.cat([ys,
508                         torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
509     return ys
510
511 model.eval()
512 src = Variable(torch.LongTensor([[1,2,3,4,5,6,7,8,9,10]]) )
513 src_mask = Variable(torch.ones(1, 1, 10) )
514 print(greedy_decode(model, src, src_mask, max_len=10, start_symbol=1))
515
516
517 ‘‘‘
518 # # A Real World Example
519 #
520 # > Now we consider a real-world example using the IWSLT German-English Translation task. This task is much smaller than the WMT task considered in the paper, but it illustrates the whole system. We also show how to use multi-gpu processing to make it really fast.
521
522 #!pip install torchtext spacy
523 #!python -m spacy download en
524 #!python -m spacy download de
525
526
527 # ## Training Data and Batching
528 global max_src_in_batch, max_tgt_in_batch
529 def batch_size_fn(new, count, sofar):
530     "Keep augmenting batch and calculate total number of tokens + padding."
531     global max_src_in_batch, max_tgt_in_batch
532     if count == 1:
533         max_src_in_batch = 0
534         max_tgt_in_batch = 0
535     max_src_in_batch = max(max_src_in_batch,  len(new.src))
536     max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
537     src_elements = count * max_src_in_batch
538     tgt_elements = count * max_tgt_in_batch
539
540     return max(src_elements, tgt_elements)
541
542 # ## Data Loading
543 # > We will load the dataset using torchtext and spacy for tokenization.
544
545 # For data loading.
546 from torchtext import data, datasets
547
548 if True:
549     import spacy
550     spacy_de = spacy.load(‘de‘)
551     spacy_en = spacy.load(‘en‘)
552
553     def tokenize_de(text):
554         return [tok.text for tok in spacy_de.tokenizer(text)]
555
556     def tokenize_en(text):
557         return [tok.text for tok in spacy_en.tokenizer(text)]
558
559     BOS_WORD = ‘<s>‘
560     EOS_WORD = ‘</s>‘
561     BLANK_WORD = "<blank>"
562     SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
563     TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD,
564                      eos_token = EOS_WORD, pad_token=BLANK_WORD)
565
566     MAX_LEN = 100
567     train, val, test = datasets.IWSLT.splits(
568         exts=(‘.de‘, ‘.en‘), fields=(SRC, TGT),
569         filter_pred=lambda x: len(vars(x)[‘src‘]) <= MAX_LEN and
570             len(vars(x)[‘trg‘]) <= MAX_LEN)
571     MIN_FREQ = 2
572     SRC.build_vocab(train.src, min_freq=MIN_FREQ)
573     TGT.build_vocab(train.trg, min_freq=MIN_FREQ)
574
575
576 # > Batching matters a ton for speed. We want to have very evenly divided batches, with absolutely minimal padding. To do this we have to hack a bit around the default torchtext batching. This code patches their default batching to make sure we search over enough sentences to find tight batches.
577 # ## Iterators
578
579 class MyIterator(data.Iterator):
580     def create_batches(self):
581         if self.train:
582             def pool(d, random_shuffler):
583                 for p in data.batch(d, self.batch_size * 100):
584                     p_batch = data.batch(
585                         sorted(p, key=self.sort_key),
586                         self.batch_size, self.batch_size_fn)
587                     for b in random_shuffler(list(p_batch)):
588                         yield b
589             self.batches = pool(self.data(), self.random_shuffler)
590
591         else:
592             self.batches = []
593             for b in data.batch(self.data(), self.batch_size,
594                                           self.batch_size_fn):
595                 self.batches.append(sorted(b, key=self.sort_key))
596
597 def rebatch(pad_idx, batch):
598     "Fix order in torchtext to match ours"
599     src, trg = batch.src.transpose(0, 1), batch.trg.transpose(0, 1)
600     return Batch(src, trg, pad_idx)
601
602
603 # ## Multi-GPU Training
604 # > Finally to really target fast training, we will use multi-gpu. This code implements multi-gpu word generation. It is not specific to transformer so I won‘t go into too much detail. The idea is to split up word generation at training time into chunks to be processed in parallel across many different gpus. We do this using pytorch parallel primitives:
605 #
606 # * replicate - split modules onto different gpus.
607 # * scatter - split batches onto different gpus
608 # * parallel_apply - apply module to batches on different gpus
609 # * gather - pull scattered data back onto one gpu.
610 # * nn.DataParallel - a special module wrapper that calls these all before evaluating.
611 #
612
613 # Skip if not interested in multigpu.
614 class MultiGPULossCompute:
615     "A multi-gpu loss compute and train function."
616     def __init__(self, generator, criterion, devices, opt=None, chunk_size=5):
617         # Send out to different gpus.
618         self.generator = generator
619         self.criterion = nn.parallel.replicate(criterion,
620                                                devices=devices)
621         self.opt = opt
622         self.devices = devices
623         self.chunk_size = chunk_size
624
625     def __call__(self, out, targets, normalize):
626         total = 0.0
627         generator = nn.parallel.replicate(self.generator,
628                                                 devices=self.devices)
629         out_scatter = nn.parallel.scatter(out,
630                                           target_gpus=self.devices)
631         out_grad = [[] for _ in out_scatter]
632         targets = nn.parallel.scatter(targets,
633                                       target_gpus=self.devices)
634
635         # Divide generating into chunks.
636         chunk_size = self.chunk_size
637         for i in range(0, out_scatter[0].size(1), chunk_size):
638             # Predict distributions
639             out_column = [[Variable(o[:, i:i+chunk_size].data,
640                                     requires_grad=self.opt is not None)]
641                            for o in out_scatter]
642             gen = nn.parallel.parallel_apply(generator, out_column)
643
644             # Compute loss.
645             y = [(g.contiguous().view(-1, g.size(-1)),
646                   t[:, i:i+chunk_size].contiguous().view(-1))
647                  for g, t in zip(gen, targets)]
648             loss = nn.parallel.parallel_apply(self.criterion, y)
649
650             # Sum and normalize loss
651             l = nn.parallel.gather(loss,
652                                    target_device=self.devices[0])
653             l = l.sum()[0] / normalize
654             total += l.data[0]
655
656             # Backprop loss to output of transformer
657             if self.opt is not None:
658                 l.backward()
659                 for j, l in enumerate(loss):
660                     out_grad[j].append(out_column[j][0].grad.data.clone())
661
662         # Backprop all loss through transformer.
663         if self.opt is not None:
664             out_grad = [Variable(torch.cat(og, dim=1)) for og in out_grad]
665             o1 = out
666             o2 = nn.parallel.gather(out_grad,
667                                     target_device=self.devices[0])
668             o1.backward(gradient=o2)
669             self.opt.step()
670             self.opt.optimizer.zero_grad()
671         return total * normalize
672
673
674 # > Now we create our model, criterion, optimizer, data iterators, and paralelization
675 # GPUs to use
676 devices = [0, 1, 2, 3]
677 if True:
678     pad_idx = TGT.vocab.stoi["<blank>"]
679     model = make_model(len(SRC.vocab), len(TGT.vocab), N=6)
680     model.cuda()
681     criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
682     criterion.cuda()
683     BATCH_SIZE = 12000
684     train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=0,
685                             repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
686                             batch_size_fn=batch_size_fn, train=True)
687     valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=0,
688                             repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
689                             batch_size_fn=batch_size_fn, train=False)
690     model_par = nn.DataParallel(model, device_ids=devices)
691 None
692
693
694 # > Now we train the model. I will play with the warmup steps a bit, but everything else uses the default parameters.  On an AWS p3.8xlarge with 4 Tesla V100s, this runs at ~27,000 tokens per second with a batch size of 12,000
695 # ## Training the System
696 #!wget https://s3.amazonaws.com/opennmt-models/iwslt.pt
697
698 if False:
699     model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000,
700             torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
701     for epoch in range(10):
702         model_par.train()
703         run_epoch((rebatch(pad_idx, b) for b in train_iter),
704                   model_par,
705                   MultiGPULossCompute(model.generator, criterion,
706                                       devices=devices, opt=model_opt))
707         model_par.eval()
708         loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter),
709                           model_par,
710                           MultiGPULossCompute(model.generator, criterion,
711                           devices=devices, opt=None))
712         print(loss)
713 else:
714     model = torch.load("iwslt.pt")
715
716
717 # > Once trained we can decode the model to produce a set of translations. Here we simply translate the first sentence in the validation set. This dataset is pretty small so the translations with greedy search are reasonably accurate.
718
719 for i, batch in enumerate(valid_iter):
720     src = batch.src.transpose(0, 1)[:1]
721     src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2)
722     out = greedy_decode(model, src, src_mask,
723                         max_len=60, start_symbol=TGT.vocab.stoi["<s>"])
724     print("Translation:", end="\t")
725     for i in range(1, out.size(1)):
726         sym = TGT.vocab.itos[out[0, i]]
727         if sym == "</s>": break
728         print(sym, end =" ")
729     print()
730     print("Target:", end="\t")
731     for i in range(1, batch.trg.size(0)):
732         sym = TGT.vocab.itos[batch.trg.data[i, 0]]
733         if sym == "</s>": break
734         print(sym, end =" ")
735     print()
736     break
737
738
739 # # Additional Components: BPE, Search, Averaging
740
741 # > So this mostly covers the transformer model itself. There are four aspects that we didn‘t cover explicitly. We also have all these additional features implemented in [OpenNMT-py](https://github.com/opennmt/opennmt-py).
742 #
743 #
744
745 # > 1) BPE/ Word-piece: We can use a library to first preprocess the data into subword units. See Rico Sennrich‘s [subword-nmt](https://github.com/rsennrich/subword-nmt) implementation. These models will transform the training data to look like this:
746 # ▁Die ▁Protokoll datei ▁kann ▁ heimlich ▁per ▁E - Mail ▁oder ▁FTP ▁an ▁einen ▁bestimmte n ▁Empfänger ▁gesendet ▁werden .
747 # > 2) Shared Embeddings: When using BPE with shared vocabulary we can share the same weight vectors between the source / target / generator. See the [(cite)](https://arxiv.org/abs/1608.05859) for details. To add this to the model simply do this:
748
749 if False:
750     model.src_embed[0].lut.weight = model.tgt_embeddings[0].lut.weight
751     model.generator.lut.weight = model.tgt_embed[0].lut.weight
752
753
754 # > 3) Beam Search: This is a bit too complicated to cover here. See the [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/translate/Beam.py) for a pytorch implementation.
755 # > 4) Model Averaging: The paper averages the last k checkpoints to create an ensembling effect. We can do this after the fact if we have a bunch of models:
756
757 def average(model, models):
758     "Average models into model"
759     for ps in zip(*[m.params() for m in [model] + models]):
760         p[0].copy_(torch.sum(*ps[1:]) / len(ps[1:]))
761
762
763 # # Results
764 #
765 # On the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big)
766 # in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0
767 # BLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is
768 # listed in the bottom line of Table 3. Training took 3.5 days on 8 P100 GPUs. Even our base model
769 # surpasses all previously published models and ensembles, at a fraction of the training cost of any of
770 # the competitive models.
771 #
772 # On the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0,
773 # outperforming all of the previously published single models, at less than 1/4 the training cost of the
774 # previous state-of-the-art model. The Transformer (big) model trained for English-to-French used
775 # dropout rate Pdrop = 0.1, instead of 0.3.
776 #
777 #
778
779
780 # > The code we have written here is a version of the base model. There are fully trained version of this system available here  [(Example Models)](http://opennmt.net/Models-py/).
781 # >
782 # > With the addtional extensions in the last section, the OpenNMT-py replication gets to 26.9 on EN-DE WMT. Here I have loaded in those parameters to our reimplemenation.
783
784 get_ipython().system(‘wget https://s3.amazonaws.com/opennmt-models/en-de-model.pt‘)
785
786 model, SRC, TGT = torch.load("en-de-model.pt")
787
788 model.eval()
789 sent = "▁The ▁log ▁file ▁can ▁be ▁sent ▁secret ly ▁with ▁email ▁or ▁FTP ▁to ▁a ▁specified ▁receiver".split()
790 src = torch.LongTensor([[SRC.stoi[w] for w in sent]])
791 src = Variable(src)
792 src_mask = (src != SRC.stoi["<blank>"]).unsqueeze(-2)
793 out = greedy_decode(model, src, src_mask,
794                     max_len=60, start_symbol=TGT.stoi["<s>"])
795 print("Translation:", end="\t")
796 trans = "<s> "
797 for i in range(1, out.size(1)):
798     sym = TGT.itos[out[0, i]]
799     if sym == "</s>": break
800     trans += sym + " "
801 print(trans)
802
803
804 # ## Attention Visualization
805 #
806 # > Even with a greedy decoder the translation looks pretty good. We can further visualize it to see what is happening at each layer of the attention
807
808 tgt_sent = trans.split()
809 def draw(data, x, y, ax):
810     seaborn.heatmap(data,
811                     xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0,
812                     cbar=False, ax=ax)
813
814 for layer in range(1, 6, 2):
815     fig, axs = plt.subplots(1,4, figsize=(20, 10))
816     print("Encoder Layer", layer+1)
817     for h in range(4):
818         draw(model.encoder.layers[layer].self_attn.attn[0, h].data,
819             sent, sent if h ==0 else [], ax=axs[h])
820     plt.show()
821
822 for layer in range(1, 6, 2):
823     fig, axs = plt.subplots(1,4, figsize=(20, 10))
824     print("Decoder Self Layer", layer+1)
825     for h in range(4):
826         draw(model.decoder.layers[layer].self_attn.attn[0, h].data[:len(tgt_sent), :len(tgt_sent)],
827             tgt_sent, tgt_sent if h ==0 else [], ax=axs[h])
828     plt.show()
829     print("Decoder Src Layer", layer+1)
830     fig, axs = plt.subplots(1,4, figsize=(20, 10))
831     for h in range(4):
832         draw(model.decoder.layers[layer].self_attn.attn[0, h].data[:len(tgt_sent), :len(sent)],
833             sent, tgt_sent if h ==0 else [], ax=axs[h])
834     plt.show()
835
836 ‘‘‘
原文地址：https://www.cnblogs.com/buyizhiyou/p/11649817.html
时间： 2024-11-11 11:09:32
self attention pytorch代码

self attention pytorch代码的相关文章

pytorch代码中同时包含训练和测试代码时显存爆炸

运行pytorch代码遇到的error解决办法

经典的卷积神经网络及其Pytorch代码实现

RefineDet -pytorch代码记录

pytorch代码调试工具

【CV中的Attention机制】Selective Kernel Networks(SE进化版)

PyTorch为何如此高效好用？

Pytorch实现的语义分割器

使用faster-rcnn.pytorch训练自己数据集