Source code for aitemplate.frontend.nn.attention

#  Copyright (c) Meta Platforms, Inc. and affiliates.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
"""
Frontend for attention module
"""

from aitemplate.compiler import ops
from aitemplate.compiler.ops import flash_attention
from aitemplate.compiler.ops.common.epilogue import FuncEnum
from aitemplate.frontend import Tensor
from aitemplate.frontend.nn.dropout import Dropout
from aitemplate.frontend.nn.linear import Linear
from aitemplate.frontend.nn.module import Module
from aitemplate.frontend.nn.parameter import Parameter
from aitemplate.testing import detect_target


[docs]class FlashAttention(Module): r"""FlashAttention provides an implementation for fused multi-head attention module: .. math:: \text{Attention}(Q, K, V) = \text{softmax}(\frac{QK}{\sqrt(d)}) * V .. math:: \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. """ def __init__( self, batch_size, max_seq_len, dropout=0, causal=False, dtype="float16", ): """Initialize attention module, create a tensor for seqlen""" super().__init__() self.cu_length = Parameter(shape=[batch_size + 1], dtype="int32") self.op = flash_attention( batch_size=batch_size, dropout=dropout, max_seq_len=max_seq_len, causal=causal, )
[docs] def forward(self, *args): """forward pass for calling attention op""" assert len(args) == 1 x = args[0] return self.op(x, self.cu_length.tensor())
[docs]class MultiheadAttention(Module): r"""Multi-Head Attention. Allows the model to jointly attend to information from different representation subspaces as described in the paper: `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_. Multi-Head Attention is defined as: .. math:: \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. Args: dim: total dimension of the model batch_size: batch size seq_len: sequence length num_heads: Number of parallel attention heads. Default: 8 qkv_bias: whether to add bias to QKV. Default: False attn_drop: Dropout probability on attention output weights. Default: ``0.0`` (no dropout). proj_drop: Dropout probability on projection layers. Default: ``0.0`` (no dropout). has_residual: has or has no residual. Default: `True`. causal: default: `False`. mask_seq: sequence mask, default: ``0``. """ USE_CUDA = None def __init__( self, dim, batch_size, seq_len, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0, has_residual=True, causal=False, mask_seq=0, use_mem_eff=False, dtype="float16", ): super().__init__() assert ( dim % num_heads == 0 ), f"dim {dim} should be divisible by num_heads {num_heads}" if MultiheadAttention.USE_CUDA is None: MultiheadAttention.USE_CUDA = detect_target().name() == "cuda" self.num_heads = num_heads head_dim = dim // num_heads self.scale = head_dim**-0.5 self.causal = causal self.has_residual = has_residual self.mask_seq = mask_seq self.use_mem_eff = use_mem_eff flash_head_dims = {8, 16, 32, 64, 128} # simple heuristic, may need refinement self.use_flash = ( not (seq_len >= 512 and batch_size <= 2) ) and head_dim in flash_head_dims # odd seq try use flash if seq_len % 2 == 1: self.use_flash = True if use_mem_eff: self.op = ops.mem_eff_attention( causal=causal, ) self.use_flash = False else: self.op = flash_attention( batch_size=batch_size, dropout=attn_drop, max_seq_len=seq_len, causal=causal, ) # cu_length: the cumulative sequence lengths, used to index into hidden_states. self.cu_length = Parameter(shape=[batch_size + 1], dtype="int32") if self.mask_seq: self.output_mask = Parameter( shape=[mask_seq, num_heads, head_dim], dtype=dtype ) if self.USE_CUDA: # on CUDA flash_attention needs packed QKV as input, # then do split + permute inside flash_attn # input: (B, S, H) # output: (B*S, 3, num_heads, head_dim) if self.use_flash: self.qkv = Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype) else: self.qkv = Linear( dim, dim * 3, specialization="permute", shape=(seq_len, 3, self.num_heads), dtype=dtype, ) else: # on ROCM ck attention (bmm_softmax_bmm) takes three inputs (Q, K, V) # here we generate packed QKV for splitting # input: (B, seqlen, dim) -> (B*seqlen, dim) # gemm: (B*seqlen, 3*dim) # reshape to: (B, seqlen, 3, num_heads, head_dim) # output: (3, B, num_heads, seqlen, head_dim) self.qkv = Linear( dim, dim * 3, specialization="permute", shape=(seq_len, 3, self.num_heads), layout="m2n3", dtype=dtype, ) self.attn_drop = Dropout(attn_drop, dtype=dtype) self.proj = Linear( dim, dim, specialization="add" if has_residual else None, dtype=dtype ) self.proj_drop = Dropout(proj_drop, dtype=dtype) def get_shape(self, x): shape = [it.value() for it in x._attrs["shape"]] return shape def qkv_proj(self, x): if self.USE_CUDA: if self.use_flash: batch, seq, hidden = self.get_shape(x) out = self.qkv(x) return ops.reshape()( out, [int(batch * seq), 3, self.num_heads, hidden // self.num_heads] ) else: batch, seq, hidden = self.get_shape(x) x = ops.reshape()(x, [-1, hidden]) return self.qkv(x) else: return self.qkv(x) def attention(self, x): # fused attention # output: (B, Seqlen, num_heads, head_dim) if self.USE_CUDA and self.use_flash: # input(x): (B*seqlen, 3, num_heads, head_dim) # output: (B, Seqlen, num_heads, head_dim) return self.op(x, self.cu_length.tensor()) elif self.USE_CUDA and self.use_mem_eff: (q, k, v) = ops.split()(x, 1, dim=0) _, b, num_heads, seqlen, d = self.get_shape(q) return self.op( ops.reshape()(q, [b, -1, seqlen, d]), ops.reshape()(k, [b, -1, seqlen, d]), ops.reshape()(v, [b, -1, seqlen, d]), ) else: # input(q/k/v): (B*num_heads, seqlen, head_dim) # attn = (B, S, H) * (B, S, H) = (B, S, S) #RCR # softmax on dim -1 (B, S, S) # attn@v: (B, S, S) * (B, S, H) = (B, S, H) #RRR # reshape: (B, num_head, seqlen, head_dim) # permute: (B, Seqlen, num_heads, head_dim) if self.USE_CUDA: scale = Tensor( shape=[], dtype="float16", name="scale", value=self.scale ) # [3, b, num_heads, seqlen, d] _, b, num_heads, seqlen, d = self.get_shape(x) # [3 * b * num_heads, seqlen, d] x = ops.reshape()(x, [-1, seqlen, d]) (q, k, v) = ops.split()(x, b * num_heads, dim=0) qk = ops.bmm_rcr()(q, k) score = ops.elementwise(FuncEnum.MUL)(qk, scale) score = ops.softmax()(score, -1) out = ops.bmm_rrr_permute((num_heads,))(score, v) else: (q, k, v) = ops.split()(x, 1, dim=0) _, _, _, seqlen, d = self.get_shape(q) OP = ops.bmm_softmax_bmm_permute( shape=(self.num_heads,), scale=self.scale, causal=self.causal, ) out = OP( ops.reshape()(q, [-1, seqlen, d]), ops.reshape()(k, [-1, seqlen, d]), ops.reshape()(v, [-1, seqlen, d]), ) return out
[docs] def forward(self, *args): """forward pass for calling mha module""" assert len(args) >= 1 x = args[0] batch, seq, hidden = self.get_shape(x) qkv = self.qkv_proj(x) if self.mask_seq: total = self.get_shape(qkv)[0] qkv = ops.dynamic_slice()( qkv, start_indices=[0, 0, 0, 0], end_indices=[total - self.mask_seq, None, None, None], ) attn_output = self.attention(qkv) if self.mask_seq: attn_output = ops.concatenate()( [attn_output, self.output_mask.tensor()], dim=0 ) attn_output = ops.reshape()(attn_output, [batch * seq, -1]) if self.has_residual: assert len(args) == 2 x = self.proj(attn_output, args[1]) else: x = self.proj(attn_output) x = self.proj_drop(x) x = ops.reshape()(x, [batch, seq, hidden]) return x
[docs]class CrossAttention(Module): r"""Cross Multi-head Attention. Allows the model to jointly attend to information from different representation subspaces as described in the paper: `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_. Multi-Head Attention is defined as: .. math:: \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. Args: dim: total dimension of the model batch_size: batch size seq_len: sequence length num_heads: Number of parallel attention heads. Default: 8 qkv_bias: whether to add bias to QKV. Default: False attn_drop: Dropout probability on attention output weights. Default: ``0.0`` (no dropout). proj_drop: Dropout probability on projection layers. Default: ``0.0`` (no dropout). has_residual: has or has no residual. Default: `True`. causal: default: `False`. mask_seq: sequence mask, default: ``0``. """ def __init__( self, dim, seq_len, seq_len_kv, num_heads, qkv_bias=False, attn_drop=0.0, proj_drop=0.0, has_residual=True, causal=False, dtype="float16", ): super().__init__() assert ( dim % num_heads == 0 ), f"dim {dim} should be divisible by num_heads {num_heads}" self.num_heads = num_heads self.causal = causal self.has_residual = has_residual self.dim = dim self.op = ops.mem_eff_attention(causal=causal) self.proj_q = Linear( dim, dim, bias=qkv_bias, dtype=dtype, ) self.proj_k = Linear( dim, dim, bias=qkv_bias, dtype=dtype, ) self.proj_v = Linear( dim, dim, bias=qkv_bias, dtype=dtype, ) self.attn_drop = Dropout(attn_drop, dtype=dtype) self.proj = Linear( dim, dim, specialization="add" if has_residual else None, dtype=dtype ) self.proj_drop = Dropout(proj_drop, dtype=dtype) def attention(self, q, k, v): batch = q.shape()[0] head_dim = self.dim // self.num_heads query = self.proj_q(q) key = self.proj_k(k) value = self.proj_v(v) query = ops.permute()( ops.reshape()(query, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3] ) key = ops.permute()( ops.reshape()(key, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3] ) value = ops.permute()( ops.reshape()(value, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3], ) return self.op(query, key, value)
[docs] def forward(self, *args): """forward pass for calling mha module""" assert len(args) >= 3 x = args[0] batch = x.shape()[0] attn_output = self.attention(args[0], args[1], args[2]) attn_output = ops.reshape()(attn_output, [batch, -1, self.dim]) if self.has_residual: assert len(args) == 4 x = self.proj(attn_output, args[3]) else: x = self.proj(attn_output) x = self.proj_drop(x) x = ops.reshape()(x, [batch, -1, self.dim]) return x
[docs]class ScaledDotProductAttention(Module): def __init__(self) -> None: super().__init__()
[docs] def forward(self, q, k, v): attn = ops.mem_eff_attention(causal=False)(q, k, v) return attn