Source code for aitemplate.compiler.transform.split_large_split_ops

#  Copyright (c) Meta Platforms, Inc. and affiliates.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
"""
This transformation splits a split with a large number of outputs into multiple
splitt ops, which share the same input with correct output_masks.
"""

import logging

from typing import List

from aitemplate.compiler import ops
from aitemplate.compiler.base import Operator, Tensor

from aitemplate.compiler.transform import toposort, transform_utils

from aitemplate.utils import graph_utils


_LOGGER = logging.getLogger(__name__)

SPLIT_INPUT_META_SIZE = 16
SPLIT_OUTPUT_META_SIZE = 32
MAX_CUDA_PARAM_BYTES = 4096


def _split_kernel_single_input_output_param_size(op: Operator):
    """
    Return the total size (in bytes) of the split's params.
    We need to adjust this if we change the split op's params.
    Note this is conservative by multiplying input_meta and constant 24 bytes.
    """
    outputs = op._attrs["outputs"]
    rank = outputs[0]._rank()
    size_of_input_meta = SPLIT_INPUT_META_SIZE * rank
    # There are 3 more params, where each takes 8 bytes, so we add 24 more bytes
    total_params_size = SPLIT_OUTPUT_META_SIZE + size_of_input_meta + 24
    _LOGGER.debug(f'split op op._attrs["name"]: {total_params_size=}')
    return total_params_size


[docs]def split_large_split_ops(sorted_graph: List[Tensor], _: str) -> List[Tensor]: """ Our split CUDA kernel takes an output meta argument whose size is proportional to the number of outputs. In extreme cases, the total size of the params of a split kernel may exceed the limit imposed by the CUDA compiler. In such cases, we split the split op into separate ones. """ modified = False sorted_ops = graph_utils.get_sorted_ops(sorted_graph) for op in sorted_ops: if not op._attrs["op"].startswith("split"): continue split_op = op split_params_size = _split_kernel_single_input_output_param_size(split_op) if split_params_size > MAX_CUDA_PARAM_BYTES: raise RuntimeError( f"cannot handle cases: {split_params_size=} > {MAX_CUDA_PARAM_BYTES=}" ) if split_params_size * len(split_op._attrs["outputs"]) <= MAX_CUDA_PARAM_BYTES: continue modified = True split_dim = split_op._attrs["split_dim"] split_sizes = split_op._attrs["split_sizes"] outputs = split_op._attrs["outputs"] num_outputs_per_split = MAX_CUDA_PARAM_BYTES // split_params_size # compute how many split ops we need to fix within MAX_CUDA_PARAM_BYTES num_split_ops = ( len(outputs) + num_outputs_per_split - 1 ) // num_outputs_per_split output_mapping = [] for split_i in range(num_split_ops): start = split_i * num_outputs_per_split end = min( (split_i + 1) * num_outputs_per_split, len(split_op._attrs["outputs"]) ) remove_indices = list(range(start)) + list( range(end, len(split_op._attrs["outputs"])) ) new_split = ops.split() new_outputs = new_split( split_op._attrs["inputs"][0], split_sizes, split_dim ) new_split.remove_output_at(remove_indices) new_outputs = new_split._attrs["outputs"] sorted_graph += list(new_outputs) output_mapping += list(zip(outputs[start:end], new_outputs)) for old_output, new_output in output_mapping: transform_utils.replace_tensor(old_output, new_output) if not modified: return sorted_graph new_output_tensors = [ tensor for tensor in sorted_graph if tensor._attrs["is_output"] ] sorted_graph = toposort.toposort(new_output_tensors) return sorted_graph