Source code for aitemplate.compiler.transform.memory_planning

#  Copyright (c) Meta Platforms, Inc. and affiliates.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
"""
Graph pass for memory planning.
"""
import bisect
import logging
from collections import defaultdict
from dataclasses import dataclass
from typing import List

from aitemplate.compiler.base import Operator, Tensor
from aitemplate.utils.environ import multistream_max_mem_parallel_ops, multistream_mode
from aitemplate.utils.graph_utils import split_simple_multistream_parallel_ops

# pylint: disable=C0103

_LOGGER = logging.getLogger(__name__)


@dataclass
class TensorUsageRecord:
    """
    A named tuple to keep a tensor usage record, where

    tensor: this tensor

    first_op_idx: the index of the first op that uses this tensor as its input
                  or output

    last_op_idx: the index of the last op that uses this tensor as its input or
                 output

    size: the size of this tensor
    """

    tensor: Tensor
    first_op_idx: int
    last_op_idx: int
    size: int

    def __iter__(self):
        return iter([self.tensor, self.first_op_idx, self.last_op_idx, self.size])


def _find_original_tensor(tensor: Tensor):
    """Find the original tensor of a tensor view recursively."""
    view = tensor._attrs["is_view_of"]
    if not view:
        return tensor
    return _find_original_tensor(view)


def _make_tensor_usage_records(sorted_ops: List[Operator]) -> List[TensorUsageRecord]:
    num_of_ops = len(sorted_ops)
    tensor_records = defaultdict(
        lambda: TensorUsageRecord(
            tensor=None, first_op_idx=num_of_ops, last_op_idx=-1, size=None
        )
    )
    for op_idx, op in enumerate(sorted_ops):
        for tensor in op._attrs["inputs"] + op._attrs["outputs"]:
            # Skip weights and inputs since we don't overwrite them.
            # Note that it might be OK to overwrite inputs, but let's be
            # consertative for now and not surprise users. We could always
            # make a flag to do that later if it's needed.
            if tensor._attrs["is_param"]:
                continue
            name = tensor._attrs["name"]
            this_tensor = tensor_records[name].tensor
            if this_tensor is None:
                tensor_records[name].tensor = tensor
            else:
                # make sure we didn't screw up anything
                assert (
                    tensor == this_tensor
                ), f"existing tensor: {this_tensor}, new tensor: {tensor}, op: {op}"

            first_op_idx = tensor_records[name].first_op_idx
            last_op_idx = tensor_records[name].last_op_idx
            tensor_records[name].first_op_idx = min(first_op_idx, op_idx)
            tensor_records[name].last_op_idx = max(last_op_idx, op_idx)
            # An output tensor's lifetime extends to the last op.
            if tensor._attrs["is_output"]:
                tensor_records[name].last_op_idx = num_of_ops - 1

            size = tensor_records[name].size
            tensor_size = tensor.size_bytes(alignment=64)
            if size is None:
                tensor_records[name].size = tensor_size
            else:
                # make sure we didn't screw up anything
                assert size == tensor_size

    # tensor views extend the lifetime of the original tensors
    tensor_views = []
    for name, tensor_record in tensor_records.items():
        this_tensor = tensor_record.tensor
        if this_tensor._attrs["is_view_of"]:
            orig_tensor = _find_original_tensor(this_tensor)
            # view of input
            if orig_tensor._attrs["is_param"]:
                continue
            orig_tensor_name = orig_tensor._attrs["name"]
            assert orig_tensor_name in tensor_records
            tensor_records[orig_tensor_name].last_op_idx = max(
                tensor_records[orig_tensor_name].last_op_idx, tensor_record.last_op_idx
            )
            tensor_views.append(name)

    # remove tensor views from tensor_records
    for name in tensor_views:
        del tensor_records[name]

    # sanity checks
    # make sure we have valid indices and sizes
    records = tensor_records.values()
    for tensor, first_op_idx, last_op_idx, size in records:
        assert tensor is not None
        assert 0 <= first_op_idx < num_of_ops
        assert 0 <= last_op_idx < num_of_ops
        assert first_op_idx <= last_op_idx
        assert size is not None

    return list(records)


def assign_offsets_to_views_and_outputs(sorted_graph: List[Tensor]) -> None:
    """Propagate offsets determined by the memory planning algorithm to views.

    Parameters
    ----------
    sorted_graph : List[Tensor]
        The graph, modified in-place
    """
    for node in sorted_graph:
        if node._attrs["is_view_of"]:
            node._attrs["offset"] = node._attrs["is_view_of"]._attrs["offset"]


[docs]@dataclass class Workspace: shared_size: int unique_size: int def total_size(self) -> int: return self.shared_size + self.unique_size
def _compute_workspace(sorted_graph: List[Tensor]) -> Workspace: """ Compute the workspace for the model, which can be used as scratch memory by ops. This pass examines two attributes on every function in the graph: - workspace: The amount of memory in bytes to be used as shared scratch memory. Here, "shared" means that other ops are allowed to write to this memory. - unique_workspace: The amount of memory in bytes to be used as exclusive scratch memory. If set, this pass will assign the op a "unique_workspace_offset". This can be used at codegen time to set a pointer to the region of exclusive shared memory. The returned Workspace has two attributes: - shared_size: The total memory needed for all op's shared scratch memory (i.e. the maximum of all workspace attributes) - unique_size: The total memory needed for all unique scratch memory (i.e. the sum of all unique_workspace attributes) During codegen, the workspace gets set up like this: [--unique 1--][--unique 2--]...[--unique N--][--shared--] """ unique_workspace_size = 0 max_workspace = 0 for node in sorted_graph: for func in node._attrs["src_ops"]: if "workspace" in func._attrs: max_workspace = max(max_workspace, func._attrs["workspace"]) if ( "unique_workspace" in func._attrs and "unique_workspace_offset" not in func._attrs ): func._attrs["unique_workspace_offset"] = unique_workspace_size unique_workspace_size += func._attrs["unique_workspace"] return Workspace(max_workspace, unique_workspace_size) def _greedy_by_size_memory_planning( sorted_graph: List[Tensor], tensor_usage_records: List[TensorUsageRecord] ): """ based on the greedy-by-size algorithm for offset calculation described in the following paper: Yury Pisarchyk, Juhyun Lee, Efficient Memory Management for Deep Neural Net Inference, https://arxiv.org/abs/2001.03288 """ # sort tensor usage records in non-increasing order by their sizes sorted_tensor_usage_records = sorted( tensor_usage_records, key=lambda r: r.size, reverse=True ) max_blob = 0 # For tensors that have been assigned, we keep their tensor usage records # in increasing order by memory offsets sorted_assigned_records = [] for tensor_record in sorted_tensor_usage_records: tensor, first_op_idx, last_op_idx, size = tensor_record prev_offset = 0 best_offset = None smallest_gap = pow(2, 63) - 1 # Iterate through tensors that have been allocated. # For those whose usage intervals intersect with that of current # tensor, we try to find the smallest valid memory gap between such two # allocated tensors, which is big enough to hold current tensor. # If such a gap is found, we will place current tensor in the gap. for a_record in sorted_assigned_records: a_tensor, a_first_op_idx, a_last_op_idx, a_size = a_record max_first_op_idx = max(first_op_idx, a_first_op_idx) min_last_op_idx = min(last_op_idx, a_last_op_idx) # current tensor overlaps with this assigned tensor if max_first_op_idx <= min_last_op_idx: a_offset = a_tensor._attrs["offset"] gap = a_offset - prev_offset if size <= gap < smallest_gap: smallest_gap = gap best_offset = prev_offset prev_offset = max(prev_offset, a_offset + a_size) # If we can't find a valid memory gap between two allocated tensors, # we put current tensor to the rightmost tensor whose usage interval # intersects with that of the current tensor. if best_offset is None: best_offset = prev_offset tensor._attrs["offset"] = best_offset max_blob = max(max_blob, best_offset + size) # bisect from Python <=3.9 doesn't have the key parameter sorted_offsets = [r.tensor._attrs["offset"] for r in sorted_assigned_records] in_pos = bisect.bisect_right( sorted_offsets, tensor_record.tensor._attrs["offset"] ) sorted_assigned_records.insert(in_pos, tensor_record) # now we assign blobs for weights and inputs constant_offset = 0 for node in sorted_graph: if ( node._attrs["data"] is not None or node._attrs["constant_folding_output_idx"] is not None ): node._attrs["offset"] = constant_offset constant_offset += node.size_bytes(alignment=64) # assign offsets to tensor views # this step must happen after weights and inputs are assigned so that views # of inputs are properly handled assign_offsets_to_views_and_outputs(sorted_graph) workspace = _compute_workspace(sorted_graph) # make sure we've covered the entire graph return (max_blob, constant_offset, workspace) def greedy_by_size_memory_planning(sorted_graph: List[Tensor]): # noqa: C901 """ based on the greedy-by-size algorithm for offset calculation described in the following paper: Yury Pisarchyk, Juhyun Lee, Efficient Memory Management for Deep Neural Net Inference, https://arxiv.org/abs/2001.03288 """ sorted_ops = [] for node in sorted_graph: sorted_ops.extend(node.src_ops()) tensor_usage_records = _make_tensor_usage_records(sorted_ops) return _greedy_by_size_memory_planning(sorted_graph, tensor_usage_records) def naive_memory_planning(sorted_graph: List[Tensor]): max_blob = 0 offset = 0 constant_offset = 0 for node in sorted_graph: if ( node._attrs["data"] is not None or node._attrs["constant_folding_output_idx"] is not None ): node._attrs["offset"] = constant_offset constant_offset += node.size_bytes(alignment=64) elif not node._attrs["is_view_of"]: node._attrs["offset"] = offset tensor_size = node.size_bytes(alignment=64) offset += tensor_size max_blob += tensor_size # workspace workspace = _compute_workspace(sorted_graph) assign_offsets_to_views_and_outputs(sorted_graph) return (max_blob, constant_offset, workspace) def _make_tensor_usage_records_simple_multistream( par_ops_seq: List[List[Operator]], ) -> List[TensorUsageRecord]: """ Generalized version of _make_tensor_usage_records() which assumes that several ops may be executed on every step. Simple multistream algo iteratively tracks sets of operators that can be run in parallel independently on each iteration. par_ops_seq contains lists of operators that can be run in parallel on every algorithm iteration. Technically, the regular _make_tensor_usage_records() version is similar to the following one: def _make_tensor_usage_records(sorted_ops): par_ops_seq = [sorted_ops] return _make_tensor_usage_records_simple_multistream(par_ops_seq) This version is kept as a separate one, because multistreaming feature is still somewhat experimental. """ num_of_ops = len(par_ops_seq) tensor_records = defaultdict( lambda: TensorUsageRecord( tensor=None, first_op_idx=num_of_ops, last_op_idx=-1, size=None ) ) for op_idx, par_ops in enumerate(par_ops_seq): for op in par_ops: for tensor in op._attrs["inputs"] + op._attrs["outputs"]: # Skip weights and inputs since we don't overwrite them. # Note that it might be OK to overwrite inputs, but let's be # consertative for now and not surprise users. We could always # make a flag to do that later if it's needed. if tensor._attrs["is_param"]: continue name = tensor._attrs["name"] this_tensor = tensor_records[name].tensor if this_tensor is None: tensor_records[name].tensor = tensor else: # make sure we didn't screw up anything assert ( tensor == this_tensor ), f"existing tensor: {this_tensor}, new tensor: {tensor}, op: {op}" first_op_idx = tensor_records[name].first_op_idx last_op_idx = tensor_records[name].last_op_idx tensor_records[name].first_op_idx = min(first_op_idx, op_idx) tensor_records[name].last_op_idx = max(last_op_idx, op_idx) # An output tensor's lifetime extends to the last op. if tensor._attrs["is_output"]: tensor_records[name].last_op_idx = num_of_ops - 1 size = tensor_records[name].size tensor_size = tensor.size_bytes(alignment=64) if size is None: tensor_records[name].size = tensor_size else: # make sure we didn't screw up anything assert size == tensor_size # tensor views extend the lifetime of the original tensors tensor_views = [] for name, tensor_record in tensor_records.items(): this_tensor = tensor_record.tensor if this_tensor._attrs["is_view_of"]: orig_tensor = _find_original_tensor(this_tensor) # view of input if orig_tensor._attrs["is_param"]: continue orig_tensor_name = orig_tensor._attrs["name"] assert orig_tensor_name in tensor_records tensor_records[orig_tensor_name].last_op_idx = max( tensor_records[orig_tensor_name].last_op_idx, tensor_record.last_op_idx ) tensor_views.append(name) # remove tensor views from tensor_records for name in tensor_views: del tensor_records[name] # sanity checks # make sure we have valid indices and sizes records = tensor_records.values() for tensor, first_op_idx, last_op_idx, size in records: assert tensor is not None assert 0 <= first_op_idx < num_of_ops assert 0 <= last_op_idx < num_of_ops assert first_op_idx <= last_op_idx assert size is not None return list(records)
[docs]def simple_multistream_memory_planning(sorted_graph: List[Tensor]): """ A specialized case for simple multi-stream execution. It uses more or slightly more GPU memory than greedy_by_size_memory_planner, depending on the input graph, but still significantly less than naive_memory_planning. """ from aitemplate.utils.graph_utils import track_graph_timings # track the sequence time_stats = track_graph_timings(sorted_graph, {}) # sort all operators by parallel execution order ops_by_order = defaultdict(list) for op, tracking in time_stats.op_parallel_trackers.items(): ops_by_order[tracking.execution_order].append(op) # convert Dict[int, List[Operator]] into List[List[Operator]] max_parallel_ops = multistream_max_mem_parallel_ops() par_ops_seq = split_simple_multistream_parallel_ops(ops_by_order, max_parallel_ops) tensor_usage_records = _make_tensor_usage_records_simple_multistream(par_ops_seq) return _greedy_by_size_memory_planning(sorted_graph, tensor_usage_records)
def proxy_memory_planning(sorted_graph: List[Tensor]): run_mode = multistream_mode() if run_mode == 0: # no multistream max_blob, constant_offset, workspace = greedy_by_size_memory_planning( sorted_graph ) elif run_mode == 1: # simple multistream max_blob, constant_offset, workspace = simple_multistream_memory_planning( sorted_graph ) else: # unsupported raise Exception(f"Unsupported multistream mode ({run_mode})") # print some statistics _LOGGER.info( f"Workspace shared_size={workspace.shared_size} unique_size={workspace.unique_size}" ) _LOGGER.info(f"max_blob={max_blob} constant_offset={constant_offset}") # done return (max_blob, constant_offset, workspace) # memory_planning = greedy_by_size_memory_planning # memory_planning = naive_memory_planning memory_planning = proxy_memory_planning