Source code for aitemplate.compiler.model

#  Copyright (c) Meta Platforms, Inc. and affiliates.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
"""
Python bindings to the AIT runtime.
"""

import ctypes
import enum
import logging
import math
import struct
from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, TypeVar, Union

import numpy as np

from aitemplate.compiler.dtype import dtype_str_to_enum
from aitemplate.utils.misc import is_linux, is_windows
from aitemplate.utils.torch_utils import torch_dtype_to_string, write_tensor_binary

# Controls how many runtimes will be used in ModelContainer by default.
# See the runtime README.md for more information on the Model/ModelContainer
# system and the num_runtimes parameter.
# This value is used as the default for the num_runtimes argument
# in both Model.__init__ and compile_model. Changing it will have no
# effect since Python default arguments only get evaluated once.
AIT_DEFAULT_NUM_RUNTIMES = 1

# Stand-in for torch.Tensor. Use a TypeVar for some APIs since we can't introduce
# a torch dependency.
TorchTensor = TypeVar("TorchTensor")


[docs]class AITemplateMemcpyKind(enum.Enum):
    HostToDevice = 0
    DeviceToHost = 1
    DeviceToDevice = 2


[docs]class AITemplateAllocatorKind(enum.Enum):
    DEFAULT = 0
    TRACKING = 1


[docs]class AITData(NamedTuple):
    """
    Input or output tensor for Model.run. We require the extra data for safety
    checks inside the runtime.
    """

    data_ptr: int
    shape: List[int]
    dtype: str


class _AITemplateShape(ctypes.Structure):
    _fields_ = [
        ("shape_data", ctypes.POINTER(ctypes.c_longlong)),
        ("size", ctypes.c_size_t),
    ]


class _CFormatAITData(ctypes.Structure):
    _fields_ = [
        ("pointer", ctypes.c_void_p),
        ("shape", _AITemplateShape),
        ("dtype", ctypes.c_int),
    ]


def _dlclose(dll: ctypes.CDLL):
    f_dlclose = None

    if is_windows():
        f_dlclose = ctypes.windll.kernel32.FreeLibrary
    elif is_linux():
        syms = ctypes.CDLL(None)
        if not hasattr(syms, "dlclose"):
            # Apline Linux
            syms = ctypes.CDLL("libc.so")

        if hasattr(syms, "dlclose"):
            f_dlclose = syms.dlclose

    if f_dlclose is not None:
        f_dlclose.argtypes = [ctypes.c_void_p]
        f_dlclose(dll._handle)
    else:
        logging.warning(
            "dll unloading function was not found, library may not be unloaded properly!"
        )


def _check_tensors(
    tensor_list: Union[Dict[str, TorchTensor], List[TorchTensor]],
    is_error_fn: Callable[[TorchTensor], bool],
    list_name: str,
    condition_description: str,
):
    """
    Helper for various input/output sanity checks.
    """
    if isinstance(tensor_list, dict):
        tensor_list = tensor_list.values()

    for i, tensor in enumerate(tensor_list):
        if is_error_fn(tensor):
            raise ValueError(f"{list_name}[{i}] failed check: {condition_description}")


def _check_tensors_contiguous_and_on_gpu(
    tensors: Union[Dict[str, TorchTensor], List[TorchTensor]], name: str
):
    def is_bad_tensor(tensor: TorchTensor) -> bool:
        return not tensor.is_contiguous() or not tensor.is_cuda

    _check_tensors(tensors, is_bad_tensor, name, "contiguous and on GPU")


def _check_tensors_contiguous_and_on_host(
    tensors: Union[Dict[str, TorchTensor], List[TorchTensor]], name: str
):
    def is_bad_tensor(tensor: TorchTensor) -> bool:
        return not tensor.is_contiguous() or tensor.is_cuda

    _check_tensors(tensors, is_bad_tensor, name, "contiguous and on host")


[docs]def torch_to_ait_data(tensor: TorchTensor) -> AITData:
    """
    Convert a torch Tensor to a AITData.
    """
    return AITData(
        tensor.data_ptr(), list(tensor.size()), torch_dtype_to_string(tensor.dtype)
    )


def _convert_tensor_args(params: Union[List[TorchTensor], Dict[str, TorchTensor]]):
    """
    Helper function for the WithTensors APIs.
    """
    if isinstance(params, dict):
        result = {name: torch_to_ait_data(x) for name, x in params.items()}
    else:
        result = [torch_to_ait_data(x) for x in params]
    return result


def _reshape_tensor(tensor: TorchTensor, shape: List[int]) -> TorchTensor:
    """
    Reinterpret a blob of contiguous memory as some shape. Used to convert
    outputs in RunWithTensors.
    """
    assert (
        tensor.ndim == len(shape)
    ), f"Expected output tensor's ndim to match the length of Run()'s return value: {tensor.ndim=} != {len(shape)=}"
    numel = math.prod(shape)
    new_tensor = tensor.flatten()[:numel]
    return new_tensor.reshape(shape)


class Model:
    class _DLLWrapper:
        def __init__(
            self,
            lib_path: str,
        ):
            self.lib_path = lib_path
            self.DLL = ctypes.cdll.LoadLibrary(lib_path)
            self.is_open = True

        def close(self):
            if self.is_open:
                _dlclose(self.DLL)
                self.is_open = False

        def __getattr__(self, name):
            if not self.is_open:
                raise RuntimeError(f"Cannot use closed AIT library: {self.lib_path}")

            method = getattr(self.DLL, name)

            def _wrapped_func(*args):
                err = method(*args)
                if err:
                    raise RuntimeError(f"Error in function: {method.__name__}")

            return _wrapped_func

    def __init__(
        self,
        lib_path: str,
        num_runtimes: int = AIT_DEFAULT_NUM_RUNTIMES,
        allocator_kind: Optional[AITemplateAllocatorKind] = None,
    ):
        """
        Instantiates a wrapper around the C++ model_interface.

        Parameters
        ----------
        lib_path : str
            The path to the compiled .so
        num_runtimes : int, optional
            How many runtimes should be stored in the internal pool. This
            determines how many inferences can happen concurrently. By
            default, set to 1. Must be positive.
        allocator_kind : AITemplateAllocatorKind, optional
            What type of allocator to use when allocating GPU memory.
        """
        # Set of pointers allocated with numpy_to_ait_data.
        # If the user forgets to free their data, we use this to
        # avoid leaking memory.
        self._allocated_ait_data = set()

        if num_runtimes <= 0:
            raise ValueError(f"num_runtimes must be positive, but got {num_runtimes}")

        self.DLL = self._DLLWrapper(lib_path)
        self.lib_path = lib_path
        self.handle = ctypes.c_void_p()
        self.allocator_handle = ctypes.c_void_p()
        if allocator_kind is not None:
            self.DLL.AITemplateAllocatorCreate(
                ctypes.byref(self.allocator_handle),
                ctypes.c_int(allocator_kind.value),
            )

        self.DLL.AITemplateModelContainerCreate(
            ctypes.pointer(self.handle),
            ctypes.c_size_t(num_runtimes),
            self.allocator_handle,
        )

        # We use this list to add reference counts of Torch tensors
        # to avoid lifetime issues caused by user misuse.
        self.torch_constant_tensors = {}

        # The corresponding sorted_graph. Optional. For debugging purpose.
        self.debug_sorted_graph = None

        self._output_name_to_index = self._construct_output_name_to_index_map()
        self._input_name_to_index = self._construct_input_name_to_index_map()
        self._output_ndims = [
            len(self.get_output_maximum_shape(i))
            for i in range(len(self._output_name_to_index))
        ]

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def __del__(self):
        self.close()

    def close(self):
        # Copy to avoid set size changed during iteration
        for ptr in list(self._allocated_ait_data):
            self.free_gpu_memory(ptr, sync=True)

        # Check that it exists since we may have thrown
        # an exception before initializing it.
        if hasattr(self, "DLL"):
            if self.handle:
                self.DLL.AITemplateModelContainerDelete(self.handle)
                self.handle = ctypes.c_void_p()

            if self.allocator_handle:
                self.DLL.AITemplateAllocatorDelete(self.allocator_handle)
                self.allocator_handle = ctypes.c_void_p()

            self.DLL.close()

    def __getstate__(self):
        return {"lib_path": self.DLL.lib_path}

    def __setstate__(self, d):
        if "lib_path" not in d:
            raise RuntimeError(f"Didn't find 'lib_path' property in {d}")
        self.__init__(d["lib_path"])

    def _convert_single_param_to_c_format(self, param: AITData) -> _CFormatAITData:
        pointer, shape, dtype = param
        c_pointer = ctypes.c_void_p(pointer)
        c_shape_data = (ctypes.c_longlong * len(shape))()
        for j, dim in enumerate(shape):
            c_shape_data[j] = ctypes.c_longlong(dim)
        c_shape = _AITemplateShape(c_shape_data, ctypes.c_size_t(len(shape)))
        c_dtype = dtype_str_to_enum(dtype)
        return _CFormatAITData(c_pointer, c_shape, c_dtype)

    def _convert_params_to_c_format(self, params: List[AITData]):
        c_params = (_CFormatAITData * len(params))()
        for i, param in enumerate(params):
            c_params[i] = self._convert_single_param_to_c_format(param)
        return c_params

    def _prepare_run(
        self,
        inputs,
        outputs,
        stream_ptr,
    ):
        c_inputs = self._convert_params_to_c_format(inputs)
        c_outputs = self._convert_params_to_c_format(outputs)
        c_stream = (
            ctypes.c_void_p() if stream_ptr is None else ctypes.c_void_p(stream_ptr)
        )

        num_outputs = len(self._output_ndims)
        c_output_shapes_out = (ctypes.POINTER(ctypes.c_int64) * num_outputs)()
        for i in range(num_outputs):
            c_output_shapes_out[i] = ctypes.cast(
                (ctypes.c_int64 * self._output_ndims[i])(),
                ctypes.POINTER(ctypes.c_int64),
            )

        return (
            c_inputs,
            c_outputs,
            c_stream,
            c_output_shapes_out,
        )

    def _dict_to_ordered_list(self, params, is_inputs):
        if is_inputs:
            index_map = self._input_name_to_index
        else:
            index_map = self._output_name_to_index
        if len(params) != len(index_map):
            raise ValueError(
                f"Did not get correct number of {'inputs' if is_inputs else 'outputs'} expected {len(index_map)}, got {len(params)}"
            )

        result = [None] * len(index_map)
        for name, tensor in params.items():
            if name not in index_map:
                raise ValueError(
                    f"Got unexpected {'input' if is_inputs else 'output'}: {name}"
                )

            result[index_map[name]] = tensor

        return result

    def _write_tensors_for_standalone_testcase(
        self,
        tensor_dict: Dict[str, TorchTensor],
        file_handle,
        is_inputs: bool = True,
    ) -> None:
        if is_inputs:
            index_map = self._input_name_to_index
        else:
            index_map = self._output_name_to_index
        result = [None] * len(index_map)
        for name, tensor in tensor_dict.items():
            if name not in index_map:
                raise ValueError(
                    f"Got unexpected {'input' if is_inputs else 'output'}: {name}"
                )
            idx = index_map[name]
            result[idx] = tensor
        for tensor in result:
            write_tensor_binary(tensor, file_handle)

    def write_standalone_testcase_data(
        self,
        filename,
        inputs: Dict[str, TorchTensor],
        expected_outputs: List[TorchTensor],
        atol=1e-2,
        rtol=1e-2,
    ):
        with open(filename, "wb") as file_handle:
            file_handle.write(struct.pack("ff", atol, rtol))
            self._write_tensors_for_standalone_testcase(
                tensor_dict=inputs, file_handle=file_handle
            )
            for out in expected_outputs:
                write_tensor_binary(out, file_handle)

    def _make_ait_outputs(
        self, outputs: List[AITData], c_output_shapes
    ) -> Dict[str, AITData]:
        output_shapes = []
        for i, c_shape in enumerate(c_output_shapes):
            shape = []
            for j in range(self._output_ndims[i]):
                shape.append(c_shape[j])
            output_shapes.append(shape)

        return {
            name: AITData(outputs[idx].data_ptr, output_shapes[idx], outputs[idx].dtype)
            for name, idx in self._output_name_to_index.items()
        }

    def _run_impl(
        self,
        inputs: Union[Dict[str, AITData], List[AITData]],
        outputs: Union[Dict[str, AITData], List[AITData]],
        stream_ptr: Optional[int] = None,
        sync: bool = True,
        graph_mode: bool = False,
        outputs_on_host: bool = False,
    ) -> Dict[str, AITData]:
        if isinstance(inputs, dict):
            inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
        if isinstance(outputs, dict):
            outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
        (
            c_inputs,
            c_outputs,
            c_stream,
            c_output_shapes_out,
        ) = self._prepare_run(
            inputs,
            outputs,
            stream_ptr,
        )

        if not outputs_on_host:
            self.DLL.AITemplateModelContainerRun(
                self.handle,
                c_inputs,
                ctypes.c_size_t(len(inputs)),
                c_outputs,
                ctypes.c_size_t(len(outputs)),
                c_stream,
                ctypes.c_bool(sync),
                ctypes.c_bool(graph_mode),
                c_output_shapes_out,
            )
        else:
            self.DLL.AITemplateModelContainerRunWithOutputsOnHost(
                self.handle,
                c_inputs,
                ctypes.c_size_t(len(inputs)),
                c_outputs,
                ctypes.c_size_t(len(outputs)),
                c_stream,
                ctypes.c_bool(graph_mode),
                c_output_shapes_out,
            )

        return self._make_ait_outputs(outputs, c_output_shapes_out)

    def run(
        self,
        inputs: Union[Dict[str, AITData], List[AITData]],
        outputs: Union[Dict[str, AITData], List[AITData]],
        stream_ptr: Optional[int] = None,
        sync: bool = True,
        graph_mode: bool = False,
    ) -> Dict[str, AITData]:
        """
        Run the model.

        Parameters
        ----------
        inputs: Union[Dict[str, AITData], List[AITData]]
            The inputs to use. AITData is a named tuple containing
            the tensor's data_ptr, size, and dtype. If inputs is a list,
            it must be ordered correctly (as specified by GetInputNameToIndexMap).
            This parameter can also be a dictionary (name -> AITData).
        outputs: Union[Dict[str, AITData], List[AITData]]
            The outputs to use. Similar to inputs, can either be a list of ordered
            outputs, or a dictionary (output name -> AITData).
            These should be allocated with enough memory to store their maximum
            size (which can be queried with GetOutputMaximumSize).
        stream_ptr: int
            A pointer to CUDA stream to run on. If None, use the legacy stream.
        sync: bool:
            If True, synchronize the stream at the end of the run
        graph_mode: bool
            If True, use a CUDA graph kernel (experimental)

        Returns
        -------
        AITDatas with output shapes that are computed by shape inference. This may not be
        the maximum shape. The output memory blobs that are passed in to Run()
        should be interpreted and possibly truncated according to these sizes.
        """
        return self._run_impl(
            inputs, outputs, stream_ptr, sync, graph_mode, outputs_on_host=False
        )

    def profile(
        self,
        inputs: Union[Dict[str, AITData], List[AITData]],
        outputs: Union[Dict[str, AITData], List[AITData]],
        num_iters: int,
        filename: str,
        stream_ptr: Optional[int] = None,
    ) -> None:
        if isinstance(inputs, dict):
            inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
        if isinstance(outputs, dict):
            outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
        (
            c_inputs,
            c_outputs,
            c_stream,
            c_output_shapes_out,
        ) = self._prepare_run(
            inputs,
            outputs,
            stream_ptr,
        )
        self.DLL.AITemplateModelContainerProfile(
            self.handle,
            c_inputs,
            ctypes.c_size_t(len(inputs)),
            c_outputs,
            ctypes.c_size_t(len(outputs)),
            c_stream,
            ctypes.c_size_t(num_iters),
            ctypes.c_char_p(filename.encode("utf-8")),
        )

    def profile_with_tensors(
        self,
        inputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
        num_iters: int,
        filename: str,
        stream_ptr: Optional[int] = None,
    ) -> None:
        _check_tensors_contiguous_and_on_gpu(
            inputs,
            name="inputs",
        )
        _check_tensors_contiguous_and_on_gpu(
            outputs,
            name="outputs",
        )
        self.profile(
            _convert_tensor_args(inputs),
            _convert_tensor_args(outputs),
            num_iters,
            filename,
            stream_ptr,
        )

    def _interpret_tensors_as_shapes(
        self,
        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
        outputs_ait: Dict[str, AITData],
    ) -> Dict[str, TorchTensor]:
        if isinstance(outputs, dict):
            return {
                name: _reshape_tensor(tensor, outputs_ait[name].shape)
                for name, tensor in outputs.items()
            }
        else:
            return {
                name: _reshape_tensor(outputs[idx], outputs_ait[name].shape)
                for name, idx in self._output_name_to_index.items()
            }

    def run_with_tensors(
        self,
        inputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
        stream_ptr: Optional[int] = None,
        sync: bool = True,
        graph_mode: bool = False,
    ) -> Dict[str, TorchTensor]:
        """
        Run the model with torch.Tensors. See Run() for information about the
        arguments.

        Inputs may either be a dictionary (name -> torch.Tensor), or a list
        of torch.Tensors ordered according to GetInputNameToIndexMap. Outputs
        can also be a dictionary, or a list ordered according to GetOutputNameToIndexMap.
        """

        _check_tensors_contiguous_and_on_gpu(
            inputs,
            name="inputs",
        )
        _check_tensors_contiguous_and_on_gpu(
            outputs,
            name="outputs",
        )
        outputs_ait = self.run(
            _convert_tensor_args(inputs),
            _convert_tensor_args(outputs),
            stream_ptr=stream_ptr,
            sync=sync,
            graph_mode=graph_mode,
        )

        return self._interpret_tensors_as_shapes(outputs, outputs_ait)

    def _run_with_outputs_on_host(
        self,
        inputs: Union[Dict[str, AITData], List[AITData]],
        outputs: Union[Dict[str, AITData], List[AITData]],
        stream_ptr: Optional[int] = None,
        graph_mode: bool = False,
    ) -> Dict[str, AITData]:
        """
        Like Run(), but takes host memory outputs. Note that there is no sync parameter;
        the stream will always be synchronized after copying the outputs to the host.

        Warning: don't use this! It's not optimal with respect to performance.
        It's here for use if you need it for debugging purpose.
        """
        return self._run_impl(
            inputs, outputs, stream_ptr, graph_mode=graph_mode, outputs_on_host=True
        )

    def _run_with_tensors_outputs_on_host(
        self,
        inputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
        stream_ptr: Optional[int] = None,
        graph_mode: bool = False,
    ) -> Dict[str, TorchTensor]:
        """
        Like RunWithTensors(), but takes host memory tensors

        Warning: don't use this! It's not optimal with respect to performance.
        It's here for use if you need it for debugging.
        """
        _check_tensors_contiguous_and_on_gpu(
            inputs,
            name="inputs",
        )
        _check_tensors_contiguous_and_on_host(
            outputs,
            name="outputs",
        )
        output_shapes = self._run_with_outputs_on_host(
            _convert_tensor_args(inputs),
            _convert_tensor_args(outputs),
            stream_ptr=stream_ptr,
            graph_mode=graph_mode,
        )
        return self._interpret_tensors_as_shapes(outputs, output_shapes)

    def benchmark(
        self,
        inputs: Union[Dict[str, AITData], List[AITData]],
        outputs: Union[Dict[str, AITData], List[AITData]],
        stream_ptr: Optional[int] = None,
        graph_mode: bool = False,
        count: int = 10,
        repeat: int = 1,
        num_threads: int = 1,
        use_unique_stream_per_thread: bool = False,
    ) -> Tuple[float, float, Dict[str, AITData]]:
        """
        Benchmark the model. See run() for information on most parameters.
        """
        if isinstance(inputs, dict):
            inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
        if isinstance(outputs, dict):
            outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
        (
            c_inputs,
            c_outputs,
            c_stream,
            c_output_shapes_out,
        ) = self._prepare_run(
            inputs,
            outputs,
            stream_ptr,
        )
        time_ms = []
        runtime_ms = ctypes.c_float()
        for _ in range(repeat):
            self.DLL.AITemplateModelContainerBenchmark(
                self.handle,
                c_inputs,
                ctypes.c_size_t(len(inputs)),
                c_outputs,
                ctypes.c_size_t(len(outputs)),
                c_stream,
                ctypes.c_bool(graph_mode),
                ctypes.c_size_t(count),
                ctypes.c_size_t(num_threads),
                ctypes.c_bool(use_unique_stream_per_thread),
                ctypes.byref(runtime_ms),
                c_output_shapes_out,
            )
            time_ms.append(runtime_ms.value)
        mean = np.mean(time_ms)
        std = np.std(time_ms)

        return (mean, std, self._make_ait_outputs(outputs, c_output_shapes_out))

    def benchmark_with_tensors(
        self,
        inputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
        stream_ptr: Optional[int] = None,
        graph_mode: bool = False,
        count: int = 10,
        repeat: int = 1,
        num_threads: int = 1,
        use_unique_stream_per_thread: bool = False,
    ) -> Tuple[float, float, Dict[str, TorchTensor]]:
        """
        Benchmark the model. See run_with_tensors() for information on most parameters.
        """

        _check_tensors_contiguous_and_on_gpu(
            inputs,
            name="inputs",
        )
        _check_tensors_contiguous_and_on_gpu(
            outputs,
            name="outputs",
        )

        mean, std, ait_outputs = self.benchmark(
            _convert_tensor_args(inputs),
            _convert_tensor_args(outputs),
            stream_ptr,
            graph_mode,
            count,
            repeat,
            num_threads,
            use_unique_stream_per_thread,
        )
        return (mean, std, self._interpret_tensors_as_shapes(outputs, ait_outputs))

    def _get_map_helper(self, n: int, get_name_func) -> Dict[str, int]:
        result = {}
        for i in range(n):
            c_name = ctypes.c_char_p()
            c_idx = ctypes.c_size_t(i)
            get_name_func(c_idx, ctypes.byref(c_name))
            name = c_name.value.decode("utf-8")
            result[name] = i
        return result

    def _construct_input_name_to_index_map(self) -> Dict[str, int]:
        num_inputs = ctypes.c_size_t()
        self.DLL.AITemplateModelContainerGetNumInputs(
            self.handle, ctypes.byref(num_inputs)
        )
        get_input_name = (
            lambda idx, name: self.DLL.AITemplateModelContainerGetInputName(
                self.handle, idx, name
            )
        )
        return self._get_map_helper(num_inputs.value, get_input_name)

    def get_input_name_to_index_map(self) -> Dict[str, int]:
        """
        Get the name to index mapping. Note that the ordering of inputs
        is not guaranteed to be deterministic.

        If using run()'s list interface, this ordering must be used!
        """
        # Copy so people can't modify our version of the map
        return self._input_name_to_index.copy()

    def _construct_output_name_to_index_map(self) -> Dict[str, int]:
        num_outputs = ctypes.c_size_t()
        self.DLL.AITemplateModelContainerGetNumOutputs(
            self.handle, ctypes.byref(num_outputs)
        )
        get_output_name = (
            lambda idx, name: self.DLL.AITemplateModelContainerGetOutputName(
                self.handle, idx, name
            )
        )
        return self._get_map_helper(num_outputs.value, get_output_name)

    def get_output_name_to_index_map(self) -> Dict[str, int]:
        """
        Get the name to index mapping. Unlike inputs, outputs
        have a guaranteed ordering; the order that outputs were
        provided to `compile_model` is always used as the internal
        name to index mapping.

        If using run()'s list interface, this ordering must be used!
        """
        # Copy so people can't modify our version of the map
        return self._output_name_to_index.copy()

    def set_constant(self, name: str, tensor: AITData):
        """
        Set a constant. All constants must have values before calling run().

        Note that the pointer inside tensor must be valid for the entire
        duration of run().
        """
        b_name = name.encode("utf-8")
        c_name = ctypes.c_char_p(b_name)
        c_tensor = self._convert_single_param_to_c_format(tensor)
        self.DLL.AITemplateModelContainerSetConstant(
            self.handle, c_name, ctypes.byref(c_tensor)
        )

    def set_many_constants(self, tensors: Dict[str, AITData]):
        """
        Bulk set many constants at once. More efficient than set_constant()
        since it only has to acquire the lock once.
        """
        c_names = (ctypes.c_char_p * len(tensors))()
        c_tensors = (_CFormatAITData * len(tensors))()
        ait_tensors = {
            name.encode("utf-8"): self._convert_single_param_to_c_format(tensor)
            for name, tensor in tensors.items()
        }
        for i, (name_bytes, tensor) in enumerate(ait_tensors.items()):
            c_names[i] = ctypes.c_char_p(name_bytes)
            c_tensors[i] = tensor

        num_tensors = ctypes.c_size_t(len(tensors))
        self.DLL.AITemplateModelContainerSetManyConstants(
            self.handle, c_names, c_tensors, num_tensors
        )

    def set_double_buffer_constant(
        self, name: str, tensor: AITData, stream_ptr: Optional[int] = None
    ):
        """
        Set a constant. All constants must have values before calling run().

        Note that the pointer inside tensor must be valid for the entire
        duration of run().
        """
        b_name = name.encode("utf-8")
        c_name = ctypes.c_char_p(b_name)
        c_tensor = self._convert_single_param_to_c_format(tensor)
        self.DLL.AITemplateModelContainerSetDoubleBufferConstant(
            self.handle, ctypes.c_void_p(stream_ptr), c_name, ctypes.byref(c_tensor)
        )

    def set_many_double_buffer_constants(
        self, tensors: Dict[str, AITData], stream_ptr: Optional[int] = None
    ):
        """
        Bulk set many constants at once. More efficient than set_constant()
        since it only has to acquire the lock once.
        """
        c_names = (ctypes.c_char_p * len(tensors))()
        c_tensors = (_CFormatAITData * len(tensors))()
        ait_tensors = {
            name.encode("utf-8"): self._convert_single_param_to_c_format(tensor)
            for name, tensor in tensors.items()
        }
        for i, (name_bytes, tensor) in enumerate(ait_tensors.items()):
            c_names[i] = ctypes.c_char_p(name_bytes)
            c_tensors[i] = tensor

        num_tensors = ctypes.c_size_t(len(tensors))
        self.DLL.AITemplateModelContainerSetManyDoubleBufferConstants(
            self.handle, ctypes.c_void_p(stream_ptr), c_names, c_tensors, num_tensors
        )

    def set_many_constants_with_tensors(self, tensors: Dict[str, TorchTensor]):
        ait_tensors = {}
        for name, tensor in tensors.items():
            if not tensor.is_contiguous() or not tensor.is_cuda:
                raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
            self.torch_constant_tensors[name] = tensor
            ait_tensors[name] = torch_to_ait_data(tensor)
        self.set_many_constants(ait_tensors)

    def set_double_buffer_constant_with_tensor(
        self, name: str, tensor: TorchTensor, stream_ptr: Optional[int] = None
    ):
        """
        Set a constant with a PyTorch tensor.
        Model will store a reference to the given tensor in
        torch_constant_tensors until it is explicitly deleted or replaced.
        """
        if not tensor.is_contiguous() or not tensor.is_cuda:
            raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
        self.torch_constant_tensors[name] = tensor
        self.set_double_buffer_constant(name, torch_to_ait_data(tensor), stream_ptr)

    def set_many_double_buffer_constants_with_tensors(
        self, tensors: Dict[str, TorchTensor], stream_ptr: Optional[int] = None
    ):
        ait_tensors = {}
        for name, tensor in tensors.items():
            if not tensor.is_contiguous() or not tensor.is_cuda:
                raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
            self.torch_constant_tensors[name] = tensor
            ait_tensors[name] = torch_to_ait_data(tensor)
        self.set_many_double_buffer_constants(ait_tensors, stream_ptr)

    def set_constant_with_tensor(self, name: str, tensor: TorchTensor):
        """
        Set a constant with a PyTorch tensor.
        Model will store a reference to the given tensor in
        torch_constant_tensors until it is explicitly deleted or replaced.
        """
        if not tensor.is_contiguous() or not tensor.is_cuda:
            raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
        self.torch_constant_tensors[name] = tensor
        self.set_constant(name, torch_to_ait_data(tensor))

    def get_output_maximum_shape(
        self, output_idx_or_name: Union[int, str]
    ) -> List[int]:
        """
        Get the maximum output shape. The input here can either be an output name
        or an index. The index is the runtime's internal index (as specified by
        GetOutputNameToIndexMap)
        """
        if isinstance(output_idx_or_name, int):
            output_idx = output_idx_or_name
        elif isinstance(output_idx_or_name, str):
            if output_idx_or_name not in self._output_name_to_index:
                raise ValueError(
                    f"Name {output_idx_or_name} not in OutputNameToIndexMap! Available names: {list(self._output_name_to_index.keys())}"
                )
            output_idx = self._output_name_to_index[output_idx_or_name]
        else:
            raise TypeError(
                f"output_idx_or_name must be str or int, but got {type(output_idx_or_name)}"
            )

        class Shape(ctypes.Structure):
            _fields_ = [
                ("shape_data", ctypes.POINTER(ctypes.c_longlong)),
                ("size", ctypes.c_size_t),
            ]

        raw_shape = Shape()
        self.DLL.AITemplateModelContainerGetMaximumOutputShape(
            self.handle, output_idx, ctypes.byref(raw_shape)
        )
        return [raw_shape.shape_data[idx] for idx in range(raw_shape.size)]

    def get_output_dtype(self, index):
        """
        Get the expected dtype of an output.
        """
        output = ctypes.c_int()
        self.DLL.AITemplateModelContainerGetOutputDtype(
            self.handle, index, ctypes.byref(output)
        )
        return output.value

    def allocate_gpu_memory(
        self, nbytes: int, stream_ptr: Optional[int] = None, sync: bool = True
    ) -> int:
        """
        Helper function for allocating memory on the GPU. Can be useful if
        third-party libraries like PyTorch or pycuda are not available.

        The pointer returned by this function must be freed by free_gpu_memory
        to avoid memory leaks.
        """
        ptr = ctypes.c_void_p()
        self.DLL.AITemplateDeviceMalloc(
            ctypes.byref(ptr),
            ctypes.c_size_t(nbytes),
            ctypes.c_void_p(stream_ptr),
            ctypes.c_bool(sync),
        )
        return ptr.value

    def free_gpu_memory(
        self, ptr: int, stream_ptr: Optional[int] = None, sync: bool = True
    ) -> None:
        """
        Helper function for freeing memory on the GPU. Can be useful if
        third-party libraries like PyTorch or pycuda are not available.
        """
        if ptr in self._allocated_ait_data:
            self._allocated_ait_data.remove(ptr)

        self.DLL.AITemplateDeviceFree(
            ctypes.c_void_p(ptr), ctypes.c_void_p(stream_ptr), ctypes.c_bool(sync)
        )

    def memcpy(
        self,
        dst: int,
        src: int,
        count: int,
        kind: AITemplateMemcpyKind,
        stream_ptr: Optional[int] = None,
        sync: bool = True,
    ) -> None:
        """
        Helper function for copying memory on the GPU. Can be useful if
        third-party libraries like PyTorch or pycuda are not available.

        Supports D2H, H2D, and D2D copies. The copy direction can be
        specified by the AITemplateMemcpyKind enum.
        """
        self.DLL.AITemplateMemcpy(
            ctypes.c_void_p(dst),
            ctypes.c_void_p(src),
            ctypes.c_size_t(count),
            ctypes.c_int(kind.value),
            ctypes.c_void_p(stream_ptr),
            ctypes.c_bool(sync),
        )

    def get_num_runtimes(self) -> int:
        """
        Get the number of runtimes this model container stores.
        """
        out = ctypes.c_size_t()
        self.DLL.AITemplateModelContainerGetNumRuntimes(self.handle, ctypes.byref(out))
        return out.value

    def numpy_to_ait_data(
        self, arr: np.ndarray, stream_ptr: Optional[int] = None, sync: bool = True
    ) -> AITData:
        """
        Convert a numpy array to AIT-usable data. Mallocs and copies
        on the given stream.

        The allocated buffer should be manually freed with free_gpu_memory.
        But, in case of misuse, Model will keep track of pointers allocated with
        this method and free them all at the end.
        """
        dtype = str(arr.dtype)
        shape = list(arr.shape)
        gpu_mem = self.allocate_gpu_memory(arr.nbytes, stream_ptr=stream_ptr, sync=sync)
        self._allocated_ait_data.add(gpu_mem)
        self.memcpy(
            gpu_mem,
            arr.ctypes._data.value,
            arr.nbytes,
            AITemplateMemcpyKind.HostToDevice,
            sync=sync,
            stream_ptr=stream_ptr,
        )
        return AITData(gpu_mem, shape, dtype)

    def ait_data_to_numpy(
        self,
        ait_data: AITData,
        stream_ptr: Optional[int] = None,
        sync: bool = True,
    ) -> np.ndarray:
        """
        Create numpy array from an AITData.
        Copies on the given stream.
        """
        arr = np.empty(ait_data.shape, dtype=ait_data.dtype)
        self.memcpy(
            arr.ctypes._data.value,
            ait_data.data_ptr,
            arr.nbytes,
            AITemplateMemcpyKind.DeviceToHost,
            sync=sync,
            stream_ptr=stream_ptr,
        )
        return arr

    def fold_constants(
        self,
        stream_ptr: Optional[int] = None,
        sync: bool = True,
        double_buffer: bool = False,
    ):
        if double_buffer:
            self.DLL.AITemplateModelContainerFoldConstantsInDoubleBuffer(
                self.handle,
                ctypes.c_void_p(stream_ptr),
                ctypes.c_bool(sync),
            )
        else:
            self.DLL.AITemplateModelContainerFoldConstants(
                self.handle,
                ctypes.c_void_p(stream_ptr),
                ctypes.c_bool(sync),
            )

    def swap_constants(self):
        self.DLL.AITemplateModelContainerSwapConstants(self.handle)

    def _get_constant_names_impl(
        self, unbound_constants_only: bool, constant_folding_only: bool
    ) -> List[str]:
        num_constants = ctypes.c_size_t()
        constant_folding_inputs_only = ctypes.c_bool(constant_folding_only)
        unbound_constants_only_ = ctypes.c_bool(unbound_constants_only)
        self.DLL.AITemplateModelContainerGetNumConstants(
            self.handle,
            unbound_constants_only_,
            constant_folding_inputs_only,
            ctypes.byref(num_constants),
        )
        names = (ctypes.c_char_p * num_constants.value)()
        self.DLL.AITemplateModelContainerGetConstantNames(
            self.handle, unbound_constants_only_, constant_folding_inputs_only, names
        )
        return [name.decode("utf-8") for name in names]

    def get_constant_names(
        self, unbound_constants_only: bool = True, constant_folding_only: bool = False
    ) -> List[str]:
        return self._get_constant_names_impl(
            unbound_constants_only, constant_folding_only
        )

    def get_constant_folding_input_names(
        self, unbound_constants_only: bool = True
    ) -> List[str]:
        return self._get_constant_names_impl(unbound_constants_only, True)