CUDAGraph Trees - Warn on dealloc (pytorch#97171)

eellison · pytorchmergebot · commit 33dfdedb28ce · 2023-03-24T01:19:19.000Z
Differential Revision: [D44228370](https://our.internmc.facebook.com/intern/diff/D44228370) Pull Request resolved: pytorch#97171 Approved by: https://github.com/ezyang, https://github.com/jansel
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
@@ -5,6 +5,7 @@
 import importlib
 import sys
 import unittest
+import warnings
 
 import torch
 
@@ -100,6 +101,7 @@ def setUp(self):
             config.triton.cudagraphs = True
             config.triton.cudagraph_trees = True
             self.device_idx = torch.rand([0], device="cuda").device.index
+            warnings.filterwarnings("ignore")
 
         def tearDown(self):
             super().tearDown()
@@ -109,6 +111,7 @@ def tearDown(self):
             config.triton.cudagraph_trees = self.tapes_enabled
             self.assertIsNone(self.get_manager())
             self.assertEqual(all_live_block_count(), 0)
+            warnings.resetwarnings()
 
         def get_manager(self, device_index=None):
             return torch._inductor.cudagraph_trees.get_container(
@@ -534,6 +537,20 @@ def foo(args):
             test()
             self.assertTrue(self.get_manager(device_index=1) is None)
 
+        def test_warnings_on_dealloc(self):
+            @torch.compile()
+            def foo(x):
+                return x * x * x
+
+            inp = torch.rand([4], device="cuda")
+            out = foo(inp)
+            warnings.resetwarnings()
+            with warnings.catch_warnings(record=True) as w:
+                foo(inp)
+
+            self.assertTrue(len(w) == 1)
+            self.assertTrue("x * x * x" in str(w[0]))
+
         def test_forward_generation(self):
             def foo(x):
                 return x * x * x
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
@@ -2768,7 +2768,8 @@ def aot_function(
             larger Aten ops into simpler or core Aten ops.
         inference_compiler (Optional[Callable]): A Python function that accepts an
             Fx graph with Aten ops and input args, and returns a Callable that
-            semantically is equivalent to the input Fx graph.  Default: None
+            semantically is equivalent to the input Fx graph. inference_compiler is invoked
+            if no autograd is needed. Default: None
             (when None, it defaults to the :attr:`fw_compiler`)
     Returns:
         Returns a ``Callable`` that retains the eager behavior of the original
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
@@ -185,9 +185,15 @@ def compile_fx_inner(
             if aot_mode:
                 return compiled_fn
 
-    output = list(gm.graph.nodes)[-1]
-    assert len(output.args) == 1
     if cudagraphs:
+        # output args are tuple of first argument
+        output = list(gm.graph.nodes)[-1]
+        assert len(output.args) == 1
+        stack_traces = [
+            (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
+            for arg in output.args[0]
+        ]
+
         complex_memory_overlap_inputs = any(
             complex_memory_overlap(t) for t in example_inputs
         )
@@ -204,6 +210,7 @@ def compile_fx_inner(
                 example_inputs,
                 static_input_idxs=range(num_fixed),
                 device_index=next(iter(graph.device_idxs)),
+                stack_traces=stack_traces,
                 is_backward=is_backward,
                 is_inference=is_inference,
             )
@@ -279,6 +286,7 @@ def cudagraphify(
     static_input_idxs=(),
     *,
     device_index: int,
+    stack_traces: List[Optional[str]],
     is_backward: bool,
     is_inference: bool,
 ):
@@ -290,6 +298,7 @@ def cudagraphify(
         cudagraphify_fn = functools.partial(
             new_cudagraphify_impl,
             device_index=device_index,
+            stack_traces=stack_traces,
             is_backward=is_backward,
             is_inference=is_inference,
         )
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -194,7 +194,11 @@ class triton:
     # Use cudagraph trees for memory pooling if `cudagraphs` is True
     cudagraph_trees = False
 
-    debug_cudagraph_trees = True
+    # assertions not on the fast path, steady state
+    fast_cudagraph_asserts = True
+
+    # assertions on the fast path
+    slow_cudagraph_asserts = False
 
     # skip warmup for cudagraph trees
     skip_cudagraph_warmup = False
diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py
@@ -258,14 +258,22 @@ def cudagraphify_impl(
     device_index: int,
     is_backward: bool,
     is_inference: bool,
+    stack_traces: Optional[StackTraces] = None,
 ):
     manager = get_container(device_index).get_tree_manager()
+    assert not (is_backward and is_inference)
+    mode = (
+        CompilationMode.BACKWARD
+        if is_backward
+        else (CompilationMode.INFERENCE if is_inference else CompilationMode.FORWARD)
+    )
+
     return manager.add_function(
         model,
         inputs,
         static_input_idxs,
-        is_backward,
-        is_inference,
+        stack_traces,
+        mode,
     )
 
 
@@ -351,6 +359,8 @@ def map_to_ref(t: Optional[Tensor]) -> Optional[StorageWeakRefWrapper]:
 # For each node in the path, for each output, is the output alive
 PathLiveness = List[List[bool]]
 
+StackTraces = List[Optional[str]]
+
 
 class CUDAWarmupNode:
     """
@@ -378,6 +388,7 @@ def __init__(
         cuda_graphs_pool: Tuple[int, int],
         existing_cuda_graph: torch.cuda.Graph,
         device_index: int,
+        stack_traces: Optional[StackTraces],
     ):
         self.wrapped_function = wrapped_function
         self.parent = parent
@@ -386,6 +397,7 @@ def __init__(
         self.existing_cuda_graph = existing_cuda_graph
         self.has_run = False
         self.device_index = device_index
+        self.stack_traces = stack_traces
 
     def run(self, new_inputs):
         assert not self.has_run, "Wrapped function should never be run twice"
@@ -403,7 +415,7 @@ def run(self, new_inputs):
             ):
                 non_cudagraph_inps.add(new_inputs[i].untyped_storage().data_ptr())
 
-        if config.triton.debug_cudagraph_trees:
+        if config.triton.fast_cudagraph_asserts:
             refs = list(self.path_live_weakrefs())
             check_memory_pool(self.cuda_graphs_pool, refs)
 
@@ -425,7 +437,7 @@ def run(self, new_inputs):
             ]
         )
 
-        if config.triton.debug_cudagraph_trees:
+        if config.triton.fast_cudagraph_asserts:
             out_refs = self.path_live_weakrefs()
             new_storages = [
                 t for t in out_refs if t.data_ptr() not in non_cudagraph_inps
@@ -436,16 +448,22 @@ def run(self, new_inputs):
 
     def path_live_weakrefs(self) -> Generator[StorageWeakRefWrapper]:
         "Returns all live storages weakrefs that created by nodes in this path"
+        for stor_ref, _ in self.path_live_weakrefs_and_stacktraces():
+            yield stor_ref
+
+    def path_live_weakrefs_and_stacktraces(
+        self,
+    ) -> Generator[Tuple[StorageWeakRefWrapper, Optional[str]]]:
         nodes = []
         node = self
         while node:
             nodes.append(node)
             node = node.parent
 
         for node in reversed(nodes):
-            for output in node.outputs_weakrefs:
+            for i, output in enumerate(node.outputs_weakrefs):
                 if is_live(output):
-                    yield output
+                    yield output, (node.stack_traces[i] if node.stack_traces else None)
 
     def all_outputs_are_dead(self):
         return not list(self.path_live_weakrefs())
@@ -486,12 +504,14 @@ def __init__(
         inputs: List[Tensor],
         cuda_graphs_pool: Tuple[int, int],
         device_index: int,
+        stack_traces: Optional[StackTraces],
     ):
         assert isinstance(inputs, (list, tuple))
 
         self.wrapped_function = wrapped_function
         self.id = id
         self.device = device_index
+        self.stack_traces = stack_traces
 
         # if this is a root parent will be None. use weakref to prevent reference cycle
         self._parent = weakref.ref(parent) if parent is not None else None
@@ -510,6 +530,9 @@ def __init__(
         self.path_weakrefs: LevelList[OutputList[Optional[StorageWeakRefWrapper]]] = [
             node.outputs_weakrefs for node in self._path_from_root
         ]
+        self.path_stacktraces: LevelList[StackTraces] = [
+            node.stack_traces for node in self._path_from_root
+        ]
 
         # tensors which are outputs of previous graphs in the tree
         self.cudagraph_managed_idxs: List[int] = [
@@ -616,7 +639,7 @@ def __init__(
         self.checkpointed_caching_state: Optional[AllocatorState] = None
 
     def run(self, new_inputs):
-        if config.triton.debug_cudagraph_trees:
+        if config.triton.slow_cudagraph_asserts:
             self.debug_check_invariants_before_invocation()
 
         assert len(self.static_input_data_ptrs) == len(new_inputs)
@@ -677,7 +700,7 @@ def all_outputs_are_dead(self):
     def _record(self, model, stream, inputs):
         "Record the model"
 
-        if config.triton.debug_cudagraph_trees:
+        if config.triton.fast_cudagraph_asserts:
             # need to use parent live weakrefs because live_indices isnt set yet
             memory = (
                 [] if self.parent is None else list(self.parent.path_live_weakrefs())
@@ -720,6 +743,13 @@ def _add_first_outputs(self, outputs):
                 and o.untyped_storage().data_ptr() in self.static_input_storage_ptrs
             )
 
+        if self.stack_traces is None:
+            self.stack_traces = [None for _ in range(len(outputs))]
+        else:
+            assert len(self.stack_traces) == len(
+                outputs
+            ), "Wrong number of stack traces passed in"
+
         self._add_replayed_outputs(outputs)
         self.recorded_liveness_after_graph = self._get_liveness(self.path_weakrefs)
 
@@ -734,7 +764,7 @@ def _add_first_outputs(self, outputs):
                     self.live_indices_after_graph.append((depth, output_index))
 
         self.debug_check_invariants_after_invocation()
-        if config.triton.debug_cudagraph_trees:
+        if config.triton.fast_cudagraph_asserts:
             check_memory_pool(self.cuda_graphs_pool, list(self.path_live_weakrefs()))
 
     def _add_replayed_outputs(self, outputs):
@@ -816,7 +846,7 @@ def _get_liveness(
     def debug_assert_invariants(
         self, expected_liveness: List[List[bool]], newly_dead: List[PathOutputIndex]
     ):
-        if not config.triton.debug_cudagraph_trees:
+        if not config.triton.slow_cudagraph_asserts:
             return
 
         for i, node in enumerate(self._path_from_root):
@@ -1066,6 +1096,8 @@ def __init__(self, device_index: int):
         # mapping from function id to wrapped function
         self.ids_to_funcs: Dict[FunctionID, WrappedFunction] = {}
 
+        self.ids_to_stack_traces: Dict[FunctionID, StackTraces] = {}
+
         self.warmed_up_functions: Set[FunctionID] = set()
 
         with torch.cuda.device(device_index):
@@ -1194,6 +1226,7 @@ def record_function(self, new_inputs, function_id) -> List[Optional[Tensor]]:
             new_inputs,
             self.cuda_graphs_thread_pool,
             self.device_index,
+            self.ids_to_stack_traces[function_id],
         )
         if self.current_node is None:
             self.roots[function_id].append(node)
@@ -1220,6 +1253,7 @@ def run_eager(self, new_inputs, function_id: FunctionID):
             self.cuda_graphs_thread_pool,
             self.graph,
             self.device_index,
+            self.ids_to_stack_traces[function_id],
         )
         self.current_node = node
         self.path_state = ExecutionState.WARMUP
@@ -1240,22 +1274,15 @@ def add_function(
         model,
         inputs,
         static_input_idxs,
-        is_backward,
-        is_inference,
+        stack_traces,
+        mode,
     ) -> Callable:
         id = self.new_func_id()
+        self.ids_to_stack_traces[id] = stack_traces
         self.ids_to_funcs[id] = WrappedFunction(
             model, remove_unaligned_input_idxs(inputs, static_input_idxs), id
         )
-        self.id_to_mode[id] = (
-            CompilationMode.BACKWARD
-            if is_backward
-            else (
-                CompilationMode.INFERENCE if is_inference else CompilationMode.FORWARD
-            )
-        )
-
-        comp_context = torch._functorch.aot_autograd.get_graph_being_compiled()
+        self.id_to_mode[id] = mode
         fn = functools.partial(self.run, function_id=id)
 
         # container needs to set clean up when fn dies
@@ -1345,9 +1372,21 @@ def try_end_curr_warmup(self):
     def dealloc_current_path_weakrefs(self):
         # TODO: we could also allow the these weak refs to continue to be allocated,
         # but that adds some complications.
-        for t in self.current_node.path_live_weakrefs():
+        for t, stack_trace in self.current_node.path_live_weakrefs_and_stacktraces():
+            # TODO: dont need to test t(), but would need to deduplicate storages
             if t():
                 torch._C._free_And_Remove_DeleterFn(t())
+                stack_trace = (
+                    stack_trace.strip()
+                    if stack_trace
+                    else "[Could not find stack trace]"
+                )
+                warnings.warn(
+                    f"CUDAGraphTrees triggered deallocating tensor output from {stack_trace}. "
+                    "Subsequent use of this storage may return garbage result. "
+                    "Outside of torch.compile(), clone the corresponding tensor for safety, or "
+                    "deallocate the corresponding output no longer in use."
+                )
 
     def clear_current_node_outputs_and_set_to_none(self):
         self.current_node.clear_path_outputs()
@@ -1377,7 +1416,7 @@ def apply_checkpoint_execution_state_in_allocator(self):
             torch._C._cuda_cudaCachingAllocator_raw_delete(ptr)
 
         # Now the live blocks should be exactly equal to the live storages in private pool
-        if config.triton.debug_cudagraph_trees:
+        if config.triton.fast_cudagraph_asserts:
             check_memory_pool(self.cuda_graphs_thread_pool, live_storages_wrappers)
 
     def live_cudagraph_pool_storages_in_curr_execution(