Compare text

Find the difference between two text files

Real-time diff

Unified diff

Collapse lines

Highlight change

Syntax highlighting

Tools

Diffchecker Desktop The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!Get Desktop

torch 2.6.0 v.s. torch 2.7.0.dev

Created 2 months agoDiff never expires

208 lines

242 lines

Unable to import quantization op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models

Unable to import quantization op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models

TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops

TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops

[01/31/2025-21:55:59] [TRT] [W] Functionality provided through tensorrt.plugin module is experimental.

[01/31/2025-21:48:35] [TRT] [W] Functionality provided through tensorrt.plugin module is experimental.

DEBUG:torch_tensorrt.dynamo.lowering.passes.remove_detach:Removed 0 detach nodes:

DEBUG:torch_tensorrt.dynamo.lowering.passes.remove_detach:Removed 0 detach nodes:

graph():

graph():

%x : [num_users=1] = placeholder[target=x]

%x : [num_users=1] = placeholder[target=x]

%add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%to : [num_users=1] = call_function[target=torch.ops.aten.to.dtype](args = (%add, torch.float32), kwargs = {})

%to : [num_users=1] = call_function[target=torch.ops.aten.to.dtype](args = (%add, torch.float32), kwargs = {})

%mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%to, %to), kwargs = {})

%mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%to, %to), kwargs = {})

%to_1 : [num_users=1] = call_function[target=torch.ops.aten.to.dtype](args = (%mul, torch.float16), kwargs = {})

%to_1 : [num_users=1] = call_function[target=torch.ops.aten.to.dtype](args = (%mul, torch.float16), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%to_1, %to_1), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%to_1, %to_1), kwargs = {})

return (sub,)

return (sub,)

DEBUG:torch_tensorrt.dynamo._compiler:Input graph: graph():

DEBUG:torch_tensorrt.dynamo._compiler:Input graph: graph():

%x : [num_users=1] = placeholder[target=x]

%x : [num_users=1] = placeholder[target=x]

%add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})

%_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})

%mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})

%_assert_tensor_metadata : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%add, None, None, torch.float16), kwargs = {})

%mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})

%_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})

%_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})

%_assert_tensor_metadata_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%mul, None, None, torch.float32), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})

return (sub,)

return (sub,)

DEBUG:torch_tensorrt.dynamo.lowering.passes.constant_folding:Graph after constant folding:

DEBUG:torch_tensorrt.dynamo.lowering.passes.constant_folding:Graph after constant folding:

graph():

graph():

%x : [num_users=1] = placeholder[target=x]

%x : [num_users=1] = placeholder[target=x]

%add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})

%_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})

%mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})

%_assert_tensor_metadata : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%add, None, None, torch.float16), kwargs = {})

%mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})

%_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})

%_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})

%_assert_tensor_metadata_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%mul, None, None, torch.float32), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})

return (sub,)

return (sub,)

DEBUG:torch_tensorrt.dynamo.lowering.passes.remove_assert_scalar:Removed 0 assert_scalar nodes:

DEBUG:torch_tensorrt.dynamo.lowering.passes.remove_assert_scalar:Removed 0 assert_scalar nodes:

graph():

graph():

%x : [num_users=1] = placeholder[target=x]

%x : [num_users=1] = placeholder[target=x]

%add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})

%_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})

%mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})

%_assert_tensor_metadata : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%add, None, None, torch.float16), kwargs = {})

%mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})

%_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})

%_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})

%_assert_tensor_metadata_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%mul, None, None, torch.float32), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})

return (sub,)

return (sub,)

DEBUG:torch_tensorrt.dynamo.lowering.passes.accumulate_fp32_matmul:Skipping FP32 accumulation for matmul layers as use_fp32_acc is not enabled in the compilation settings

DEBUG:torch_tensorrt.dynamo.lowering.passes.accumulate_fp32_matmul:Skipping FP32 accumulation for matmul layers as use_fp32_acc is not enabled in the compilation settings

DEBUG:torch_tensorrt.dynamo._compiler:Lowered Input graph: graph():

DEBUG:torch_tensorrt.dynamo._compiler:Lowered Input graph: graph():

%x : [num_users=1] = placeholder[target=x]

%x : [num_users=1] = placeholder[target=x]

%add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})

%_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})

%mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})

%_assert_tensor_metadata : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%add, None, None, torch.float16), kwargs = {})

%mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})

%_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})

%_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})

%_assert_tensor_metadata_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%mul, None, None, torch.float32), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})

return (sub,)

return (sub,)

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor

DEBUG:torch_tensorrt.dynamo.partitioning._global_partitioner:

DEBUG:torch_tensorrt.dynamo.partitioning._global_partitioner:

Supported Nodes:

Supported Nodes:

- torch.ops.aten.add.Tensor + Operator Count: 1

- torch.ops.aten.add.Tensor + Operator Count: 1

- torch.ops.aten._to_copy.default + Operator Count: 2

- torch.ops.aten._to_copy.default + Operator Count: 2

- torch.ops.aten.mul.Tensor + Operator Count: 1

- torch.ops.aten.mul.Tensor + Operator Count: 1

- torch.ops.aten.sub.Tensor + Operator Count: 1

- torch.ops.aten.sub.Tensor + Operator Count: 1

DEBUG:torch_tensorrt.dynamo.partitioning._global_partitioner:

DEBUG:torch_tensorrt.dynamo.partitioning._global_partitioner:

All Nodes Supported

All Nodes Supported

DEBUG:torch_tensorrt.dynamo._compiler:Detected support for 5 operators out of 5 in subgraph.

DEBUG:torch_tensorrt.dynamo._compiler:Detected support for 5 operators out of 7 in subgraph.

INFO:torch_tensorrt.dynamo._compiler:Partitioning the graph via the fast partitioner

INFO:torch_tensorrt.dynamo._compiler:Partitioning the graph via the fast partitioner

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor

DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:

DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:

Number of TensorRT-Accelerated Engines Generated: 1

Number of TensorRT-Accelerated Engines Generated: 1

DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:

DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:

Supported Nodes:

Supported Nodes:

- torch.ops.aten.add.Tensor + Operator Count: 1

- torch.ops.aten.add.Tensor + Operator Count: 1

- torch.ops.aten._to_copy.default + Operator Count: 2

- torch.ops.aten._to_copy.default + Operator Count: 2

- torch.ops.aten.mul.Tensor + Operator Count: 1

- torch.ops.aten.mul.Tensor + Operator Count: 1

- torch.ops.aten.sub.Tensor + Operator Count: 1

- torch.ops.aten.sub.Tensor + Operator Count: 1

DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:

DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:

All Nodes Supported

All Nodes Supported

DEBUG:torch_tensorrt.dynamo._compiler:Updated metadata for node: _run_on_acc_0 with its corresponding submodule outputs

DEBUG:torch_tensorrt.dynamo._compiler:Updated metadata for node: _run_on_acc_0 with its corresponding submodule outputs

DEBUG:torch_tensorrt.dynamo._compiler:Converting submodule: _run_on_acc_0

DEBUG:torch_tensorrt.dynamo._compiler:Converting submodule: _run_on_acc_0

Input shapes: [(1, 3, 5, 7)]

Input shapes: [(1, 3, 5, 7)]

graph():

graph():

%x : [num_users=1] = placeholder[target=x]

%x : [num_users=1] = placeholder[target=x]

%add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})

%_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})

%_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})

%mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})

%mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})

%_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})

%_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})

%sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})

return sub

return (add, mul, sub)

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node x (kind: x, args: ())

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node x (kind: x, args: ())

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Adding input to in-progress INetwork: x [shape=[1, 3, 5, 7], dtype=DataType.HALF]

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Adding input to in-progress INetwork: x [shape=[1, 3, 5, 7], dtype=DataType.HALF]

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node x [x] (Inputs: () | Outputs: (x: (1, 3, 5, 7)@torch.float16))

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node x [x] (Inputs: () | Outputs: (x: (1, 3, 5, 7)@torch.float16))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /add (kind: aten.add.Tensor, args: ('x <Node>', 'x <Node>'))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /add (kind: aten.add.Tensor, args: ('x <Node>', 'x <Node>'))

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /add [aten.add.Tensor] (Inputs: (x: (1, 3, 5, 7)@torch.float16, x: (1, 3, 5, 7)@torch.float16) | Outputs: (add: (1, 3, 5, 7)@torch.float16))

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /add [aten.add.Tensor] (Inputs: (x: (1, 3, 5, 7)@torch.float16, x: (1, 3, 5, 7)@torch.float16) | Outputs: (add: (1, 3, 5, 7)@torch.float16))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /_to_copy (kind: aten._to_copy.default, args: ('add <Node>',))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /_to_copy (kind: aten._to_copy.default, args: ('add <Node>',))

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /_to_copy [aten._to_copy.default] (Inputs: (add: (1, 3, 5, 7)@torch.float16) | Outputs: (_to_copy: (1, 3, 5, 7)@torch.float32))

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /_to_copy [aten._to_copy.default] (Inputs: (add: (1, 3, 5, 7)@torch.float16) | Outputs: (_to_copy: (1, 3, 5, 7)@torch.float32))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /mul (kind: aten.mul.Tensor, args: ('_to_copy <Node>', '_to_copy <Node>'))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /mul (kind: aten.mul.Tensor, args: ('_to_copy <Node>', '_to_copy <Node>'))

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /mul [aten.mul.Tensor] (Inputs: (_to_copy: (1, 3, 5, 7)@torch.float32, _to_copy: (1, 3, 5, 7)@torch.float32) | Outputs: (mul: (1, 3, 5, 7)@torch.float32))

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /mul [aten.mul.Tensor] (Inputs: (_to_copy: (1, 3, 5, 7)@torch.float32, _to_copy: (1, 3, 5, 7)@torch.float32) | Outputs: (mul: (1, 3, 5, 7)@torch.float32))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /_to_copy_1 (kind: aten._to_copy.default, args: ('mul <Node>',))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /_to_copy_1 (kind: aten._to_copy.default, args: ('mul <Node>',))

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /_to_copy_1 [aten._to_copy.default] (Inputs: (mul: (1, 3, 5, 7)@torch.float32) | Outputs: (_to_copy_1: (1, 3, 5, 7)@torch.float16))

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /_to_copy_1 [aten._to_copy.default] (Inputs: (mul: (1, 3, 5, 7)@torch.float32) | Outputs: (_to_copy_1: (1, 3, 5, 7)@torch.float16))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /sub (kind: aten.sub.Tensor, args: ('_to_copy_1 <Node>', '_to_copy_1 <Node>'))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /sub (kind: aten.sub.Tensor, args: ('_to_copy_1 <Node>', '_to_copy_1 <Node>'))

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor

DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /sub [aten.sub.Tensor] (Inputs: (_to_copy_1: (1, 3, 5, 7)@torch.float16, _to_copy_1: (1, 3, 5, 7)@torch.float16) | Outputs: (sub: (1, 3, 5, 7)@torch.float16))

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /sub [aten.sub.Tensor] (Inputs: (_to_copy_1: (1, 3, 5, 7)@torch.float16, _to_copy_1: (1, 3, 5, 7)@torch.float16) | Outputs: (sub: (1, 3, 5, 7)@torch.float16))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node output (kind: output, args: ('sub <Node>',))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node output (kind: output, args: (('add <Node>', 'mul <Node>', 'sub <Node>'),))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Marking output output0 [shape=(1, 3, 5, 7), dtype=DataType.HALF]

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Marking output output0 [shape=(1, 3, 5, 7), dtype=DataType.HALF]

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node output [output] (Inputs: (sub: (1, 3, 5, 7)@torch.float16) | Outputs: (output: ))

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Marking output output1 [shape=(1, 3, 5, 7), dtype=DataType.FLOAT]

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT INetwork construction elapsed time: 0:00:00.002928

DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Marking output output2 [shape=(1, 3, 5, 7), dtype=DataType.HALF]

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node output [output] (Inputs: ((add, mul, sub)) | Outputs: (output: ))

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT INetwork construction elapsed time: 0:00:00.005905

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Not found cached TRT engines. Start building engine.

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Not found cached TRT engines. Start building engine.

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Build TRT engine elapsed time: 0:00:06.303962

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Build TRT engine elapsed time: 0:00:00.172416

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT Engine uses: 31212 bytes of Memory

INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT Engine uses: 35204 bytes of Memory

DEBUG: [Torch-TensorRT] - Deserializing Device Info: 0%8%9%0%NVIDIA GeForce RTX 4060 Ti

DEBUG: [Torch-TensorRT] - Deserializing Device Info: 0%8%9%0%NVIDIA GeForce RTX 4060 Ti

DEBUG: [Torch-TensorRT] - Deserialized Device Info: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)

DEBUG: [Torch-TensorRT] - Deserialized Device Info: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)

DEBUG: [Torch-TensorRT] - Target Device: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)

DEBUG: [Torch-TensorRT] - Target Device: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)

DEBUG: [Torch-TensorRT] - Setting Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU) as active device

DEBUG: [Torch-TensorRT] - Setting Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU) as active device

INFO: [Torch-TensorRT] - Loaded engine size: 0 MiB

INFO: [Torch-TensorRT] - Loaded engine size: 0 MiB

DEBUG: [Torch-TensorRT] - Deserialization required 110 microseconds.

DEBUG: [Torch-TensorRT] - Deserialization required 107 microseconds.

DEBUG: [Torch-TensorRT] - Total per-runner device persistent memory is 0

DEBUG: [Torch-TensorRT] - Total per-runner device persistent memory is 0

DEBUG: [Torch-TensorRT] - Total per-runner host persistent memory is 240

DEBUG: [Torch-TensorRT] - Total per-runner host persistent memory is 320

DEBUG: [Torch-TensorRT] - Allocated device scratch memory of size 1024

DEBUG: [Torch-TensorRT] - Allocated device scratch memory of size 1024

DEBUG: [Torch-TensorRT] - - Runner scratch: 1024 bytes

DEBUG: [Torch-TensorRT] - - Runner scratch: 1024 bytes

INFO: [Torch-TensorRT] - [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 0 (MiB)

INFO: [Torch-TensorRT] - [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 0 (MiB)

DEBUG: [Torch-TensorRT] - CUDA lazy loading is enabled.

DEBUG: [Torch-TensorRT] - CUDA lazy loading is enabled.

DEBUG: [Torch-TensorRT] - Input binding name: x has TensorRT binding index: 0, Torch binding index: 0

DEBUG: [Torch-TensorRT] - Input binding name: x has TensorRT binding index: 0, Torch binding index: 0

DEBUG: [Torch-TensorRT] - Output binding name: output0 has TensorRT binding index: 1, Torch binding index: 1

DEBUG: [Torch-TensorRT] - Output binding name: output0 has TensorRT binding index: 1, Torch binding index: 1

DEBUG: [Torch-TensorRT] - Output binding name: output1 has TensorRT binding index: 2, Torch binding index: 2

DEBUG: [Torch-TensorRT] - Output binding name: output2 has TensorRT binding index: 3, Torch binding index: 3

DEBUG: [Torch-TensorRT] - Torch-TensorRT TensorRT Engine:

DEBUG: [Torch-TensorRT] - Torch-TensorRT TensorRT Engine:

Name: _run_on_acc_0_engine

Name: _run_on_acc_0_engine

Inputs: [

Inputs: [

id: 0

id: 0

name: x

name: x

shape: [1, 3, 5, 7]

shape: [1, 3, 5, 7]

dtype: Half

dtype: Half

]

]

Outputs: [

Outputs: [

id: 0

id: 0

name: output0

name: output0

shape: [1, 3, 5, 7]

shape: [1, 3, 5, 7]

dtype: Half

dtype: Half

id: 1

name: output1

shape: [1, 3, 5, 7]

dtype: Float

id: 2

name: output2

shape: [1, 3, 5, 7]

dtype: Half

]

]

Device: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)

Device: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)

Hardware Compatibility: Disabled

Hardware Compatibility: Disabled

Target Platform: windows_x86_64

Target Platform: windows_x86_64

DEBUG:torch_tensorrt.dynamo._compiler:Submodule in PyTorch: _run_on_gpu_1

graph():

%add : [num_users=1] = placeholder[target=add]

%_assert_tensor_metadata : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%add, None, None, torch.float16), kwargs = {})

%mul : [num_users=1] = placeholder[target=mul]

%_assert_tensor_metadata_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%mul, None, None, torch.float32), kwargs = {})

return ()

DEBUG:torch_tensorrt.dynamo._DryRunTracker:

DEBUG:torch_tensorrt.dynamo._DryRunTracker:

++++++++++++++++++++++++++++++++++++++++++++++++++ Dry-Run Results for Graph ++++++++++++++++++++++++++++++++++++++++++++++++++

++++++++++++++++++++++++++++++++++++++++++++++++++ Dry-Run Results for Graph ++++++++++++++++++++++++++++++++++++++++++++++++++

The graph consists of 5 Total Operators, of which 5 operators are supported, 100.0% coverage

The graph consists of 7 Total Operators, of which 5 operators are supported, 71.43% coverage

The following nodes are currently set to run in Torch:

Node: torch.ops.aten._assert_tensor_metadata.default, with layer location: /_assert_tensor_metadata

Node: torch.ops.aten._assert_tensor_metadata.default, with layer location: /_assert_tensor_metadata_1

Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner

Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, debug=True, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, assume_dynamic_shape_support=False, sparse_weights=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False, timing_cache_path='C:\\Users\\HolyWu\\AppData\\Local\\Temp\\torch_tensorrt_engine_cache\\timing_cache.bin', lazy_engine_init=False, cache_built_engines=False, reuse_cached_engines=False, use_explicit_typing=True, use_fp32_acc=False, refit_identical_engine_weights=False, strip_engine_weights=False, immutable_weights=True, enable_weight_streaming=False, enable_cross_compile_for_windows=False, use_aot_joint_export=True)

Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, debug=True, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, assume_dynamic_shape_support=False, sparse_weights=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False, timing_cache_path='C:\\Users\\HolyWu\\AppData\\Local\\Temp\\torch_tensorrt_engine_cache\\timing_cache.bin', lazy_engine_init=False, cache_built_engines=False, reuse_cached_engines=False, use_explicit_typing=True, use_fp32_acc=False, refit_identical_engine_weights=False, strip_engine_weights=False, immutable_weights=True, enable_weight_streaming=False, enable_cross_compile_for_windows=False, use_aot_joint_export=True)

Graph Structure:

Graph Structure:

Inputs: List[Tensor: (1, 3, 5, 7)@float16]

Inputs: List[Tensor: (1, 3, 5, 7)@float16]

...

...

TRT Engine #1 - Submodule name: _run_on_acc_0

TRT Engine #1 - Submodule name: _run_on_acc_0

Engine Inputs: List[Tensor: (1, 3, 5, 7)@float16]

Engine Inputs: List[Tensor: (1, 3, 5, 7)@float16]

Number of Operators in Engine: 5

Number of Operators in Engine: 5

Engine Outputs: List[Tensor: (1, 3, 5, 7)@float16]

Engine Outputs: List[Tensor: (1, 3, 5, 7)@float16, Tensor: (1, 3, 5, 7)@float32, Tensor: (1, 3, 5, 7)@float16]

...

...

Outputs: List[Tensor: (1, 3, 5, 7)@float16]

Outputs: List[Tensor: (1, 3, 5, 7)@float16]

------------------------- Aggregate Stats -------------------------

------------------------- Aggregate Stats -------------------------

Average Number of Operators per TRT Engine: 5.0

Average Number of Operators per TRT Engine: 5.0

Most Operators in a TRT Engine: 5

Most Operators in a TRT Engine: 5

********** Recommendations **********

********** Recommendations **********

- For minimal graph segmentation, select min_block_size=5 which would generate 1 TRT engine(s)

- For minimal graph segmentation, select min_block_size=5 which would generate 1 TRT engine(s)

- The current level of graph segmentation is equivalent to selecting min_block_size=5 which generates 1 TRT engine(s)

- The current level of graph segmentation is equivalent to selecting min_block_size=5 which generates 1 TRT engine(s)

DEBUG: [Torch-TensorRT] - Attempting to run engine (ID: _run_on_acc_0_engine); Hardware Compatible: 0

DEBUG: [Torch-TensorRT] - Attempting to run engine (ID: _run_on_acc_0_engine); Hardware Compatible: 0

DEBUG: [Torch-TensorRT] - Input shape changed None -> (1,3,5,7)

DEBUG: [Torch-TensorRT] - Input shape changed None -> (1,3,5,7)

DEBUG: [Torch-TensorRT] - Input Name: x Shape: [1, 3, 5, 7]

DEBUG: [Torch-TensorRT] - Input Name: x Shape: [1, 3, 5, 7]

DEBUG: [Torch-TensorRT] - Output Name: output0 Shape: [1, 3, 5, 7]

DEBUG: [Torch-TensorRT] - Output Name: output0 Shape: [1, 3, 5, 7]

DEBUG: [Torch-TensorRT] - Output Name: output1 Shape: [1, 3, 5, 7]

DEBUG: [Torch-TensorRT] - Output Name: output2 Shape: [1, 3, 5, 7]

assert_close passed

assert_close passed

Saved diffs

Original text

Open file

Unable to import quantization op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models
TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops
[01/31/2025-21:55:59] [TRT] [W] Functionality provided through tensorrt.plugin module is experimental.
DEBUG:torch_tensorrt.dynamo.lowering.passes.remove_detach:Removed 0 detach nodes:
graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %to : [num_users=1] = call_function[target=torch.ops.aten.to.dtype](args = (%add, torch.float32), kwargs = {})
    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%to, %to), kwargs = {})
    %to_1 : [num_users=1] = call_function[target=torch.ops.aten.to.dtype](args = (%mul, torch.float16), kwargs = {})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%to_1, %to_1), kwargs = {})
    return (sub,)
DEBUG:torch_tensorrt.dynamo._compiler:Input graph: graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})
    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})
    %_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})
    return (sub,)
DEBUG:torch_tensorrt.dynamo.lowering.passes.constant_folding:Graph after constant folding:
graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})
    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})
    %_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})
    return (sub,)
DEBUG:torch_tensorrt.dynamo.lowering.passes.remove_assert_scalar:Removed 0 assert_scalar nodes:
graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})
    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})
    %_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})
    return (sub,)
DEBUG:torch_tensorrt.dynamo.lowering.passes.accumulate_fp32_matmul:Skipping FP32 accumulation for matmul layers as use_fp32_acc is not enabled in the compilation settings
DEBUG:torch_tensorrt.dynamo._compiler:Lowered Input graph: graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})
    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})
    %_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})
    return (sub,)
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor
DEBUG:torch_tensorrt.dynamo.partitioning._global_partitioner:
Supported Nodes:
- torch.ops.aten.add.Tensor + Operator Count: 1
- torch.ops.aten._to_copy.default + Operator Count: 2
- torch.ops.aten.mul.Tensor + Operator Count: 1
- torch.ops.aten.sub.Tensor + Operator Count: 1

DEBUG:torch_tensorrt.dynamo.partitioning._global_partitioner:
All Nodes Supported

DEBUG:torch_tensorrt.dynamo._compiler:Detected support for 5 operators out of 5 in subgraph.
INFO:torch_tensorrt.dynamo._compiler:Partitioning the graph via the fast partitioner
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor
DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:
Number of TensorRT-Accelerated Engines Generated: 1
DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:
Supported Nodes:
- torch.ops.aten.add.Tensor + Operator Count: 1
- torch.ops.aten._to_copy.default + Operator Count: 2
- torch.ops.aten.mul.Tensor + Operator Count: 1
- torch.ops.aten.sub.Tensor + Operator Count: 1

DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:
All Nodes Supported

DEBUG:torch_tensorrt.dynamo._compiler:Updated metadata for node: _run_on_acc_0 with its corresponding submodule outputs
DEBUG:torch_tensorrt.dynamo._compiler:Converting submodule: _run_on_acc_0
 Input shapes: [(1, 3, 5, 7)]
 graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})
    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})
    %_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})
    return sub
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node x (kind: x, args: ())
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Adding input to in-progress INetwork: x [shape=[1, 3, 5, 7], dtype=DataType.HALF]
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node x [x] (Inputs: () | Outputs: (x: (1, 3, 5, 7)@torch.float16))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /add (kind: aten.add.Tensor, args: ('x <Node>', 'x <Node>'))
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /add [aten.add.Tensor] (Inputs: (x: (1, 3, 5, 7)@torch.float16, x: (1, 3, 5, 7)@torch.float16) | Outputs: (add: (1, 3, 5, 7)@torch.float16))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /_to_copy (kind: aten._to_copy.default, args: ('add <Node>',))
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /_to_copy [aten._to_copy.default] (Inputs: (add: (1, 3, 5, 7)@torch.float16) | Outputs: (_to_copy: (1, 3, 5, 7)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /mul (kind: aten.mul.Tensor, args: ('_to_copy <Node>', '_to_copy <Node>'))
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /mul [aten.mul.Tensor] (Inputs: (_to_copy: (1, 3, 5, 7)@torch.float32, _to_copy: (1, 3, 5, 7)@torch.float32) | Outputs: (mul: (1, 3, 5, 7)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /_to_copy_1 (kind: aten._to_copy.default, args: ('mul <Node>',))
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /_to_copy_1 [aten._to_copy.default] (Inputs: (mul: (1, 3, 5, 7)@torch.float32) | Outputs: (_to_copy_1: (1, 3, 5, 7)@torch.float16))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /sub (kind: aten.sub.Tensor, args: ('_to_copy_1 <Node>', '_to_copy_1 <Node>'))
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /sub [aten.sub.Tensor] (Inputs: (_to_copy_1: (1, 3, 5, 7)@torch.float16, _to_copy_1: (1, 3, 5, 7)@torch.float16) | Outputs: (sub: (1, 3, 5, 7)@torch.float16))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node output (kind: output, args: ('sub <Node>',))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Marking output output0 [shape=(1, 3, 5, 7), dtype=DataType.HALF]
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node output [output] (Inputs: (sub: (1, 3, 5, 7)@torch.float16) | Outputs: (output: ))
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT INetwork construction elapsed time: 0:00:00.002928
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Not found cached TRT engines. Start building engine.
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Build TRT engine elapsed time: 0:00:06.303962
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT Engine uses: 31212 bytes of Memory

DEBUG: [Torch-TensorRT] - Deserializing Device Info: 0%8%9%0%NVIDIA GeForce RTX 4060 Ti
DEBUG: [Torch-TensorRT] - Deserialized Device Info: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)
DEBUG: [Torch-TensorRT] - Target Device: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)
DEBUG: [Torch-TensorRT] - Setting Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU) as active device
INFO: [Torch-TensorRT] - Loaded engine size: 0 MiB
DEBUG: [Torch-TensorRT] - Deserialization required 110 microseconds.
DEBUG: [Torch-TensorRT] - Total per-runner device persistent memory is 0
DEBUG: [Torch-TensorRT] - Total per-runner host persistent memory is 240
DEBUG: [Torch-TensorRT] - Allocated device scratch memory of size 1024
DEBUG: [Torch-TensorRT] - - Runner scratch: 1024 bytes
INFO: [Torch-TensorRT] - [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 0 (MiB)
DEBUG: [Torch-TensorRT] - CUDA lazy loading is enabled.
DEBUG: [Torch-TensorRT] - Input binding name: x has TensorRT binding index: 0, Torch binding index: 0
DEBUG: [Torch-TensorRT] - Output binding name: output0 has TensorRT binding index: 1, Torch binding index: 1
DEBUG: [Torch-TensorRT] - Torch-TensorRT TensorRT Engine:
  Name: _run_on_acc_0_engine
  Inputs: [
    id: 0
      name: x
      shape: [1, 3, 5, 7]
      dtype: Half
  ]
  Outputs: [
    id: 0
      name: output0
      shape: [1, 3, 5, 7]
      dtype: Half
  ]
  Device: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)
  Hardware Compatibility: Disabled
  Target Platform: windows_x86_64

DEBUG:torch_tensorrt.dynamo._DryRunTracker:
++++++++++++++++++++++++++++++++++++++++++++++++++ Dry-Run Results for Graph ++++++++++++++++++++++++++++++++++++++++++++++++++

The graph consists of 5 Total Operators, of which 5 operators are supported, 100.0% coverage

Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, debug=True, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, assume_dynamic_shape_support=False, sparse_weights=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False, timing_cache_path='C:\\Users\\HolyWu\\AppData\\Local\\Temp\\torch_tensorrt_engine_cache\\timing_cache.bin', lazy_engine_init=False, cache_built_engines=False, reuse_cached_engines=False, use_explicit_typing=True, use_fp32_acc=False, refit_identical_engine_weights=False, strip_engine_weights=False, immutable_weights=True, enable_weight_streaming=False, enable_cross_compile_for_windows=False, use_aot_joint_export=True)

Graph Structure:

Inputs: List[Tensor: (1, 3, 5, 7)@float16]
    ...
    TRT Engine #1 - Submodule name: _run_on_acc_0
     Engine Inputs: List[Tensor: (1, 3, 5, 7)@float16]
     Number of Operators in Engine: 5
     Engine Outputs: List[Tensor: (1, 3, 5, 7)@float16]
    ...
   Outputs: List[Tensor: (1, 3, 5, 7)@float16]

------------------------- Aggregate Stats -------------------------

Average Number of Operators per TRT Engine: 5.0
   Most Operators in a TRT Engine: 5

********** Recommendations **********

- For minimal graph segmentation, select min_block_size=5 which would generate 1 TRT engine(s)
   - The current level of graph segmentation is equivalent to selecting min_block_size=5 which generates 1 TRT engine(s)
DEBUG: [Torch-TensorRT] - Attempting to run engine (ID: _run_on_acc_0_engine); Hardware Compatible: 0
DEBUG: [Torch-TensorRT] - Input shape changed None -> (1,3,5,7)
DEBUG: [Torch-TensorRT] - Input Name: x Shape: [1, 3, 5, 7]
DEBUG: [Torch-TensorRT] - Output Name: output0 Shape: [1, 3, 5, 7]
assert_close passed

Changed text

Open file

Unable to import quantization op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models
TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops
[01/31/2025-21:48:35] [TRT] [W] Functionality provided through tensorrt.plugin module is experimental.
DEBUG:torch_tensorrt.dynamo.lowering.passes.remove_detach:Removed 0 detach nodes:
graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %to : [num_users=1] = call_function[target=torch.ops.aten.to.dtype](args = (%add, torch.float32), kwargs = {})
    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%to, %to), kwargs = {})
    %to_1 : [num_users=1] = call_function[target=torch.ops.aten.to.dtype](args = (%mul, torch.float16), kwargs = {})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%to_1, %to_1), kwargs = {})
    return (sub,)
DEBUG:torch_tensorrt.dynamo._compiler:Input graph: graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})
    %_assert_tensor_metadata : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%add, None, None, torch.float16), kwargs = {})
    %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})
    %_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})
    %_assert_tensor_metadata_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%mul, None, None, torch.float32), kwargs = {})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})
    return (sub,)
DEBUG:torch_tensorrt.dynamo.lowering.passes.constant_folding:Graph after constant folding:
graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})
    %_assert_tensor_metadata : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%add, None, None, torch.float16), kwargs = {})
    %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})
    %_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})
    %_assert_tensor_metadata_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%mul, None, None, torch.float32), kwargs = {})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})
    return (sub,)
DEBUG:torch_tensorrt.dynamo.lowering.passes.remove_assert_scalar:Removed 0 assert_scalar nodes:
graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})
    %_assert_tensor_metadata : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%add, None, None, torch.float16), kwargs = {})
    %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})
    %_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})
    %_assert_tensor_metadata_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%mul, None, None, torch.float32), kwargs = {})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})
    return (sub,)
DEBUG:torch_tensorrt.dynamo.lowering.passes.accumulate_fp32_matmul:Skipping FP32 accumulation for matmul layers as use_fp32_acc is not enabled in the compilation settings
DEBUG:torch_tensorrt.dynamo._compiler:Lowered Input graph: graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})
    %_assert_tensor_metadata : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%add, None, None, torch.float16), kwargs = {})
    %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})
    %_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})
    %_assert_tensor_metadata_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%mul, None, None, torch.float32), kwargs = {})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})
    return (sub,)
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor
DEBUG:torch_tensorrt.dynamo.partitioning._global_partitioner:
Supported Nodes:
- torch.ops.aten.add.Tensor + Operator Count: 1
- torch.ops.aten._to_copy.default + Operator Count: 2
- torch.ops.aten.mul.Tensor + Operator Count: 1
- torch.ops.aten.sub.Tensor + Operator Count: 1

DEBUG:torch_tensorrt.dynamo.partitioning._global_partitioner:
All Nodes Supported

DEBUG:torch_tensorrt.dynamo._compiler:Detected support for 5 operators out of 7 in subgraph.
INFO:torch_tensorrt.dynamo._compiler:Partitioning the graph via the fast partitioner
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor
DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:
Number of TensorRT-Accelerated Engines Generated: 1
DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:
Supported Nodes:
- torch.ops.aten.add.Tensor + Operator Count: 1
- torch.ops.aten._to_copy.default + Operator Count: 2
- torch.ops.aten.mul.Tensor + Operator Count: 1
- torch.ops.aten.sub.Tensor + Operator Count: 1

DEBUG:torch_tensorrt.dynamo.partitioning._adjacency_partitioner:
All Nodes Supported

DEBUG:torch_tensorrt.dynamo._compiler:Updated metadata for node: _run_on_acc_0 with its corresponding submodule outputs
DEBUG:torch_tensorrt.dynamo._compiler:Converting submodule: _run_on_acc_0
 Input shapes: [(1, 3, 5, 7)]
 graph():
    %x : [num_users=1] = placeholder[target=x]
    %add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
    %_to_copy : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%add,), kwargs = {dtype: torch.float32})
    %mul : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%_to_copy, %_to_copy), kwargs = {})
    %_to_copy_1 : [num_users=1] = call_function[target=torch.ops.aten._to_copy.default](args = (%mul,), kwargs = {dtype: torch.float16})
    %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%_to_copy_1, %_to_copy_1), kwargs = {})
    return (add, mul, sub)
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node x (kind: x, args: ())
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Adding input to in-progress INetwork: x [shape=[1, 3, 5, 7], dtype=DataType.HALF]
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node x [x] (Inputs: () | Outputs: (x: (1, 3, 5, 7)@torch.float16))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /add (kind: aten.add.Tensor, args: ('x <Node>', 'x <Node>'))
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.add.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.add.Tensor
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /add [aten.add.Tensor] (Inputs: (x: (1, 3, 5, 7)@torch.float16, x: (1, 3, 5, 7)@torch.float16) | Outputs: (add: (1, 3, 5, 7)@torch.float16))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /_to_copy (kind: aten._to_copy.default, args: ('add <Node>',))
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /_to_copy [aten._to_copy.default] (Inputs: (add: (1, 3, 5, 7)@torch.float16) | Outputs: (_to_copy: (1, 3, 5, 7)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /mul (kind: aten.mul.Tensor, args: ('_to_copy <Node>', '_to_copy <Node>'))
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.mul.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.mul.Tensor
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /mul [aten.mul.Tensor] (Inputs: (_to_copy: (1, 3, 5, 7)@torch.float32, _to_copy: (1, 3, 5, 7)@torch.float32) | Outputs: (mul: (1, 3, 5, 7)@torch.float32))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /_to_copy_1 (kind: aten._to_copy.default, args: ('mul <Node>',))
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten._to_copy.default: 2
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten._to_copy.default
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /_to_copy_1 [aten._to_copy.default] (Inputs: (mul: (1, 3, 5, 7)@torch.float32) | Outputs: (_to_copy_1: (1, 3, 5, 7)@torch.float16))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node /sub (kind: aten.sub.Tensor, args: ('_to_copy_1 <Node>', '_to_copy_1 <Node>'))
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Converter options for aten.sub.Tensor: 1
DEBUG:torch_tensorrt.dynamo.conversion._ConverterRegistry:Selecting converter option 0 for converting aten.sub.Tensor
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node /sub [aten.sub.Tensor] (Inputs: (_to_copy_1: (1, 3, 5, 7)@torch.float16, _to_copy_1: (1, 3, 5, 7)@torch.float16) | Outputs: (sub: (1, 3, 5, 7)@torch.float16))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converting node output (kind: output, args: (('add <Node>', 'mul <Node>', 'sub <Node>'),))
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Marking output output0 [shape=(1, 3, 5, 7), dtype=DataType.HALF]
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Marking output output1 [shape=(1, 3, 5, 7), dtype=DataType.FLOAT]
DEBUG:torch_tensorrt.dynamo.conversion._TRTInterpreter:Marking output output2 [shape=(1, 3, 5, 7), dtype=DataType.HALF]
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Converted node output [output] (Inputs: ((add, mul, sub)) | Outputs: (output: ))
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT INetwork construction elapsed time: 0:00:00.005905
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Not found cached TRT engines. Start building engine.
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:Build TRT engine elapsed time: 0:00:00.172416
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT Engine uses: 35204 bytes of Memory

DEBUG: [Torch-TensorRT] - Deserializing Device Info: 0%8%9%0%NVIDIA GeForce RTX 4060 Ti
DEBUG: [Torch-TensorRT] - Deserialized Device Info: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)
DEBUG: [Torch-TensorRT] - Target Device: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)
DEBUG: [Torch-TensorRT] - Setting Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU) as active device
INFO: [Torch-TensorRT] - Loaded engine size: 0 MiB
DEBUG: [Torch-TensorRT] - Deserialization required 107 microseconds.
DEBUG: [Torch-TensorRT] - Total per-runner device persistent memory is 0
DEBUG: [Torch-TensorRT] - Total per-runner host persistent memory is 320
DEBUG: [Torch-TensorRT] - Allocated device scratch memory of size 1024
DEBUG: [Torch-TensorRT] - - Runner scratch: 1024 bytes
INFO: [Torch-TensorRT] - [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 0 (MiB)
DEBUG: [Torch-TensorRT] - CUDA lazy loading is enabled.
DEBUG: [Torch-TensorRT] - Input binding name: x has TensorRT binding index: 0, Torch binding index: 0
DEBUG: [Torch-TensorRT] - Output binding name: output0 has TensorRT binding index: 1, Torch binding index: 1
DEBUG: [Torch-TensorRT] - Output binding name: output1 has TensorRT binding index: 2, Torch binding index: 2
DEBUG: [Torch-TensorRT] - Output binding name: output2 has TensorRT binding index: 3, Torch binding index: 3
DEBUG: [Torch-TensorRT] - Torch-TensorRT TensorRT Engine:
  Name: _run_on_acc_0_engine
  Inputs: [
    id: 0
      name: x
      shape: [1, 3, 5, 7]
      dtype: Half
  ]
  Outputs: [
    id: 0
      name: output0
      shape: [1, 3, 5, 7]
      dtype: Half
    id: 1
      name: output1
      shape: [1, 3, 5, 7]
      dtype: Float
    id: 2
      name: output2
      shape: [1, 3, 5, 7]
      dtype: Half
  ]
  Device: Device(ID: 0, Name: NVIDIA GeForce RTX 4060 Ti, SM Capability: 8.9, Type: GPU)
  Hardware Compatibility: Disabled
  Target Platform: windows_x86_64

DEBUG:torch_tensorrt.dynamo._compiler:Submodule in PyTorch: _run_on_gpu_1
 graph():
    %add : [num_users=1] = placeholder[target=add]
    %_assert_tensor_metadata : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%add, None, None, torch.float16), kwargs = {})
    %mul : [num_users=1] = placeholder[target=mul]
    %_assert_tensor_metadata_1 : [num_users=0] = call_function[target=torch.ops.aten._assert_tensor_metadata.default](args = (%mul, None, None, torch.float32), kwargs = {})
    return ()
DEBUG:torch_tensorrt.dynamo._DryRunTracker:
++++++++++++++++++++++++++++++++++++++++++++++++++ Dry-Run Results for Graph ++++++++++++++++++++++++++++++++++++++++++++++++++

The graph consists of 7 Total Operators, of which 5 operators are supported, 71.43% coverage

The following nodes are currently set to run in Torch:
Node: torch.ops.aten._assert_tensor_metadata.default, with layer location: /_assert_tensor_metadata
Node: torch.ops.aten._assert_tensor_metadata.default, with layer location: /_assert_tensor_metadata_1
Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner

Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, debug=True, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, assume_dynamic_shape_support=False, sparse_weights=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False, timing_cache_path='C:\\Users\\HolyWu\\AppData\\Local\\Temp\\torch_tensorrt_engine_cache\\timing_cache.bin', lazy_engine_init=False, cache_built_engines=False, reuse_cached_engines=False, use_explicit_typing=True, use_fp32_acc=False, refit_identical_engine_weights=False, strip_engine_weights=False, immutable_weights=True, enable_weight_streaming=False, enable_cross_compile_for_windows=False, use_aot_joint_export=True)

Graph Structure:

Inputs: List[Tensor: (1, 3, 5, 7)@float16]
    ...
    TRT Engine #1 - Submodule name: _run_on_acc_0
     Engine Inputs: List[Tensor: (1, 3, 5, 7)@float16]
     Number of Operators in Engine: 5
     Engine Outputs: List[Tensor: (1, 3, 5, 7)@float16, Tensor: (1, 3, 5, 7)@float32, Tensor: (1, 3, 5, 7)@float16]
    ...
   Outputs: List[Tensor: (1, 3, 5, 7)@float16]

------------------------- Aggregate Stats -------------------------

Average Number of Operators per TRT Engine: 5.0
   Most Operators in a TRT Engine: 5

********** Recommendations **********

- For minimal graph segmentation, select min_block_size=5 which would generate 1 TRT engine(s)
   - The current level of graph segmentation is equivalent to selecting min_block_size=5 which generates 1 TRT engine(s)
DEBUG: [Torch-TensorRT] - Attempting to run engine (ID: _run_on_acc_0_engine); Hardware Compatible: 0
DEBUG: [Torch-TensorRT] - Input shape changed None -> (1,3,5,7)
DEBUG: [Torch-TensorRT] - Input Name: x Shape: [1, 3, 5, 7]
DEBUG: [Torch-TensorRT] - Output Name: output0 Shape: [1, 3, 5, 7]
DEBUG: [Torch-TensorRT] - Output Name: output1 Shape: [1, 3, 5, 7]
DEBUG: [Torch-TensorRT] - Output Name: output2 Shape: [1, 3, 5, 7]
assert_close passed