Bad left good right

Created Diff never expires
286 removals
Words removed1056
Total words2157
Words removed (%)48.96
559 lines
237 additions
Words added737
Total words1838
Words added (%)40.10
508 lines
//
//
// Generated by LLVM NVPTX Back-End
// Generated by LLVM NVPTX Back-End
//
//


.version 8.4
.version 8.4
.target sm_90a
.target sm_90a
.address_size 64
.address_size 64


// .globl triton_f4_to_bf16_kernel // -- Begin function triton_f4_to_bf16_kernel
// .globl triton_f4_to_bf16_kernel // -- Begin function triton_f4_to_bf16_kernel
// @triton_f4_to_bf16_kernel
// @triton_f4_to_bf16_kernel
.visible .entry triton_f4_to_bf16_kernel(
.visible .entry triton_f4_to_bf16_kernel(
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_0,
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_0,
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_1,
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_1,
.param .u32 triton_f4_to_bf16_kernel_param_2,
.param .u32 triton_f4_to_bf16_kernel_param_2
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_3
)
)
.reqntid 128, 1, 1
.reqntid 128, 1, 1
{
{
.reg .pred %p<25>;
.reg .pred %p<25>;
.reg .b16 %rs<61>;
.reg .b16 %rs<65>;
.reg .b32 %r<132>;
.reg .b32 %r<111>;
.reg .f32 %f<9>;
.reg .b64 %rd<13>;
.reg .b64 %rd<13>;
.loc 1 178 0 // custom_cast.py:178:0
.loc 1 178 0 // custom_cast.py:178:0
$L__func_begin0:
$L__func_begin0:
.loc 1 178 0 // custom_cast.py:178:0
.loc 1 178 0 // custom_cast.py:178:0


// %bb.0:
// %bb.0:
ld.param.u64 %rd9, [triton_f4_to_bf16_kernel_param_0];
ld.param.u64 %rd9, [triton_f4_to_bf16_kernel_param_0];
ld.param.u64 %rd10, [triton_f4_to_bf16_kernel_param_1];
ld.param.u64 %rd10, [triton_f4_to_bf16_kernel_param_1];
$L__tmp0:
$L__tmp0:
.loc 1 194 24 // custom_cast.py:194:24
.loc 1 194 24 // custom_cast.py:194:24
mov.u32 %r5, %ctaid.x;
// begin inline asm
ld.param.u32 %r6, [triton_f4_to_bf16_kernel_param_2];
mov.u32 %r1, %ctaid.x;
// end inline asm
ld.param.u32 %r14, [triton_f4_to_bf16_kernel_param_2];
.loc 1 195 37 // custom_cast.py:195:37
.loc 1 195 37 // custom_cast.py:195:37
shl.b32 %r7, %r6, 1;
shl.b32 %r15, %r14, 1;
.loc 1 198 27 // custom_cast.py:198:27
.loc 1 198 27 // custom_cast.py:198:27
shl.b32 %r8, %r5, 9;
shl.b32 %r16, %r1, 9;
.loc 1 199 47 // custom_cast.py:199:47
.loc 1 199 47 // custom_cast.py:199:47
mov.u32 %r9, %tid.x;
mov.u32 %r17, %tid.x;
shl.b32 %r10, %r9, 2;
shl.b32 %r18, %r17, 2;
and.b32 %r11, %r10, 508;
and.b32 %r19, %r18, 508;
.loc 1 199 34 // custom_cast.py:199:34
.loc 1 199 34 // custom_cast.py:199:34
or.b32 %r12, %r11, %r8;
or.b32 %r20, %r16, %r19;
or.b32 %r13, %r12, 1;
or.b32 %r21, %r20, 1;
or.b32 %r14, %r12, 2;
or.b32 %r22, %r20, 2;
or.b32 %r15, %r12, 3;
or.b32 %r23, %r20, 3;
.loc 1 201 27 // custom_cast.py:201:27
.loc 1 201 27 // custom_cast.py:201:27
setp.lt.s32 %p1, %r12, %r6;
setp.lt.s32 %p1, %r20, %r14;
setp.lt.s32 %p2, %r13, %r6;
setp.lt.s32 %p2, %r21, %r14;
setp.lt.s32 %p3, %r14, %r6;
setp.lt.s32 %p3, %r22, %r14;
setp.lt.s32 %p4, %r15, %r6;
setp.lt.s32 %p4, %r23, %r14;
.loc 1 204 31 // custom_cast.py:204:31
.loc 1 204 31 // custom_cast.py:204:31
cvt.s64.s32 %rd11, %r12;
cvt.s64.s32 %rd11, %r20;
add.s64 %rd1, %rd9, %rd11;
add.s64 %rd1, %rd9, %rd11;
add.s64 %rd2, %rd1, 1;
add.s64 %rd2, %rd1, 1;
add.s64 %rd3, %rd1, 2;
add.s64 %rd3, %rd1, 2;
add.s64 %rd4, %rd1, 3;
add.s64 %rd4, %rd1, 3;
.loc 1 204 23 // custom_cast.py:204:23
.loc 1 204 23 // custom_cast.py:204:23
// begin inline asm
// begin inline asm
mov.u16 %rs1, 0x0;
mov.u16 %rs1, 0x0;
@%p1 ld.global.b8 { %rs1 }, [ %rd1 + 0 ];
@%p1 ld.global.b8 { %rs1 }, [ %rd1 + 0 ];
// end inline asm
// end inline asm
// begin inline asm
// begin inline asm
mov.u16 %rs2, 0x0;
mov.u16 %rs2, 0x0;
@%p2 ld.global.b8 { %rs2 }, [ %rd2 + 0 ];
@%p2 ld.global.b8 { %rs2 }, [ %rd2 + 0 ];
// end inline asm
// end inline asm
// begin inline asm
// begin inline asm
mov.u16 %rs3, 0x0;
mov.u16 %rs3, 0x0;
@%p3 ld.global.b8 { %rs3 }, [ %rd3 + 0 ];
@%p3 ld.global.b8 { %rs3 }, [ %rd3 + 0 ];
// end inline asm
// end inline asm
// begin inline asm
// begin inline asm
mov.u16 %rs4, 0x0;
mov.u16 %rs4, 0x0;
@%p4 ld.global.b8 { %rs4 }, [ %rd4 + 0 ];
@%p4 ld.global.b8 { %rs4 }, [ %rd4 + 0 ];
// end inline asm
// end inline asm
$L__tmp1:
$L__tmp1:
.loc 1 151 26 // custom_cast.py:151:26
and.b16 %rs5, %rs1, 1;
and.b16 %rs6, %rs2, 1;
and.b16 %rs7, %rs3, 1;
and.b16 %rs8, %rs4, 1;
.loc 1 123 29 // custom_cast.py:123:29
.loc 1 123 29 // custom_cast.py:123:29
shr.u16 %rs9, %rs1, 4;
shr.u16 %rs13, %rs1, 4;
and.b16 %rs10, %rs9, 15;
shr.u16 %rs14, %rs2, 4;
shr.u16 %rs15, %rs3, 4;
shr.u16 %rs16, %rs4, 4;
.loc 1 131 18 // custom_cast.py:131:18
.loc 1 131 18 // custom_cast.py:131:18
mov.b32 %r16, {%rs10, %rs11};
and.b16 %rs17, %rs13, 8;
mov.b32 %r17, {%rs10, %rs1};
and.b16 %rs18, %rs14, 8;
and.b32 %r18, %r17, 524296;
and.b16 %rs19, %rs15, 8;
and.b16 %rs20, %rs16, 8;
.loc 1 134 16 // custom_cast.py:134:16
.loc 1 134 16 // custom_cast.py:134:16
and.b32 %r19, %r17, 458759;
and.b16 %rs21, %rs13, 7;
and.b16 %rs22, %rs1, 6;
and.b16 %rs23, %rs1, 7;
and.b16 %rs24, %rs14, 7;
and.b16 %rs25, %rs2, 6;
and.b16 %rs26, %rs2, 7;
and.b16 %rs27, %rs15, 7;
and.b16 %rs28, %rs3, 6;
and.b16 %rs29, %rs3, 7;
and.b16 %rs30, %rs16, 7;
and.b16 %rs31, %rs4, 6;
and.b16 %rs32, %rs4, 7;
.loc 1 137 25 // custom_cast.py:137:25
.loc 1 137 25 // custom_cast.py:137:25
mov.b32 {%rs12, %rs13}, %r19;
setp.eq.s16 %p9, %rs21, 0;
setp.eq.s16 %p9, %rs12, 0;
setp.eq.s16 %p10, %rs23, 0;
setp.eq.s16 %p10, %rs13, 0;
setp.eq.s16 %p11, %rs24, 0;
setp.eq.s16 %p12, %rs26, 0;
setp.eq.s16 %p13, %rs27, 0;
setp.eq.s16 %p14, %rs29, 0;
setp.eq.s16 %p15, %rs30, 0;
setp.eq.s16 %p16, %rs32, 0;
.loc 1 143 29 // custom_cast.py:143:29
.loc 1 143 29 // custom_cast.py:143:29
setp.eq.s16 %p11, %rs13, 1;
setp.eq.s16 %p17, %rs21, 1;
setp.eq.s16 %p12, %rs12, 1;
setp.eq.s16 %p18, %rs23, 1;
setp.eq.s16 %p19, %rs24, 1;
setp.eq.s16 %p20, %rs26, 1;
setp.eq.s16 %p21, %rs27, 1;
setp.eq.s16 %p22, %rs29, 1;
setp.eq.s16 %p23, %rs30, 1;
setp.eq.s16 %p24, %rs32, 1;
.loc 1 146 29 // custom_cast.py:146:29
.loc 1 146 29 // custom_cast.py:146:29
shr.u16 %rs14, %rs12, 1;
shr.u16 %rs33, %rs1, 5;
shr.u16 %rs15, %rs13, 1;
and.b16 %rs34, %rs33, 3;
shr.u16 %rs35, %rs22, 1;
shr.u16 %rs36, %rs2, 5;
and.b16 %rs37, %rs36, 3;
shr.u16 %rs38, %rs25, 1;
shr.u16 %rs39, %rs3, 5;
and.b16 %rs40, %rs39, 3;
shr.u16 %rs41, %rs28, 1;
shr.u16 %rs42, %rs4, 5;
and.b16 %rs43, %rs42, 3;
shr.u16 %rs44, %rs31, 1;
.loc 1 147 56 // custom_cast.py:147:56
.loc 1 147 56 // custom_cast.py:147:56
add.s16 %rs16, %rs14, 126;
add.s16 %rs45, %rs34, 126;
.loc 1 151 26 // custom_cast.py:151:26
add.s16 %rs46, %rs35, 126;
mov.b32 %r20, {%rs17, %rs15};
add.s16 %rs47, %rs37, 126;
and.b32 %r21, %r16, 1;
add.s16 %rs48, %rs38, 126;
mov.b32 %r22, 8257536;
add.s16 %rs49, %rs40, 126;
add.s16x2 %r23, %r20, %r22;
add.s16 %rs50, %rs41, 126;
.loc 1 152 34 // custom_cast.py:152:34
add.s16 %rs51, %rs43, 126;
{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r23; }
add.s16 %rs52, %rs44, 126;
cvt.u32.u16 %r24, %rs18;
{ .reg .b16 tmp; mov.b32 {%rs19, tmp}, %r21; }
cvt.u32.u16 %r25, %rs19;
.loc 1 148 39 // custom_cast.py:148:39
.loc 1 148 39 // custom_cast.py:148:39
cvt.u32.u16 %r26, %rs16;
cvt.u32.u16 %r24, %rs45;
cvt.u32.u16 %r27, %rs5;
cvt.u32.u16 %r25, %rs46;
.loc 1 152 48 // custom_cast.py:152:48
cvt.u32.u16 %r26, %rs47;
shl.b32 %r28, %r24, 23;
cvt.u32.u16 %r27, %rs48;
shl.b32 %r29, %r25, 22;
cvt.u32.u16 %r28, %rs49;
cvt.u32.u16 %r29, %rs50;
cvt.u32.u16 %r30, %rs51;
cvt.u32.u16 %r31, %rs52;
.loc 1 148 52 // custom_cast.py:148:52
.loc 1 148 52 // custom_cast.py:148:52
shl.b32 %r30, %r26, 23;
shl.b32 %r32, %r24, 23;
shl.b32 %r31, %r27, 22;
shl.b32 %r33, %r25, 23;
.loc 1 156 30 // custom_cast.py:156:30
shl.b32 %r34, %r26, 23;
or.b32 %r32, %r29, %r30;
shl.b32 %r35, %r27, 23;
or.b32 %r33, %r28, %r31;
shl.b32 %r36, %r28, 23;
.loc 1 158 48 // custom_cast.py:158:48
shl.b32 %r37, %r29, 23;
selp.b32 %r34, 0, %r33, %p10;
shl.b32 %r38, %r30, 23;
selp.b32 %r35, 0, %r32, %p9;
shl.b32 %r39, %r31, 23;
.loc 1 160 63 // custom_cast.py:160:63
selp.b32 %r36, 1056964608, %r35, %p12;
selp.b32 %r37, 1056964608, %r34, %p11;
.loc 1 163 26 // custom_cast.py:163:26
mov.b32 {%rs20, %rs21}, %r18;
cvt.u32.u16 %r38, %rs21;
cvt.u32.u16 %r39, %rs20;
.loc 1 164 8 // custom_cast.py:164:8
shl.b32 %r40, %r39, 28;
shl.b32 %r41, %r38, 28;
.loc 1 166 22 // custom_cast.py:166:22
or.b32 %r42, %r37, %r41;
or.b32 %r43, %r36, %r40;
.loc 1 173 23 // custom_cast.py:173:23
mov.b32 %f1, %r43;
mov.b32 %f2, %r42;
.loc 1 174 23 // custom_cast.py:174:23
cvt.rn.bf16x2.f32 %r1, %f2, %f1;
.loc 1 123 29 // custom_cast.py:123:29
shr.u16 %rs22, %rs2, 4;
and.b16 %rs23, %rs22, 15;
.loc 1 131 18 // custom_cast.py:131:18
mov.b32 %r44, {%rs23, %rs24};
mov.b32 %r45, {%rs23, %rs2};
and.b32 %r46, %r45, 524296;
.loc 1 134 16 // custom_cast.py:134:16
and.b32 %r47, %r45, 458759;
.loc 1 137 25 // custom_cast.py:137:25
mov.b32 {%rs25, %rs26}, %r47;
setp.eq.s16 %p13, %rs25, 0;
setp.eq.s16 %p14, %rs26, 0;
.loc 1 143 29 // custom_cast.py:143:29
setp.eq.s16 %p15, %rs26, 1;
setp.eq.s16 %p16, %rs25, 1;
.loc 1 146 29 // custom_cast.py:146:29
shr.u16 %rs27, %rs25, 1;
shr.u16 %rs28, %rs26, 1;
.loc 1 147 56 // custom_cast.py:147:56
add.s16 %rs29, %rs27, 126;
.loc 1 151 26 // custom_cast.py:151:26
.loc 1 151 26 // custom_cast.py:151:26
mov.b32 %r48, {%rs30, %rs28};
and.b16 %rs53, %rs13, 1;
and.b32 %r49, %r44, 1;
and.b16 %rs54, %rs14, 1;
add.s16x2 %r50, %r48, %r22;
and.b16 %rs55, %rs15, 1;
and.b16 %rs56, %rs16, 1;
.loc 1 152 34 // custom_cast.py:152:34
.loc 1 152 34 // custom_cast.py:152:34
{ .reg .b16 tmp; mov.b32 {tmp, %rs31}, %r50; }
cvt.u32.u16 %r40, %rs53;
cvt.u32.u16 %r51, %rs31;
$L__tmp2:
{ .reg .b16 tmp; mov.b32 {%rs32, tmp}, %r49; }
.loc 1 204 23 // custom_cast.py:204:23
cvt.u32.u16 %r52, %rs32;
cvt.u32.u16 %r41, %rs1;
.loc 1 148 39 // custom_cast.py:148:39
$L__tmp3:
cvt.u32.u16 %r53, %rs29;
cvt.u32.u16 %r54, %rs6;
.loc 1 152 48 // custom_cast.py:152:48
shl.b32 %r55, %r51, 23;
shl.b32 %r56, %r52, 22;
.loc 1 148 52 // custom_cast.py:148:52
shl.b32 %r57, %r53, 23;
shl.b32 %r58, %r54, 22;
.loc 1 156 30 // custom_cast.py:156:30
or.b32 %r59, %r56, %r57;
or.b32 %r60, %r55, %r58;
.loc 1 158 48 // custom_cast.py:158:48
selp.b32 %r61, 0, %r60, %p14;
selp.b32 %r62, 0, %r59, %p13;
.loc 1 160 63 // custom_cast.py:160:63
selp.b32 %r63, 1056964608, %r62, %p16;
selp.b32 %r64, 1056964608, %r61, %p15;
.loc 1 163 26 // custom_cast.py:163:26
mov.b32 {%rs33, %rs34}, %r46;
cvt.u32.u16 %r65, %rs34;
cvt.u32.u16 %r66, %rs33;
.loc 1 164 8 // custom_cast.py:164:8
shl.b32 %r67, %r66, 28;
shl.b32 %r68, %r65, 28;
.loc 1 166 22 // custom_cast.py:166:22
or.b32 %r69, %r64, %r68;
or.b32 %r70, %r63, %r67;
.loc 1 173 23 // custom_cast.py:173:23
mov.b32 %f3, %r70;
mov.b32 %f4, %r69;
.loc 1 174 23 // custom_cast.py:174:23
cvt.rn.bf16x2.f32 %r2, %f4, %f3;
.loc 1 123 29 // custom_cast.py:123:29
shr.u16 %rs35, %rs3, 4;
and.b16 %rs36, %rs35, 15;
.loc 1 131 18 // custom_cast.py:131:18
mov.b32 %r71, {%rs36, %rs37};
mov.b32 %r72, {%rs36, %rs3};
and.b32 %r73, %r72, 524296;
.loc 1 134 16 // custom_cast.py:134:16
and.b32 %r74, %r72, 458759;
.loc 1 137 25 // custom_cast.py:137:25
mov.b32 {%rs38, %rs39}, %r74;
setp.eq.s16 %p17, %rs38, 0;
setp.eq.s16 %p18, %rs39, 0;
.loc 1 143 29 // custom_cast.py:143:29
setp.eq.s16 %p19, %rs39, 1;
setp.eq.s16 %p20, %rs38, 1;
.loc 1 146 29 // custom_cast.py:146:29
shr.u16 %rs40, %rs38, 1;
shr.u16 %rs41, %rs39, 1;
.loc 1 147 56 // custom_cast.py:147:56
add.s16 %rs42, %rs40, 126;
.loc 1 151 26 // custom_cast.py:151:26
mov.b32 %r75, {%rs43, %rs41};
and.b32 %r76, %r71, 1;
add.s16x2 %r77, %r75, %r22;
.loc 1 152 34 // custom_cast.py:152:34
.loc 1 152 34 // custom_cast.py:152:34
{ .reg .b16 tmp; mov.b32 {tmp, %rs44}, %r77; }
and.b32 %r42, %r41, 1;
cvt.u32.u16 %r78, %rs44;
cvt.u32.u16 %r43, %rs54;
{ .reg .b16 tmp; mov.b32 {%rs45, tmp}, %r76; }
$L__tmp4:
cvt.u32.u16 %r79, %rs45;
.loc 1 204 23 // custom_cast.py:204:23
.loc 1 148 39 // custom_cast.py:148:39
cvt.u32.u16 %r44, %rs2;
cvt.u32.u16 %r80, %rs42;
$L__tmp5:
cvt.u32.u16 %r81, %rs7;
.loc 1 152 48 // custom_cast.py:152:48
shl.b32 %r82, %r78, 23;
shl.b32 %r83, %r79, 22;
.loc 1 148 52 // custom_cast.py:148:52
shl.b32 %r84, %r80, 23;
shl.b32 %r85, %r81, 22;
.loc 1 156 30 // custom_cast.py:156:30
or.b32 %r86, %r83, %r84;
or.b32 %r87, %r82, %r85;
.loc 1 158 48 // custom_cast.py:158:48
selp.b32 %r88, 0, %r87, %p18;
selp.b32 %r89, 0, %r86, %p17;
.loc 1 160 63 // custom_cast.py:160:63
selp.b32 %r90, 1056964608, %r89, %p20;
selp.b32 %r91, 1056964608, %r88, %p19;
.loc 1 163 26 // custom_cast.py:163:26
mov.b32 {%rs46, %rs47}, %r73;
cvt.u32.u16 %r92, %rs47;
cvt.u32.u16 %r93, %rs46;
.loc 1 164 8 // custom_cast.py:164:8
shl.b32 %r94, %r93, 28;
shl.b32 %r95, %r92, 28;
.loc 1 166 22 // custom_cast.py:166:22
or.b32 %r96, %r91, %r95;
or.b32 %r97, %r90, %r94;
.loc 1 173 23 // custom_cast.py:173:23
mov.b32 %f5, %r97;
mov.b32 %f6, %r96;
.loc 1 174 23 // custom_cast.py:174:23
cvt.rn.bf16x2.f32 %r3, %f6, %f5;
.loc 1 123 29 // custom_cast.py:123:29
shr.u16 %rs48, %rs4, 4;
and.b16 %rs49, %rs48, 15;
.loc 1 131 18 // custom_cast.py:131:18
mov.b32 %r98, {%rs49, %rs50};
mov.b32 %r99, {%rs49, %rs4};
and.b32 %r100, %r99, 524296;
.loc 1 134 16 // custom_cast.py:134:16
and.b32 %r101, %r99, 458759;
.loc 1 137 25 // custom_cast.py:137:25
mov.b32 {%rs51, %rs52}, %r101;
setp.eq.s16 %p21, %rs51, 0;
setp.eq.s16 %p22, %rs52, 0;
.loc 1 143 29 // custom_cast.py:143:29
setp.eq.s16 %p23, %rs52, 1;
setp.eq.s16 %p24, %rs51, 1;
.loc 1 146 29 // custom_cast.py:146:29
shr.u16 %rs53, %rs51, 1;
shr.u16 %rs54, %rs52, 1;
.loc 1 147 56 // custom_cast.py:147:56
add.s16 %rs55, %rs53, 126;
.loc 1 151 26 // custom_cast.py:151:26
mov.b32 %r102, {%rs56, %rs54};
and.b32 %r103, %r98, 1;
add.s16x2 %r104, %r102, %r22;
.loc 1 152 34 // custom_cast.py:152:34
.loc 1 152 34 // custom_cast.py:152:34
{ .reg .b16 tmp; mov.b32 {tmp, %rs57}, %r104; }
and.b32 %r45, %r44, 1;
cvt.u32.u16 %r105, %rs57;
cvt.u32.u16 %r46, %rs55;
{ .reg .b16 tmp; mov.b32 {%rs58, tmp}, %r103; }
$L__tmp6:
cvt.u32.u16 %r106, %rs58;
.loc 1 204 23 // custom_cast.py:204:23
.loc 1 148 39 // custom_cast.py:148:39
cvt.u32.u16 %r47, %rs3;
cvt.u32.u16 %r107, %rs55;
$L__tmp7:
cvt.u32.u16 %r108, %rs8;
.loc 1 152 34 // custom_cast.py:152:34
and.b32 %r48, %r47, 1;
cvt.u32.u16 %r49, %rs56;
$L__tmp8:
.loc 1 204 23 // custom_cast.py:204:23
cvt.u32.u16 %r50, %rs4;
$L__tmp9:
.loc 1 152 34 // custom_cast.py:152:34
and.b32 %r51, %r50, 1;
.loc 1 152 48 // custom_cast.py:152:48
.loc 1 152 48 // custom_cast.py:152:48
shl.b32 %r109, %r105, 23;
shl.b32 %r52, %r40, 22;
shl.b32 %r110, %r106, 22;
shl.b32 %r53, %r42, 22;
.loc 1 148 52 // custom_cast.py:148:52
shl.b32 %r54, %r43, 22;
shl.b32 %r111, %r107, 23;
shl.b32 %r55, %r45, 22;
shl.b32 %r112, %r108, 22;
shl.b32 %r56, %r46, 22;
shl.b32 %r57, %r48, 22;
shl.b32 %r58, %r49, 22;
shl.b32 %r59, %r51, 22;
.loc 1 156 30 // custom_cast.py:156:30
.loc 1 156 30 // custom_cast.py:156:30
or.b32 %r113, %r110, %r111;
or.b32 %r60, %r32, %r52;
or.b32 %r114, %r109, %r112;
or.b32 %r61, %r33, %r53;
or.b32 %r62, %r34, %r54;
or.b32 %r63, %r35, %r55;
or.b32 %r64, %r36, %r56;
or.b32 %r65, %r37, %r57;
or.b32 %r66, %r38, %r58;
or.b32 %r67, %r39, %r59;
.loc 1 158 48 // custom_cast.py:158:48
.loc 1 158 48 // custom_cast.py:158:48
selp.b32 %r115, 0, %r114, %p22;
selp.b32 %r68, 0, %r60, %p9;
selp.b32 %r116, 0, %r113, %p21;
selp.b32 %r69, 0, %r61, %p10;
selp.b32 %r70, 0, %r62, %p11;
selp.b32 %r71, 0, %r63, %p12;
selp.b32 %r72, 0, %r64, %p13;
selp.b32 %r73, 0, %r65, %p14;
selp.b32 %r74, 0, %r66, %p15;
selp.b32 %r75, 0, %r67, %p16;
.loc 1 160 63 // custom_cast.py:160:63
.loc 1 160 63 // custom_cast.py:160:63
selp.b32 %r117, 1056964608, %r116, %p24;
selp.b32 %r76, 1056964608, %r68, %p17;
selp.b32 %r118, 1056964608, %r115, %p23;
selp.b32 %r77, 1056964608, %r69, %p18;
selp.b32 %r78, 1056964608, %r70, %p19;
selp.b32 %r79, 1056964608, %r71, %p20;
selp.b32 %r80, 1056964608, %r72, %p21;
selp.b32 %r81, 1056964608, %r73, %p22;
selp.b32 %r82, 1056964608, %r74, %p23;
selp.b32 %r83, 1056964608, %r75, %p24;
.loc 1 163 26 // custom_cast.py:163:26
.loc 1 163 26 // custom_cast.py:163:26
mov.b32 {%rs59, %rs60}, %r100;
cvt.u32.u16 %r84, %rs17;
cvt.u32.u16 %r119, %rs60;
and.b32 %r85, %r41, 8;
cvt.u32.u16 %r120, %rs59;
cvt.u32.u16 %r86, %rs18;
and.b32 %r87, %r44, 8;
cvt.u32.u16 %r88, %rs19;
and.b32 %r89, %r47, 8;
cvt.u32.u16 %r90, %rs20;
and.b32 %r91, %r50, 8;
.loc 1 164 8 // custom_cast.py:164:8
.loc 1 164 8 // custom_cast.py:164:8
shl.b32 %r121, %r120, 28;
shl.b32 %r92, %r84, 28;
shl.b32 %r122, %r119, 28;
shl.b32 %r93, %r85, 28;
shl.b32 %r94, %r86, 28;
shl.b32 %r95, %r87, 28;
shl.b32 %r96, %r88, 28;
shl.b32 %r97, %r89, 28;
shl.b32 %r98, %r90, 28;
shl.b32 %r99, %r91, 28;
.loc 1 166 22 // custom_cast.py:166:22
.loc 1 166 22 // custom_cast.py:166:22
or.b32 %r123, %r118, %r122;
or.b32 %r2, %r76, %r92;
or.b32 %r124, %r117, %r121;
or.b32 %r3, %r77, %r93;
.loc 1 173 23 // custom_cast.py:173:23
or.b32 %r4, %r78, %r94;
mov.b32 %f7, %r124;
or.b32 %r5, %r79, %r95;
mov.b32 %f8, %r123;
or.b32 %r6, %r80, %r96;
or.b32 %r7, %r81, %r97;
or.b32 %r8, %r82, %r98;
or.b32 %r9, %r83, %r99;
.loc 1 174 23 // custom_cast.py:174:23
.loc 1 174 23 // custom_cast.py:174:23
cvt.rn.bf16x2.f32 %r4, %f8, %f7;
// begin inline asm
$L__tmp2:
cvt.rn.bf16.f32 %rs5, %r2;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs6, %r3;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs7, %r4;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs8, %r5;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs9, %r6;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs10, %r7;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs11, %r8;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs12, %r9;
// end inline asm
$L__tmp10:
.loc 1 220 28 // custom_cast.py:220:28
.loc 1 220 28 // custom_cast.py:220:28
shl.b32 %r125, %r5, 10;
shl.b32 %r100, %r1, 10;
.loc 1 221 49 // custom_cast.py:221:49
.loc 1 221 49 // custom_cast.py:221:49
shl.b32 %r126, %r9, 3;
shl.b32 %r101, %r17, 3;
and.b32 %r127, %r126, 1016;
and.b32 %r102, %r101, 1016;
.loc 1 221 36 // custom_cast.py:221:36
.loc 1 221 36 // custom_cast.py:221:36
or.b32 %r128, %r127, %r125;
or.b32 %r103, %r100, %r102;
or.b32 %r129, %r128, 2;
or.b32 %r104, %r103, 2;
or.b32 %r130, %r128, 4;
or.b32 %r105, %r103, 4;
or.b32 %r131, %r128, 6;
or.b32 %r106, %r103, 6;
.loc 1 222 29 // custom_cast.py:222:29
.loc 1 222 29 // custom_cast.py:222:29
setp.lt.s32 %p5, %r128, %r7;
setp.lt.s32 %p5, %r103, %r15;
setp.lt.s32 %p6, %r129, %r7;
setp.lt.s32 %p6, %r104, %r15;
setp.lt.s32 %p7, %r130, %r7;
setp.lt.s32 %p7, %r105, %r15;
setp.lt.s32 %p8, %r131, %r7;
setp.lt.s32 %p8, %r106, %r15;
.loc 1 224 26 // custom_cast.py:224:26
.loc 1 224 26 // custom_cast.py:224:26
mul.wide.s32 %rd12, %r128, 2;
mul.wide.s32 %rd12, %r103, 2;
add.s64 %rd5, %rd10, %rd12;
add.s64 %rd5, %rd10, %rd12;
add.s64 %rd6, %rd5, 4;
add.s64 %rd6, %rd5, 4;
add.s64 %rd7, %rd5, 8;
add.s64 %rd7, %rd5, 8;
add.s64 %rd8, %rd5, 12;
add.s64 %rd8, %rd5, 12;
.loc 1 224 39 // custom_cast.py:224:39
.loc 1 224 39 // custom_cast.py:224:39
mov.b32 %r107, {%rs5, %rs6};
// begin inline asm
// begin inline asm
@%p5 st.global.b32 [ %rd5 + 0 ], { %r1 };
@%p5 st.global.b32 [ %rd5 + 0 ], { %r107 };
// end inline asm
// end inline asm
mov.b32 %r108, {%rs7, %rs8};
// begin inline asm
// begin inline asm
@%p6 st.global.b32 [ %rd6 + 0 ], { %r2 };
@%p6 st.global.b32 [ %rd6 + 0 ], { %r108 };
// end inline asm
// end inline asm
mov.b32 %r109, {%rs9, %rs10};
// begin inline asm
// begin inline asm
@%p7 st.global.b32 [ %rd7 + 0 ], { %r3 };
@%p7 st.global.b32 [ %rd7 + 0 ], { %r109 };
// end inline asm
// end inline asm
mov.b32 %r110, {%rs11, %rs12};
// begin inline asm
// begin inline asm
@%p8 st.global.b32 [ %rd8 + 0 ], { %r4 };
@%p8 st.global.b32 [ %rd8 + 0 ], { %r110 };
// end inline asm
// end inline asm
.loc 1 224 4 // custom_cast.py:224:4
.loc 1 224 4 // custom_cast.py:224:4
ret;
ret;
$L__tmp3:
$L__tmp11:
$L__func_end0:
$L__func_end0:
// -- End function
// -- End function
}
}
.file 1 "/home/drisspg/meta/ao/torchao/prototype/mx_formats/custom_cast.py"
.file 1 "/home/drisspg/meta/ao/torchao/prototype/mx_formats/custom_cast.py"
.section .debug_abbrev
.section .debug_abbrev
{
{
.b8 1 // Abbreviation Code
.b8 1 // Abbreviation Code
.b8 17 // DW_TAG_compile_unit
.b8 17 // DW_TAG_compile_unit
.b8 1 // DW_CHILDREN_yes
.b8 1 // DW_CHILDREN_yes
.b8 37 // DW_AT_producer
.b8 37 // DW_AT_producer
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 19 // DW_AT_language
.b8 19 // DW_AT_language
.b8 5 // DW_FORM_data2
.b8 5 // DW_FORM_data2
.b8 3 // DW_AT_name
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 16 // DW_AT_stmt_list
.b8 16 // DW_AT_stmt_list
.b8 6 // DW_FORM_data4
.b8 6 // DW_FORM_data4
.b8 27 // DW_AT_comp_dir
.b8 27 // DW_AT_comp_dir
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 2 // Abbreviation Code
.b8 2 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 46 // DW_TAG_subprogram
.b8 0 // DW_CHILDREN_no
.b8 0 // DW_CHILDREN_no
.b8 3 // DW_AT_name
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 32 // DW_AT_inline
.b8 32 // DW_AT_inline
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 3 // Abbreviation Code
.b8 3 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 46 // DW_TAG_subprogram
.b8 1 // DW_CHILDREN_yes
.b8 1 // DW_CHILDREN_yes
.b8 17 // DW_AT_low_pc
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 49 // DW_AT_abstract_origin
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 19 // DW_FORM_ref4
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 4 // Abbreviation Code
.b8 4 // Abbreviation Code
.b8 29 // DW_TAG_inlined_subroutine
.b8 29 // DW_TAG_inlined_subroutine
.b8 0 // DW_CHILDREN_no
.b8 0 // DW_CHILDREN_no
.b8 49 // DW_AT_abstract_origin
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 19 // DW_FORM_ref4
.b8 17 // DW_AT_low_pc
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 88 // DW_AT_call_file
.b8 88 // DW_AT_call_file
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 89 // DW_AT_call_line
.b8 89 // DW_AT_call_line
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 87 // DW_AT_call_column
.b8 87 // DW_AT_call_column
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 0 // EOM(3)
.b8 0 // EOM(3)
}
}
.section .debug_info
.section .debug_info
{
{
.b32 161 // Length of Unit
.b32 161 // Length of Unit
.b8 2 // DWARF version number
.b8 2 // DWARF version number
.b8 0
.b8 0
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b8 8 // Address Size (in bytes)
.b8 8 // Address Size (in bytes)
.b8 1 // Abbrev [1] 0xb:0x9a DW_TAG_compile_unit
.b8 1 // Abbrev [1] 0xb:0x9a DW_TAG_compile_unit
.b8 116 // DW_AT_producer
.b8 116 // DW_AT_producer
.b8 114
.b8 114
.b8 105
.b8 105
.b8 116
.b8 116
.b8 111
.b8 111
.b8 110
.b8 110
.b8 0
.b8 0
.b8 2 // DW_AT_language
.b8 2 // DW_AT_language
.b8 0
.b8 0
.b8 99 // DW_AT_name
.b8 99 // DW_AT_name
.b8 117
.b8 117
.b8 115
.b8 115
.b8 116
.b8 116
.b8 111
.b8 111
.b8 109
.b8 109
.b8 95
.b8 95
.b8 99
.b8 99
.b8 97
.b8 97
.b8 115
.b8 115
.b8 116
.b8 116
.b8 46
.b8 46
.b8 112
.b8 112
.b8 121
.b8 121
.b8 0
.b8 0
.b32 .debug_line // DW_AT_stmt_list
.b32 .debug_line // DW_AT_stmt_list
.b8 47 // DW_AT_comp_dir
.b8 47 // DW_AT_comp_dir
.b8 104
.b8 104
.b8 111
.b8 111
.b8 109
.b8 109
.b8 101
.b8 101
.b8 47
.b8 47
.b8 100
.b8 100
.b8 114
.b8 114
.b8 105
.b8 105
.b8 115
.b8 115
.b8 115
.b8 115
.b8 112
.b8 112
.b8 103
.b8 103
.b8 47
.b8 47
.b8 109
.b8 109
.b8 101
.b8 101
.b8 116
.b8 116
.b8 97
.b8 97
.b8 47
.b8 47
.b8 97
.b8 97
.b8 111
.b8 111
.b8 47
.b8 47
.b8 116
.b8 116
.b8 111
.b8 111
.b8 114
.b8 114
.b8 99
.b8 99
.b8 104
.b8 104
.b8 97
.b8 97
.b8 111
.b8 111
.b8 47
.b8 47
.b8 112
.b8 112
.b8 114
.b8 114
.b8 111
.b8 111
.b8 116
.b8 116
.b8 111
.b8 111
.b8 116
.b8 116
.b8 121
.b8 121
.b8 112
.b8 112
.b8 101
.b8 101
.b8 47
.b8 47
.b8 109
.b8 109
.b8 120
.b8 120
.b8 95
.b8 95
.b8 102
.b8 102
.b8 111
.b8 111
.b8 114
.b8 114
.b8 109
.b8 109
.b8 97
.b8 97
.b8 116
.b8 116
.b8 115
.b8 115
.b8 0
.b8 0
.b8 2 // Abbrev [2] 0x5b:0x1b DW_TAG_subprogram
.b8 2 // Abbrev [2] 0x5b:0x1b DW_TAG_subprogram
.b8 116 // DW_AT_name
.b8 116 // DW_AT_name
.b8 114
.b8 114
.b8 105
.b8 105
.b8 116
.b8 116
.b8 111
.b8 111
.b8 110
.b8 110
.b8 95
.b8 95
.b8 102
.b8 102
.b8 52
.b8 52
.b8 95
.b8 95
.b8 116
.b8 116
.b8 111
.b8 111
.b8 95
.b8 95
.b8 98
.b8 98
.b8 102
.b8 102
.b8 49
.b8 49
.b8 54
.b8 54
.b8 95
.b8 95
.b8 107
.b8 107
.b8 101
.b8 101
.b8 114
.b8 114
.b8 110
.b8 110
.b8 101
.b8 101
.b8 108
.b8 108
.b8 0
.b8 0
.b8 1 // DW_AT_inline
.b8 1 // DW_AT_inline
.b8 3 // Abbrev [3] 0x76:0x2e DW_TAG_subprogram
.b8 3 // Abbrev [3] 0x76:0x2e DW_TAG_subprogram
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b32 91 // DW_AT_abstract_origin
.b32 91 // DW_AT_abstract_origin
.b8 4 // Abbrev [4] 0x8b:0x18 DW_TAG_inlined_subroutine
.b8 4 // Abbrev [4] 0x8b:0x18 DW_TAG_inlined_subroutine
.b32 91 // DW_AT_abstract_origin
.b32 91 // DW_AT_abstract_origin
.b64 $L__tmp1 // DW_AT_low_pc
.b64 $L__tmp1 // DW_AT_low_pc
.b64 $L__tmp2 // DW_AT_high_pc
.b64 $L__tmp10 // DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 1 // DW_AT_call_file
.b8 216 // DW_AT_call_line
.b8 216 // DW_AT_call_line
.b8 8 // DW_AT_call_column
.b8 8 // DW_AT_call_column
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
}
}
.section .debug_macinfo { }
.section .debug_macinfo { }