pad

Created Diff never expires
45 removals
52 lines
48 additions
54 lines
__global__ void nvfuser_pointwise_f0_c1_r0_g2(Tensor<float, 3, 3> T1, Tensor<float, 3, 3> T2, Tensor<float, 3, 3> T6) {
__global__ void nvfuser_pointwise_f0_c1_r0_g2(Tensor<float, 3, 3> T1, Tensor<float, 3, 3> T2, nvfuser_index_t i0, Tensor<float, 3, 3> T6) {
NVFUSER_DEFINE_MAGIC_ZERO;
NVFUSER_DEFINE_MAGIC_ZERO;
if (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + (512 * ((nvfuser_index_t)blockIdx.x))) < 524288)) {
nvfuser_index_t i1;
i1 = i0 - 8;
if (((((4 * ((nvfuser_index_t)threadIdx.x)) + 3) + (512 * ((nvfuser_index_t)blockIdx.x))) < ((i0 * T1.logical_size[0LL]) * T1.logical_size[1LL]))) {
Array<float, 4, 4> T7;
Array<float, 4, 4> T7;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
Text moved from lines 14-16
float T3[1];
T3[0] = 0;
T3[0]
= ((((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % ((8 * T1.logical_size[1LL]) + (T1.logical_size[1LL] * T2.logical_size[2LL]))) % (8 + T2.logical_size[2LL])) < T2.logical_size[2LL]) ? T2[((((T1.logical_size[1LL] * T2.alloc_stride[1LL]) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) / ((8 * T1.logical_size[1LL]) + (T1.logical_size[1LL] * T2.logical_size[2LL])))) + (T2.alloc_stride[1LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % ((8 * T1.logical_size[1LL]) + (T1.logical_size[1LL] * T2.logical_size[2LL]))) / (8 + T2.logical_size[2LL])))) + (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % ((8 * T1.logical_size[1LL]) + (T1.logical_size[1LL] * T2.logical_size[2LL]))) % (8 + T2.logical_size[2LL])))] : 0.000000000e+00f;
float T4[1];
float T4[1];
T4[0] = 0;
T4[0] = 0;
T4[0]
T4[0]
= (((-8 + (4 * (((nvfuser_index_t)threadIdx.x) % 4))) >= (-(i0 + nvfuser_zero))) && ((-16 + (4 * (((nvfuser_index_t)threadIdx.x) % 4))) < (-(i0 + nvfuser_zero)))) ? T1[((((-8 + (4 * (((nvfuser_index_t)threadIdx.x) % 4))) + (T1.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 4))) + ((32 * T1.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (i0 + nvfuser_zero))] : 0.000000000e+00f;
= (((((-i0) + 8) + (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % (((T1.logical_size[1LL] * T1.logical_size[2LL]) + (i0 * T1.logical_size[1LL])) + (-8 * T1.logical_size[1LL]))) % ((i0 + -8) + T1.logical_size[2LL]))) >= 0) && ((((-i0) + 8) + (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % (((T1.logical_size[1LL] * T1.logical_size[2LL]) + (i0 * T1.logical_size[1LL])) + (-8 * T1.logical_size[1LL]))) % ((i0 + -8) + T1.logical_size[2LL]))) < T1.logical_size[2LL])) ? T1[(((((-i0) + 8) + ((T1.alloc_stride[1LL] * T1.logical_size[1LL]) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) / (((T1.logical_size[1LL] * T1.logical_size[2LL]) + (i0 * T1.logical_size[1LL])) + (-8 * T1.logical_size[1LL]))))) + (T1.alloc_stride[1LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % (((T1.logical_size[1LL] * T1.logical_size[2LL]) + (i0 * T1.logical_size[1LL])) + (-8 * T1.logical_size[1LL]))) / ((i0 + -8) + T1.logical_size[2LL])))) + (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % (((T1.logical_size[1LL] * T1.logical_size[2LL]) + (i0 * T1.logical_size[1LL])) + (-8 * T1.logical_size[1LL]))) % ((i0 + -8) + T1.logical_size[2LL])))] : 0.000000000e+00f;
float T5[1];
float T5[1];
T5[0]
T5[0]
= -T4[0];
= -T4[0];
Text moved to lines 9-11
float T3[1];
T7[i2]
T3[0] = 0;
T3[0]
= ((-8 + (4 * (((nvfuser_index_t)threadIdx.x) % 4))) < (-(i0 + nvfuser_zero))) ? T2[((((4 * (((nvfuser_index_t)threadIdx.x) % 4)) + (T2.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 4))) + ((32 * T2.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (i0 + nvfuser_zero))] : 0.000000000e+00f;
T7[i0]
= T3[0]
= T3[0]
+ T5[0];
+ T5[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T6[((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x)))], &T7[0]);
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T6[((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x)))], &T7[0]);
} else {
} else {
Array<float, 4, 4> T7;
Array<float, 4, 4> T7;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
float T3[1];
T3[0] = 0;
if ((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) < ((i0 * T1.logical_size[0LL]) * T1.logical_size[1LL]))) {
T3[0]
= ((((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % ((8 * T1.logical_size[1LL]) + (T1.logical_size[1LL] * T2.logical_size[2LL]))) % (8 + T2.logical_size[2LL])) < T2.logical_size[2LL]) ? T2[((((T1.logical_size[1LL] * T2.alloc_stride[1LL]) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) / ((8 * T1.logical_size[1LL]) + (T1.logical_size[1LL] * T2.logical_size[2LL])))) + (T2.alloc_stride[1LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % ((8 * T1.logical_size[1LL]) + (T1.logical_size[1LL] * T2.logical_size[2LL]))) / (8 + T2.logical_size[2LL])))) + (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % ((8 * T1.logical_size[1LL]) + (T1.logical_size[1LL] * T2.logical_size[2LL]))) % (8 + T2.logical_size[2LL])))] : 0.000000000e+00f;
}
float T4[1];
float T4[1];
T4[0] = 0;
T4[0] = 0;
if ((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) < 524288)) {
if ((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) < ((((T1.logical_size[0LL] * T1.logical_size[1LL]) * T1.logical_size[2LL]) + ((i0 * T1.logical_size[0LL]) * T1.logical_size[1LL])) + ((-8 * T1.logical_size[0LL]) * T1.logical_size[1LL])))) {
T4[0]
T4[0]
= (((-8 + (4 * (((nvfuser_index_t)threadIdx.x) % 4))) >= (-(i0 + nvfuser_zero))) && ((-16 + (4 * (((nvfuser_index_t)threadIdx.x) % 4))) < (-(i0 + nvfuser_zero)))) ? T1[((((-8 + (4 * (((nvfuser_index_t)threadIdx.x) % 4))) + (T1.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 4))) + ((32 * T1.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (i0 + nvfuser_zero))] : 0.000000000e+00f;
= (((((-i0) + 8) + (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % (((T1.logical_size[1LL] * T1.logical_size[2LL]) + (i0 * T1.logical_size[1LL])) + (-8 * T1.logical_size[1LL]))) % ((i0 + -8) + T1.logical_size[2LL]))) >= 0) && ((((-i0) + 8) + (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % (((T1.logical_size[1LL] * T1.logical_size[2LL]) + (i0 * T1.logical_size[1LL])) + (-8 * T1.logical_size[1LL]))) % ((i0 + -8) + T1.logical_size[2LL]))) < T1.logical_size[2LL])) ? T1[(((((-i0) + 8) + ((T1.alloc_stride[1LL] * T1.logical_size[1LL]) * ((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) / (((T1.logical_size[1LL] * T1.logical_size[2LL]) + (i0 * T1.logical_size[1LL])) + (-8 * T1.logical_size[1LL]))))) + (T1.alloc_stride[1LL] * (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % (((T1.logical_size[1LL] * T1.logical_size[2LL]) + (i0 * T1.logical_size[1LL])) + (-8 * T1.logical_size[1LL]))) / ((i0 + -8) + T1.logical_size[2LL])))) + (((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) + (i2 + nvfuser_zero)) % (((T1.logical_size[1LL] * T1.logical_size[2LL]) + (i0 * T1.logical_size[1LL])) + (-8 * T1.logical_size[1LL]))) % ((i0 + -8) + T1.logical_size[2LL])))] : 0.000000000e+00f;
}
}
float T5[1];
float T5[1];
T5[0]
T5[0]
= -T4[0];
= -T4[0];
float T3[1];
T7[i2]
T3[0] = 0;
if ((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) < 524288)) {
T3[0]
= ((-8 + (4 * (((nvfuser_index_t)threadIdx.x) % 4))) < (-(i0 + nvfuser_zero))) ? T2[((((4 * (((nvfuser_index_t)threadIdx.x) % 4)) + (T2.alloc_stride[1LL] * (((nvfuser_index_t)threadIdx.x) / 4))) + ((32 * T2.alloc_stride[1LL]) * ((nvfuser_index_t)blockIdx.x))) + (i0 + nvfuser_zero))] : 0.000000000e+00f;
}
T7[i0]
= T3[0]
= T3[0]
+ T5[0];
+ T5[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) < 524288)) {
if ((((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x))) < ((i0 * T1.logical_size[0LL]) * T1.logical_size[1LL]))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T6[((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x)))], &T7[0]);
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T6[((4 * ((nvfuser_index_t)threadIdx.x)) + (512 * ((nvfuser_index_t)blockIdx.x)))], &T7[0]);
}
}
}
}
}
}