FusionReshapeConcreteDomain3_CUDA

Created Diff never expires
193 removals
214 lines
197 additions
218 lines
__global__ void kernel1(Tensor<float, 4> T0, Tensor<float, 4> T1, Tensor<float, 3> T2, Tensor<float, 3> T6) {
__global__ void kernel1(Tensor<float, 4> T0, Tensor<float, 4> T1, Tensor<float, 3> T2, Tensor<float, 3> T6) {
NVFUSER_DEFINE_MAGIC_ZERO
NVFUSER_DEFINE_MAGIC_ZERO
int i1114;
int i1495;
i1114 = (((nvfuser_index_t)blockIdx.x) * 512) + (4 * ((nvfuser_index_t)threadIdx.x));
i1495 = (((nvfuser_index_t)blockIdx.x) * 512) + (4 * ((nvfuser_index_t)threadIdx.x));
int i1116;
int i1497;
i1116 = T2.size[1] * T2.size[2];
i1497 = T2.size[1] * T2.size[2];
int i1118;
int i1499;
i1118 = 3 * (ceilDiv(i1116, 3));
i1499 = 3 * (ceilDiv(i1497, 3));
int i2408;
int i3673;
i2408 = T0.size[3] * 96;
i3673 = T0.size[3] * 96;
int i2411;
int i3676;
i2411 = T1.stride[1] * 4;
i3676 = T1.stride[1] * 4;
int i2418;
int i3683;
i2418 = 32 * T0.size[3];
i3683 = 32 * T0.size[3];
int i4017;
int i6483;
i4017 = T0.stride[1] * 4;
i6483 = T0.stride[1] * 4;
int i10722;
int i17516;
i10722 = i1114 + 3;
i17516 = i1495 + 3;
int i10732;
int i17470;
i10732 = i10722 % i2408;
i17470 = (((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x);
int i10736;
int i17471;
i10736 = (i10732 % i2418) / T0.size[3];
i17471 = T0.size[3] * 24;
int i10648;
int i17526;
i10648 = T0.size[3] * 1344;
i17526 = ((i17470 % i17471) * 4) + 3;
bool b10651;
int i17530;
b10651 = ((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv(i10648, 4)), 128));
i17530 = (i17526 % i3683) / T0.size[3];
bool b10724;
int i17444;
b10724 = (i10722 / i1118) < 14;
i17444 = T0.size[3] * 1344;
bool b10727;
bool b17447;
b10727 = (i10722 % i1118) < i1116;
b17447 = ((nvfuser_index_t)blockIdx.x) < (ceilDiv((ceilDiv(i17444, 4)), 128));
bool b10744;
bool b17518;
b10744 = i10722 < i10648;
b17518 = (i17516 / i1499) < 14;
bool b11733;
bool b17521;
b11733 = (b10724 && b10727) && b10651;
b17521 = (i17516 % i1499) < i1497;
bool b12964;
bool b17538;
b12964 = b10744 && b10651;
b17538 = i17516 < i17444;
if (((((((b10651 && b10724) && b10727) && ((i10722 / i2408) < 14)) && ((((i10732 / i2418) * 4) + (i10736 / 8)) < 12)) && ((i10736 % 8) < 8)) && b10744)) {
bool b18649;
b18649 = (b17518 && b17521) && b17447;
bool b20468;
b20468 = b17538 && b17447;
if (((((((b17447 && b17518) && b17521) && ((i17470 / i17471) < 14)) && ((((i17526 / i3683) * 4) + (i17530 / 8)) < 12)) && ((i17530 % 8) < 8)) && b17538)) {
float T9[4];
float T9[4];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i212 = 0; i212 < 4; ++i212) {
for(nvfuser_index_t i212 = 0; i212 < 4; ++i212) {
T9[i212] = 0;
T9[i212] = 0;
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i212 = 0; i212 < 4; ++i212) {
for(nvfuser_index_t i212 = 0; i212 < 4; ++i212) {
int i1115;
int i1496;
i1115 = i1114 + (i212 + nvfuser_zero);
i1496 = i1495 + (i212 + nvfuser_zero);
int i1143;
int i1524;
i1143 = i1115 % i1118;
i1524 = i1496 % i1499;
T9[i212]
T9[i212]
= T2[(((T2.stride[0] * (i1115 / i1118)) + (T2.stride[1] * (i1143 / T2.size[2]))) + (T2.stride[2] * (i1143 % T2.size[2])))];
= T2[(((T2.stride[0] * (i1496 / i1499)) + (T2.stride[1] * (i1524 / T2.size[2]))) + (T2.stride[2] * (i1524 % T2.size[2])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
float T8[4];
float T8[4];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i207 = 0; i207 < 4; ++i207) {
for(nvfuser_index_t i207 = 0; i207 < 4; ++i207) {
T8[i207] = 0;
T8[i207] = 0;
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i207 = 0; i207 < 4; ++i207) {
for(nvfuser_index_t i207 = 0; i207 < 4; ++i207) {
int i2446;
int i3711;
i2446 = i1114 + (i207 + nvfuser_zero);
i3711 = i1495 + (i207 + nvfuser_zero);
int i2449;
int i3714;
i2449 = i2446 % i2408;
i3714 = i3711 % i3673;
int i2454;
int i3719;
i2454 = (i2449 % i2418) / T0.size[3];
i3719 = (i3714 % i3683) / T0.size[3];
T8[i207]
T8[i207]
= T1[((((T1.stride[0] * (i2446 / i2408)) + (i2411 * (i2449 / i2418))) + (T1.stride[1] * (i2454 / 8))) + (T1.stride[2] * (i2454 % 8)))];
= T1[((((T1.stride[0] * (i3711 / i3673)) + (i3676 * (i3714 / i3683))) + (T1.stride[1] * (i3719 / 8))) + (T1.stride[2] * (i3719 % 8)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
float T7[4];
float T7[4];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i202 = 0; i202 < 4; ++i202) {
for(nvfuser_index_t i202 = 0; i202 < 4; ++i202) {
T7[i202] = 0;
T7[i202] = 0;
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i202 = 0; i202 < 4; ++i202) {
for(nvfuser_index_t i202 = 0; i202 < 4; ++i202) {
int i4063;
int i6529;
i4063 = i1114 + (i202 + nvfuser_zero);
i6529 = i1495 + (i202 + nvfuser_zero);
int i4066;
int i6532;
i4066 = i4063 % i2408;
i6532 = i6529 % i3673;
int i4070;
int i6536;
i4070 = i4066 % i2418;
i6536 = i6532 % i3683;
int i4071;
int i6537;
i4071 = i4070 / T0.size[3];
i6537 = i6536 / T0.size[3];
T7[i202]
T7[i202]
= T0[(((((T0.stride[0] * (i4063 / i2408)) + (i4017 * (i4066 / i2418))) + (T0.stride[1] * (i4071 / 8))) + (T0.stride[2] * (i4071 % 8))) + (T0.stride[3] * (i4070 % T0.size[3])))];
= T0[(((((T0.stride[0] * (i6529 / i3673)) + (i6483 * (i6532 / i3683))) + (T0.stride[1] * (i6537 / 8))) + (T0.stride[2] * (i6537 % 8))) + (T0.stride[3] * (i6536 % T0.size[3])))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
Array<float, 4, 4> T10;
Array<float, 4, 4> T10;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i213 = 0; i213 < 4; ++i213) {
for(nvfuser_index_t i213 = 0; i213 < 4; ++i213) {
float T3[1];
float T3[1];
T3[0]
T3[0]
= T7[i213]
= T7[i213]
+ T8[i213];
+ T8[i213];
float T4[1];
float T4[1];
T4[0]
T4[0]
= T3[0];
= T3[0];
float T5[1];
float T5[1];
T5[0]
T5[0]
= T9[i213];
= T9[i213];
T10[i213]
T10[i213]
= T4[0]
= T4[0]
+ T5[0];
+ T5[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
loadLocalToGlobal<float, 4, false>( &T6[i1114], &T10[0]);
loadLocalToGlobal<float, 4, false>( &T6[i1495], &T10[0]);
} else {
} else {
float T9[4];
float T9[4];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i212 = 0; i212 < 4; ++i212) {
for(nvfuser_index_t i212 = 0; i212 < 4; ++i212) {
if (b10651) {
if (b17447) {
T9[i212] = 0;
T9[i212] = 0;
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i212 = 0; i212 < 4; ++i212) {
for(nvfuser_index_t i212 = 0; i212 < 4; ++i212) {
int i5391;
int i8238;
i5391 = i1114 + (i212 + nvfuser_zero);
i8238 = i1495 + (i212 + nvfuser_zero);
int i5394;
int i8241;
i5394 = i5391 % i1118;
i8241 = i8238 % i1499;
if (b11733) {
if (b18649) {
T9[i212]
T9[i212]
= T2[(((T2.stride[0] * (i5391 / i1118)) + (T2.stride[1] * (i5394 / T2.size[2]))) + (T2.stride[2] * (i5394 % T2.size[2])))];
= T2[(((T2.stride[0] * (i8238 / i1499)) + (T2.stride[1] * (i8241 / T2.size[2]))) + (T2.stride[2] * (i8241 % T2.size[2])))];
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
float T8[4];
float T8[4];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i207 = 0; i207 < 4; ++i207) {
for(nvfuser_index_t i207 = 0; i207 < 4; ++i207) {
T8[i207] = 0;
T8[i207] = 0;
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i207 = 0; i207 < 4; ++i207) {
for(nvfuser_index_t i207 = 0; i207 < 4; ++i207) {
int i6697;
int i10428;
i6697 = i1114 + (i207 + nvfuser_zero);
i10428 = i1495 + (i207 + nvfuser_zero);
int i6700;
int i10431;
i6700 = i6697 % i2408;
i10431 = i10428 % i3673;
int i6705;
int i10436;
i6705 = (i6700 % i2418) / T0.size[3];
i10436 = (i10431 % i3683) / T0.size[3];
int i6698;
int i10429;
i6698 = i6697 / i2408;
i10429 = i10428 / i3673;
int i6701;
int i10432;
i6701 = i6700 / i2418;
i10432 = i10431 / i3683;
int i6706;
int i10437;
i6706 = i6705 / 8;
i10437 = i10436 / 8;
int i6709;
int i10440;
i6709 = i6705 % 8;
i10440 = i10436 % 8;
if ((((i6698 < 14) && (((4 * i6701) + i6706) < 12)) && (i6709 < 8))) {
if ((((i10429 < 14) && (((4 * i10432) + i10437) < 12)) && (i10440 < 8))) {
T8[i207]
T8[i207]
= T1[((((T1.stride[0] * i6698) + (i2411 * i6701)) + (T1.stride[1] * i6706)) + (T1.stride[2] * i6709))];
= T1[((((T1.stride[0] * i10429) + (i3676 * i10432)) + (T1.stride[1] * i10437)) + (T1.stride[2] * i10440))];
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
float T7[4];
float T7[4];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i202 = 0; i202 < 4; ++i202) {
for(nvfuser_index_t i202 = 0; i202 < 4; ++i202) {
if (b10651) {
if (b17447) {
T7[i202] = 0;
T7[i202] = 0;
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i202 = 0; i202 < 4; ++i202) {
for(nvfuser_index_t i202 = 0; i202 < 4; ++i202) {
int i8314;
int i13246;
i8314 = i1114 + (i202 + nvfuser_zero);
i13246 = i1495 + (i202 + nvfuser_zero);
int i8317;
int i13249;
i8317 = i8314 % i2408;
i13249 = i13246 % i3673;
int i8321;
int i13253;
i8321 = i8317 % i2418;
i13253 = i13249 % i3683;
int i8322;
int i13254;
i8322 = i8321 / T0.size[3];
i13254 = i13253 / T0.size[3];
if (b12964) {
if (b20468) {
T7[i202]
T7[i202]
= T0[(((((T0.stride[0] * (i8314 / i2408)) + (i4017 * (i8317 / i2418))) + (T0.stride[1] * (i8322 / 8))) + (T0.stride[2] * (i8322 % 8))) + (T0.stride[3] * (i8321 % T0.size[3])))];
= T0[(((((T0.stride[0] * (i13246 / i3673)) + (i6483 * (i13249 / i3683))) + (T0.stride[1] * (i13254 / 8))) + (T0.stride[2] * (i13254 % 8))) + (T0.stride[3] * (i13253 % T0.size[3])))];
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
Array<float, 4, 4> T10;
Array<float, 4, 4> T10;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i213 = 0; i213 < 4; ++i213) {
for(nvfuser_index_t i213 = 0; i213 < 4; ++i213) {
float T3[1];
float T3[1];
if (b10651) {
if (b17447) {
T3[0]
T3[0]
= T7[i213]
= T7[i213]
+ T8[i213];
+ T8[i213];
}
}
float T4[1];
float T4[1];
if (b10651) {
if (b17447) {
T4[0]
T4[0]
= T3[0];
= T3[0];
}
}
float T5[1];
float T5[1];
if (b10651) {
if (b17447) {
T5[0]
T5[0]
= T9[i213];
= T9[i213];
}
}
if (b10651) {
if (b17447) {
T10[i213]
T10[i213]
= T4[0]
= T4[0]
+ T5[0];
+ T5[0];
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
if ((b10651 && b10744)) {
if ((b17447 && b17538)) {
loadLocalToGlobal<float, 4, false>( &T6[i1114], &T10[0]);
loadLocalToGlobal<float, 4, false>( &T6[i1495], &T10[0]);
}
}
}
}
}
}