-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Optimize the gemv_t_vector.c kernel for RISCV64_ZVL256B target #5427
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,110 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
|
||
#include "common.h" | ||
#if !defined(DOUBLE) | ||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) | ||
#define FLOAT_V_T vfloat32m2_t | ||
#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) | ||
#define VSETVL_MAX_M1 RISCV_RVV(vsetvlmax_e32m1) | ||
#define FLOAT_V_T vfloat32m8_t | ||
#define FLOAT_V_T_M1 vfloat32m1_t | ||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2) | ||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) | ||
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) | ||
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) | ||
#ifdef RISCV_0p10_INTRINSICS | ||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m2_f32m1(v_res, va, vb, gvl) | ||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m8_f32m1(v_res, va, vb, gvl) | ||
#else | ||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m2_f32m1) | ||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) | ||
#endif | ||
#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) | ||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) | ||
#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m8) | ||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) | ||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | ||
#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) | ||
#define xint_t int | ||
#else | ||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) | ||
#define FLOAT_V_T vfloat64m2_t | ||
#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) | ||
#define VSETVL_MAX_M1 RISCV_RVV(vsetvlmax_e64m1) | ||
#define FLOAT_V_T vfloat64m8_t | ||
#define FLOAT_V_T_M1 vfloat64m1_t | ||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2) | ||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) | ||
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) | ||
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) | ||
#ifdef RISCV_0p10_INTRINSICS | ||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m2_f64m1(v_res, va, vb, gvl) | ||
#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m8_f64m1(v_res, va, vb, gvl) | ||
#else | ||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m2_f64m1) | ||
#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1) | ||
#endif | ||
#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) | ||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) | ||
#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m8) | ||
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) | ||
#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | ||
#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) | ||
#define xint_t long long | ||
#endif | ||
|
||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||
{ | ||
BLASLONG i = 0, j = 0, k = 0; | ||
BLASLONG ix = 0, iy = 0; | ||
FLOAT *a_ptr = a; | ||
FLOAT temp; | ||
BLASLONG i = 0, j = 0, k = 0; | ||
BLASLONG ix = 0, iy = 0; | ||
FLOAT *a_ptr = a; | ||
FLOAT temp; | ||
|
||
FLOAT_V_T va, vr, vx; | ||
unsigned int gvl = 0; | ||
FLOAT_V_T_M1 v_res; | ||
FLOAT_V_T va, vr, vx; | ||
unsigned int gvl = 0; | ||
FLOAT_V_T_M1 v_res; | ||
size_t vlmax = VSETVL_MAX_M1(); | ||
|
||
#ifndef RISCV_0p10_INTRINSICS | ||
FLOAT_V_T va0, va1, va2, va3, vr0, vr1, vr2, vr3; | ||
FLOAT_V_T_M1 vec0, vec1, vec2, vec3; | ||
FLOAT *a_ptrs[4], *y_ptrs[4]; | ||
#endif | ||
|
||
if(inc_x == 1){ | ||
for(i = 0; i < n; i++){ | ||
v_res = VFMVVF_FLOAT_M1(0, 1); | ||
gvl = VSETVL(m); | ||
j = 0; | ||
vr = VFMVVF_FLOAT(0, gvl); | ||
for(k = 0; k < m/gvl; k++){ | ||
va = VLEV_FLOAT(&a_ptr[j], gvl); | ||
vx = VLEV_FLOAT(&x[j], gvl); | ||
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop | ||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail | ||
j += gvl; | ||
} | ||
if(j < m){ | ||
gvl = VSETVL(m-j); | ||
va = VLEV_FLOAT(&a_ptr[j], gvl); | ||
vx = VLEV_FLOAT(&x[j], gvl); | ||
vr = VFMULVV_FLOAT(va, vx, gvl); | ||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | ||
} | ||
temp = (FLOAT)EXTRACT_FLOAT(v_res); | ||
y[iy] += alpha * temp; | ||
if(inc_x == 1){ | ||
#ifndef RISCV_0p10_INTRINSICS | ||
BLASLONG anr = n - n % 4; | ||
for (; i < anr; i += 4) { | ||
gvl = VSETVL(m); | ||
j = 0; | ||
for (int l = 0; l < 4; l++) { | ||
a_ptrs[l] = a + (i + l) * lda; | ||
y_ptrs[l] = y + (i + l) * inc_y; | ||
} | ||
vec0 = VFMVVF_FLOAT_M1(0.0, vlmax); | ||
vec1 = VFMVVF_FLOAT_M1(0.0, vlmax); | ||
vec2 = VFMVVF_FLOAT_M1(0.0, vlmax); | ||
vec3 = VFMVVF_FLOAT_M1(0.0, vlmax); | ||
vr0 = VFMVVF_FLOAT(0.0, gvl); | ||
vr1 = VFMVVF_FLOAT(0.0, gvl); | ||
vr2 = VFMVVF_FLOAT(0.0, gvl); | ||
vr3 = VFMVVF_FLOAT(0.0, gvl); | ||
for (k = 0; k < m / gvl; k++) { | ||
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); | ||
va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl); | ||
va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl); | ||
va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl); | ||
|
||
vx = VLEV_FLOAT(x + j, gvl); | ||
vr0 = VFMULVV_FLOAT(va0, vx, gvl); | ||
vr1 = VFMULVV_FLOAT(va1, vx, gvl); | ||
vr2 = VFMULVV_FLOAT(va2, vx, gvl); | ||
vr3 = VFMULVV_FLOAT(va3, vx, gvl); | ||
// Floating-point addition does not satisfy the associative law, that is, (a + b) + c ≠ a + (b + c), | ||
// so piecewise multiplication and reduction must be performed inside the loop body. | ||
vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl); | ||
vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl); | ||
vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl); | ||
vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Part of the issue is register spillage of the vectors. It looks like you have more than 32 in play here. You shouldn't use more than m4 (instead m8). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the inner loop with m4
With m8 (see how bad it is)
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW, even with m4, the code is slower than before. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ISTR that the benchmarks do not use non-trivial values of alpha and beta by default, which may be part of the difference (if I interpret parameters to your private testOpenBLAS correctly) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
j += gvl; | ||
} | ||
if (j < m) { | ||
gvl = VSETVL(m - j); | ||
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); | ||
va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl); | ||
va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl); | ||
va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl); | ||
|
||
iy += inc_y; | ||
a_ptr += lda; | ||
} | ||
}else{ | ||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||
for(i = 0; i < n; i++){ | ||
v_res = VFMVVF_FLOAT_M1(0, 1); | ||
gvl = VSETVL(m); | ||
j = 0; | ||
ix = 0; | ||
vr = VFMVVF_FLOAT(0, gvl); | ||
for(k = 0; k < m/gvl; k++){ | ||
va = VLEV_FLOAT(&a_ptr[j], gvl); | ||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||
vr = VFMULVV_FLOAT(va, vx, gvl); | ||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | ||
j += gvl; | ||
ix += inc_x * gvl; | ||
} | ||
if(j < m){ | ||
gvl = VSETVL(m-j); | ||
va = VLEV_FLOAT(&a_ptr[j], gvl); | ||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||
vr = VFMULVV_FLOAT(va, vx, gvl); | ||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | ||
} | ||
temp = (FLOAT)EXTRACT_FLOAT(v_res); | ||
y[iy] += alpha * temp; | ||
vx = VLEV_FLOAT(x + j, gvl); | ||
vr0 = VFMULVV_FLOAT(va0, vx, gvl); | ||
vr1 = VFMULVV_FLOAT(va1, vx, gvl); | ||
vr2 = VFMULVV_FLOAT(va2, vx, gvl); | ||
vr3 = VFMULVV_FLOAT(va3, vx, gvl); | ||
vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl); | ||
vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl); | ||
vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl); | ||
vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl); | ||
} | ||
*y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(vec0)); | ||
*y_ptrs[1] += alpha * (FLOAT)(EXTRACT_FLOAT(vec1)); | ||
*y_ptrs[2] += alpha * (FLOAT)(EXTRACT_FLOAT(vec2)); | ||
*y_ptrs[3] += alpha * (FLOAT)(EXTRACT_FLOAT(vec3)); | ||
} | ||
// deal with the tail | ||
for (; i < n; i++) { | ||
v_res = VFMVVF_FLOAT_M1(0, vlmax); | ||
gvl = VSETVL(m); | ||
j = 0; | ||
a_ptrs[0] = a + i * lda; | ||
y_ptrs[0] = y + i * inc_y; | ||
vr0 = VFMVVF_FLOAT(0, gvl); | ||
for (k = 0; k < m / gvl; k++) { | ||
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); | ||
vx = VLEV_FLOAT(x + j, gvl); | ||
vr0 = VFMULVV_FLOAT(va0, vx, gvl); | ||
v_res = VFREDSUM_FLOAT(vr0, v_res, gvl); | ||
j += gvl; | ||
} | ||
if (j < m) { | ||
gvl = VSETVL(m - j); | ||
va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); | ||
vx = VLEV_FLOAT(x + j, gvl); | ||
vr0 = VFMULVV_FLOAT(va0, vx, gvl); | ||
v_res = VFREDSUM_FLOAT(vr0, v_res, gvl); | ||
} | ||
*y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(v_res)); | ||
} | ||
#else | ||
for(i = 0; i < n; i++){ | ||
v_res = VFMVVF_FLOAT_M1(0, 1); | ||
gvl = VSETVL(m); | ||
j = 0; | ||
vr = VFMVVF_FLOAT(0, gvl); | ||
for(k = 0; k < m/gvl; k++){ | ||
va = VLEV_FLOAT(&a_ptr[j], gvl); | ||
vx = VLEV_FLOAT(&x[j], gvl); | ||
vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop | ||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail | ||
j += gvl; | ||
} | ||
if(j < m){ | ||
gvl = VSETVL(m-j); | ||
va = VLEV_FLOAT(&a_ptr[j], gvl); | ||
vx = VLEV_FLOAT(&x[j], gvl); | ||
vr = VFMULVV_FLOAT(va, vx, gvl); | ||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | ||
} | ||
temp = (FLOAT)EXTRACT_FLOAT(v_res); | ||
y[iy] += alpha * temp; | ||
|
||
|
||
iy += inc_y; | ||
a_ptr += lda; | ||
iy += inc_y; | ||
a_ptr += lda; | ||
} | ||
#endif | ||
} else { | ||
BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||
for(i = 0; i < n; i++){ | ||
v_res = VFMVVF_FLOAT_M1(0, 1); | ||
gvl = VSETVL(m); | ||
j = 0; | ||
ix = 0; | ||
vr = VFMVVF_FLOAT(0, gvl); | ||
for(k = 0; k < m/gvl; k++){ | ||
va = VLEV_FLOAT(&a_ptr[j], gvl); | ||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||
vr = VFMULVV_FLOAT(va, vx, gvl); | ||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | ||
j += gvl; | ||
ix += inc_x * gvl; | ||
} | ||
if(j < m){ | ||
gvl = VSETVL(m-j); | ||
va = VLEV_FLOAT(&a_ptr[j], gvl); | ||
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||
vr = VFMULVV_FLOAT(va, vx, gvl); | ||
v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | ||
} | ||
} | ||
temp = (FLOAT)EXTRACT_FLOAT(v_res); | ||
y[iy] += alpha * temp; | ||
|
||
|
||
return(0); | ||
iy += inc_y; | ||
a_ptr += lda; | ||
} | ||
} | ||
|
||
return (0); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have you tried ordered (vs unordered) reduction sums?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I tried redosum. Like redusum, it has precision issues and failed the correctness verification.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like to ask if you enabled -O3 optimization when compiling? Or could you provide the test code and test process so that I can reproduce the problem?Negative optimization shouldn't happen, the performance degradation is too severe.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that when building for rvv 1.0 targets, the inner loop in this kernel is unrolled four times, and vector temporaries are used to hold the accumulator and the input:
This means 8 vector temporaries total. However, there are only 32 architectural vector registers, and when LMUL=8, each temporary uses 8 architectural vector registers; it is therefore impossible to avoid stack spills and catastrophic performance with LMUL=8
The code block that is /not/ using rvv 1.0 intrinsics is not unrolled in this way. Do you perhaps not have
RISCV_0p10_INTRINSICS
defined in your build?Some of the other kernels use different lmul depending on build parameters. Perhaps that is the way forward here? - lmul 2 when the rvv 1.0 path is in use, lmul 8 when it is not?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@yuanjia111 What tests are failing if you remove the vfredusum_vs out of the inner loop? gemv_t_rvv.c doesn't have it in the loop and is used by some platforms.
I had a similar problem for PowerPC with GEMV (for BF16). It was where the beta value was applied that caused the issue.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ChipKerchner



(1)my gcc version is: gcc version 14.2.1 20240801 (openEuler 14.2.1-6.oe2403sp1)
(2)If remove the vfredusum_vs out of the inner loop, utest will report an error when compiling, as follows:
It turns out that the checksum here (accurate to 12 decimal places) fails:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sergei-lewis I added print to the code and found that the code is based on the rvv1.0 branch, not RISCV_0p10_INTRINSICS. I will take some time today to analyze the performance test and the inconsistency between yours.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ChipKerchner, thank you, i will test when removing the manual unrolling in gemv_t_vector.c.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ChipKerchner ,I retested and found that removing the manual four-way loop unrolling in gemv_t_vector.c and simply changing LMUL to 8 gave the best performance. (Comparative test scenarios: manual four-way loop unrolling with LMUL=4; manual four-way loop unrolling with LMUL=8; no manual loop unrolling with LMUL=2)
@ChipKerchner ,@sergei-lewis , @martin-frbg ,thank you very much for your suggestions and discussions !