Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Vectorising np.linalg.qr #19151

Merged
merged 26 commits into from Jul 14, 2021
Merged

ENH: Vectorising np.linalg.qr #19151

merged 26 commits into from Jul 14, 2021

Conversation

czgdp1807
Copy link
Member

Fixes #11741

Comments

This is my first PR to numpy. I thought of pushing the early changes so that reviews can be provided as soon as possible and I can keep updating my local branch frequently, keeping the things up to date.

As of now I have just initialized the framework for implementing QR decomposition at C level. I haven't made any change to Python code, will do that once the C code looks good. See, #11741 (comment)

@czgdp1807
Copy link
Member Author

Please let me know if the above additions require any modifications. A bunch of questions,

  1. How to access the lapack_routine (used in Python definition of QR (np.linalg.qr)) for doing the decomposition in C code?
  2. How to decide which types would be filled in arrays ending with _types suffix (such as, lstsq_types, svd_1_3_types, etc.)?

ping @eric-wieser @mattip

Thank you. Apologies if I missed out on any guidelines.

@eric-wieser
Copy link
Member

eric-wieser commented Jun 1, 2021

Note you'll need qr versions of all of these lstsq functions:

/* -------------------------------------------------------------------------- */
/* least squares */
typedef struct gelsd_params_struct
{
fortran_int M;
fortran_int N;
fortran_int NRHS;
void *A;
fortran_int LDA;
void *B;
fortran_int LDB;
void *S;
void *RCOND;
fortran_int RANK;
void *WORK;
fortran_int LWORK;
void *RWORK;
void *IWORK;
} GELSD_PARAMS_t;
static inline void
dump_gelsd_params(const char *name,
GELSD_PARAMS_t *params)
{
TRACE_TXT("\n%s:\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %18p\n",
name,
"A", params->A,
"B", params->B,
"S", params->S,
"WORK", params->WORK,
"RWORK", params->RWORK,
"IWORK", params->IWORK,
"M", (int)params->M,
"N", (int)params->N,
"NRHS", (int)params->NRHS,
"LDA", (int)params->LDA,
"LDB", (int)params->LDB,
"LWORK", (int)params->LWORK,
"RANK", (int)params->RANK,
"RCOND", params->RCOND);
}
/**begin repeat
#TYPE=FLOAT,DOUBLE#
#lapack_func=sgelsd,dgelsd#
#ftyp=fortran_real,fortran_doublereal#
*/
static inline fortran_int
call_@lapack_func@(GELSD_PARAMS_t *params)
{
fortran_int rv;
LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
params->A, &params->LDA,
params->B, &params->LDB,
params->S,
params->RCOND, &params->RANK,
params->WORK, &params->LWORK,
params->IWORK,
&rv);
return rv;
}
static inline int
init_@lapack_func@(GELSD_PARAMS_t *params,
fortran_int m,
fortran_int n,
fortran_int nrhs)
{
npy_uint8 *mem_buff = NULL;
npy_uint8 *mem_buff2 = NULL;
npy_uint8 *a, *b, *s, *work, *iwork;
fortran_int min_m_n = fortran_int_min(m, n);
fortran_int max_m_n = fortran_int_max(m, n);
size_t safe_min_m_n = min_m_n;
size_t safe_max_m_n = max_m_n;
size_t safe_m = m;
size_t safe_n = n;
size_t safe_nrhs = nrhs;
size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
size_t s_size = safe_min_m_n * sizeof(@ftyp@);
fortran_int work_count;
size_t work_size;
size_t iwork_size;
fortran_int lda = fortran_int_max(1, m);
fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
mem_buff = malloc(a_size + b_size + s_size);
if (!mem_buff)
goto error;
a = mem_buff;
b = a + a_size;
s = b + b_size;
params->M = m;
params->N = n;
params->NRHS = nrhs;
params->A = a;
params->B = b;
params->S = s;
params->LDA = lda;
params->LDB = ldb;
{
/* compute optimal work size */
@ftyp@ work_size_query;
fortran_int iwork_size_query;
params->WORK = &work_size_query;
params->IWORK = &iwork_size_query;
params->RWORK = NULL;
params->LWORK = -1;
if (call_@lapack_func@(params) != 0)
goto error;
work_count = (fortran_int)work_size_query;
work_size = (size_t) work_size_query * sizeof(@ftyp@);
iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
}
mem_buff2 = malloc(work_size + iwork_size);
if (!mem_buff2)
goto error;
work = mem_buff2;
iwork = work + work_size;
params->WORK = work;
params->RWORK = NULL;
params->IWORK = iwork;
params->LWORK = work_count;
return 1;
error:
TRACE_TXT("%s failed init\n", __FUNCTION__);
free(mem_buff);
free(mem_buff2);
memset(params, 0, sizeof(*params));
return 0;
}
/**end repeat**/
/**begin repeat
#TYPE=CFLOAT,CDOUBLE#
#ftyp=fortran_complex,fortran_doublecomplex#
#frealtyp=fortran_real,fortran_doublereal#
#typ=COMPLEX_t,DOUBLECOMPLEX_t#
#lapack_func=cgelsd,zgelsd#
*/
static inline fortran_int
call_@lapack_func@(GELSD_PARAMS_t *params)
{
fortran_int rv;
LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
params->A, &params->LDA,
params->B, &params->LDB,
params->S,
params->RCOND, &params->RANK,
params->WORK, &params->LWORK,
params->RWORK, params->IWORK,
&rv);
return rv;
}
static inline int
init_@lapack_func@(GELSD_PARAMS_t *params,
fortran_int m,
fortran_int n,
fortran_int nrhs)
{
npy_uint8 *mem_buff = NULL;
npy_uint8 *mem_buff2 = NULL;
npy_uint8 *a, *b, *s, *work, *iwork, *rwork;
fortran_int min_m_n = fortran_int_min(m, n);
fortran_int max_m_n = fortran_int_max(m, n);
size_t safe_min_m_n = min_m_n;
size_t safe_max_m_n = max_m_n;
size_t safe_m = m;
size_t safe_n = n;
size_t safe_nrhs = nrhs;
size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
size_t s_size = safe_min_m_n * sizeof(@frealtyp@);
fortran_int work_count;
size_t work_size, rwork_size, iwork_size;
fortran_int lda = fortran_int_max(1, m);
fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
mem_buff = malloc(a_size + b_size + s_size);
if (!mem_buff)
goto error;
a = mem_buff;
b = a + a_size;
s = b + b_size;
params->M = m;
params->N = n;
params->NRHS = nrhs;
params->A = a;
params->B = b;
params->S = s;
params->LDA = lda;
params->LDB = ldb;
{
/* compute optimal work size */
@ftyp@ work_size_query;
@frealtyp@ rwork_size_query;
fortran_int iwork_size_query;
params->WORK = &work_size_query;
params->IWORK = &iwork_size_query;
params->RWORK = &rwork_size_query;
params->LWORK = -1;
if (call_@lapack_func@(params) != 0)
goto error;
work_count = (fortran_int)work_size_query.r;
work_size = (size_t )work_size_query.r * sizeof(@ftyp@);
rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@);
iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
}
mem_buff2 = malloc(work_size + rwork_size + iwork_size);
if (!mem_buff2)
goto error;
work = mem_buff2;
rwork = work + work_size;
iwork = rwork + rwork_size;
params->WORK = work;
params->RWORK = rwork;
params->IWORK = iwork;
params->LWORK = work_count;
return 1;
error:
TRACE_TXT("%s failed init\n", __FUNCTION__);
free(mem_buff);
free(mem_buff2);
memset(params, 0, sizeof(*params));
return 0;
}
/**end repeat**/
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
#lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
#dot_func=sdot,ddot,cdotc,zdotc#
#typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
#basetyp = npy_float, npy_double, npy_float, npy_double#
#ftyp = fortran_real, fortran_doublereal,
fortran_complex, fortran_doublecomplex#
#cmplx = 0, 0, 1, 1#
*/
static inline void
release_@lapack_func@(GELSD_PARAMS_t* params)
{
/* A and WORK contain allocated blocks */
free(params->A);
free(params->WORK);
memset(params, 0, sizeof(*params));
}
/** Compute the squared l2 norm of a contiguous vector */
static @basetyp@
@TYPE@_abs2(@typ@ *p, npy_intp n) {
npy_intp i;
@basetyp@ res = 0;
for (i = 0; i < n; i++) {
@typ@ el = p[i];
#if @cmplx@
res += el.real*el.real + el.imag*el.imag;
#else
res += el*el;
#endif
}
return res;
}
static void
@TYPE@_lstsq(char **args, npy_intp const *dimensions, npy_intp const *steps,
void *NPY_UNUSED(func))
{
GELSD_PARAMS_t params;
int error_occurred = get_fp_invalid_and_clear();
fortran_int n, m, nrhs;
fortran_int excess;
INIT_OUTER_LOOP_7
m = (fortran_int)dimensions[0];
n = (fortran_int)dimensions[1];
nrhs = (fortran_int)dimensions[2];
excess = m - n;
if (init_@lapack_func@(&params, m, n, nrhs)) {
LINEARIZE_DATA_t a_in, b_in, x_out, s_out, r_out;
init_linearize_data(&a_in, n, m, steps[1], steps[0]);
init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m));
init_linearize_data_ex(&x_out, nrhs, n, steps[5], steps[4], fortran_int_max(n, m));
init_linearize_data(&r_out, 1, nrhs, 1, steps[6]);
init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[7]);
BEGIN_OUTER_LOOP_7
int not_ok;
linearize_@TYPE@_matrix(params.A, args[0], &a_in);
linearize_@TYPE@_matrix(params.B, args[1], &b_in);
params.RCOND = args[2];
not_ok = call_@lapack_func@(&params);
if (!not_ok) {
delinearize_@TYPE@_matrix(args[3], params.B, &x_out);
*(npy_int*) args[5] = params.RANK;
delinearize_@REALTYPE@_matrix(args[6], params.S, &s_out);
/* Note that linalg.lstsq discards this when excess == 0 */
if (excess >= 0 && params.RANK == n) {
/* Compute the residuals as the square sum of each column */
int i;
char *resid = args[4];
@ftyp@ *components = (@ftyp@ *)params.B + n;
for (i = 0; i < nrhs; i++) {
@ftyp@ *vector = components + i*m;
/* Numpy and fortran floating types are the same size,
* so this cast is safe */
@basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
memcpy(
resid + i*r_out.column_strides,
&abs2, sizeof(abs2));
}
}
else {
/* Note that this is always discarded by linalg.lstsq */
nan_@REALTYPE@_matrix(args[4], &r_out);
}
} else {
error_occurred = 1;
nan_@TYPE@_matrix(args[3], &x_out);
nan_@REALTYPE@_matrix(args[4], &r_out);
*(npy_int*) args[5] = -1;
nan_@REALTYPE@_matrix(args[6], &s_out);
}
END_OUTER_LOOP
release_@lapack_func@(&params);
}
set_fp_invalid_or_clear(error_occurred);
}

Not just the final @TYPE@_lstsq definition.

@czgdp1807
Copy link
Member Author

Note you'll need qr versions of all of these lstsq functions:

Sure. So, can we say that this is the kind of pattern to be followed for every new function added to umath_linalg.c.src?

@mattip
Copy link
Member

mattip commented Jun 1, 2021

... this is the kind of pattern to be followed for every new function added ...

It depends on the underlying LAPACK/BLAS functionality. In general, I think it would be better to add functionality to SciPy and not to Numpy. In fact, you may be able to reuse some ideas or code from the scipy.linalg.qr implementation.

@czgdp1807
Copy link
Member Author

In the recent push I have written a generic implementation of QR decomposition. I will add specific ones for 'r', 'raw', 'economic' and 'reduced_m' and 'reduced_n' separately in future commits.

@czgdp1807 czgdp1807 changed the title [WIP] Vectorising np.linalg.qr Vectorising np.linalg.qr Jun 5, 2021
@czgdp1807 czgdp1807 requested a review from mattip June 5, 2021 08:38
@czgdp1807
Copy link
Member Author

Can someone please restart this build on Travis CI? It seems like the build error-ed due to some issue on Travis servers. Thanks.

@czgdp1807
Copy link
Member Author

ping @eric-wieser Please let me know if any updates are required to this PR. Thanks.

@czgdp1807
Copy link
Member Author

@eric-wieser @mattip Are there any improvements to be made here? Please let me know.

@mattip
Copy link
Member

mattip commented Jun 13, 2021

@eric-wieser any more review here?

@czgdp1807 czgdp1807 requested a review from seberg June 18, 2021 15:09
@czgdp1807
Copy link
Member Author

I have factored out common code in initialization for sorgqr, dorgqr and their complex counterparts. Let me know if I can do more to reduce the size of the PR. Thanks.

Copy link
Contributor

@pearu pearu left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This review part covers only the Python part of the PR: mostly nits, clarifications, and request to improve testing coverage.

numpy/linalg/linalg.py Outdated Show resolved Hide resolved
numpy/linalg/linalg.py Outdated Show resolved Hide resolved
numpy/linalg/linalg.py Show resolved Hide resolved
numpy/linalg/tests/test_linalg.py Outdated Show resolved Hide resolved
numpy/linalg/tests/test_linalg.py Outdated Show resolved Hide resolved
numpy/linalg/tests/test_linalg.py Outdated Show resolved Hide resolved
numpy/linalg/tests/test_linalg.py Outdated Show resolved Hide resolved
numpy/linalg/tests/test_linalg.py Outdated Show resolved Hide resolved
@czgdp1807
Copy link
Member Author

Hi. Thanks a lot for the reviews. I will be addressing them by pushing changes tomorrow morning.

Copy link
Contributor

@pearu pearu left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This review part covers the C code. My comments are mostly optimization nits and clarifications.

PS: github did not do a good job in displaying diffs this time..

numpy/linalg/umath_linalg.c.src Outdated Show resolved Hide resolved
numpy/linalg/umath_linalg.c.src Outdated Show resolved Hide resolved
numpy/linalg/umath_linalg.c.src Outdated Show resolved Hide resolved
numpy/linalg/umath_linalg.c.src Outdated Show resolved Hide resolved
numpy/linalg/umath_linalg.c.src Outdated Show resolved Hide resolved
numpy/linalg/umath_linalg.c.src Show resolved Hide resolved
numpy/linalg/umath_linalg.c.src Outdated Show resolved Hide resolved
numpy/linalg/umath_linalg.c.src Outdated Show resolved Hide resolved
numpy/linalg/umath_linalg.c.src Outdated Show resolved Hide resolved
numpy/linalg/umath_linalg.c.src Outdated Show resolved Hide resolved
@czgdp1807
Copy link
Member Author

czgdp1807 commented Jun 24, 2021

I have addressed the reviews and the tests seem to pass. Please let me know if there are any other concerns or there is something more that can be done. Thank you.

Copy link
Contributor

@pearu pearu left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a couple of optimization nits and a BC question.

In addition, it is not clear to me why to define function pairs, qr_r_raw_m and qr_r_raw_n, when they both use the same C implementation. The only difference appears in gufunc descriptor signatures: "(m,n),(m)->(m,m)" vs "(m,n),(n)->(m,n)".
Would it be possible join these signatures to "(m, n),(k)->(m, k)" and let the implementation to check that k == min(m, n)?
The same note applies to other function pairs with _m and _n sufficies.

numpy/linalg/linalg.py Outdated Show resolved Hide resolved
numpy/linalg/linalg.py Outdated Show resolved Hide resolved
numpy/linalg/linalg.py Outdated Show resolved Hide resolved
numpy/linalg/linalg.py Outdated Show resolved Hide resolved
numpy/linalg/linalg.py Outdated Show resolved Hide resolved
@czgdp1807
Copy link
Member Author

In addition, it is not clear to me why to define function pairs, qr_r_raw_m and qr_r_raw_n, when they both use the same C implementation. The only difference appears in gufunc descriptor signatures: "(m,n),(m)->(m,m)" vs "(m,n),(n)->(m,n)".
Would it be possible join these signatures to "(m, n),(k)->(m, k)" and let the implementation to check that k == min(m, n)?
The same note applies to other function pairs with _m and _n sufficies.

Here's the thread - #19151 (comment) - which covers the same.

@pearu
Copy link
Contributor

pearu commented Jul 9, 2021

Here's the thread - #19151 (comment) - which covers the same.

OK, we have:

  • qr_r_raw_m and qr_r_raw_n with signatures "(m,n)->(m)" and "(m,n)->(n)", respectively
  • qr_reduced_m and qr_reduced_n with signatures "(m,n),(m)->(m,m)" and "(m,n),(n)->(m,n)", respectively
  • qr_complete_m and qr_complete_n with signatures "(m,n),(m)->(m,m)" and "(m,n),(n)->(m,m)", respectively.

So:

  • nothing can be done regarding qr_r_raw_*
  • qr_reduced_* signatures can be merged into "(m, n), (k) -> (m, k)", notice that the comment in the referenced thread does not apply here: all rhs dimensions are present in the lhs
  • qr_complete_* signatures can be merged into "(m, n), (k) -> (m, m)", again the thread comment does not apply here.

Copy link
Contributor

@pearu pearu left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug report.

numpy/linalg/linalg.py Outdated Show resolved Hide resolved
@czgdp1807
Copy link
Member Author

I have addressed all the reviews. Let me know if there's something else to be done. Thanks.

Copy link
Contributor

@pearu pearu left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, thanks @czgdp1807 !

@czgdp1807
Copy link
Member Author

Thanks @pearu for the feedbacks and thorough reviews.

@mattip mattip merged commit a4e931f into numpy:main Jul 14, 2021
@mattip
Copy link
Member

mattip commented Jul 14, 2021

Thanks @czgdp1807

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Vectorize np.linalg.qr
5 participants